From aa10649bf4bdf954d407c30e0f69c78076ce6884 Mon Sep 17 00:00:00 2001 From: "Mark J. Olah" Date: Thu, 13 Mar 2025 09:04:49 -0600 Subject: [PATCH] nvmath-python-0.3.0 --- .gitignore | 2 +- .markdownlint.yaml | 4 + .pre-commit-config.yaml | 4 +- .talismanrc | 69 +- MANIFEST.in | 5 + builder/__init__.py | 2 +- builder/pep517.py | 4 +- builder/utils.py | 2 +- docs/Makefile | 2 +- docs/sphinx/_static/switcher.json | 4 + docs/sphinx/bindings/index.rst | 4 +- docs/sphinx/conf.py | 55 +- docs/sphinx/device-apis/cufft.rst | 2 +- docs/sphinx/device-apis/index.rst | 1 + docs/sphinx/{ => host-apis}/fft/index.rst | 0 docs/sphinx/host-apis/index.rst | 20 + docs/sphinx/{ => host-apis}/linalg/index.rst | 22 + .../{host-utils.rst => host-apis/utils.rst} | 23 +- docs/sphinx/index.rst | 18 +- docs/sphinx/installation.rst | 74 +- docs/sphinx/overview.rst | 78 +- docs/sphinx/quickstart.rst | 9 +- docs/sphinx/release-notes.rst | 41 +- docs/sphinx/tutorials/linalg.rst | 11 + .../notebooks/matmul/01_introduction.nblink | 3 + .../notebooks/matmul/02_epilogs.nblink | 3 + .../matmul/03_backpropagation.nblink | 3 + .../tutorials/notebooks/matmul/04_fp8.nblink | 3 + examples/device/common.py | 2 +- examples/device/common_cupy.py | 2 +- examples/device/common_numba.py | 2 +- examples/device/cublasdx_batched_gemm_fp64.py | 2 +- .../device/cublasdx_blockdim_gemm_fp16.py | 2 +- .../device/cublasdx_fused_gemm_performance.py | 2 +- examples/device/cublasdx_gemm_fft.py | 2 +- examples/device/cublasdx_gemm_fft_fp16.py | 2 +- .../device/cublasdx_gemm_fft_performance.py | 2 +- examples/device/cublasdx_gemm_fusion.py | 2 +- examples/device/cublasdx_simple_gemm_cfp16.py | 2 +- examples/device/cublasdx_simple_gemm_fp32.py | 2 +- ...cublasdx_simple_gemm_leading_dimensions.py | 2 +- .../cublasdx_single_gemm_performance.py | 2 +- examples/device/cufftdx_autotuning.py | 2 +- examples/device/cufftdx_block_fft.py | 2 +- .../device/cufftdx_block_fft_performance.py | 2 +- .../cufftdx_block_fft_performance_many.py | 2 +- examples/device/cufftdx_convolution.py | 2 +- .../device/cufftdx_convolution_performance.py | 2 +- .../device/cufftdx_convolution_r2c_c2r.py | 2 +- ...nvolution_r2c_c2r_packed_fold_optimized.py | 2 +- examples/device/cufftdx_convolution_signal.py | 2 +- examples/device/cufftdx_fft_2d.py | 2 +- examples/device/cufftdx_fft_2d_r2c_c2r.py | 2 +- .../device/cufftdx_fft_2d_single_kernel.py | 2 +- .../device/cufftdx_fft_3d_box_single_block.py | 2 +- .../cufftdx_fft_3d_cube_single_block.py | 2 +- examples/device/cufftdx_helloworld.py | 2 +- examples/device/cufftdx_simple_fft_block.py | 2 +- .../device/cufftdx_simple_fft_block_c2r.py | 2 +- .../cufftdx_simple_fft_block_c2r_fp16.py | 2 +- .../device/cufftdx_simple_fft_block_half2.py | 2 +- .../device/cufftdx_simple_fft_block_r2c.py | 2 +- .../cufftdx_simple_fft_block_r2c_fp16.py | 2 +- .../device/cufftdx_simple_fft_block_shared.py | 2 +- examples/device/cufftdx_simple_fft_thread.py | 2 +- .../device/cufftdx_simple_fft_thread_fp16.py | 2 +- examples/device/curand_cufftdx_block_fft.py | 5 +- examples/device/curand_philox_uniform4.py | 3 +- examples/device/curand_scrambled_sobol64.py | 2 +- examples/device/curand_xorwow_uniform.py | 3 +- examples/fft/caching.py | 4 +- examples/fft/example01_cupy.py | 2 +- examples/fft/example01_cupy_layouts.py | 2 +- examples/fft/example01_numpy.py | 2 +- examples/fft/example01_numpy_cpu_execution.py | 2 +- examples/fft/example01_torch_complex32.py | 2 +- examples/fft/example02_stateful_cupy.py | 2 +- examples/fft/example02_stateful_torch.py | 2 +- examples/fft/example02_stateful_torch_cpu.py | 2 +- .../example02_stateful_torch_cpu_execution.py | 2 +- examples/fft/example03_options.py | 2 +- .../fft/example03_options_cpu_execution.py | 2 +- examples/fft/example04_logging_global.py | 2 +- examples/fft/example04_logging_user.py | 2 +- examples/fft/example05_stateful_inplace.py | 2 +- examples/fft/example05_stateful_reset.py | 2 +- examples/fft/example06_r2c.py | 2 +- examples/fft/example07_c2r.py | 2 +- examples/fft/example07_c2r_odd.py | 2 +- examples/fft/example08_cupy_inplace.py | 2 +- examples/fft/example08_numpy_inplace.py | 2 +- .../example08_numpy_inplace_cpu_execution.py | 2 +- examples/fft/example09_streams.py | 2 +- examples/fft/example10_memory_allocator.py | 2 +- examples/fft/example11_resource_mgmt.py | 2 +- ...example12_stateful_unsupported_fallback.py | 2 +- .../fft/example12_unsupported_fallback.py | 2 +- examples/fft/example13_cupy_mt_mgpu.py | 2 +- examples/fft/example13_numpy_mp_mgpu.py | 2 +- examples/fft/example14_caching.py | 2 +- .../fft/example15_cupy_nd_fft_benchmark.py | 2 +- .../fft/example16_cupy_nd_fft_benchmark.py | 2 +- examples/fft/example17_trunc.py | 2 +- examples/fft/example18_5D_trunc.py | 2 +- .../example19_convolution_epilog_callback.py | 2 +- ...le19_convolution_memory_layout_callback.py | 2 +- .../example19_convolution_prolog_callback.py | 2 +- examples/fft/fftn1.py | 2 +- examples/fft/fftn2.py | 2 +- examples/fft/truncation.py | 2 +- .../linalg/advanced/matmul/example01_cupy.py | 2 +- .../matmul/example01_cupy_complex64.py | 2 +- .../linalg/advanced/matmul/example01_numpy.py | 2 +- .../linalg/advanced/matmul/example01_torch.py | 4 +- .../advanced/matmul/example02_options.py | 2 +- .../matmul/example03_logging_global.py | 2 +- .../advanced/matmul/example03_logging_user.py | 2 +- .../matmul/example04_stateful_cupy.py | 2 +- .../matmul/example04_stateful_torch.py | 2 +- .../matmul/example04_stateful_torch_cpu.py | 2 +- .../matmul/example05_stateful_inplace.py | 12 +- .../matmul/example05_stateful_reset.py | 12 +- .../linalg/advanced/matmul/example06_gemm.py | 2 +- .../advanced/matmul/example07_batched_a.py | 2 +- .../advanced/matmul/example07_batched_a_b.py | 2 +- .../matmul/example08_batched_a_bcast_c.py | 4 +- .../advanced/matmul/example08_batched_a_c.py | 2 +- .../advanced/matmul/example09_epilog_bias.py | 2 +- .../matmul/example09_epilog_gelu_bias.py | 2 +- .../advanced/matmul/example10_epilog_dgelu.py | 2 +- .../advanced/matmul/example10_epilog_drelu.py | 2 +- .../matmul/example10_epilog_relu_aux.py | 2 +- .../matmul/example11_epilog_drelu_bgrad.py | 2 +- .../matmul/example12_epilog_bgrada.py | 2 +- .../matmul/example12_epilog_bgradb.py | 2 +- .../matmul/example13_epilog_stateful_reset.py | 12 +- .../advanced/matmul/example14_autotune.py | 2 +- .../matmul/example15_manual_tuning.py | 2 +- .../matmul/example16_reuse_algorithms.py | 2 +- .../linalg/advanced/matmul/example17_fp8.py | 41 + .../advanced/matmul/example18_fp8_types.py | 69 ++ .../advanced/matmul/example19_fp8_reset.py | 46 + .../example20_fp8_inplace_scale_change.py | 52 + .../advanced/matmul/example21_fp8_amax.py | 54 + .../matmul/example22_fp8_delayed_scaling.py | 64 + .../advanced/matmul/example23_fp8_epilog.py | 45 + .../matmul/example24_fp8_epilog_aux.py | 69 ++ .../linalg/advanced/matmul/example25_mxfp8.py | 61 + .../advanced/matmul/example26_mxfp8_d_out.py | 78 ++ .../matmul/example27_mxfp8_chaining.py | 61 + .../advanced/matmul/example28_mxfp8_epilog.py | 37 + .../advanced/matmul/example29_mxfp8_layout.py | 41 + notebooks/matmul/01_introduction.ipynb | 8 +- notebooks/matmul/02_epilogs.ipynb | 6 +- notebooks/matmul/03_backpropagation.ipynb | 6 +- notebooks/matmul/04_fp8.ipynb | 1077 +++++++++++++++++ nvmath/__init__.py | 19 +- nvmath/_internal/enum_utils.py | 4 +- nvmath/_internal/formatters.py | 2 +- nvmath/_internal/layout.py | 2 +- nvmath/_internal/mem_limit.py | 2 +- nvmath/_internal/package_ifc.py | 2 +- nvmath/_internal/package_ifc_cupy.py | 2 +- nvmath/_internal/package_ifc_torch.py | 2 +- nvmath/_internal/package_wrapper.py | 4 +- nvmath/_internal/tensor_ifc.py | 7 +- nvmath/_internal/tensor_ifc_cupy.py | 6 +- nvmath/_internal/tensor_ifc_numpy.py | 6 +- nvmath/_internal/tensor_ifc_torch.py | 6 +- nvmath/_internal/tensor_wrapper.py | 2 +- nvmath/_internal/typemaps.py | 26 +- nvmath/_internal/utils.py | 4 +- nvmath/_utils.py | 2 +- nvmath/bindings/__init__.py | 16 +- nvmath/bindings/_internal/cublas.pxd | 4 +- nvmath/bindings/_internal/cublasLt.pxd | 4 +- nvmath/bindings/_internal/cublasLt_linux.pyx | 4 +- .../bindings/_internal/cublasLt_windows.pyx | 4 +- nvmath/bindings/_internal/cublas_linux.pyx | 4 +- nvmath/bindings/_internal/cublas_windows.pyx | 4 +- nvmath/bindings/_internal/cufft.pxd | 4 +- nvmath/bindings/_internal/cufft_linux.pyx | 4 +- nvmath/bindings/_internal/cufft_windows.pyx | 4 +- nvmath/bindings/_internal/curand.pxd | 4 +- nvmath/bindings/_internal/curand_linux.pyx | 4 +- nvmath/bindings/_internal/curand_windows.pyx | 4 +- nvmath/bindings/_internal/cusolver.pxd | 2 +- nvmath/bindings/_internal/cusolverDn.pxd | 4 +- .../bindings/_internal/cusolverDn_linux.pyx | 4 +- .../bindings/_internal/cusolverDn_windows.pyx | 4 +- nvmath/bindings/_internal/cusolver_linux.pyx | 2 +- .../bindings/_internal/cusolver_windows.pyx | 2 +- nvmath/bindings/_internal/cusparse.pxd | 4 +- nvmath/bindings/_internal/cusparse_linux.pyx | 4 +- .../bindings/_internal/cusparse_windows.pyx | 4 +- nvmath/bindings/_internal/utils.pxd | 4 + nvmath/bindings/_internal/utils.pyx | 4 + nvmath/bindings/cublas.pxd | 4 +- nvmath/bindings/cublas.pyx | 4 +- nvmath/bindings/cublasLt.pxd | 5 +- nvmath/bindings/cublasLt.pyx | 35 +- nvmath/bindings/cufft.pxd | 16 +- nvmath/bindings/cufft.pyx | 4 +- nvmath/bindings/curand.pxd | 4 +- nvmath/bindings/curand.pyx | 14 +- nvmath/bindings/cusolver.pxd | 2 +- nvmath/bindings/cusolver.pyx | 2 +- nvmath/bindings/cusolverDn.pxd | 4 +- nvmath/bindings/cusolverDn.pyx | 4 +- nvmath/bindings/cusparse.pxd | 4 +- nvmath/bindings/cusparse.pyx | 4 +- nvmath/bindings/cycublas.pxd | 4 +- nvmath/bindings/cycublas.pyx | 4 +- nvmath/bindings/cycublasLt.pxd | 31 +- nvmath/bindings/cycublasLt.pyx | 4 +- nvmath/bindings/cycufft.pxd | 74 +- nvmath/bindings/cycufft.pyx | 4 +- nvmath/bindings/cycurand.pxd | 4 +- nvmath/bindings/cycurand.pyx | 4 +- nvmath/bindings/cycusolver.pxd | 2 +- nvmath/bindings/cycusolver.pyx | 2 +- nvmath/bindings/cycusolverDn.pxd | 4 +- nvmath/bindings/cycusolverDn.pyx | 4 +- nvmath/bindings/cycusparse.pxd | 4 +- nvmath/bindings/cycusparse.pyx | 4 +- nvmath/bindings/nvpl/__init__.py | 6 +- nvmath/bindings/nvpl/_internal/fft.pxd | 2 +- nvmath/bindings/nvpl/_internal/fft_linux.pyx | 2 +- nvmath/bindings/nvpl/cyfft.pxd | 2 +- nvmath/bindings/nvpl/cyfft.pyx | 2 +- nvmath/bindings/nvpl/fft.pxd | 2 +- nvmath/bindings/nvpl/fft.pyx | 2 +- nvmath/device/__init__.py | 13 +- nvmath/device/caching.py | 2 +- nvmath/device/common.py | 4 +- nvmath/device/common_cpp.py | 2 +- nvmath/device/common_cuda.py | 11 +- nvmath/device/common_mathdx.py | 2 +- nvmath/device/common_numba.py | 2 +- nvmath/device/cublasdx.py | 7 +- nvmath/device/cublasdx_backend.py | 2 +- nvmath/device/cublasdx_numba.py | 2 +- nvmath/device/cufftdx.py | 21 +- nvmath/device/cufftdx_backend.py | 2 +- nvmath/device/cufftdx_db.py | 2 +- nvmath/device/cufftdx_numba.py | 2 +- nvmath/device/cufftdx_workspace.py | 2 +- nvmath/device/curand_kernel.py | 2 +- nvmath/device/nvrtc.py | 2 +- nvmath/device/patch.py | 5 +- nvmath/device/random.py | 2 +- nvmath/device/random_helpers.py | 2 +- nvmath/device/random_states.py | 10 +- nvmath/device/types.py | 2 +- nvmath/device/vector_types_numba.py | 2 +- nvmath/fft/__init__.py | 8 +- nvmath/fft/_configuration.py | 19 +- nvmath/fft/_exec_utils.py | 4 +- nvmath/fft/_helpers.py | 4 +- nvmath/fft/fft.py | 20 +- nvmath/linalg/__init__.py | 8 +- nvmath/linalg/_internal/__init__.py | 4 +- nvmath/linalg/_internal/algo_cap_ifc.py | 2 +- nvmath/linalg/_internal/algo_config_ifc.py | 2 +- nvmath/linalg/_internal/enum_to_tuples.py | 2 +- nvmath/linalg/_internal/epilog_protocol.py | 36 +- nvmath/linalg/_internal/matmul_desc_ifc.py | 9 +- nvmath/linalg/_internal/matmul_pref_ifc.py | 10 +- nvmath/linalg/_internal/matrix_layout_ifc.py | 6 +- nvmath/linalg/_internal/typemaps.py | 69 +- nvmath/linalg/_internal/utils.py | 2 +- nvmath/linalg/advanced/__init__.py | 9 +- nvmath/linalg/advanced/_algorithmmod.py | 3 +- nvmath/linalg/advanced/_configuration.py | 113 +- .../advanced/helpers/__init__.py} | 2 +- nvmath/linalg/advanced/helpers/matmul.py | 173 +++ nvmath/linalg/advanced/matmulmod.py | 595 +++++++-- nvmath/memory.py | 2 +- pyproject.toml | 13 +- requirements/README.md | 1 + requirements/pip/docs.txt | 4 + requirements/pip/torch-cu12-nightly.txt | 5 + setup.py | 8 +- tests/conftest.py | 3 +- tests/docstring_tests/test_docstrings.py | 4 + .../device_tests/test_device_samples.py | 4 +- .../fft_tests/test_fft_samples.py | 4 +- .../test_advanced_matmul_samples.py | 38 +- tests/example_tests/test_utils.py | 2 +- tests/nvmath_tests/device/cpp_conv.py | 2 +- tests/nvmath_tests/device/cpp_gemm_batched.py | 2 +- tests/nvmath_tests/device/cpp_gemm_loop.py | 2 +- .../device/curand/compiled_apis.py | 2 +- .../device/curand/distributions.py | 144 +-- .../nvmath_tests/device/curand/generators.py | 3 +- .../nvmath_tests/device/curand/test_random.py | 43 +- tests/nvmath_tests/device/curand/utils.py | 14 +- tests/nvmath_tests/device/helpers.py | 6 +- tests/nvmath_tests/device/helpers_cpp.py | 13 +- tests/nvmath_tests/device/helpers_numba.py | 2 +- tests/nvmath_tests/device/numba_conv.py | 2 +- .../nvmath_tests/device/numba_gemm_batched.py | 3 +- tests/nvmath_tests/device/numba_gemm_loop.py | 2 +- .../device/test_cublasdx_generic.py | 2 +- .../device/test_cublasdx_numba.py | 3 +- .../device/test_cublasdx_numba_perf.py | 2 +- .../device/test_cufftdx_generic.py | 2 +- .../nvmath_tests/device/test_cufftdx_numba.py | 2 +- .../device/test_cufftdx_numba_perf.py | 2 +- .../device/test_vector_types_numba.py | 2 +- .../nvmath_tests/fft/test_default_backend.py | 2 +- .../fft/test_fft_with_hypothesis.py | 6 +- tests/nvmath_tests/fft/test_lto_callbacks.py | 68 +- tests/nvmath_tests/fft/test_perf.py | 2 +- tests/nvmath_tests/fft/test_perf_2d.py | 2 +- tests/nvmath_tests/fft/test_perf_4-5d.py | 5 +- tests/nvmath_tests/fft/test_stateful.py | 4 +- tests/nvmath_tests/fft/test_stateless_1d.py | 30 +- tests/nvmath_tests/fft/test_stateless_nd.py | 2 +- tests/nvmath_tests/fft/utils/axes_utils.py | 2 +- tests/nvmath_tests/fft/utils/check_helpers.py | 2 +- tests/nvmath_tests/fft/utils/common_axes.py | 2 +- .../nvmath_tests/fft/utils/input_fixtures.py | 12 +- .../nvmath_tests/fft/utils/support_matrix.py | 2 +- tests/nvmath_tests/helpers.py | 2 +- .../linalg/advanced/matmul/fp8_utils.py | 264 ++++ .../linalg/advanced/matmul/test_epilog.py | 90 +- .../linalg/advanced/matmul/test_fp8.py | 647 ++++++++++ .../advanced/matmul/test_fp8_epilogs.py | 310 +++++ .../linalg/advanced/matmul/test_fp8_utils.py | 88 ++ .../linalg/advanced/matmul/test_ifc.py | 37 +- .../linalg/advanced/matmul/test_input.py | 39 +- .../matmul/test_matmul_with_hypothesis.py | 128 +- .../linalg/advanced/matmul/test_mxfp8.py | 788 ++++++++++++ .../linalg/advanced/matmul/test_options.py | 14 +- .../linalg/advanced/matmul/test_perf.py | 2 +- .../linalg/advanced/matmul/test_planning.py | 86 +- .../linalg/advanced/matmul/test_reset.py | 5 +- .../linalg/advanced/matmul/utils.py | 55 +- 339 files changed, 6294 insertions(+), 967 deletions(-) create mode 100644 MANIFEST.in rename docs/sphinx/{ => host-apis}/fft/index.rst (100%) create mode 100644 docs/sphinx/host-apis/index.rst rename docs/sphinx/{ => host-apis}/linalg/index.rst (63%) rename docs/sphinx/{host-utils.rst => host-apis/utils.rst} (64%) create mode 100644 docs/sphinx/tutorials/linalg.rst create mode 100644 docs/sphinx/tutorials/notebooks/matmul/01_introduction.nblink create mode 100644 docs/sphinx/tutorials/notebooks/matmul/02_epilogs.nblink create mode 100644 docs/sphinx/tutorials/notebooks/matmul/03_backpropagation.nblink create mode 100644 docs/sphinx/tutorials/notebooks/matmul/04_fp8.nblink create mode 100644 examples/linalg/advanced/matmul/example17_fp8.py create mode 100644 examples/linalg/advanced/matmul/example18_fp8_types.py create mode 100644 examples/linalg/advanced/matmul/example19_fp8_reset.py create mode 100644 examples/linalg/advanced/matmul/example20_fp8_inplace_scale_change.py create mode 100644 examples/linalg/advanced/matmul/example21_fp8_amax.py create mode 100644 examples/linalg/advanced/matmul/example22_fp8_delayed_scaling.py create mode 100644 examples/linalg/advanced/matmul/example23_fp8_epilog.py create mode 100644 examples/linalg/advanced/matmul/example24_fp8_epilog_aux.py create mode 100644 examples/linalg/advanced/matmul/example25_mxfp8.py create mode 100644 examples/linalg/advanced/matmul/example26_mxfp8_d_out.py create mode 100644 examples/linalg/advanced/matmul/example27_mxfp8_chaining.py create mode 100644 examples/linalg/advanced/matmul/example28_mxfp8_epilog.py create mode 100644 examples/linalg/advanced/matmul/example29_mxfp8_layout.py create mode 100644 notebooks/matmul/04_fp8.ipynb rename nvmath/{_version.py => linalg/advanced/helpers/__init__.py} (72%) create mode 100644 nvmath/linalg/advanced/helpers/matmul.py create mode 100644 requirements/pip/torch-cu12-nightly.txt create mode 100644 tests/nvmath_tests/linalg/advanced/matmul/fp8_utils.py create mode 100644 tests/nvmath_tests/linalg/advanced/matmul/test_fp8.py create mode 100644 tests/nvmath_tests/linalg/advanced/matmul/test_fp8_epilogs.py create mode 100644 tests/nvmath_tests/linalg/advanced/matmul/test_fp8_utils.py create mode 100644 tests/nvmath_tests/linalg/advanced/matmul/test_mxfp8.py diff --git a/.gitignore b/.gitignore index 419f809..a58b87a 100644 --- a/.gitignore +++ b/.gitignore @@ -7,7 +7,7 @@ __pycache__ *.so *.pyd docs/_build -docs/sphinx/*/generated +docs/sphinx/**/generated docs/sphinx/generated dist build diff --git a/.markdownlint.yaml b/.markdownlint.yaml index 859154b..56025fb 100644 --- a/.markdownlint.yaml +++ b/.markdownlint.yaml @@ -1,3 +1,7 @@ +# Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. +# +# SPDX-License-Identifier: Apache-2.0 + MD013: line_length: 92 code_block_line_length: 88 diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 740be91..30a16dd 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -1,4 +1,6 @@ -# Copyright (c) 2024, NVIDIA CORPORATION. +# Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. +# +# SPDX-License-Identifier: Apache-2.0 repos: - repo: https://github.com/pre-commit/pre-commit-hooks diff --git a/.talismanrc b/.talismanrc index 4b8d727..960fa89 100644 --- a/.talismanrc +++ b/.talismanrc @@ -18,5 +18,72 @@ fileignoreconfig: checksum: 01022d56aafb7c98d5af05a3e9e87ce4d267781def6f1844470fd4cd59d6b26b - filename: nvmath/device/random.py checksum: c534d9a475521cfcbfa6b048904f8495ff70e2a9ccdf3f2710e050cf75fafa35 + +- filename: .ci/pipenv/manylinux_2_28_x86_64-py311-test-sysctk12-Pipfile.lock + checksum: ac3e74b0d9d8e36c9400aaccda328a23eae6abb09b39813d3767fcca4f7314c9 +- filename: .ci/pipenv/manylinux_2_28_x86_64-py311-test-sysctk11-Pipfile.lock + checksum: 7d7fe899d77a9b3cddd67b7ad6cedd3b0fd508e403dc750ad8d4b186f3e0e470 +- filename: .ci/pipenv/manylinux_2_28_x86_64-py310-test-cu12-cpu-Pipfile.lock + checksum: dc0f70918f75d9a336d748eade983ff4b46ad03149565e0740efe8a4aadfdc10 +- filename: .ci/pipenv/manylinux_2_28_x86_64-py310-test-cu12-dx-Pipfile.lock + checksum: 3bc632416be184605b6dcb3c1ec28af1e26e68df6f3232ad40a02a4091153a0a +- filename: .ci/pipenv/manylinux_2_28_x86_64-py312-test-cpu-Pipfile.lock + checksum: f11933df76dcc98ae3b25e5f356cc9060afdd9a838cf705a4ceca6bdd7161d01 +- filename: .ci/pipenv/manylinux_2_28_x86_64-py311-test-sysctk12-dx-Pipfile.lock + checksum: 357c92fecf447f5640da599d25def0ac04490b03adc859798b2eba937a546f09 +- filename: .ci/pipenv/manylinux_2_28_x86_64-py312-build-Pipfile.lock + checksum: 224516e0451196831d93512a6ff9e26d8dee14b83064270ce6cbe9567dcb5753 +- filename: .ci/pipenv/manylinux_2_28_x86_64-py312-test-cu12-Pipfile.lock + checksum: 61d43b09f08b7f6e965194ade06c41896827cd5868c3d64e95b4524e3a1d98b9 +- filename: .ci/pipenv/manylinux_2_28_x86_64-py310-test-cpu-Pipfile.lock + checksum: 73cfd9f66cfa1c7252cc16904b90d79faf7e06cd27b0fe653fd290e1c20a819c +- filename: .ci/pipenv/manylinux_2_28_x86_64-py312-test-cu12-cpu-Pipfile.lock + checksum: 88c76a2fd790a1f20f0bd68fd47c30cd99df07603ff8e82353855e18ffc16e75 +- filename: .ci/pipenv/manylinux_2_28_x86_64-py312-test-cu12-dx-Pipfile.lock + checksum: 8a0d52aff956ac7f241e09b79af3ca5bddcf0aa040476a861f7a499a5681c410 +- filename: .ci/pipenv/manylinux_2_28_x86_64-py310-test-cu11-Pipfile.lock + checksum: 7333ea9adfb7d931a5a4f4056e26deab90ded9caa054ff7baadc03722577b2d6 +- filename: .ci/pipenv/manylinux_2_28_x86_64-py310-build-Pipfile.lock + checksum: 7d9895b83fe7051b9a0fb146a8f47cb25f87a8deccf4862400941f7c61196ed4 +- filename: .ci/pipenv/manylinux_2_28_x86_64-py312-test-cu12-dx-torch-Pipfile.lock + checksum: 1e3fce2ab1065d2feef0e714a06dfe172033c10cade32de030749ee466b45423 +- filename: .ci/pipenv/manylinux_2_28_x86_64-py312-test-cu11-Pipfile.lock + checksum: 555f9e4ba8b76f3ea912e1df46fee0ed5a0a809b01cceb9e79abe5138ad6e1c0 +- filename: .ci/pipenv/manylinux_2_28_x86_64-py311-test-docstrings-Pipfile.lock + checksum: 2dc4d248779a72d1aad7b7119f0eae9cf5430f58b60cd84a1eb5b28dc6602bd9 +- filename: .ci/pipenv/manylinux_2_28_x86_64-py311-test-cpu-Pipfile.lock + checksum: d5a24df09c39349a868443a7da14840eb0d503132402cc11d916c0928eb41286 +- filename: .ci/pipenv/manylinux_2_28_x86_64-py310-test-cu11-torch-Pipfile.lock + checksum: 14172b0e1f856fbaeb3075613bbdc8d08fed7db84c8017e914656d0f41d1c2bf +- filename: .ci/pipenv/manylinux_2_28_x86_64-py312-test-sysctk12-dx-Pipfile.lock + checksum: f4ee267a48ed091c1daf92ab57a6ee65a62182e531ac3f6d351ff379a94e03ef +- filename: .ci/pipenv/manylinux_2_28_x86_64-py310-test-cu12-dx-torch-Pipfile.lock + checksum: fe0b0af438bc668bb1ecfeee126cb5d5e1aaa3ab4997bdb88191c65e8e70fcb2 +- filename: .ci/pipenv/manylinux_2_28_x86_64-py311-test-cu12-cpu-Pipfile.lock + checksum: 9a26295b25c524d38ee32dc15225a60a1f78a42c0fbe831881603c4528c3b79d +- filename: .ci/pipenv/manylinux_2_28_x86_64-py310-test-sysctk11-Pipfile.lock + checksum: 15710d3872d0ff5ad83750e0c15f5d05128a9b2e131f3e91a645f78e6651f450 +- filename: .ci/pipenv/manylinux_2_28_x86_64-py312-test-sysctk11-Pipfile.lock + checksum: 27b5fed2e2beac1530ca3c7a2e14ce058ba0fbbc58585775c12c9f1f7b479dbc +- filename: .ci/pipenv/manylinux_2_28_x86_64-py311-test-cu12-Pipfile.lock + checksum: eeeb2c572ffe4cf2f5c84f9e3ff6a66f6dfbe54ce0523fd13a309fb8151e061e +- filename: .ci/pipenv/manylinux_2_28_x86_64-py310-test-sysctk12-Pipfile.lock + checksum: a341c44c5bcfabbf8faab106dfe4d331eef55e9b4895b34d6d26b4ea975579b2 +- filename: .ci/pipenv/manylinux_2_28_x86_64-py310-test-sysctk12-dx-Pipfile.lock + checksum: e0fb537be5bde7c0a52c551198fe21085ffac9bed1b30700afd5e05659506fb3 +- filename: .ci/pipenv/manylinux_2_28_x86_64-py310-test-cu12-Pipfile.lock + checksum: a42cbc8a03f5a82494b44226903cee90dfc4208320385b6eccfbc72cc8e508f7 +- filename: .ci/pipenv/manylinux_2_28_x86_64-py311-test-cu11-torch-Pipfile.lock + checksum: 64ca8c7987f08a2de952a99833010f14a7180948ba23b46c17fb04c35cfc9ba1 +- filename: .ci/pipenv/manylinux_2_28_x86_64-py311-test-cu11-Pipfile.lock + checksum: d88bb28ca54a8f79d3ce4ae0575658de62c4fbd49048f8844a740372b2483e82 +- filename: .ci/pipenv/manylinux_2_28_x86_64-py311-build-Pipfile.lock + checksum: 68a465110297077c4e07616dc055b73a5e37ad412d9c5ca4bb5800024d3fa273 +- filename: .ci/pipenv/manylinux_2_28_x86_64-py311-test-cu12-dx-Pipfile.lock + checksum: 669ce9cf12b07e98b1955008ea90fbeb333a087269ed05b5c5319f4ea9c5988b +- filename: .ci/pipenv/manylinux_2_28_x86_64-py311-test-cu12-dx-torch-Pipfile.lock + checksum: b89939ae0ac554d89f5a333f37da8d9e7fb37c0ad832fb3e6f9b73b00b21bf99 +- filename: .ci/pipenv/manylinux_2_28_x86_64-py312-test-sysctk12-Pipfile.lock + checksum: 62280af6c8aa520138f636a09d0330f3cda9efa2ac4e9337d7e995c93bd10c06 - filename: .ci/pipenv/manylinux_2_28_x86_64-py311-docs-Pipfile.lock - checksum: 50417e87baee9d7aa17765525d21c48ea99f9d3c0b2a2b25d401102b5c5bb32a + checksum: acfaab5ffb3098a96645323d7879d8d1df69a549b40a09a590bf8fe1315dc839 diff --git a/MANIFEST.in b/MANIFEST.in new file mode 100644 index 0000000..39aca1a --- /dev/null +++ b/MANIFEST.in @@ -0,0 +1,5 @@ +graft nvmath +global-include *.pyd +global-include *.pyi +global-exclude *.cpp +global-exclude *.pyx diff --git a/builder/__init__.py b/builder/__init__.py index 4f1a2f2..6ff03ac 100644 --- a/builder/__init__.py +++ b/builder/__init__.py @@ -1,3 +1,3 @@ -# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. +# Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. # # SPDX-License-Identifier: Apache-2.0 diff --git a/builder/pep517.py b/builder/pep517.py index c7a21c1..4e9b7a8 100644 --- a/builder/pep517.py +++ b/builder/pep517.py @@ -1,4 +1,4 @@ -# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. +# Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. # # SPDX-License-Identifier: Apache-2.0 @@ -15,8 +15,6 @@ from setuptools import build_meta as _build_meta -import utils # this is builder.utils (the build system has sys.path set up) - prepare_metadata_for_build_wheel = _build_meta.prepare_metadata_for_build_wheel build_wheel = _build_meta.build_wheel diff --git a/builder/utils.py b/builder/utils.py index 7c123f5..9c517b8 100644 --- a/builder/utils.py +++ b/builder/utils.py @@ -1,4 +1,4 @@ -# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. +# Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. # # SPDX-License-Identifier: Apache-2.0 diff --git a/docs/Makefile b/docs/Makefile index de6a9ce..0a3e2e4 100644 --- a/docs/Makefile +++ b/docs/Makefile @@ -2,7 +2,7 @@ SHELL=/bin/bash # You can set these variables from the command line or environment -SPHINX_NVMATH_PYTHON_VER ?= $(shell [[ $$(< ../nvmath/_version.py) =~ __version__[^0-9.]*([0-9.]*) ]] && echo $${BASH_REMATCH[1]}) +SPHINX_NVMATH_PYTHON_VER ?= $(shell [[ $$(< ../pyproject.toml) =~ [^a-zA-Z_]version\ =\ [^0-9.]*([0-9.]*) ]] && echo $${BASH_REMATCH[1]}) SPHINXOPTS ?= -W SPHINXBUILD ?= sphinx-build SOURCEDIR = sphinx diff --git a/docs/sphinx/_static/switcher.json b/docs/sphinx/_static/switcher.json index aabddac..55122ce 100644 --- a/docs/sphinx/_static/switcher.json +++ b/docs/sphinx/_static/switcher.json @@ -3,6 +3,10 @@ "version": "latest", "url": "https://docs.nvidia.com/cuda/nvmath-python/latest" }, + { + "version": "0.3.0", + "url": "https://docs.nvidia.com/cuda/nvmath-python/0.3.0" + }, { "version": "0.2.1", "url": "https://docs.nvidia.com/cuda/nvmath-python/0.2.1" diff --git a/docs/sphinx/bindings/index.rst b/docs/sphinx/bindings/index.rst index d87532f..799601d 100644 --- a/docs/sphinx/bindings/index.rst +++ b/docs/sphinx/bindings/index.rst @@ -157,13 +157,13 @@ require a sequence or a nested sequence, the following operations are equivalent my_func(..., buf, ...) # the underlying data type is determined by the C API which is particularly useful when users need to pass multiple sequences or nested sequences -to C (ex: :func:`nvmath.bindings.cufft.plan_many`). +to C (For example, :func:`nvmath.bindings.cufft.plan_many`). .. note:: Some functions require their arguments to be in the device memory. You need to pass device memory (for example, :class:`cupy.ndarray`) to such arguments. nvmath-python - does not validate the memory pointers passed and does not implicitly transfer the data. + neither validates the memory pointers nor implicitly transfers the data. Passing host memory where device memory is expected (and vice versa) results in undefined behavior. diff --git a/docs/sphinx/conf.py b/docs/sphinx/conf.py index ad66638..0bde383 100644 --- a/docs/sphinx/conf.py +++ b/docs/sphinx/conf.py @@ -17,11 +17,12 @@ import os import re import sys +import tomllib +import tempfile +import json sys.path.insert(0, os.path.abspath(".")) -import pkg_resources import warnings -import json from sphinx.writers.html import HTMLTranslator from docutils.transforms import Transform @@ -56,10 +57,8 @@ # The version info for the project you're documenting, acts as replacement for # |version| and |release|, also used in various other places throughout the # built documents. -with open("../../nvmath/_version.py") as f: - exec(f.read()) - nvmath_py_ver = __version__ # noqa: F821 - del __version__ # noqa: F821 +with open("../../pyproject.toml", "rb") as f: + nvmath_py_ver = tomllib.load(f)["project"]["version"] # The short X.Y version. version = nvmath_py_ver @@ -90,6 +89,8 @@ #'sphinxcontrib.autoprogram', "sphinxcontrib.programoutput", "sphinx_favicon", + "nbsphinx", + "nbsphinx_link", ] imgmath_latex_preamble = r"\usepackage{braket}" @@ -101,6 +102,9 @@ # Add any paths that contain templates here, relative to this directory. templates_path = ["_templates"] +# Silence a warning about unpicklable value +nbsphinx_custom_formats = {} + # The suffix(es) of source filenames. # You can specify multiple suffix as a list of string: # @@ -181,7 +185,7 @@ def autodoc_process_docstring(app, what, name, obj, options, lines): struct = snake_to_camel([mod] + struct.split("_")[:-1]) line = f"NumPy dtype object that represents the `{struct}` struct.\n" else: - # handle dtype in high-level pythonic APIs + # handle dtype in high-level Pythonic APIs struct = " ".join(struct.split("_")[:-1]) line = f"NumPy dtype object that encapsulates the {struct} in {mod}.\n" lines.clear() @@ -250,9 +254,42 @@ def default_departure(self, node): default_priority = 800 +class NotebookHandler: + def __init__(self): + self.tmpdir = tempfile.mkdtemp() + + def __del__(self): + os.unlink(self.tmpdir) + + def remove_notebook_copyright(self, app, docname, content): + if os.path.exists(os.path.join("sphinx", docname + ".nblink")): + link = json.loads(content[0]) + notebook_path = os.path.join("sphinx", os.path.dirname(docname), link["path"]) + + with open(notebook_path) as original_notebook_file: + notebook_content = json.load(original_notebook_file) + copyright_regex = ( + r"\s*Copyright \(c\) [0-9-]+, NVIDIA CORPORATION & AFFILIATES\s*SPDX-License-Identifier: BSD-3-Clause\s*" + ) + if re.match(copyright_regex, "".join(notebook_content["cells"][0]["source"])): + # Remove first cell if it's a copyright notice + notebook_content["cells"] = notebook_content["cells"][1:] + + new_notebook_path = os.path.join(self.tmpdir, docname.replace(".nblink", ".ipynb").replace("/", "__")) + with open(new_notebook_path, "w") as new_notebook_file: + json.dump(notebook_content, new_notebook_file) + + link["path"] = os.path.relpath(new_notebook_path, os.path.join("sphinx", os.path.dirname(docname))) + content[0] = json.dumps(link) + + +notebook_handler = NotebookHandler() + + def setup(app): app.add_css_file("nvmath_override.css") app.connect("autodoc-process-docstring", autodoc_process_docstring) + app.connect("source-read", lambda *args, **kwargs: notebook_handler.remove_notebook_copyright(*args, **kwargs)) app.set_translator("html", DotBreakHtmlTranslator) app.add_autodocumenter(PatchedEnumDocumenter, override=True) app.add_post_transform(UnqualifiedTitlesTransform) @@ -283,6 +320,10 @@ def setup(app): # sweetspot value determined by trial & error to suppress all warnings autosectionlabel_maxdepth = 2 +show_warning_types = True +suppress_warnings = [ + "config.cache", # nbsphinx_link makes nbsphinx_custom_formats unpicklable +] doctest_global_setup = """ import numpy as np diff --git a/docs/sphinx/device-apis/cufft.rst b/docs/sphinx/device-apis/cufft.rst index 065ff0a..81078fc 100644 --- a/docs/sphinx/device-apis/cufft.rst +++ b/docs/sphinx/device-apis/cufft.rst @@ -9,7 +9,7 @@ Overview ======== These APIs offer integration with the NVIDIA cuFFTDx library. -Detailed documentation of cuBLASDx can be found in the +Detailed documentation of cuFFTDx can be found in the `cuFFTDx documentation `_. .. note:: diff --git a/docs/sphinx/device-apis/index.rst b/docs/sphinx/device-apis/index.rst index d25a257..bdddcc7 100644 --- a/docs/sphinx/device-apis/index.rst +++ b/docs/sphinx/device-apis/index.rst @@ -13,6 +13,7 @@ Detailed documentation for these libraries can be found at `cuFFTDx `_, `cuBLASDx `_, and `cuRAND device APIs `_ respectively. +Device APIs can only be called from CUDA device or kernel code, and execute on the GPU. Users may take advantage of the device module via the two approaches below: diff --git a/docs/sphinx/fft/index.rst b/docs/sphinx/host-apis/fft/index.rst similarity index 100% rename from docs/sphinx/fft/index.rst rename to docs/sphinx/host-apis/fft/index.rst diff --git a/docs/sphinx/host-apis/index.rst b/docs/sphinx/host-apis/index.rst new file mode 100644 index 0000000..7f1f7f2 --- /dev/null +++ b/docs/sphinx/host-apis/index.rst @@ -0,0 +1,20 @@ +********* +Host APIs +********* + +The following of modules of nvmath-python offer integration with NVIDIA's +high-performance computing libraries through host APIs for cuBLAS and cuFFT. +Host APIs are called from host code but can execute in any supported execution +space (CPU or GPU). + +======== +Contents +======== + +.. toctree:: + :caption: API Reference + :maxdepth: 2 + + Linear Algebra + Fast Fourier Transform + Host API Utilities diff --git a/docs/sphinx/linalg/index.rst b/docs/sphinx/host-apis/linalg/index.rst similarity index 63% rename from docs/sphinx/linalg/index.rst rename to docs/sphinx/host-apis/linalg/index.rst index d005942..8da23c2 100644 --- a/docs/sphinx/linalg/index.rst +++ b/docs/sphinx/host-apis/linalg/index.rst @@ -43,5 +43,27 @@ Specialized Linear Algebra APIs (:mod:`nvmath.linalg.advanced`) :template: dataclass.rst + MatmulEpilogPreferences MatmulOptions MatmulPlanPreferences + MatmulQuantizationScales + +Helpers +^^^^^^^ + +The Specialized Linear Algebra helpers module :mod:`nvmath.linalg.advanced.helpers` +provides helper functions to facilitate working with some of the complex features of +:mod:`nvmath.linalg.advanced` module. + +Matmul helpers (:mod:`nvmath.linalg.advanced.helpers.matmul`) +""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""" + +.. module:: nvmath.linalg.advanced.helpers.matmul + +.. autosummary:: + :toctree: generated/ + + create_mxfp8_scale + invert_mxfp8_scale + apply_mxfp8_scale + get_mxfp8_scale_offset diff --git a/docs/sphinx/host-utils.rst b/docs/sphinx/host-apis/utils.rst similarity index 64% rename from docs/sphinx/host-utils.rst rename to docs/sphinx/host-apis/utils.rst index 1654166..2d72f43 100644 --- a/docs/sphinx/host-utils.rst +++ b/docs/sphinx/host-apis/utils.rst @@ -2,13 +2,6 @@ Host API Utilities ********************** -.. _host-api-util-overview: - -Overview -======== - -nvmath-python provides host-side APIs for managing device-side memory. - .. _host-api-util-reference: API Reference @@ -16,8 +9,24 @@ API Reference .. module:: nvmath +Memory utilities +---------------- + +nvmath-python provides host-side APIs for managing device-side memory. + .. autosummary:: :toctree: generated/ BaseCUDAMemoryManager MemoryPointer + +Data types +---------- + +nvmath-python provides the following data types. + + +.. autosummary:: + :toctree: generated/ + + CudaDataType diff --git a/docs/sphinx/index.rst b/docs/sphinx/index.rst index 0245cc7..295d189 100644 --- a/docs/sphinx/index.rst +++ b/docs/sphinx/index.rst @@ -10,9 +10,10 @@ Welcome to the nvmath-python documentation! interoperability within the Python computational ecosystem through NVIDIA's high-performance math libraries. -To quickly get started with *nvmath-python*, take a look at our +To quickly get started with nvmath-python, take a look at our :doc:`Getting Started ` manual. -Refer to our :doc:`Installation Guide ` for detailed instructions on the various installation choices available. +Refer to our :doc:`Installation Guide ` for detailed instructions on the +various installation choices available. ======== Contents @@ -32,12 +33,17 @@ Contents Installation .. toctree:: - :caption: API Reference + :caption: Examples and tutorials :maxdepth: 2 - Host API Utilities - Linear Algebra - Fast Fourier Transform + Linear Algebra Host APIs Tutorial + Examples on GitHub + +.. toctree:: + :caption: API Reference + :maxdepth: 3 + + Host APIs Device APIs Bindings diff --git a/docs/sphinx/installation.rst b/docs/sphinx/installation.rst index c9062d7..03631aa 100644 --- a/docs/sphinx/installation.rst +++ b/docs/sphinx/installation.rst @@ -5,13 +5,13 @@ Install nvmath-python ===================== nvmath-python, like most modern Python packages, provides pre-built binaries (wheels and -later conda packages) to the end users. The full source code is hosted in the +conda packages) to the end users. The full source code is hosted in the `NVIDIA/nvmath-python `_ repository. In terms of CUDA Toolkit (CTK) choices, nvmath-python is designed and implemented to allow building and running against 1. ``pip``-wheel, 2. ``conda``, or 3. system installation of -CTK. Having a full CTK installation at either build- or run- time is not necessary; just a -small fraction as explained below is enough. +CTK. Having a full CTK installation at either build- or run-time is not necessary; only a +small subset, as explained below, is enough. Host & device APIs (see :ref:`nvmath overview`) have different run-time dependencies and requirements. Even among host APIs the needed underlying libraries are different (for @@ -20,9 +20,9 @@ loaded when only needed. Therefore, nvmath-python is designed to have most of it dependencies *optional*, but provides convenient installation commands for users to quickly spin up a working Python environment. -The :ref:`cheatsheet ` below captures nvmath-python's required/optional, -build-/run- time dependencies. Using the installation commands from the sections below -should support most of your needs. +The :ref:`cheatsheet ` below captures nvmath-python's required and optional +build-time and run-time dependencies. Using the installation commands from the sections +below should support most of your needs. .. _install from pypi: @@ -60,12 +60,12 @@ needed; the dependencies are pulled via extras). **Note**: - 1. NVPL is for ARM architecture only. MKL or another FFTW3 [9]_ compatible - library may be substituted for x86 architecture. + 1. NVPL supports only ARM architecture, while MKL or another FFTW3 [9]_ + compatible library may be substituted for x86 architecture. 2. The environment variable ``NVMATH_FFT_CPU_LIBRARY`` may be used to provide the path to an alternate shared object which implements the - FFTW3 (non-guru) API. ``LD_LIBRARY_PATH`` should be set properly to - include this library if it is not already in the PATH. + FFTW3 (non-guru) API. Ensure ``LD_LIBRARY_PATH`` includes this + library if it is not already in the PATH. The options below are for adventurous users who want to manage most of the dependencies themselves. The following assumes that **system CTK is installed**. @@ -80,15 +80,13 @@ themselves. The following assumes that **system CTK is installed**. - Install nvmath-python along with CuPy for CUDA 11 to support nvmath host APIs. - **Note**: ``LD_LIBRARY_PATH`` should be set - properly to include CUDA libraries. + **Note**: Set ``LD_LIBRARY_PATH`` to include the CUDA libraries. * - ``pip install nvmath-python[sysctk12]`` - Install nvmath-python along with CuPy for CUDA 12 to support nvmath host APIs. - **Note**: ``LD_LIBRARY_PATH`` should be set - properly to include CUDA libraries. + **Note**: Set ``LD_LIBRARY_PATH`` to include the CUDA libraries. * - ``pip install nvmath-python[sysctk12-dx]`` - Install nvmath-python along with CuPy for CUDA 12 to support @@ -96,9 +94,9 @@ themselves. The following assumes that **system CTK is installed**. **Note**: - 1. ``LD_LIBRARY_PATH`` should be set properly to include CUDA libraries. - 2. For using :mod:`nvmath.device` APIs, ``CUDA_HOME`` (or ``CUDA_PATH``) should be - set to point to the system CTK. + 1. Set ``LD_LIBRARY_PATH`` to include the CUDA libraries. + 2. To use :mod:`nvmath.device` APIs, set ``CUDA_HOME`` (or ``CUDA_PATH``) + to point to the system CTK. For system admins or expert users, ``pip install nvmath-python`` would be a bare minimal installation (very lightweight). This allows fully explicit control of all dependencies. @@ -132,10 +130,10 @@ Conda packages can be installed from the `conda-forge ` **Note**: - 1. ``nvmath-python-dx`` is a meta-package for ease of installing + 1. ``nvmath-python-dx`` is a metapackage for ease of installing ``nvmath-python`` and other dependencies. - 2. ``pynvjitlink`` currently only lives on the rapidsai channel, - not the conda-forge channel. + 2. Currently, ``pynvjitlink`` is only available on the rapidsai channel, + and not on conda-forge. * - ``conda install -c conda-forge nvmath-python-cpu`` - Install nvmath-python along with all CPU optional dependencies (NVPL or other) to support optimized CPU FFT APIs. [1]_ @@ -183,9 +181,9 @@ source. There are several ways to build it since we need some CUDA headers at bu **Note**: in this case we get CUDA headers by installing pip wheels to the isolated build environment. * - ``CUDA_PATH=/path/to/your/cuda/installation pip install --no-build-isolation -v .`` - - Skip creating a build isolation (it'd use CUDA headers from ``$CUDA_PATH/include`` - instead), build the project, and install it to the current - user environment together with the run-time dependencies. One can use: + - Skip creating a build isolation (it would use CUDA headers from + ``$CUDA_PATH/include`` instead), build the project, and install it to the current + user environment together with the run-time dependencies. Use: - conda: After installing CUDA 12 conda packages, set the environment variable ``CUDA_PATH`` @@ -198,9 +196,9 @@ source. There are several ways to build it since we need some CUDA headers at bu **Notes**: -- If you add the "extras" notation after the dot ``.`` (for example ``.[cu11]``, ``.[cu12,dx]``, - ...), it has the same meaning as explained in the :ref:`previous section `. +- If you add the "extras" notation after the dot ``.`` (for example ``.[cu11]``, + ``.[cu12,dx]``, ...), it has the same meaning as explained in the :ref:`previous section + `. - If you don't want the run-time dependencies to be automatically handled, add ``--no-deps`` after the ``pip install`` command above; in this case, however, it's your responsibility to make sure that all the run-time requirements are met. @@ -299,7 +297,7 @@ dependency is *required* unless stated otherwise. * - | PyTorch | (see `PyTorch installation guide `_) - - - >=1.10 (optional) + - >=1.10 (optional) [10]_ - - >=1.10 (optional) * - MathDx (cuBLASDx, cuFFTDx, ...) @@ -319,7 +317,7 @@ dependency is *required* unless stated otherwise. - * - Math Kernel Library (MKL) - - - 2024.4 (optional) + - >=2024 (optional) - - * - NVIDIA Performance Libraries (NVPL) @@ -343,9 +341,9 @@ nvmath-python is tested in the following environments: * - CUDA - 11.x (latest), 12.x (latest) * - Driver - - R450, R520, R525, R560 + - R520, R525, R570 * - GPU model - - A100, H100, RTX 4090, CG1 (Grace-Hopper) + - H100, B100, RTX 4090, CG1 (Grace-Hopper) * - Python - 3.10, 3.11, 3.12 * - CPU architecture @@ -486,18 +484,18 @@ libraries, there are user-visible caveats. 3. CuPy installed from ``pip`` currently (as of v13.3.0) only supports conda and system CTK, and not ``pip``-installed CUDA wheels. nvmath-python can help CuPy use the CUDA libraries installed to ``site-packages`` (where wheels are installed to) if ``nvmath`` is imported. - From beta 2 (v0.2.0) onwards the libraries are "soft-loaded" (no error is raised if a library is - not installed) when ``import nvmath`` happens. This behavior may change in a future - release. + From beta 2 (v0.2.0) onwards the libraries are "soft-loaded" (no error is raised if a + library is not installed) when ``import nvmath`` happens. This behavior may change in a + future release. 4. Numba installed from ``pip`` currently (as of v0.60.0) only supports conda and system CTK, and not ``pip``-installed CUDA wheels. nvmath-python can also help Numba use the CUDA compilers installed to ``site-packages`` if ``nvmath`` is imported. Same as above, this behavior may change in a future release. In general, mixing-and-matching CTK packages from ``pip``, ``conda``, and the system is -possible but can be very fragile, so it's important to understand what you're doing. The nvmath-python -internals are designed to work with everything installed either via ``pip``, ``conda``, or -local system (system CTK, including `tarball extractions +possible but can be very fragile, so it's important to understand what you're doing. The +nvmath-python internals are designed to work with everything installed either via ``pip``, +``conda``, or local system (system CTK, including `tarball extractions `_, are the fallback solution in the detection logic), but mix-n-match makes the detection logic impossible to get right. @@ -563,7 +561,7 @@ For more information with regard to the new CUDA 12+ package layout on conda-for .. [2] nvmath-python relies on `CUDA minor version compatibility `_. -.. [4] As of beta 2.1 (v0.2.1), CuPy is a required run-time dependency except for CPU-only +.. [4] As of beta 3.0 (v0.3.0), CuPy is a required run-time dependency except for CPU-only execution. In a future release it will be turned into an optional run-time dependency. .. [5] For example, Hopper GPUs are supported starting CUDA 11.8, so they would not work with libraries from CUDA 11.7 or below. @@ -576,3 +574,5 @@ For more information with regard to the new CUDA 12+ package layout on conda-for already takes care of this. .. [9] The library must ship FFTW3 symbols for single and double precision transforms in a single ``so`` file. +.. [10] To use ``matmul`` with FP8 or MXFP8 you need PyTorch version built with CUDA 12.8 + (``>=2.7.0`` or nightly version) diff --git a/docs/sphinx/overview.rst b/docs/sphinx/overview.rst index fe4dd0d..8d3b550 100644 --- a/docs/sphinx/overview.rst +++ b/docs/sphinx/overview.rst @@ -4,7 +4,7 @@ Overview ******** The primary goal of nvmath-python is to bring the power of the NVIDIA math libraries to the -Python ecosystem. The package aims to provide intuitive pythonic APIs that provide users +Python ecosystem. The package aims to provide intuitive Pythonic APIs that provide users full access to all the features offered by our libraries in a variety of execution spaces. We hope to empower a wide range of Python users by providing easy access to high-performance @@ -26,7 +26,7 @@ The APIs provided by nvmath-python can be categorized into: nvmath-python is dedicated to delivering the following key features and commitments: -1. **Logical Feature Parity**: While the pythonic API surface (the number of APIs and the +1. **Logical Feature Parity**: While the Pythonic API surface (the number of APIs and the complexity of each) is more concise compared to that of the C libraries, it provides access to their complete functionality. 2. **Consistent Design Patterns**: Uniform design across all modules to simplify user @@ -67,12 +67,12 @@ flexibility allows: Additionally, we offer :doc:`Python bindings ` that provide a 1:1 mapping with the C APIs. These bindings, which serve as wrappers with API signatures similar to their C counterparts, are ideal for library developers looking to integrate the capabilities -of the NVIDIA Math Libraries in a customized manner, in the event that the pythonic APIs -don't meet their specific requirements. Conversely, our high-level pythonic APIs deliver a +of the NVIDIA Math Libraries in a customized manner, in the event that the Pythonic APIs +don't meet their specific requirements. Conversely, our high-level Pythonic APIs deliver a fully integrated solution suitable for native Python users as well as library developers, -encompassing both host and device APIs. In the future, select host APIs will accept -**callback functions written in Python** and compiled into supported formats such as LTO-IR, -using compilers like `Numba`_. +encompassing both host and device APIs. Select host APIs accept **callback functions +written in Python**, which are compiled into supported formats such as LTO-IR, using +compilers like `Numba`_. .. _host api section: @@ -86,9 +86,9 @@ nvmath-python provides a collection of APIs that can be directly invoked from th categories: - Fast Fourier Transform in :mod:`nvmath.fft`. Refer to :doc:`Fast Fourier Transform - ` for details. + ` for details. - Linear Algebra in :mod:`nvmath.linalg`. Refer to :doc:`Linear Algebra - ` for details. + ` for details. .. _host api interop: @@ -108,7 +108,7 @@ frameworks. One example for the interoperability is shown below: # Create a numpy.ndarray as input a = np.random.random(128) + 1.j * np.random.random(128) - # Call nvmath-python pythonic APIs + # Call nvmath-python Pythonic APIs b = nvmath.fft.fft(a) # Verify that output is also a numpy.ndarray @@ -134,24 +134,24 @@ potentially enhancing performance significantly. The design pattern for all stateful APIs in nvmath-python consists of several key phases: - - Problem Specification: This initial phase involves defining the operation and setting - options that affect its execution. It's designed to be as lightweight as possible, - ensuring the problem is well-defined and supported by the current implementation. - - Preparation: Using FFT as an example, this phase includes a planning step to select - the optimal algorithm for the defined FFT operation. An optional autotuning operation, - when available, also falls within the preparation phase. The preparation phase is - generally the most resource-intensive and may incorporate user-specified planning and - autotuning options. - - Execution: This phase allows for repeated execution, where the operand can be either - modified in-place or explicitly reset using the ``reset_operand``/``reset_operands`` - method. The costs associated with the first two phases are therefore amortized over - these multiple executions. - - Resource Release: Users are advised to use stateful objects from within a context - using the `with statement - `_, which - automatically handles the release of internal resources upon exit. If the object is - not used as a context manager using ``with``, it is necessary to explicitly call the - ``free`` method to ensure all resources are properly released. +- Problem Specification: This initial phase involves defining the operation and setting + options that affect its execution. It's designed to be as lightweight as possible, + ensuring the problem is well-defined and supported by the current implementation. +- Preparation: Using FFT as an example, this phase includes a planning step to select + the optimal algorithm for the defined FFT operation. An optional autotuning operation, + when available, also falls within the preparation phase. The preparation phase is + generally the most resource-intensive and may incorporate user-specified planning and + autotuning options. +- Execution: This phase allows for repeated execution, where the operand can be either + modified in-place or explicitly reset using the ``reset_operand``/``reset_operands`` + method. The costs associated with the first two phases are therefore amortized over + these multiple executions. +- Resource Release: Users are advised to use stateful objects from within a context + using the `with statement + `_, which + automatically handles the release of internal resources upon exit. If the object is + not used as a context manager using ``with``, it is necessary to explicitly call the + ``free`` method to ensure all resources are properly released. .. note:: @@ -166,7 +166,7 @@ The design pattern for all stateful APIs in nvmath-python consists of several ke .. note:: The decision to require explicit ``free`` calls for resource release is driven by the - fact that Python's garbage collector can delay freeing object resources when the object + fact that Python's garbage collector may delay freeing object resources when the object goes out of scope or its reference count drops to zero. For details, refer to the `__del__ method Python documentation `_. @@ -206,8 +206,8 @@ Full Logging Support nvmath-python provides integration with the Python standard library logger from the `logging module `_ to offer full logging of the -computational details at various levels, for example debug, information, warning and error. An -example illustrating the use of the global Python logger is shown below: +computational details at various levels, for example debug, information, warning and error. +An example illustrating the use of the global Python logger is shown below: .. code-block:: python @@ -220,7 +220,7 @@ example illustrating the use of the global Python logger is shown below: datefmt='%m-%d %H:%M:%S' ) - # Call nvmath-python pythonic APIs + # Call nvmath-python Pythonic APIs out = nvmath.linalg.advanced.matmul(...) Alternatively, for APIs that contain the ``options`` argument, users can set a custom logger @@ -236,7 +236,7 @@ object, for example :attr:`nvmath.fft.FFTOptions.logger` for :func:`nvmath.fft.f logger = logging.getLogger('userlogger') ... - # Call nvmath-python pythonic APIs + # Call nvmath-python Pythonic APIs out = nvmath.fft.fft(..., options={'logger': logger}) For the complete examples, refer to `global logging example @@ -260,7 +260,7 @@ example04_logging_user.py>`_. Call Blocking Behavior ---------------------- -By default, calls to all pythonic host APIs that require GPU execution are *not* blocking if +By default, calls to all Pythonic host APIs that require GPU execution are *not* blocking if the input operands reside on the device. This means that functions like :func:`nvmath.linalg.advanced.matmul`, :meth:`nvmath.fft.FFT.execute`, and :meth:`nvmath.linalg.advanced.Matmul.execute` will return immediately after the operation is @@ -269,7 +269,7 @@ properly synchronizing the stream when needed. The default behavior can be modif setting the ``blocking`` attribute (default ``'auto'``) of the relevant ``Options`` object to ``True``. For example, users may set :attr:`nvmath.fft.FFTOptions.blocking` to ``True`` and pass this options object to the corresponding FFT API calls. If the input operands are -on the host, the pythonic API calls will always block since the computation yields an output +on the host, the Pythonic API calls will always block since the computation yields an output operand that will also reside on the host. Meanwhile, APIs that execute on the host (such as :meth:`nvmath.fft.FFT.create_key`) always block. @@ -295,7 +295,7 @@ provided for two reasons: For non-blocking behavior, it is the user's responsibility to ensure correct stream ordering between the execution API calls. -In any case, the execution APIs are launched on the provided stream. +The execution APIs are always launched on the provided stream. For examples on stream ordering, refer to `FFT with multiple streams `_. @@ -307,8 +307,8 @@ Memory Management By default, the host APIs use the memory pool from the package that their operands belong to. This ensures that there is no contention for memory or spurious out-of-memory errors. -However the user also has the ability to provide their own memory allocator if they choose -to do so. In our pythonic APIs, we support an `EMM`_-like interface as proposed and +However, the user also has the ability to provide their own memory allocator if they choose +to do so. In our Pythonic APIs, we support an `EMM`_-like interface as proposed and supported by Numba for users to set their Python mempool. Taking FFT as an example, users can set the option :attr:`nvmath.fft.FFTOptions.allocator` to a Python object complying with the :class:`nvmath.BaseCUDAMemoryManager` protocol, and pass the options to the high-level @@ -397,7 +397,7 @@ considerations, we strive to meet the following commitments: Note that all bindings are currently *experimental*. -2. For the high-level pythonic APIs, we maintain backward compatibility to the greatest +2. For the high-level Pythonic APIs, we maintain backward compatibility to the greatest extent feasible. When a breaking change is necessary, we issue a runtime warning to alert users of the upcoming changes in the next major release. This practice ensures that breaking changes are clearly communicated and reserved for major version updates, diff --git a/docs/sphinx/quickstart.rst b/docs/sphinx/quickstart.rst index ee70b8b..6a6dd43 100644 --- a/docs/sphinx/quickstart.rst +++ b/docs/sphinx/quickstart.rst @@ -2,7 +2,7 @@ Getting Started *************** nvmath-python brings the power of the NVIDIA math libraries to the Python ecosystem. -The package aims to provide intuitive pythonic APIs that provide users full access +The package aims to provide intuitive Pythonic APIs that provide users full access to all the features offered by NVIDIA's libraries in a variety of execution spaces. nvmath-python works seamlessly with existing Python array/tensor frameworks and focuses on providing functionality that is missing from those frameworks. @@ -32,7 +32,8 @@ Matrix multiplication Using the nvmath-python API allows access to all parameters of the underlying NVIDIA cuBLASLt library. -Some of these parameters are unavailable in other wrappings of NVIDIA's C-API libraries. +Some of these parameters are unavailable in other packages that wrap NVIDIA's C-API +libraries. .. doctest:: @@ -76,7 +77,7 @@ prolog to the IFFT operation. >>> B, N = 256, 1024 >>> a = cp.random.rand(B, N, dtype=cp.float64) + 1j * cp.random.rand(B, N, dtype=cp.float64) >>> - >>> # Create the data to use as filter. + >>> # Create the data to use as a filter. >>> filter_data = cp.sin(a) >>> >>> # Define the prolog function for the inverse FFT. @@ -84,7 +85,7 @@ prolog to the IFFT operation. >>> def convolve(data_in, offset, filter_data, unused): ... # Note we are accessing `data_out` and `filter_data` with a single `offset` integer, ... # even though the input and `filter_data` are 2D tensors (batches of samples). - ... # Care must be taken to assure that both arrays accessed here have the same memory + ... # Care must be taken to ensure that both arrays accessed here have the same memory ... # layout. ... return data_in[offset] * filter_data[offset] / N >>> diff --git a/docs/sphinx/release-notes.rst b/docs/sphinx/release-notes.rst index 94852b0..6c7d45c 100644 --- a/docs/sphinx/release-notes.rst +++ b/docs/sphinx/release-notes.rst @@ -1,14 +1,37 @@ nvmath-python Release Notes *************************** +nvmath-python v0.3.0 +==================== + +Beta3 release. + +* FP8 and MXFP8 support for the advanced matrix multiplication API. +* Notebook to illustrate use of FP8 and MXFP8 in the advanced matrix multiplication API. +* Added bindings for new APIs introduced in CTK version 12.8. + +Bugs Fixed +---------- + +* The advanced matrix multiplication API may return an incorrect result when a bias vector + is used along with 1-D A and C operands. + +API Changes +----------- + +* The ``last_axis_size`` option in :class:`nvmath.fft.FFTOptions` is removed in favor of + ``last_axis_parity`` to better reflect its semantics. + nvmath-python v0.2.1 ==================== Beta2 update 1 with improved diagnostics, testing enhancements, and bug fixes. -* New tests for batched epilogs and autotuning with epilogs for the advanced matrix multiplication APIs. +* New tests for batched epilogs and autotuning with epilogs for the advanced matrix + multiplication APIs. * Added more hypothesis-based tests for host APIs. -* Improved algorithm for detecting overlapping memory operands for certain sliced tensors, thereby supporting such layouts for FFTs. +* Improved algorithm for detecting overlapping memory operands for certain sliced tensors, + thereby supporting such layouts for FFTs. * Added bindings for new APIs introduced in CTK versions 12.5 and 12.6. * Further coding style fixes toward meeting PEP8 recommendations. * Clarified batched semantics for matrix multiplication epilogs in the documentation. @@ -18,13 +41,16 @@ Bugs Fixed ---------- * C2R FFT may fail with "illegal memory access" on sliced tensors. -* Improved diagnostics to detect incompatible combinations of scale and compute types for matrix multiplication, that previously may have resulted in incorrect results. -* Matrix multiplication provided incorrect results when operand A is a vector (number of dimensions=1). +* Improved diagnostics to detect incompatible combinations of scale and compute types for + matrix multiplication, that previously may have resulted in incorrect results. +* Matrix multiplication provided incorrect results when operand A is a vector (number of + dimensions=1). API Changes ----------- -* The ``last_axis_size`` option in :class:`nvmath.fft.FFTOptions` is now deprecated in favor of `last_axis_parity` to better reflect its semantics. +* The ``last_axis_size`` option in :class:`nvmath.fft.FFTOptions` is now deprecated in favor + of ``last_axis_parity`` to better reflect its semantics. .. note:: @@ -35,7 +61,8 @@ nvmath-python v0.2.0 Beta2 release. -* CPU execution space support for FFT libraries that conform to FFTW3 API (for example MKL, NVPL). +* CPU execution space support for FFT libraries that conform to FFTW3 API (for example MKL, + NVPL). * Support for prolog and epilog callback for FFT, written in Python. * New device APIs for random number generation. * Notebooks to illustrate use of advanced matrix multiplication APIs. @@ -78,7 +105,7 @@ The required and optional dependencies are summarized in the :ref:`cheatsheet `_ diff --git a/docs/sphinx/tutorials/linalg.rst b/docs/sphinx/tutorials/linalg.rst new file mode 100644 index 0000000..8e1f6f9 --- /dev/null +++ b/docs/sphinx/tutorials/linalg.rst @@ -0,0 +1,11 @@ +********************************* +Linear Algebra Host APIs Tutorial +********************************* + +.. toctree:: + :maxdepth: 1 + + Introduction to GEMM with nvmath-python + Fused Epilogs + Implementing a simple neural network + Narrow-precision operations \ No newline at end of file diff --git a/docs/sphinx/tutorials/notebooks/matmul/01_introduction.nblink b/docs/sphinx/tutorials/notebooks/matmul/01_introduction.nblink new file mode 100644 index 0000000..a9d07c6 --- /dev/null +++ b/docs/sphinx/tutorials/notebooks/matmul/01_introduction.nblink @@ -0,0 +1,3 @@ +{ + "path": "../../../../../notebooks/matmul/01_introduction.ipynb" +} diff --git a/docs/sphinx/tutorials/notebooks/matmul/02_epilogs.nblink b/docs/sphinx/tutorials/notebooks/matmul/02_epilogs.nblink new file mode 100644 index 0000000..c50805b --- /dev/null +++ b/docs/sphinx/tutorials/notebooks/matmul/02_epilogs.nblink @@ -0,0 +1,3 @@ +{ + "path": "../../../../../notebooks/matmul/02_epilogs.ipynb" +} diff --git a/docs/sphinx/tutorials/notebooks/matmul/03_backpropagation.nblink b/docs/sphinx/tutorials/notebooks/matmul/03_backpropagation.nblink new file mode 100644 index 0000000..d135b43 --- /dev/null +++ b/docs/sphinx/tutorials/notebooks/matmul/03_backpropagation.nblink @@ -0,0 +1,3 @@ +{ + "path": "../../../../../notebooks/matmul/03_backpropagation.ipynb" +} diff --git a/docs/sphinx/tutorials/notebooks/matmul/04_fp8.nblink b/docs/sphinx/tutorials/notebooks/matmul/04_fp8.nblink new file mode 100644 index 0000000..9f755da --- /dev/null +++ b/docs/sphinx/tutorials/notebooks/matmul/04_fp8.nblink @@ -0,0 +1,3 @@ +{ + "path": "../../../../../notebooks/matmul/04_fp8.ipynb" +} diff --git a/examples/device/common.py b/examples/device/common.py index c633945..e173bab 100644 --- a/examples/device/common.py +++ b/examples/device/common.py @@ -1,4 +1,4 @@ -# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. +# Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. # # SPDX-License-Identifier: Apache-2.0 diff --git a/examples/device/common_cupy.py b/examples/device/common_cupy.py index 43eed9b..96f7248 100644 --- a/examples/device/common_cupy.py +++ b/examples/device/common_cupy.py @@ -1,4 +1,4 @@ -# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. +# Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. # # SPDX-License-Identifier: Apache-2.0 diff --git a/examples/device/common_numba.py b/examples/device/common_numba.py index 4f323bd..1d81edb 100644 --- a/examples/device/common_numba.py +++ b/examples/device/common_numba.py @@ -1,4 +1,4 @@ -# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. +# Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. # # SPDX-License-Identifier: Apache-2.0 diff --git a/examples/device/cublasdx_batched_gemm_fp64.py b/examples/device/cublasdx_batched_gemm_fp64.py index 973f18b..fb3df22 100644 --- a/examples/device/cublasdx_batched_gemm_fp64.py +++ b/examples/device/cublasdx_batched_gemm_fp64.py @@ -1,4 +1,4 @@ -# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. +# Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. # # SPDX-License-Identifier: Apache-2.0 diff --git a/examples/device/cublasdx_blockdim_gemm_fp16.py b/examples/device/cublasdx_blockdim_gemm_fp16.py index b620a99..98abe48 100644 --- a/examples/device/cublasdx_blockdim_gemm_fp16.py +++ b/examples/device/cublasdx_blockdim_gemm_fp16.py @@ -1,4 +1,4 @@ -# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. +# Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. # # SPDX-License-Identifier: Apache-2.0 diff --git a/examples/device/cublasdx_fused_gemm_performance.py b/examples/device/cublasdx_fused_gemm_performance.py index 2f93af1..1db8c9d 100644 --- a/examples/device/cublasdx_fused_gemm_performance.py +++ b/examples/device/cublasdx_fused_gemm_performance.py @@ -1,4 +1,4 @@ -# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. +# Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. # # SPDX-License-Identifier: Apache-2.0 diff --git a/examples/device/cublasdx_gemm_fft.py b/examples/device/cublasdx_gemm_fft.py index fe4f819..38a8749 100644 --- a/examples/device/cublasdx_gemm_fft.py +++ b/examples/device/cublasdx_gemm_fft.py @@ -1,4 +1,4 @@ -# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. +# Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. # # SPDX-License-Identifier: Apache-2.0 diff --git a/examples/device/cublasdx_gemm_fft_fp16.py b/examples/device/cublasdx_gemm_fft_fp16.py index b183549..e7447cf 100644 --- a/examples/device/cublasdx_gemm_fft_fp16.py +++ b/examples/device/cublasdx_gemm_fft_fp16.py @@ -1,4 +1,4 @@ -# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. +# Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. # # SPDX-License-Identifier: Apache-2.0 diff --git a/examples/device/cublasdx_gemm_fft_performance.py b/examples/device/cublasdx_gemm_fft_performance.py index 9ae4de7..095bd8b 100644 --- a/examples/device/cublasdx_gemm_fft_performance.py +++ b/examples/device/cublasdx_gemm_fft_performance.py @@ -1,4 +1,4 @@ -# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. +# Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. # # SPDX-License-Identifier: Apache-2.0 diff --git a/examples/device/cublasdx_gemm_fusion.py b/examples/device/cublasdx_gemm_fusion.py index 7b7b0a4..8b849b1 100644 --- a/examples/device/cublasdx_gemm_fusion.py +++ b/examples/device/cublasdx_gemm_fusion.py @@ -1,4 +1,4 @@ -# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. +# Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. # # SPDX-License-Identifier: Apache-2.0 diff --git a/examples/device/cublasdx_simple_gemm_cfp16.py b/examples/device/cublasdx_simple_gemm_cfp16.py index bcf9520..6dfecef 100644 --- a/examples/device/cublasdx_simple_gemm_cfp16.py +++ b/examples/device/cublasdx_simple_gemm_cfp16.py @@ -1,4 +1,4 @@ -# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. +# Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. # # SPDX-License-Identifier: Apache-2.0 diff --git a/examples/device/cublasdx_simple_gemm_fp32.py b/examples/device/cublasdx_simple_gemm_fp32.py index 18ee520..94f3bd7 100644 --- a/examples/device/cublasdx_simple_gemm_fp32.py +++ b/examples/device/cublasdx_simple_gemm_fp32.py @@ -1,4 +1,4 @@ -# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. +# Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. # # SPDX-License-Identifier: Apache-2.0 diff --git a/examples/device/cublasdx_simple_gemm_leading_dimensions.py b/examples/device/cublasdx_simple_gemm_leading_dimensions.py index ec20560..75b5c1c 100644 --- a/examples/device/cublasdx_simple_gemm_leading_dimensions.py +++ b/examples/device/cublasdx_simple_gemm_leading_dimensions.py @@ -1,4 +1,4 @@ -# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. +# Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. # # SPDX-License-Identifier: Apache-2.0 diff --git a/examples/device/cublasdx_single_gemm_performance.py b/examples/device/cublasdx_single_gemm_performance.py index 738efea..a11d50b 100644 --- a/examples/device/cublasdx_single_gemm_performance.py +++ b/examples/device/cublasdx_single_gemm_performance.py @@ -1,4 +1,4 @@ -# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. +# Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. # # SPDX-License-Identifier: Apache-2.0 diff --git a/examples/device/cufftdx_autotuning.py b/examples/device/cufftdx_autotuning.py index f7aba72..dfb581e 100644 --- a/examples/device/cufftdx_autotuning.py +++ b/examples/device/cufftdx_autotuning.py @@ -1,4 +1,4 @@ -# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. +# Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. # # SPDX-License-Identifier: Apache-2.0 diff --git a/examples/device/cufftdx_block_fft.py b/examples/device/cufftdx_block_fft.py index ea587bc..5aac1bb 100644 --- a/examples/device/cufftdx_block_fft.py +++ b/examples/device/cufftdx_block_fft.py @@ -1,4 +1,4 @@ -# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. +# Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. # # SPDX-License-Identifier: Apache-2.0 diff --git a/examples/device/cufftdx_block_fft_performance.py b/examples/device/cufftdx_block_fft_performance.py index 894ca5b..bd97482 100644 --- a/examples/device/cufftdx_block_fft_performance.py +++ b/examples/device/cufftdx_block_fft_performance.py @@ -1,4 +1,4 @@ -# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. +# Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. # # SPDX-License-Identifier: Apache-2.0 diff --git a/examples/device/cufftdx_block_fft_performance_many.py b/examples/device/cufftdx_block_fft_performance_many.py index acc6e6b..f3b6ea0 100644 --- a/examples/device/cufftdx_block_fft_performance_many.py +++ b/examples/device/cufftdx_block_fft_performance_many.py @@ -1,4 +1,4 @@ -# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. +# Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. # # SPDX-License-Identifier: Apache-2.0 diff --git a/examples/device/cufftdx_convolution.py b/examples/device/cufftdx_convolution.py index 28411ad..52684b2 100644 --- a/examples/device/cufftdx_convolution.py +++ b/examples/device/cufftdx_convolution.py @@ -1,4 +1,4 @@ -# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. +# Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. # # SPDX-License-Identifier: Apache-2.0 diff --git a/examples/device/cufftdx_convolution_performance.py b/examples/device/cufftdx_convolution_performance.py index 5cf743e..1045d90 100644 --- a/examples/device/cufftdx_convolution_performance.py +++ b/examples/device/cufftdx_convolution_performance.py @@ -1,4 +1,4 @@ -# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. +# Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. # # SPDX-License-Identifier: Apache-2.0 diff --git a/examples/device/cufftdx_convolution_r2c_c2r.py b/examples/device/cufftdx_convolution_r2c_c2r.py index 08c1008..83e2545 100644 --- a/examples/device/cufftdx_convolution_r2c_c2r.py +++ b/examples/device/cufftdx_convolution_r2c_c2r.py @@ -1,4 +1,4 @@ -# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. +# Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. # # SPDX-License-Identifier: Apache-2.0 diff --git a/examples/device/cufftdx_convolution_r2c_c2r_packed_fold_optimized.py b/examples/device/cufftdx_convolution_r2c_c2r_packed_fold_optimized.py index 6b5c329..e2edff2 100644 --- a/examples/device/cufftdx_convolution_r2c_c2r_packed_fold_optimized.py +++ b/examples/device/cufftdx_convolution_r2c_c2r_packed_fold_optimized.py @@ -1,4 +1,4 @@ -# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. +# Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. # # SPDX-License-Identifier: Apache-2.0 diff --git a/examples/device/cufftdx_convolution_signal.py b/examples/device/cufftdx_convolution_signal.py index c5e9d33..4d9c6dd 100644 --- a/examples/device/cufftdx_convolution_signal.py +++ b/examples/device/cufftdx_convolution_signal.py @@ -1,4 +1,4 @@ -# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. +# Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. # # SPDX-License-Identifier: Apache-2.0 diff --git a/examples/device/cufftdx_fft_2d.py b/examples/device/cufftdx_fft_2d.py index 2811b4a..840badb 100644 --- a/examples/device/cufftdx_fft_2d.py +++ b/examples/device/cufftdx_fft_2d.py @@ -1,4 +1,4 @@ -# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. +# Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. # # SPDX-License-Identifier: Apache-2.0 diff --git a/examples/device/cufftdx_fft_2d_r2c_c2r.py b/examples/device/cufftdx_fft_2d_r2c_c2r.py index 716a7b2..49d41bc 100644 --- a/examples/device/cufftdx_fft_2d_r2c_c2r.py +++ b/examples/device/cufftdx_fft_2d_r2c_c2r.py @@ -1,4 +1,4 @@ -# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. +# Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. # # SPDX-License-Identifier: Apache-2.0 diff --git a/examples/device/cufftdx_fft_2d_single_kernel.py b/examples/device/cufftdx_fft_2d_single_kernel.py index a50c167..fcfd3fc 100644 --- a/examples/device/cufftdx_fft_2d_single_kernel.py +++ b/examples/device/cufftdx_fft_2d_single_kernel.py @@ -1,4 +1,4 @@ -# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. +# Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. # # SPDX-License-Identifier: Apache-2.0 diff --git a/examples/device/cufftdx_fft_3d_box_single_block.py b/examples/device/cufftdx_fft_3d_box_single_block.py index 2396742..5df13b8 100644 --- a/examples/device/cufftdx_fft_3d_box_single_block.py +++ b/examples/device/cufftdx_fft_3d_box_single_block.py @@ -1,4 +1,4 @@ -# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. +# Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. # # SPDX-License-Identifier: Apache-2.0 diff --git a/examples/device/cufftdx_fft_3d_cube_single_block.py b/examples/device/cufftdx_fft_3d_cube_single_block.py index 8b51f5a..c6d02c8 100644 --- a/examples/device/cufftdx_fft_3d_cube_single_block.py +++ b/examples/device/cufftdx_fft_3d_cube_single_block.py @@ -1,4 +1,4 @@ -# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. +# Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. # # SPDX-License-Identifier: Apache-2.0 diff --git a/examples/device/cufftdx_helloworld.py b/examples/device/cufftdx_helloworld.py index ab5cc60..e873e48 100644 --- a/examples/device/cufftdx_helloworld.py +++ b/examples/device/cufftdx_helloworld.py @@ -1,4 +1,4 @@ -# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. +# Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. # # SPDX-License-Identifier: Apache-2.0 diff --git a/examples/device/cufftdx_simple_fft_block.py b/examples/device/cufftdx_simple_fft_block.py index b29c86f..957eeba 100644 --- a/examples/device/cufftdx_simple_fft_block.py +++ b/examples/device/cufftdx_simple_fft_block.py @@ -1,4 +1,4 @@ -# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. +# Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. # # SPDX-License-Identifier: Apache-2.0 diff --git a/examples/device/cufftdx_simple_fft_block_c2r.py b/examples/device/cufftdx_simple_fft_block_c2r.py index 39aefaa..b92444f 100644 --- a/examples/device/cufftdx_simple_fft_block_c2r.py +++ b/examples/device/cufftdx_simple_fft_block_c2r.py @@ -1,4 +1,4 @@ -# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. +# Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. # # SPDX-License-Identifier: Apache-2.0 diff --git a/examples/device/cufftdx_simple_fft_block_c2r_fp16.py b/examples/device/cufftdx_simple_fft_block_c2r_fp16.py index cc72edb..9e7d6e7 100644 --- a/examples/device/cufftdx_simple_fft_block_c2r_fp16.py +++ b/examples/device/cufftdx_simple_fft_block_c2r_fp16.py @@ -1,4 +1,4 @@ -# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. +# Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. # # SPDX-License-Identifier: Apache-2.0 diff --git a/examples/device/cufftdx_simple_fft_block_half2.py b/examples/device/cufftdx_simple_fft_block_half2.py index 38ec629..f2a4d3b 100644 --- a/examples/device/cufftdx_simple_fft_block_half2.py +++ b/examples/device/cufftdx_simple_fft_block_half2.py @@ -1,4 +1,4 @@ -# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. +# Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. # # SPDX-License-Identifier: Apache-2.0 diff --git a/examples/device/cufftdx_simple_fft_block_r2c.py b/examples/device/cufftdx_simple_fft_block_r2c.py index f95c740..38232d8 100644 --- a/examples/device/cufftdx_simple_fft_block_r2c.py +++ b/examples/device/cufftdx_simple_fft_block_r2c.py @@ -1,4 +1,4 @@ -# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. +# Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. # # SPDX-License-Identifier: Apache-2.0 diff --git a/examples/device/cufftdx_simple_fft_block_r2c_fp16.py b/examples/device/cufftdx_simple_fft_block_r2c_fp16.py index 4835417..78c15f9 100644 --- a/examples/device/cufftdx_simple_fft_block_r2c_fp16.py +++ b/examples/device/cufftdx_simple_fft_block_r2c_fp16.py @@ -1,4 +1,4 @@ -# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. +# Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. # # SPDX-License-Identifier: Apache-2.0 diff --git a/examples/device/cufftdx_simple_fft_block_shared.py b/examples/device/cufftdx_simple_fft_block_shared.py index 553e0e7..a5432a6 100644 --- a/examples/device/cufftdx_simple_fft_block_shared.py +++ b/examples/device/cufftdx_simple_fft_block_shared.py @@ -1,4 +1,4 @@ -# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. +# Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. # # SPDX-License-Identifier: Apache-2.0 diff --git a/examples/device/cufftdx_simple_fft_thread.py b/examples/device/cufftdx_simple_fft_thread.py index 9b9ae71..2a848c3 100644 --- a/examples/device/cufftdx_simple_fft_thread.py +++ b/examples/device/cufftdx_simple_fft_thread.py @@ -1,4 +1,4 @@ -# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. +# Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. # # SPDX-License-Identifier: Apache-2.0 diff --git a/examples/device/cufftdx_simple_fft_thread_fp16.py b/examples/device/cufftdx_simple_fft_thread_fp16.py index 0bffc89..51be9cc 100644 --- a/examples/device/cufftdx_simple_fft_thread_fp16.py +++ b/examples/device/cufftdx_simple_fft_thread_fp16.py @@ -1,4 +1,4 @@ -# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. +# Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. # # SPDX-License-Identifier: Apache-2.0 diff --git a/examples/device/curand_cufftdx_block_fft.py b/examples/device/curand_cufftdx_block_fft.py index 2a5086f..d2b0774 100644 --- a/examples/device/curand_cufftdx_block_fft.py +++ b/examples/device/curand_cufftdx_block_fft.py @@ -1,4 +1,4 @@ -# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. +# Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. # # SPDX-License-Identifier: Apache-2.0 @@ -10,8 +10,7 @@ import numpy as np from numba import cuda -from nvmath.device import fft, random -from nvmath.device.vector_types_numba import float32x2 +from nvmath.device import fft, random, float32x2 # Compile the random device APIs for the current device. compiled_random_apis = random.Compile(cc=None) diff --git a/examples/device/curand_philox_uniform4.py b/examples/device/curand_philox_uniform4.py index 4aa4edb..2dd883f 100644 --- a/examples/device/curand_philox_uniform4.py +++ b/examples/device/curand_philox_uniform4.py @@ -1,4 +1,4 @@ -# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. +# Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. # # SPDX-License-Identifier: Apache-2.0 @@ -12,7 +12,6 @@ """ import numpy as np -from numpy.testing import assert_allclose from numba import cuda diff --git a/examples/device/curand_scrambled_sobol64.py b/examples/device/curand_scrambled_sobol64.py index 3d87023..2b54b25 100644 --- a/examples/device/curand_scrambled_sobol64.py +++ b/examples/device/curand_scrambled_sobol64.py @@ -1,4 +1,4 @@ -# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. +# Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. # # SPDX-License-Identifier: Apache-2.0 diff --git a/examples/device/curand_xorwow_uniform.py b/examples/device/curand_xorwow_uniform.py index 85e856f..0ec4845 100644 --- a/examples/device/curand_xorwow_uniform.py +++ b/examples/device/curand_xorwow_uniform.py @@ -1,4 +1,4 @@ -# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. +# Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. # # SPDX-License-Identifier: Apache-2.0 @@ -11,7 +11,6 @@ """ import numpy as np -from numpy.testing import assert_allclose from numba import cuda diff --git a/examples/fft/caching.py b/examples/fft/caching.py index 8a18e8a..fab93bc 100644 --- a/examples/fft/caching.py +++ b/examples/fft/caching.py @@ -1,4 +1,4 @@ -# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. +# Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. # # SPDX-License-Identifier: Apache-2.0 @@ -36,7 +36,7 @@ def fft( cache: dict | None = None, ): """ - A cached version of FFT, taking a cache argument in addition the the regular arguments + A cached version of FFT, taking a cache argument in addition to the regular arguments for fft(). The stateful objects are cached in the provided cache, and reused. Args: diff --git a/examples/fft/example01_cupy.py b/examples/fft/example01_cupy.py index cb53175..6b74382 100644 --- a/examples/fft/example01_cupy.py +++ b/examples/fft/example01_cupy.py @@ -1,4 +1,4 @@ -# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. +# Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. # # SPDX-License-Identifier: Apache-2.0 diff --git a/examples/fft/example01_cupy_layouts.py b/examples/fft/example01_cupy_layouts.py index bc7c3b4..c47cd09 100644 --- a/examples/fft/example01_cupy_layouts.py +++ b/examples/fft/example01_cupy_layouts.py @@ -1,4 +1,4 @@ -# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. +# Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. # # SPDX-License-Identifier: Apache-2.0 diff --git a/examples/fft/example01_numpy.py b/examples/fft/example01_numpy.py index ec86624..b9a6e49 100644 --- a/examples/fft/example01_numpy.py +++ b/examples/fft/example01_numpy.py @@ -1,4 +1,4 @@ -# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. +# Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. # # SPDX-License-Identifier: Apache-2.0 diff --git a/examples/fft/example01_numpy_cpu_execution.py b/examples/fft/example01_numpy_cpu_execution.py index 7556537..5946652 100644 --- a/examples/fft/example01_numpy_cpu_execution.py +++ b/examples/fft/example01_numpy_cpu_execution.py @@ -1,4 +1,4 @@ -# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. +# Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. # # SPDX-License-Identifier: Apache-2.0 diff --git a/examples/fft/example01_torch_complex32.py b/examples/fft/example01_torch_complex32.py index ab19051..8488d6c 100644 --- a/examples/fft/example01_torch_complex32.py +++ b/examples/fft/example01_torch_complex32.py @@ -1,4 +1,4 @@ -# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. +# Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. # # SPDX-License-Identifier: Apache-2.0 diff --git a/examples/fft/example02_stateful_cupy.py b/examples/fft/example02_stateful_cupy.py index f9a1d2c..fb5727f 100644 --- a/examples/fft/example02_stateful_cupy.py +++ b/examples/fft/example02_stateful_cupy.py @@ -1,4 +1,4 @@ -# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. +# Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. # # SPDX-License-Identifier: Apache-2.0 diff --git a/examples/fft/example02_stateful_torch.py b/examples/fft/example02_stateful_torch.py index 7e1f61c..2724238 100644 --- a/examples/fft/example02_stateful_torch.py +++ b/examples/fft/example02_stateful_torch.py @@ -1,4 +1,4 @@ -# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. +# Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. # # SPDX-License-Identifier: Apache-2.0 diff --git a/examples/fft/example02_stateful_torch_cpu.py b/examples/fft/example02_stateful_torch_cpu.py index f3da766..c5bd048 100644 --- a/examples/fft/example02_stateful_torch_cpu.py +++ b/examples/fft/example02_stateful_torch_cpu.py @@ -1,4 +1,4 @@ -# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. +# Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. # # SPDX-License-Identifier: Apache-2.0 diff --git a/examples/fft/example02_stateful_torch_cpu_execution.py b/examples/fft/example02_stateful_torch_cpu_execution.py index d2a40cf..9c9812b 100644 --- a/examples/fft/example02_stateful_torch_cpu_execution.py +++ b/examples/fft/example02_stateful_torch_cpu_execution.py @@ -1,4 +1,4 @@ -# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. +# Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. # # SPDX-License-Identifier: Apache-2.0 diff --git a/examples/fft/example03_options.py b/examples/fft/example03_options.py index 70e3496..3514fe0 100644 --- a/examples/fft/example03_options.py +++ b/examples/fft/example03_options.py @@ -1,4 +1,4 @@ -# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. +# Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. # # SPDX-License-Identifier: Apache-2.0 diff --git a/examples/fft/example03_options_cpu_execution.py b/examples/fft/example03_options_cpu_execution.py index 0bffe74..7d1f82d 100644 --- a/examples/fft/example03_options_cpu_execution.py +++ b/examples/fft/example03_options_cpu_execution.py @@ -1,4 +1,4 @@ -# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. +# Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. # # SPDX-License-Identifier: Apache-2.0 diff --git a/examples/fft/example04_logging_global.py b/examples/fft/example04_logging_global.py index 763a894..d77b8ba 100644 --- a/examples/fft/example04_logging_global.py +++ b/examples/fft/example04_logging_global.py @@ -1,4 +1,4 @@ -# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. +# Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. # # SPDX-License-Identifier: Apache-2.0 diff --git a/examples/fft/example04_logging_user.py b/examples/fft/example04_logging_user.py index 02b1da6..bf969e2 100644 --- a/examples/fft/example04_logging_user.py +++ b/examples/fft/example04_logging_user.py @@ -1,4 +1,4 @@ -# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. +# Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. # # SPDX-License-Identifier: Apache-2.0 diff --git a/examples/fft/example05_stateful_inplace.py b/examples/fft/example05_stateful_inplace.py index 633efe7..398ed0b 100644 --- a/examples/fft/example05_stateful_inplace.py +++ b/examples/fft/example05_stateful_inplace.py @@ -1,4 +1,4 @@ -# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. +# Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. # # SPDX-License-Identifier: Apache-2.0 diff --git a/examples/fft/example05_stateful_reset.py b/examples/fft/example05_stateful_reset.py index 2a8a310..e81ec88 100644 --- a/examples/fft/example05_stateful_reset.py +++ b/examples/fft/example05_stateful_reset.py @@ -1,4 +1,4 @@ -# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. +# Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. # # SPDX-License-Identifier: Apache-2.0 diff --git a/examples/fft/example06_r2c.py b/examples/fft/example06_r2c.py index bcaf983..b9c3d52 100644 --- a/examples/fft/example06_r2c.py +++ b/examples/fft/example06_r2c.py @@ -1,4 +1,4 @@ -# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. +# Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. # # SPDX-License-Identifier: Apache-2.0 diff --git a/examples/fft/example07_c2r.py b/examples/fft/example07_c2r.py index ad286aa..c15108a 100644 --- a/examples/fft/example07_c2r.py +++ b/examples/fft/example07_c2r.py @@ -1,4 +1,4 @@ -# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. +# Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. # # SPDX-License-Identifier: Apache-2.0 diff --git a/examples/fft/example07_c2r_odd.py b/examples/fft/example07_c2r_odd.py index 78a3984..2eec232 100644 --- a/examples/fft/example07_c2r_odd.py +++ b/examples/fft/example07_c2r_odd.py @@ -1,4 +1,4 @@ -# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. +# Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. # # SPDX-License-Identifier: Apache-2.0 diff --git a/examples/fft/example08_cupy_inplace.py b/examples/fft/example08_cupy_inplace.py index 689619f..e16b5b1 100644 --- a/examples/fft/example08_cupy_inplace.py +++ b/examples/fft/example08_cupy_inplace.py @@ -1,4 +1,4 @@ -# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. +# Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. # # SPDX-License-Identifier: Apache-2.0 diff --git a/examples/fft/example08_numpy_inplace.py b/examples/fft/example08_numpy_inplace.py index 9b48608..0d930be 100644 --- a/examples/fft/example08_numpy_inplace.py +++ b/examples/fft/example08_numpy_inplace.py @@ -1,4 +1,4 @@ -# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. +# Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. # # SPDX-License-Identifier: Apache-2.0 diff --git a/examples/fft/example08_numpy_inplace_cpu_execution.py b/examples/fft/example08_numpy_inplace_cpu_execution.py index 4bbd491..d5d9db3 100644 --- a/examples/fft/example08_numpy_inplace_cpu_execution.py +++ b/examples/fft/example08_numpy_inplace_cpu_execution.py @@ -1,4 +1,4 @@ -# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. +# Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. # # SPDX-License-Identifier: Apache-2.0 diff --git a/examples/fft/example09_streams.py b/examples/fft/example09_streams.py index a13369e..6eea6fb 100644 --- a/examples/fft/example09_streams.py +++ b/examples/fft/example09_streams.py @@ -1,4 +1,4 @@ -# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. +# Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. # # SPDX-License-Identifier: Apache-2.0 diff --git a/examples/fft/example10_memory_allocator.py b/examples/fft/example10_memory_allocator.py index 1c4b271..5d45930 100644 --- a/examples/fft/example10_memory_allocator.py +++ b/examples/fft/example10_memory_allocator.py @@ -1,4 +1,4 @@ -# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. +# Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. # # SPDX-License-Identifier: Apache-2.0 diff --git a/examples/fft/example11_resource_mgmt.py b/examples/fft/example11_resource_mgmt.py index 40b90d9..2335b3f 100644 --- a/examples/fft/example11_resource_mgmt.py +++ b/examples/fft/example11_resource_mgmt.py @@ -1,4 +1,4 @@ -# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. +# Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. # # SPDX-License-Identifier: Apache-2.0 diff --git a/examples/fft/example12_stateful_unsupported_fallback.py b/examples/fft/example12_stateful_unsupported_fallback.py index 632bbfc..d7e468c 100644 --- a/examples/fft/example12_stateful_unsupported_fallback.py +++ b/examples/fft/example12_stateful_unsupported_fallback.py @@ -1,4 +1,4 @@ -# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. +# Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. # # SPDX-License-Identifier: Apache-2.0 diff --git a/examples/fft/example12_unsupported_fallback.py b/examples/fft/example12_unsupported_fallback.py index 9fd7522..d48eca2 100644 --- a/examples/fft/example12_unsupported_fallback.py +++ b/examples/fft/example12_unsupported_fallback.py @@ -1,4 +1,4 @@ -# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. +# Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. # # SPDX-License-Identifier: Apache-2.0 diff --git a/examples/fft/example13_cupy_mt_mgpu.py b/examples/fft/example13_cupy_mt_mgpu.py index d6bc1e3..6c31908 100644 --- a/examples/fft/example13_cupy_mt_mgpu.py +++ b/examples/fft/example13_cupy_mt_mgpu.py @@ -1,4 +1,4 @@ -# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. +# Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. # # SPDX-License-Identifier: Apache-2.0 diff --git a/examples/fft/example13_numpy_mp_mgpu.py b/examples/fft/example13_numpy_mp_mgpu.py index c3cfefe..d523bc4 100644 --- a/examples/fft/example13_numpy_mp_mgpu.py +++ b/examples/fft/example13_numpy_mp_mgpu.py @@ -1,4 +1,4 @@ -# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. +# Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. # # SPDX-License-Identifier: Apache-2.0 diff --git a/examples/fft/example14_caching.py b/examples/fft/example14_caching.py index cb33745..8f1623a 100644 --- a/examples/fft/example14_caching.py +++ b/examples/fft/example14_caching.py @@ -1,4 +1,4 @@ -# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. +# Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. # # SPDX-License-Identifier: Apache-2.0 diff --git a/examples/fft/example15_cupy_nd_fft_benchmark.py b/examples/fft/example15_cupy_nd_fft_benchmark.py index adcb257..9201229 100644 --- a/examples/fft/example15_cupy_nd_fft_benchmark.py +++ b/examples/fft/example15_cupy_nd_fft_benchmark.py @@ -1,4 +1,4 @@ -# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. +# Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. # # SPDX-License-Identifier: Apache-2.0 diff --git a/examples/fft/example16_cupy_nd_fft_benchmark.py b/examples/fft/example16_cupy_nd_fft_benchmark.py index 104eccc..787cfa2 100644 --- a/examples/fft/example16_cupy_nd_fft_benchmark.py +++ b/examples/fft/example16_cupy_nd_fft_benchmark.py @@ -1,4 +1,4 @@ -# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. +# Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. # # SPDX-License-Identifier: Apache-2.0 diff --git a/examples/fft/example17_trunc.py b/examples/fft/example17_trunc.py index 45ed02a..dc57084 100644 --- a/examples/fft/example17_trunc.py +++ b/examples/fft/example17_trunc.py @@ -1,4 +1,4 @@ -# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. +# Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. # # SPDX-License-Identifier: Apache-2.0 diff --git a/examples/fft/example18_5D_trunc.py b/examples/fft/example18_5D_trunc.py index 0989465..f9f1f3f 100644 --- a/examples/fft/example18_5D_trunc.py +++ b/examples/fft/example18_5D_trunc.py @@ -1,4 +1,4 @@ -# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. +# Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. # # SPDX-License-Identifier: Apache-2.0 diff --git a/examples/fft/example19_convolution_epilog_callback.py b/examples/fft/example19_convolution_epilog_callback.py index d5a95d2..c068092 100644 --- a/examples/fft/example19_convolution_epilog_callback.py +++ b/examples/fft/example19_convolution_epilog_callback.py @@ -1,4 +1,4 @@ -# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. +# Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. # # SPDX-License-Identifier: Apache-2.0 diff --git a/examples/fft/example19_convolution_memory_layout_callback.py b/examples/fft/example19_convolution_memory_layout_callback.py index fa251b9..3b6b48d 100644 --- a/examples/fft/example19_convolution_memory_layout_callback.py +++ b/examples/fft/example19_convolution_memory_layout_callback.py @@ -1,4 +1,4 @@ -# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. +# Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. # # SPDX-License-Identifier: Apache-2.0 diff --git a/examples/fft/example19_convolution_prolog_callback.py b/examples/fft/example19_convolution_prolog_callback.py index 8decd42..81a9052 100644 --- a/examples/fft/example19_convolution_prolog_callback.py +++ b/examples/fft/example19_convolution_prolog_callback.py @@ -1,4 +1,4 @@ -# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. +# Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. # # SPDX-License-Identifier: Apache-2.0 diff --git a/examples/fft/fftn1.py b/examples/fft/fftn1.py index 8a034c2..8401aff 100644 --- a/examples/fft/fftn1.py +++ b/examples/fft/fftn1.py @@ -1,4 +1,4 @@ -# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. +# Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. # # SPDX-License-Identifier: Apache-2.0 diff --git a/examples/fft/fftn2.py b/examples/fft/fftn2.py index 298943a..f6f7af3 100644 --- a/examples/fft/fftn2.py +++ b/examples/fft/fftn2.py @@ -1,4 +1,4 @@ -# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. +# Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. # # SPDX-License-Identifier: Apache-2.0 diff --git a/examples/fft/truncation.py b/examples/fft/truncation.py index 8aeb93d..4c39ef2 100644 --- a/examples/fft/truncation.py +++ b/examples/fft/truncation.py @@ -1,4 +1,4 @@ -# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. +# Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. # # SPDX-License-Identifier: Apache-2.0 diff --git a/examples/linalg/advanced/matmul/example01_cupy.py b/examples/linalg/advanced/matmul/example01_cupy.py index 25a4fa0..b692c1c 100644 --- a/examples/linalg/advanced/matmul/example01_cupy.py +++ b/examples/linalg/advanced/matmul/example01_cupy.py @@ -1,4 +1,4 @@ -# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. +# Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. # # SPDX-License-Identifier: Apache-2.0 diff --git a/examples/linalg/advanced/matmul/example01_cupy_complex64.py b/examples/linalg/advanced/matmul/example01_cupy_complex64.py index c52ff53..687115d 100644 --- a/examples/linalg/advanced/matmul/example01_cupy_complex64.py +++ b/examples/linalg/advanced/matmul/example01_cupy_complex64.py @@ -1,4 +1,4 @@ -# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. +# Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. # # SPDX-License-Identifier: Apache-2.0 diff --git a/examples/linalg/advanced/matmul/example01_numpy.py b/examples/linalg/advanced/matmul/example01_numpy.py index 258b744..7d03653 100644 --- a/examples/linalg/advanced/matmul/example01_numpy.py +++ b/examples/linalg/advanced/matmul/example01_numpy.py @@ -1,4 +1,4 @@ -# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. +# Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. # # SPDX-License-Identifier: Apache-2.0 diff --git a/examples/linalg/advanced/matmul/example01_torch.py b/examples/linalg/advanced/matmul/example01_torch.py index d6151cc..34b542a 100644 --- a/examples/linalg/advanced/matmul/example01_torch.py +++ b/examples/linalg/advanced/matmul/example01_torch.py @@ -1,4 +1,4 @@ -# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. +# Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. # # SPDX-License-Identifier: Apache-2.0 @@ -25,7 +25,7 @@ # No synchronization is needed for CPU tensors, since the execution always blocks. -# Check if the result is numpy array as well. +# Check if the result is torch tensor as well. print(f"Inputs were of types {type(a)} and {type(b)} and the result is of type {type(result)}.") print(f"Inputs were located on devices {a.device} and {b.device} and the result is on {result.device}") diff --git a/examples/linalg/advanced/matmul/example02_options.py b/examples/linalg/advanced/matmul/example02_options.py index 395a595..f3ebdf7 100644 --- a/examples/linalg/advanced/matmul/example02_options.py +++ b/examples/linalg/advanced/matmul/example02_options.py @@ -1,4 +1,4 @@ -# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. +# Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. # # SPDX-License-Identifier: Apache-2.0 diff --git a/examples/linalg/advanced/matmul/example03_logging_global.py b/examples/linalg/advanced/matmul/example03_logging_global.py index c7d995d..bf4dd43 100644 --- a/examples/linalg/advanced/matmul/example03_logging_global.py +++ b/examples/linalg/advanced/matmul/example03_logging_global.py @@ -1,4 +1,4 @@ -# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. +# Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. # # SPDX-License-Identifier: Apache-2.0 diff --git a/examples/linalg/advanced/matmul/example03_logging_user.py b/examples/linalg/advanced/matmul/example03_logging_user.py index 7a30b3b..333d3d1 100644 --- a/examples/linalg/advanced/matmul/example03_logging_user.py +++ b/examples/linalg/advanced/matmul/example03_logging_user.py @@ -1,4 +1,4 @@ -# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. +# Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. # # SPDX-License-Identifier: Apache-2.0 diff --git a/examples/linalg/advanced/matmul/example04_stateful_cupy.py b/examples/linalg/advanced/matmul/example04_stateful_cupy.py index a39ec0b..b84c011 100644 --- a/examples/linalg/advanced/matmul/example04_stateful_cupy.py +++ b/examples/linalg/advanced/matmul/example04_stateful_cupy.py @@ -1,4 +1,4 @@ -# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. +# Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. # # SPDX-License-Identifier: Apache-2.0 diff --git a/examples/linalg/advanced/matmul/example04_stateful_torch.py b/examples/linalg/advanced/matmul/example04_stateful_torch.py index 87e6917..ae37fc9 100644 --- a/examples/linalg/advanced/matmul/example04_stateful_torch.py +++ b/examples/linalg/advanced/matmul/example04_stateful_torch.py @@ -1,4 +1,4 @@ -# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. +# Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. # # SPDX-License-Identifier: Apache-2.0 diff --git a/examples/linalg/advanced/matmul/example04_stateful_torch_cpu.py b/examples/linalg/advanced/matmul/example04_stateful_torch_cpu.py index 3ad4aa0..5540c28 100644 --- a/examples/linalg/advanced/matmul/example04_stateful_torch_cpu.py +++ b/examples/linalg/advanced/matmul/example04_stateful_torch_cpu.py @@ -1,4 +1,4 @@ -# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. +# Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. # # SPDX-License-Identifier: Apache-2.0 diff --git a/examples/linalg/advanced/matmul/example05_stateful_inplace.py b/examples/linalg/advanced/matmul/example05_stateful_inplace.py index 5d66faa..2721f7f 100644 --- a/examples/linalg/advanced/matmul/example05_stateful_inplace.py +++ b/examples/linalg/advanced/matmul/example05_stateful_inplace.py @@ -1,4 +1,4 @@ -# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. +# Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. # # SPDX-License-Identifier: Apache-2.0 @@ -13,20 +13,20 @@ execution also happens on the GPU. """ +import logging + import cupy as cp import nvmath +# Turn on logging to see what's happening. +logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)-8s %(message)s", datefmt="%m-%d %H:%M:%S") + # Prepare sample input data m, n, k = 123, 456, 789 a = cp.random.rand(m, k) b = cp.random.rand(k, n) -# Turn on logging to see what's happening. -import logging - -logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)-8s %(message)s", datefmt="%m-%d %H:%M:%S") - # Use the stateful object as a context manager to automatically release resources. with nvmath.linalg.advanced.Matmul(a, b) as mm: # Plan the matrix multiplication. Planning returns a sequence of algorithms that can be diff --git a/examples/linalg/advanced/matmul/example05_stateful_reset.py b/examples/linalg/advanced/matmul/example05_stateful_reset.py index 3f838ee..462b8b6 100644 --- a/examples/linalg/advanced/matmul/example05_stateful_reset.py +++ b/examples/linalg/advanced/matmul/example05_stateful_reset.py @@ -1,4 +1,4 @@ -# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. +# Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. # # SPDX-License-Identifier: Apache-2.0 @@ -11,21 +11,21 @@ The inputs as well as the result are NumPy ndarrays. """ +import logging + import numpy as np import nvmath +# Turn on logging to see what's happening. +logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)-8s %(message)s", datefmt="%m-%d %H:%M:%S") + # Prepare sample input data m, n, k = 123, 456, 789 m, n, k = 2, 3, 4 a = np.random.rand(m, k) b = np.random.rand(k, n) -# Turn on logging to see what's happening. -import logging - -logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)-8s %(message)s", datefmt="%m-%d %H:%M:%S") - # Use the stateful object as a context manager to automatically release resources. with nvmath.linalg.advanced.Matmul(a, b) as mm: # Plan the matrix multiplication. Planning returns a sequence of algorithms that can be diff --git a/examples/linalg/advanced/matmul/example06_gemm.py b/examples/linalg/advanced/matmul/example06_gemm.py index de60b9a..358512f 100644 --- a/examples/linalg/advanced/matmul/example06_gemm.py +++ b/examples/linalg/advanced/matmul/example06_gemm.py @@ -1,4 +1,4 @@ -# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. +# Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. # # SPDX-License-Identifier: Apache-2.0 diff --git a/examples/linalg/advanced/matmul/example07_batched_a.py b/examples/linalg/advanced/matmul/example07_batched_a.py index 6404851..fff7a19 100644 --- a/examples/linalg/advanced/matmul/example07_batched_a.py +++ b/examples/linalg/advanced/matmul/example07_batched_a.py @@ -1,4 +1,4 @@ -# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. +# Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. # # SPDX-License-Identifier: Apache-2.0 diff --git a/examples/linalg/advanced/matmul/example07_batched_a_b.py b/examples/linalg/advanced/matmul/example07_batched_a_b.py index 176cd02..90235a3 100644 --- a/examples/linalg/advanced/matmul/example07_batched_a_b.py +++ b/examples/linalg/advanced/matmul/example07_batched_a_b.py @@ -1,4 +1,4 @@ -# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. +# Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. # # SPDX-License-Identifier: Apache-2.0 diff --git a/examples/linalg/advanced/matmul/example08_batched_a_bcast_c.py b/examples/linalg/advanced/matmul/example08_batched_a_bcast_c.py index ff63ca5..bfc8017 100644 --- a/examples/linalg/advanced/matmul/example08_batched_a_bcast_c.py +++ b/examples/linalg/advanced/matmul/example08_batched_a_bcast_c.py @@ -1,4 +1,4 @@ -# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. +# Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. # # SPDX-License-Identifier: Apache-2.0 @@ -23,7 +23,7 @@ m = n = k = 2000 a_batch = cp.random.rand(batch_size, m, k) b = cp.random.rand(k, n) -c = cp.random.rand(m) +c = cp.random.rand(m, 1) beta = 1.2 print(f"a shape is: {a_batch.shape}, b shape is: {b.shape} and c shape is: {c.shape}") diff --git a/examples/linalg/advanced/matmul/example08_batched_a_c.py b/examples/linalg/advanced/matmul/example08_batched_a_c.py index 9307946..a9a3234 100644 --- a/examples/linalg/advanced/matmul/example08_batched_a_c.py +++ b/examples/linalg/advanced/matmul/example08_batched_a_c.py @@ -1,4 +1,4 @@ -# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. +# Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. # # SPDX-License-Identifier: Apache-2.0 diff --git a/examples/linalg/advanced/matmul/example09_epilog_bias.py b/examples/linalg/advanced/matmul/example09_epilog_bias.py index e3cab2a..7bfc7f6 100644 --- a/examples/linalg/advanced/matmul/example09_epilog_bias.py +++ b/examples/linalg/advanced/matmul/example09_epilog_bias.py @@ -1,4 +1,4 @@ -# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. +# Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. # # SPDX-License-Identifier: Apache-2.0 diff --git a/examples/linalg/advanced/matmul/example09_epilog_gelu_bias.py b/examples/linalg/advanced/matmul/example09_epilog_gelu_bias.py index fd356cf..ae2731c 100644 --- a/examples/linalg/advanced/matmul/example09_epilog_gelu_bias.py +++ b/examples/linalg/advanced/matmul/example09_epilog_gelu_bias.py @@ -1,4 +1,4 @@ -# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. +# Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. # # SPDX-License-Identifier: Apache-2.0 diff --git a/examples/linalg/advanced/matmul/example10_epilog_dgelu.py b/examples/linalg/advanced/matmul/example10_epilog_dgelu.py index 5d2a2f0..9b48981 100644 --- a/examples/linalg/advanced/matmul/example10_epilog_dgelu.py +++ b/examples/linalg/advanced/matmul/example10_epilog_dgelu.py @@ -1,4 +1,4 @@ -# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. +# Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. # # SPDX-License-Identifier: Apache-2.0 diff --git a/examples/linalg/advanced/matmul/example10_epilog_drelu.py b/examples/linalg/advanced/matmul/example10_epilog_drelu.py index c86f8c0..80e8639 100644 --- a/examples/linalg/advanced/matmul/example10_epilog_drelu.py +++ b/examples/linalg/advanced/matmul/example10_epilog_drelu.py @@ -1,4 +1,4 @@ -# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. +# Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. # # SPDX-License-Identifier: Apache-2.0 diff --git a/examples/linalg/advanced/matmul/example10_epilog_relu_aux.py b/examples/linalg/advanced/matmul/example10_epilog_relu_aux.py index 3e8879f..fc68b99 100644 --- a/examples/linalg/advanced/matmul/example10_epilog_relu_aux.py +++ b/examples/linalg/advanced/matmul/example10_epilog_relu_aux.py @@ -1,4 +1,4 @@ -# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. +# Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. # # SPDX-License-Identifier: Apache-2.0 diff --git a/examples/linalg/advanced/matmul/example11_epilog_drelu_bgrad.py b/examples/linalg/advanced/matmul/example11_epilog_drelu_bgrad.py index d002697..25af423 100644 --- a/examples/linalg/advanced/matmul/example11_epilog_drelu_bgrad.py +++ b/examples/linalg/advanced/matmul/example11_epilog_drelu_bgrad.py @@ -1,4 +1,4 @@ -# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. +# Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. # # SPDX-License-Identifier: Apache-2.0 diff --git a/examples/linalg/advanced/matmul/example12_epilog_bgrada.py b/examples/linalg/advanced/matmul/example12_epilog_bgrada.py index c13ed63..5b610d8 100644 --- a/examples/linalg/advanced/matmul/example12_epilog_bgrada.py +++ b/examples/linalg/advanced/matmul/example12_epilog_bgrada.py @@ -1,4 +1,4 @@ -# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. +# Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. # # SPDX-License-Identifier: Apache-2.0 diff --git a/examples/linalg/advanced/matmul/example12_epilog_bgradb.py b/examples/linalg/advanced/matmul/example12_epilog_bgradb.py index 51e1b41..a8a9cd6 100644 --- a/examples/linalg/advanced/matmul/example12_epilog_bgradb.py +++ b/examples/linalg/advanced/matmul/example12_epilog_bgradb.py @@ -1,4 +1,4 @@ -# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. +# Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. # # SPDX-License-Identifier: Apache-2.0 diff --git a/examples/linalg/advanced/matmul/example13_epilog_stateful_reset.py b/examples/linalg/advanced/matmul/example13_epilog_stateful_reset.py index 87cf5bd..638dac1 100644 --- a/examples/linalg/advanced/matmul/example13_epilog_stateful_reset.py +++ b/examples/linalg/advanced/matmul/example13_epilog_stateful_reset.py @@ -1,4 +1,4 @@ -# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. +# Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. # # SPDX-License-Identifier: Apache-2.0 @@ -11,21 +11,21 @@ The inputs as well as the result are NumPy ndarrays. """ +import logging + import numpy as np import nvmath +# Turn on logging to see what's happening. +logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)-8s %(message)s", datefmt="%m-%d %H:%M:%S") + # Prepare sample input data. m, n, k = 123, 456, 789 a = np.random.rand(m, k) b = np.random.rand(k, n) bias = np.random.rand(m, 1) -# Turn on logging to see what's happening. -import logging - -logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)-8s %(message)s", datefmt="%m-%d %H:%M:%S") - # Use the stateful object as a context manager to automatically release resources. with nvmath.linalg.advanced.Matmul(a, b) as mm: # Plan the matrix multiplication for the BIAS epilog. diff --git a/examples/linalg/advanced/matmul/example14_autotune.py b/examples/linalg/advanced/matmul/example14_autotune.py index b027f98..5aaf1a5 100644 --- a/examples/linalg/advanced/matmul/example14_autotune.py +++ b/examples/linalg/advanced/matmul/example14_autotune.py @@ -1,4 +1,4 @@ -# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. +# Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. # # SPDX-License-Identifier: Apache-2.0 diff --git a/examples/linalg/advanced/matmul/example15_manual_tuning.py b/examples/linalg/advanced/matmul/example15_manual_tuning.py index 8593864..047804e 100644 --- a/examples/linalg/advanced/matmul/example15_manual_tuning.py +++ b/examples/linalg/advanced/matmul/example15_manual_tuning.py @@ -1,4 +1,4 @@ -# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. +# Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. # # SPDX-License-Identifier: Apache-2.0 diff --git a/examples/linalg/advanced/matmul/example16_reuse_algorithms.py b/examples/linalg/advanced/matmul/example16_reuse_algorithms.py index 6009d9e..1f32868 100644 --- a/examples/linalg/advanced/matmul/example16_reuse_algorithms.py +++ b/examples/linalg/advanced/matmul/example16_reuse_algorithms.py @@ -1,4 +1,4 @@ -# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. +# Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. # # SPDX-License-Identifier: Apache-2.0 diff --git a/examples/linalg/advanced/matmul/example17_fp8.py b/examples/linalg/advanced/matmul/example17_fp8.py new file mode 100644 index 0000000..417a38e --- /dev/null +++ b/examples/linalg/advanced/matmul/example17_fp8.py @@ -0,0 +1,41 @@ +# Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. +# +# SPDX-License-Identifier: Apache-2.0 + +""" +This example demonstrates basic matrix multiplication of FP8 tensors. + +In narrow-precision operations, quantization scales must be provided for each tensor. These +scales are used to dequantize input operands and quantize the result. Without proper +scaling, the results of FP8 operations will likely exceed the type's range. + +FP8 is only supported with cuBLAS 12.8 or newer and on devices with compute +capability 8.9 or higher. +""" + +import torch + +import nvmath + +# Prepare sample input data. Note that N, M and K must be divisible by 16 for FP8. +# cuBLAS requires B to be column-major, so we first create a row-major tensor and then +# transpose it. +m, n, k = 64, 32, 48 +a = (torch.rand(m, k, device="cuda") * 10).type(torch.float8_e4m3fn) +b = (torch.rand(n, k, device="cuda") * 10).type(torch.float8_e4m3fn).T + +# Prepare quantization scales. The scales must allow the result to fit within the dynamic +# range of the data type used. Scales can be provided either as a dictionary or as a +# MatmulQuantizationScales object. Note that scales are only allowed for FP8 operands. +scales = {"a": 1, "b": 1, "d": 0.1} + +# Perform the multiplication. The result of the multiplication will be: +# (scales.a * A) @ (scales.b * B) * scales.d +result = nvmath.linalg.advanced.matmul(a, b, quantization_scales=scales) + +# Check how scaling helped to fit into the dynamic range of float8_e4m3fn type. +result_without_scaling = nvmath.linalg.advanced.matmul(a, b, quantization_scales={"a": 1, "b": 1, "d": 1}) +print("Without scaling, most of the elements were clamped to the maximum value of float8_e4m3fn type (448):") +print(result_without_scaling) +print(f"\nWith D scale set to {scales['d']}, they were scaled down to fit into the dynamic range of float8_e4m3fn:") +print(result) diff --git a/examples/linalg/advanced/matmul/example18_fp8_types.py b/examples/linalg/advanced/matmul/example18_fp8_types.py new file mode 100644 index 0000000..be657f9 --- /dev/null +++ b/examples/linalg/advanced/matmul/example18_fp8_types.py @@ -0,0 +1,69 @@ +# Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. +# +# SPDX-License-Identifier: Apache-2.0 + +""" +This example demonstrates how different data types can be used in FP8 multiplication. + +Two kinds of FP8 are supported: float8_e4m3fn and float8_e5m2, which have 4 and 5 bits for +the exponent respectively. We support e4m3*e4m3, e4m3*e5m2, and e5m2*e4m3 operations. + +In FP8 operations, the MatmulOptions.result_type option can be used to specify the desired +output type. For the full list of supported type combinations, please visit the cuBLAS +documentation at https://docs.nvidia.com/cuda/cublas/#cublasltmatmul. + +FP8 is only supported with cuBLAS 12.8 or newer and on devices with compute +capability 8.9 or higher. +""" + +import torch + +import nvmath + +# Prepare sample input data. Note that A and B are FP8 numbers of different types. +m, n, k = 64, 32, 48 +a = torch.rand(m, k, device="cuda").type(torch.float8_e5m2) +b = torch.rand(n, k, device="cuda").type(torch.float8_e4m3fn).T + +# Perform the multiplication, requesting FP32 output. Note that a scale for the result (D +# is not specified because it is not FP8. +result_fp32 = nvmath.linalg.advanced.matmul( + a, b, quantization_scales={"a": 1, "b": 1}, options={"result_type": nvmath.CudaDataType.CUDA_R_32F} +) + +# Perform the multiplication, requesting FP16 output. +result_fp16 = nvmath.linalg.advanced.matmul( + a, b, quantization_scales={"a": 1, "b": 1}, options={"result_type": nvmath.CudaDataType.CUDA_R_16F} +) + +# Now, request FP8 (e4m3fn) output. We set the scale for D to 1 for simplicity - with small +# values in A and B, we won't exceed the range of the type anyway. +result_fp8_e4m3fn = nvmath.linalg.advanced.matmul( + a, b, quantization_scales={"a": 1, "b": 1, "d": 1}, options={"result_type": nvmath.CudaDataType.CUDA_R_8F_E4M3} +) + +# Finally, request FP8 (e5m2) output. +result_fp8_e5m2fn = nvmath.linalg.advanced.matmul( + a, b, quantization_scales={"a": 1, "b": 1, "d": 1}, options={"result_type": nvmath.CudaDataType.CUDA_R_8F_E5M2} +) + + +# Print mean relative error for each of the types +def mean_relative_error_vs_fp32(x): + reference = result_fp32.cpu() + actual = x.cpu().type(torch.float32) + return ((reference - actual).abs() / reference.abs()).mean() + + +print(f"{result_fp32.dtype=}.") +print( + f"{result_fp16.dtype=}. The mean relative error to the FP32 reference is {mean_relative_error_vs_fp32(result_fp16):.07f}." +) +print( + f"{result_fp8_e4m3fn.dtype=}. The mean relative error to the FP32 reference is " + f"{mean_relative_error_vs_fp32(result_fp8_e4m3fn):.07f}." +) +print( + f"{result_fp8_e5m2fn.dtype=}. The mean relative error to the FP32 reference is " + f"{mean_relative_error_vs_fp32(result_fp8_e5m2fn):.07f}." +) diff --git a/examples/linalg/advanced/matmul/example19_fp8_reset.py b/examples/linalg/advanced/matmul/example19_fp8_reset.py new file mode 100644 index 0000000..18822db --- /dev/null +++ b/examples/linalg/advanced/matmul/example19_fp8_reset.py @@ -0,0 +1,46 @@ +# Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. +# +# SPDX-License-Identifier: Apache-2.0 + +""" +This example demonstrates how the reset_operands method of a Matmul object can be used to +change both the operands and their quantization scales. + +FP8 is only supported with cuBLAS 12.8 or newer and on devices with compute +capability 8.9 or higher. +""" + +import torch + +import nvmath + +# Prepare sample input data +m, n, k = 128, 256, 16 +a = torch.ones(m, k, device="cuda", dtype=torch.float8_e5m2) +b = torch.ones(n, k, device="cuda", dtype=torch.float8_e4m3fn).T +print(f"A = \n{a}") +print(f"\nB = \n{b}") + +scales = {"a": 3, "b": 2, "d": 1} + +with nvmath.linalg.advanced.Matmul( + a, b, quantization_scales=scales, options={"result_type": nvmath.CudaDataType.CUDA_R_8F_E5M2} +) as mm: + # Plan the multiplication + mm.plan() + + # Execute the multiplication and print the result + result = mm.execute() + print(f"\nA (A scale: {scales['a']}) @ B (B scale: {scales['b']}) = (D scale: {scales['d']}) \n{result}") + + # Replace A with a matrix filled with 128 and adjust A and D scales. + # Note that since no new scale for B is specified, it will remain unchanged. + new_a = torch.full((m, k), 128, device="cuda").type(torch.float8_e5m2) + print(f"\nnew A = \n{new_a}") + new_a_scale = 1 + new_d_scale = 0.01 + mm.reset_operands(a=new_a, quantization_scales={"a": new_a_scale, "d": new_d_scale}) + + # Execute the multiplication again and print the new result + result2 = mm.execute() + print(f"\nA (A scale: {new_a_scale}) @ B (B scale: {scales['b']}) = (D scale: {new_d_scale}) \n{result2}") diff --git a/examples/linalg/advanced/matmul/example20_fp8_inplace_scale_change.py b/examples/linalg/advanced/matmul/example20_fp8_inplace_scale_change.py new file mode 100644 index 0000000..7f9e915 --- /dev/null +++ b/examples/linalg/advanced/matmul/example20_fp8_inplace_scale_change.py @@ -0,0 +1,52 @@ +# Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. +# +# SPDX-License-Identifier: Apache-2.0 + +""" +This example demonstrates how quantization scales passed as GPU tensors can be modified +in-place without needing to call reset_operands(). + +FP8 is only supported with cuBLAS 12.8 or newer and on devices with compute +capability 8.9 or higher. +""" + +import torch + +import nvmath + +# Prepare sample input data +m, n, k = 128, 256, 16 +a = torch.ones(m, k, device="cuda", dtype=torch.float8_e4m3fn) +b = torch.ones(n, k, device="cuda", dtype=torch.float8_e5m2).T +print(f"A = \n{a}") +print(f"\nB = \n{b}") + +# Create 1D single-element float32 GPU tensors to hold the quantization scales. +# These will be modified in-place later. +scales = { + "a": torch.full((1,), 3, dtype=torch.float32, device="cuda"), + "b": torch.full((1,), 2, dtype=torch.float32, device="cuda"), + "d": torch.full((1,), 1, dtype=torch.float32, device="cuda"), +} + +with nvmath.linalg.advanced.Matmul( + a, b, quantization_scales=scales, options={"result_type": nvmath.CudaDataType.CUDA_R_8F_E5M2} +) as mm: + # Plan the multiplication + mm.plan() + + # Execute the multiplication and print the result + result = mm.execute() + print( + f"\nA (A scale: {scales['a'].item()}) @ B (B scale: {scales['b'].item()}) = (D scale: {scales['d'].item()}) \n{result}" + ) + + # Modify the quantization scales for A and D in-place + scales["a"][:] = 2 + scales["d"][:] = 0.25 + + # Execute the multiplication again with the new quantization scales and print the result + result2 = mm.execute() + print( + f"\nA (A scale: {scales['a'].item()}) @ B (B scale: {scales['b'].item()}) = (D scale: {scales['d'].item()}) \n{result2}" + ) diff --git a/examples/linalg/advanced/matmul/example21_fp8_amax.py b/examples/linalg/advanced/matmul/example21_fp8_amax.py new file mode 100644 index 0000000..0176a4a --- /dev/null +++ b/examples/linalg/advanced/matmul/example21_fp8_amax.py @@ -0,0 +1,54 @@ +# Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. +# +# SPDX-License-Identifier: Apache-2.0 + +""" +This example demonstrates how to obtain the maximum absolute value (amax) in the result, +computed before quantization. + +In previous examples, quantization scales were set manually to appropriate values. Amax can +be used to automatically set proper scales in FP8 operations, as it indicates how much the +result needs to be scaled to fit into the dynamic range of the result type. In this example, +we first compute the result without scaling, then use amax to compute the correct scale, and +repeat the multiplication. While this approach is inefficient, it demonstrates the concept. +For a more practical example, see the `fp8_delayed_scaling` example, which uses amax from +previous iterations to choose scales for subsequent multiplications. + +FP8 is only supported with cuBLAS 12.8 or newer and on devices with compute +capability 8.9 or higher. +""" + +import torch + +import nvmath + +# Fill the input tensors with random numbers from (0, 30). +m, n, k = 128, 128, 16 +a = (torch.rand(m, k, device="cuda") * 30).type(torch.float8_e4m3fn) +b = (torch.rand(n, k, device="cuda") * 30).type(torch.float8_e4m3fn).T + +# To request amax, set `result_amax` option to True. +options = {"result_amax": True} + +# When result_amax is set, a tuple containing the actual result and the auxiliary outputs +# will be returned instead of just the result. +result, aux = nvmath.linalg.advanced.matmul(a, b, quantization_scales={"a": 1, "b": 1, "d": 1}, options=options) + +# With all quantization scales set to 1, most of the elements are clamped to the maximum +# value: +print("Result is:") +print(result) + +# Amax will be present in the auxiliary outputs dictionary as "result_amax". +print(f"Matmul returned the result and the auxiliary outputs of type {type(aux)}: {aux}") + +# Compute the proper scale by dividing the maximum representable value by amax. +max_representable_value = 448 +amax = aux["result_amax"].item() +d_scale = max_representable_value / amax +print(f"d_scale = max_representable_value / amax = {max_representable_value} / {amax:.5f} = {d_scale:.5f}") + +# Repeat the computation, this time using the proper scale for D. +result2 = nvmath.linalg.advanced.matmul(a, b, quantization_scales={"a": 1, "b": 1, "d": d_scale}) +print(f"Result (with D scale set to {d_scale:.5f}) is:") +print(result2) diff --git a/examples/linalg/advanced/matmul/example22_fp8_delayed_scaling.py b/examples/linalg/advanced/matmul/example22_fp8_delayed_scaling.py new file mode 100644 index 0000000..78629b4 --- /dev/null +++ b/examples/linalg/advanced/matmul/example22_fp8_delayed_scaling.py @@ -0,0 +1,64 @@ +# Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. +# +# SPDX-License-Identifier: Apache-2.0 + +""" +This example demonstrates how to implement a simple delayed scaling algorithm. We use the +amax value from the previous iteration to set the scale for the next iteration. In a more +advanced setup, an average amax from N previous iterations could be used as well. In each +iteration, we multiply two normally-distributed matrices A and B and add matrix C to the +result. + +FP8 is only supported with cuBLAS 12.8 or newer and on devices with compute +capability 8.9 or higher. +""" + +import torch + +import nvmath + +m, n, k = 256, 256, 256 +a = torch.zeros(m, k, device="cuda", dtype=torch.float8_e4m3fn) +b = torch.zeros(n, k, device="cuda", dtype=torch.float8_e4m3fn).T +c = torch.zeros(m, n, device="cuda", dtype=torch.float16) + + +def regenerate_inputs(): + a[:] = torch.randn(a.shape, device="cuda") * 10 + b[:] = torch.randn(b.shape, device="cuda") * 10 + c[:] = torch.randn(c.shape, device="cuda") * 10 + return a, b, c + + +# Keep D scale in a GPU tensor instead of a Python float to allow in-place changes +dscale = torch.ones((1,), dtype=torch.float32, device="cuda") +scales = {"a": 1, "b": 1, "d": dscale} + +# Request FP8 output and AMAX calculation +options = {"result_type": nvmath.CudaDataType.CUDA_R_8F_E4M3, "result_amax": True} + +with nvmath.linalg.advanced.Matmul(a, b, c=c, beta=1, quantization_scales=scales, options=options) as mm: + mm.plan() + + for iteration in range(10): + # Populate a, b, and c with fresh random data + regenerate_inputs() + + # Execute the matrix multiplication + result, aux = mm.execute() + amax = aux["result_amax"] + + # Calculate the percentage of clamped values + max_representable_value = 448 + clamped_percent = ( + 100 * ((result == max_representable_value) | (result == -max_representable_value)).sum().item() / result.nelement() + ) + + # Print a report. Note that the percentage of clamped values will rapidly decrease + print( + f"Iteration {iteration} with dscale={dscale.item():05f}: " + f"amax={amax.item():.2f}, {clamped_percent:.02f}% of values were clamped to the max value." + ) + + # Update D scale for the next iteration + dscale[:] = max_representable_value / amax diff --git a/examples/linalg/advanced/matmul/example23_fp8_epilog.py b/examples/linalg/advanced/matmul/example23_fp8_epilog.py new file mode 100644 index 0000000..2f94144 --- /dev/null +++ b/examples/linalg/advanced/matmul/example23_fp8_epilog.py @@ -0,0 +1,45 @@ +# Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. +# +# SPDX-License-Identifier: Apache-2.0 + +""" +This example demonstrates using RELU epilog with FP8 matrix multiplication. + +In FP8 operations, quantization scales must be provided for each tensor. These scales are +used to dequantize input operands and quantize the result. The RELU epilog is applied +after scaling but before final quantization. + +FP8 is only supported with cuBLAS 12.8 or newer and on devices with compute +capability 8.9 or higher. +""" + +import torch + +import nvmath + +# Prepare sample input data with some negative values +m, n, k = 64, 32, 48 +a = (torch.rand(m, k, device="cuda") * 20 - 10).type(torch.float8_e4m3fn) +b = (torch.rand(n, k, device="cuda") * 20 - 10).type(torch.float8_e4m3fn).T + +# Set quantization scales to keep values in range +scales = {"a": 1, "b": 1, "d": 0.1} + +# First perform multiplication without RELU +result_no_relu = nvmath.linalg.advanced.matmul(a, b, quantization_scales=scales) + +# Now perform multiplication with RELU epilog +result_with_relu = nvmath.linalg.advanced.matmul( + a, + b, + epilog=nvmath.linalg.advanced.MatmulEpilog.RELU, + quantization_scales=scales, +) + +print("Result without RELU (notice negative values):") +print(result_no_relu) +print("\nResult with RELU (all values >= 0):") +print(result_with_relu) + +# Verify that all values in the RELU result are non-negative +assert torch.all(result_with_relu.type(torch.float32) >= 0), "RELU result contains negative values!" diff --git a/examples/linalg/advanced/matmul/example24_fp8_epilog_aux.py b/examples/linalg/advanced/matmul/example24_fp8_epilog_aux.py new file mode 100644 index 0000000..2df34be --- /dev/null +++ b/examples/linalg/advanced/matmul/example24_fp8_epilog_aux.py @@ -0,0 +1,69 @@ +# Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. +# +# SPDX-License-Identifier: Apache-2.0 + +""" +This example demonstrates using GELU_AUX epilog with FP8 outputs. + +For GELU_AUX epilog, when A and B are e4m3fn, you can request the auxiliary output to +be returned as FP8. To request FP8 auxiliary output, set epilog.aux_type to an FP8 type +in MatmulPlanPreferences. + +You can specify the scale for this auxiliary output by passing the scale +as "epilog_aux_scale" input in `epilog_inputs`. Additionally, you can request amax to be +computed for this output by setting `epilog.aux_amax=True` in MatmulPlanPreferences. + +Note that FP8 auxiliary outputs are supported only for particular epilogs and type +combinations. For more details on the supported configurations, please see the cuBLAS +documentation. + +FP8 is only supported with cuBLAS 12.8 or newer and on devices with compute +capability 8.9 or higher. +""" + +import torch + +import nvmath + +m, n, k = 64, 64, 64 +a = (torch.randn(m, k, device="cuda") - 0.5).type(torch.float8_e4m3fn) +b = (torch.randn(n, k, device="cuda") - 0.5).type(torch.float8_e4m3fn).T + +scales = {"a": 1, "b": 1, "d": 1} + +# Specify quantization scale to use for auxiliary epilog output +epilog_inputs = {"aux_quantization_scale": 0.1} + +# Instead of a Dict, you may instantiate MatmulPlanPreferences object. +preferences = { + "epilog": { + "aux_type": nvmath.CudaDataType.CUDA_R_8F_E4M3, + "aux_amax": True, + } +} + +# Execute the operation. Note that we pass `preferences` argument. +result, aux = nvmath.linalg.advanced.matmul( + a, + b, + epilog=nvmath.linalg.advanced.MatmulEpilog.GELU_AUX, + epilog_inputs=epilog_inputs, + preferences=preferences, + quantization_scales=scales, +) + +# Print the result. +print("Result:") +print(result) +print() + +# Print the auxiliary values returned. There should be "gelu_aux" (scaled by 0.1) and +# "gelu_aux_amax" containing the maximum absolute value before scaling (amax). +assert set(aux.keys()) == {"gelu_aux", "gelu_aux_amax"} +print(f"Auxiliary outputs are {set(aux.keys())}:") +print(aux) +print() + +print(f"Note that gelu_aux is an FP8 tensor: {aux['gelu_aux'].dtype=}") +print(f"Also, amax has been returned: {aux['gelu_aux_amax']=}") +print(f"Also, amax has been returned: {aux['gelu_aux_amax']=}") diff --git a/examples/linalg/advanced/matmul/example25_mxfp8.py b/examples/linalg/advanced/matmul/example25_mxfp8.py new file mode 100644 index 0000000..09ce8be --- /dev/null +++ b/examples/linalg/advanced/matmul/example25_mxfp8.py @@ -0,0 +1,61 @@ +# Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. +# +# SPDX-License-Identifier: Apache-2.0 + +""" +This example demonstrates basic matrix multiplication of FP8 tensors using MXFP8 +(microscaled FP8) quantization scales. + +Key differences from FP8: +- MXFP8 scales are applied to each 32-element block of the tensors, rather than using a + single tensor-wide scaling factor. This allows more fine-grained control over scaling + and improves the accuracy of MXFP8 operations. +- MXFP8 scales are uint8 numbers in exponent-only format, representing values of the form + 2^n, where n is an integer between -127 and 128. +- In MXFP8 mode, if D is FP8, it is scaled automatically during the matmul operation and + the quantization scales used are returned as "d_out_scale". This is covered in the next + example. + +To use MXFP8, set the `block_scaling` option to True. + +The layout of the quantization scales is relatively complex. To facilitate working with +MXFP8, we provide helper functions in `nvmath.linalg.advanced.helpers.matmul`. + +FP8 is only supported with cuBLAS 12.8 or newer and on devices with compute +capability 10.0 or higher. +""" + +import torch + +import nvmath + +# Prepare sample input data. Note that N, M and K must be divisible by 128 for MXFP8. +# cuBLAS requires B to be column-major, so we first create a row-major tensor and then +# transpose it. +a = torch.eye(256, device="cuda", dtype=torch.float8_e4m3fn) # A is an identity matrix +b = torch.ones((256, 256), device="cuda", dtype=torch.float8_e4m3fn).T # B is filled with ones + +# Prepare quantization scales for A and B using the `create_mxfp8_scale` helper. +# While MXFP8 allows different scales for different blocks in A and B, +# this helper creates uniform scaling across all blocks. +# For more advanced scale configurations, see the cuBLAS documentation and +# the `get_mxfp8_scale_offset` helper. +scales = { + "a": nvmath.linalg.advanced.helpers.matmul.create_mxfp8_scale(a, -1), # 2^-1 = 0.5 + "b": nvmath.linalg.advanced.helpers.matmul.create_mxfp8_scale(b, 3), # 2^3 = 8 +} + +# Enable block scaling by setting the `block_scaling` option to True. For simplicity, we +# request FP16 output. For FP8 output scaling, see the mxfp8_d_out_scale example. +options = {"block_scaling": True, "result_type": nvmath.CudaDataType.CUDA_R_16F} + +# Perform the multiplication. The result is a tuple (result, aux), where aux +# contains the "d_out_scale" key with the scale used for the result. +result = nvmath.linalg.advanced.matmul(a, b, quantization_scales=scales, options=options) + +# Compute reference result without scaling +reference = a.type(torch.float16) @ b.type(torch.float16) +print(f"Reference result (without scaling):\n{reference}") + +# Print the result with scaling applied +print(f"Result with scaling (A scaled by 0.5, B scaled by 8):\n{result}") diff --git a/examples/linalg/advanced/matmul/example26_mxfp8_d_out.py b/examples/linalg/advanced/matmul/example26_mxfp8_d_out.py new file mode 100644 index 0000000..92d3c1f --- /dev/null +++ b/examples/linalg/advanced/matmul/example26_mxfp8_d_out.py @@ -0,0 +1,78 @@ +# Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. +# +# SPDX-License-Identifier: Apache-2.0 + +""" +This example demonstrates automatic output scaling in MXFP8 mode. +When using MXFP8, D is automatically scaled during the matmul operation and the scale used +is returned as "d_out_scale". This scale can be used as input for subsequent matrix +multiplications (see mxfp8_chaining example) or applied to the result using a helper +function. + +To use MXFP8, set the `block_scaling` option to True. + +The layout of MXFP8 scales is complex. To simplify working with them, we provide helper +functions in `nvmath.linalg.advanced.helpers.matmul`. For more advanced operations on +MXFP8 scales, please refer to the cuBLAS documentation. + +FP8 is only supported with cuBLAS 12.8 or newer and on devices with compute +capability 10.0 or higher. +""" + +import torch + +import nvmath + +# Prepare sample input data. Note that N, M and K must be divisible by 128 for MXFP8. +m, n, k = 256, 256, 512 + +# Create matrix A with values increasing by row to demonstrate scaling with different +# magnitudes +a = torch.zeros(m, k, device="cuda", dtype=torch.float8_e4m3fn) +a[:] = torch.arange(m)[:, None] # Each row will have progressively larger values +print("Matrix A:") +print(a) +print() + +# cuBLAS requires B to be column-major, so we first create a row-major tensor and then +# transpose it. +b = torch.rand(m, k, device="cuda").type(torch.float8_e4m3fn).T +print("Matrix B:") +print(b) +print() + +# Prepare quantization scales for A and B using the create_mxfp8_scale helper. +# Note: We don't set a scale for D since MXFP8 automatically scales the result to fit +# within the output type's dynamic range. +scales = { + "a": nvmath.linalg.advanced.helpers.matmul.create_mxfp8_scale(a, -6), # 2^-6 = 0.015625 + "b": nvmath.linalg.advanced.helpers.matmul.create_mxfp8_scale(b, 0), # 2^0 = 1 +} + +# Enable block scaling +options = { + "block_scaling": True, +} + +# Perform the multiplication. The result is a tuple containing (result, aux). +# The aux dictionary contains "d_out_scale" - the scale used for the result. +result, aux = nvmath.linalg.advanced.matmul(a, b, quantization_scales=scales, options=options) + +# Display results +print("Result (each block scaled to fit within float8_e4m3fn range):") +print(result) +print() + +# Examine the D_OUT quantization scales +print(f"Auxiliary output contains these keys: {list(aux.keys())}") +print( + f"D scale tensor shape: {aux['d_out_scale'].shape}, type: {aux['d_out_scale'].dtype}. " + f"Contains {len(aux['d_out_scale'].unique())} unique scale factors." +) + +# Apply the scale to get the actual result. Note: This helper function is for demonstration +# purposes and may use significant memory. For production use, set result_type to a +# non-FP8 type instead. +actual_result = nvmath.linalg.advanced.helpers.matmul.apply_mxfp8_scale(result, aux["d_out_scale"]) +print("Final result (with quantization scales applied):") +print(actual_result) diff --git a/examples/linalg/advanced/matmul/example27_mxfp8_chaining.py b/examples/linalg/advanced/matmul/example27_mxfp8_chaining.py new file mode 100644 index 0000000..0b535df --- /dev/null +++ b/examples/linalg/advanced/matmul/example27_mxfp8_chaining.py @@ -0,0 +1,61 @@ +# Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. +# +# SPDX-License-Identifier: Apache-2.0 + +""" +This example demonstrates how D_OUT quantization scale can be reused as input scale for +subsequent matrix multiplications. In this example, we compute matrix exponentiation by +chaining multiple matrix multiplications, while feeding D_OUT scale as A scale. + +FP8 is only supported with cuBLAS 12.8 or newer and on devices with compute +capability 10.0 or higher. +""" + +import torch + +import nvmath + +size = 256 + +p = 4 + +# We will compute B^p = A*B*B*...*B +a = torch.eye(size, device="cuda", dtype=torch.float8_e4m3fn) # Identity matrix +print("Initial value of A (identity matrix):") +print(a) +print() + +b = ( + (torch.eye(size, device="cuda") * (1 + torch.arange(size, device="cuda"))).type(torch.float8_e4m3fn).T +) # Diagonal matrix with ascending values +print("Initial value of B (diagonal matrix):") +print(b) +print() + +b_scale = nvmath.linalg.advanced.helpers.matmul.create_mxfp8_scale(b, 0) # 2^0 = 1 + +init_scales = { + "a": nvmath.linalg.advanced.helpers.matmul.create_mxfp8_scale(a, 0), # 2^0 = 1 + "b": b_scale, +} + +options = { + "block_scaling": True, +} + +torch.set_printoptions(sci_mode=False) + +with nvmath.linalg.advanced.Matmul(a, b, quantization_scales=init_scales, options=options) as mm: + mm.plan() + for i in range(1, p + 1): + d, aux = mm.execute() + + # Replace A with A*B and use the D_OUT scale as input scale for the new A + d_out_scale = aux["d_out_scale"] + print(f"{d_out_scale=}") + mm.reset_operands(a=d, quantization_scales={"a": d_out_scale}) + + # Print the result with quantization scales applied + print(f"Result of B^{i} (with quantization scales applied):") + print(nvmath.linalg.advanced.helpers.matmul.apply_mxfp8_scale(d, d_out_scale)) + print() diff --git a/examples/linalg/advanced/matmul/example28_mxfp8_epilog.py b/examples/linalg/advanced/matmul/example28_mxfp8_epilog.py new file mode 100644 index 0000000..a0185b3 --- /dev/null +++ b/examples/linalg/advanced/matmul/example28_mxfp8_epilog.py @@ -0,0 +1,37 @@ +# Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. +# +# SPDX-License-Identifier: Apache-2.0 + +""" +In this example, we perform MXFP8 matrix multiplication with ReLU activation. + +FP8 is only supported with cuBLAS 12.8 or newer and on devices with compute +capability 10.0 or higher. +""" + +import torch + +import nvmath + +# Prepare sample input data with dimensions m=256, n=256, k=512 +m, n, k = 256, 256, 512 +a = torch.randn(m, k, device="cuda").type(torch.float8_e4m3fn) +b = torch.randn(n, k, device="cuda").type(torch.float8_e4m3fn).T + +scales = { + "a": nvmath.linalg.advanced.helpers.matmul.create_mxfp8_scale(a, 0), # Scale factor 2^0 = 1 + "b": nvmath.linalg.advanced.helpers.matmul.create_mxfp8_scale(b, 0), # Scale factor 2^0 = 1 +} + +options = { + "block_scaling": True, +} + +result, aux = nvmath.linalg.advanced.matmul( + a, b, quantization_scales=scales, options=options, epilog=nvmath.linalg.advanced.MatmulEpilog.RELU +) + +# Display the results +print("Result after applying D_OUT scales:") +print(nvmath.linalg.advanced.helpers.matmul.apply_mxfp8_scale(result, aux["d_out_scale"])) +print("All values are non-negative due to the ReLU activation.") diff --git a/examples/linalg/advanced/matmul/example29_mxfp8_layout.py b/examples/linalg/advanced/matmul/example29_mxfp8_layout.py new file mode 100644 index 0000000..ee9b6a1 --- /dev/null +++ b/examples/linalg/advanced/matmul/example29_mxfp8_layout.py @@ -0,0 +1,41 @@ +# Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. +# +# SPDX-License-Identifier: Apache-2.0 + +""" +This example demonstrates how to use the get_mxfp8_scale_offset helper function to modify +individual scaling factors in MXFP8 matrix multiplication. + +FP8 is only supported with cuBLAS 12.8 or newer and on devices with compute +capability 10.0 or higher. +""" + +import torch + +import nvmath + +# Prepare sample input data +size = 256 +a = torch.eye(size, device="cuda", dtype=torch.float8_e4m3fn) +b = torch.ones(size, size, device="cuda", dtype=torch.float8_e4m3fn).T + +a_scale = nvmath.linalg.advanced.helpers.matmul.create_mxfp8_scale(a, 0) +b_scale = nvmath.linalg.advanced.helpers.matmul.create_mxfp8_scale(a, 0) + +options = {"block_scaling": True, "result_type": nvmath.CudaDataType.CUDA_R_32F} + +# Compute initial result with all scale factors set to 1 +result = nvmath.linalg.advanced.matmul(a, b, quantization_scales={"a": a_scale, "b": b_scale}, options=options) +print("Initial result with all scale factors set to 1:") +print(result) + +# Use get_mxfp8_scale_offset helper to modify the scale factor for the block containing +# position (2, 1) +offset = nvmath.linalg.advanced.helpers.matmul.get_mxfp8_scale_offset(b, (2, 1)) +b_scale[offset] += 4 # Increase the exponent by 4 + +# Compute result with modified scale factor +result2 = nvmath.linalg.advanced.matmul(a, b, quantization_scales={"a": a_scale, "b": b_scale}, options=options) +print("\nResult after modifying one scale factor:") +print(result2) +print(f"\nThe scale factor modification affected {(result2 != 1).sum().item()} elements in the block.") diff --git a/notebooks/matmul/01_introduction.ipynb b/notebooks/matmul/01_introduction.ipynb index ba6a3f0..719c990 100644 --- a/notebooks/matmul/01_introduction.ipynb +++ b/notebooks/matmul/01_introduction.ipynb @@ -5,7 +5,7 @@ "id": "88073684-ba4e-42eb-9d9e-f7541473ce4f", "metadata": {}, "source": [ - "Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES\n", + "Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES\n", "\n", "SPDX-License-Identifier: BSD-3-Clause" ] @@ -23,7 +23,7 @@ "id": "292c2024-bd73-48de-8320-62c0ec0df645", "metadata": {}, "source": [ - "In this notebook we will demonstrate how to perform GEMM (General Matrix Multiply) with nvmath-python library.\n", + "In this tutorial we will demonstrate how to perform GEMM (General Matrix Multiply) with nvmath-python library.\n", "\n", "We will demonstrate two APIs to execute matrix multiplication with nvmath-python:\n", "- `matmul` function (*stateless API*), which performs a single GEMM on its arguments and returns the result.\n", @@ -402,7 +402,7 @@ }, { "cell_type": "code", - "execution_count": 13, + "execution_count": null, "id": "2444b8d9-473c-40cf-ba0c-10002598d57e", "metadata": {}, "outputs": [ @@ -423,7 +423,7 @@ "with Matmul(a, b, c=c, alpha=2, beta=0.7) as mm:\n", " mm.plan()\n", " result = mm.execute()\n", - " # mm.free() is no longer needed, the resourced are freed here.\n", + " # mm.free() is no longer needed, the resources are freed by the context manager.\n", "print(result)" ] }, diff --git a/notebooks/matmul/02_epilogs.ipynb b/notebooks/matmul/02_epilogs.ipynb index 3c4d100..fc98742 100644 --- a/notebooks/matmul/02_epilogs.ipynb +++ b/notebooks/matmul/02_epilogs.ipynb @@ -5,7 +5,7 @@ "id": "fb0742d2-785b-4def-9939-9309b0f5c3e7", "metadata": {}, "source": [ - "Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES\n", + "Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES\n", "\n", "SPDX-License-Identifier: BSD-3-Clause" ] @@ -23,7 +23,7 @@ "id": "292c2024-bd73-48de-8320-62c0ec0df645", "metadata": {}, "source": [ - "In this notebook, we will demonstrate the use of cuBLAS *epilogs*. An epilog is a simple computation executed after performing the matrix multiplication. Epilogs are usually much faster than the same computation executed manually on the result." + "In this tutorial, we will demonstrate the use of cuBLAS *epilogs*. An epilog is a simple computation executed after performing the matrix multiplication. Epilogs are usually much faster than the same computation executed manually on the result." ] }, { @@ -315,7 +315,7 @@ "source": [ "## Learning more\n", "\n", - "We will show a practical use case for the epilogs in the next notebook, in which we will implement a simple digit recognition neural network using nvmath-python `matmul` and its epilogs.\n", + "We will show a practical use case for the epilogs in the next tutorial, in which we will implement a simple digit recognition neural network using nvmath-python `matmul` and its epilogs.\n", "\n", "To learn more about the available epilogs, you can visit [cuBLAS documentation on epilogs](https://docs.nvidia.com/cuda/cublas/#cublasltepilogue-t)." ] diff --git a/notebooks/matmul/03_backpropagation.ipynb b/notebooks/matmul/03_backpropagation.ipynb index 2801b56..37ddb0c 100644 --- a/notebooks/matmul/03_backpropagation.ipynb +++ b/notebooks/matmul/03_backpropagation.ipynb @@ -5,7 +5,7 @@ "id": "7853d721-ee63-4177-a01a-0c07f835814d", "metadata": {}, "source": [ - "Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES\n", + "Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES\n", "\n", "SPDX-License-Identifier: BSD-3-Clause" ] @@ -23,7 +23,7 @@ "id": "292c2024-bd73-48de-8320-62c0ec0df645", "metadata": {}, "source": [ - "In this notebook we will demonstrate how you can use nvmath-python matrix multiplication capabilities to implement a simple neural network recognizing digits from MNIST dataset. We will show how matmul epilogs can be used to simplify implementation and improve performance of both forward and backward pass. To learn more about how to use nvmath-python, please refer to the previous notebooks.\n", + "In this tutorial we will demonstrate how you can use nvmath-python matrix multiplication capabilities to implement a simple neural network recognizing digits from MNIST dataset. We will show how matmul epilogs can be used to simplify implementation and improve performance of both forward and backward pass. To learn more about how to use nvmath-python, please refer to the previous tutorials.\n", "\n", "We will use PyTorch to conveniently load and preprocess the data, but the model itself will be implemented in CuPy and nvmath-python." ] @@ -164,7 +164,7 @@ "source": [ "## Implementation baseclass\n", "\n", - "`MnistNetBase` will serve as the base class for our implementation of the model. It handles weight and bias initialization, but doesn't implement forward and backward pass. We will provide implementations for them later in the notebook." + "`MnistNetBase` will serve as the base class for our implementation of the model. It handles weight and bias initialization, but doesn't implement forward and backward pass. We will provide implementations for them later in the tutorial." ] }, { diff --git a/notebooks/matmul/04_fp8.ipynb b/notebooks/matmul/04_fp8.ipynb new file mode 100644 index 0000000..c4b2984 --- /dev/null +++ b/notebooks/matmul/04_fp8.ipynb @@ -0,0 +1,1077 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "7f359b74-cfdf-4f89-9ab4-26316904cafb", + "metadata": {}, + "source": [ + "Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES\n", + "\n", + "SPDX-License-Identifier: BSD-3-Clause" + ] + }, + { + "cell_type": "markdown", + "id": "8fecddc0-de29-4943-b0c9-f9be659d5ee4", + "metadata": {}, + "source": [ + "# FP8 computations with nvmath-python\n", + "\n", + "In this tutorial we will introduce the narrow-precision FP8 formats and demonstrate how to use nvmath-python to perform computations in FP8 and MXFP8 formats.\n", + "\n", + "## Table of Contents\n", + "1. Classic floating-point formats\n", + "2. Narrow-precision floating-point formats (FP8)\n", + "3. FP8 operations with nvmath-python\n", + "4. MXFP8 (microscaling FP8) operations with nvmath-python\n", + "\n", + "## Classic floating-point formats\n", + "[Floating point numbers](https://en.wikipedia.org/wiki/Floating-point_arithmetic) consist of a *significand* (also known as *mantissa*) multiplied by an integral power of 2 (*exponent*). Except for certain special cases (such as infinity, NaN, or [subnormal numbers](https://en.wikipedia.org/wiki/Subnormal_number)), the value of the number is:\n", + "$$\n", + "\\pm\\;\\text{significand} \\cdot 2 ^ {\\;\\text{exponent}}\n", + "$$\n", + "\n", + "In a typical floating point number, a fixed number of bits store the exponent and the significand. For example, in the [IEEE 754 32-bit float](https://en.wikipedia.org/wiki/Single-precision_floating-point_format) (also known as single-precision float), 23 bits store the significand, 8 bits store the exponent, and one last bit indicates the sign of the number. \n", + "\n", + "The two most important characteristics of a floating point number are:\n", + "- *Precision*, which is the number of bits in the significand. Precision determines the number of representable values on a certain interval.\n", + "- *Dynamic range*, which is the range of representable values. The dynamic range is determined by the number of bits for the exponent.\n", + "\n", + "### High-precision formats\n", + "\n", + "In scientific computations, where precision is extremely important, single-precision formats are sometimes not enough. For this reason, some formats aim to increase precision and range by using more bits for both the significand and exponent. This includes the [double-precision 64-bit floats](https://en.wikipedia.org/wiki/Double-precision_floating-point_format), which use 11 bits for the exponent and 52 bits for the significand, but also even wider formats, such as [quadruple-precision 128-bit floats](https://en.wikipedia.org/wiki/Quadruple-precision_floating-point_format) or [octuple-precision 256-bit floats](https://en.wikipedia.org/wiki/Octuple-precision_floating-point_format).\n", + "\n", + "### FP16 formats\n", + "\n", + "In certain use cases, most notably in the training of deep learning networks, the exact computations are not as important, and lower-precision formats can offer a better trade-off between performance and precision. Using 16-bit floating point numbers, which occupy half the memory of a classic float, can significantly speed up memory-bound computations, while maintaining an acceptable precision. The two most common 16-bit floating point types are:\n", + "\n", + "- [Half-precision](https://en.wikipedia.org/wiki/Half-precision_floating-point_format), with 5 bits for the exponent and 10 bits for the significand.\n", + "- [Bfloat16](https://en.wikipedia.org/wiki/Bfloat16_floating-point_format) (*brain* float), with 8 bits for the exponent and 7 bits for the significand. The dynamic range of `bfloat16` matches that of a single-precision float since both have 8 bits for the exponent. However, this comes at the cost of significantly reduced precision.\n", + "\n", + "An interesting alternative to 16-bit data types is the [TF32](https://en.wikipedia.org/wiki/TensorFloat-32) format, which is an internal computation format available on NVIDIA Tensor Cores.\n", + "\n", + "## Narrow-precision (FP8 and lower) formats\n", + "\n", + "Even smaller floating point formats have recently gained popularity. While narrow-precision formats are not practical in many areas, they still perform well in Deep Learning applications, especially in the inference. In nvmath-python, we support 8-bit floating point types. However, even smaller types are used, such as 6-bit FP6 type or even 4-bit FP4.\n", + "\n", + "The two most commonly used FP8 formats are `float8_e4m3fn` and `float8_e5m2`.\n", + "\n", + "The `float8_e4m3fn` format has 4 bits for the exponent (`e4`) and 3 bits for the significand (`m3`). The `fn` suffix indicates that the number is finite, meaning there's no special value indicating infinity. With just 4 bits for the exponent, the dynamic range of `float8_e4m3fn` is very small - its maximum value is just 448.\n", + "\n", + "The second format, `float8_e5m2`, has one more bit reserved for the exponent, allowing it to store values up to 57344. However, this comes at the cost of reduced precision due to having one fewer bit for the significand.\n", + "\n", + "To illustrate the precision of these types, let's plot their representable values between 0 and 20 and between 0 and 2000. Note how `float8_e5m2` has half the density of `float8_e4m3fn` due to having one less bit for the significand, but has much wider dynamic range. Also notice that 9 is not representable in `float8_e5m2`, and 17 is not representable in `float8_e4m3fn`." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "6395b056-59d9-4be8-8956-9db2e73d4809", + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "import torch\n", + "from matplotlib import pyplot as plt\n", + "\n", + "\n", + "fig, axs = plt.subplots(2, 1, figsize=(8,5))\n", + "for ax, limit in zip(axs, (20, 2000), strict=True):\n", + " x = torch.linspace(0, limit, 10**4)\n", + " for i, dtype in enumerate((torch.float8_e4m3fn, torch.float8_e5m2)):\n", + " representable_values = x.type(dtype).type(torch.float32).unique() # Cast to float8 dtype and back\n", + " ax.scatter(representable_values, torch.full_like(representable_values, i), label=repr(dtype))\n", + " ax.set_ylim(-0.5, 2.5)\n", + " ax.set_yticks([])\n", + " ax.set_xticks(torch.linspace(0, limit, 11, dtype=torch.int32))\n", + " ax.legend()\n", + "fig.suptitle(\"Representable values of float8 types\");" + ] + }, + { + "cell_type": "markdown", + "id": "e5f255ea-c58e-43a0-8bbc-04653372fe5e", + "metadata": {}, + "source": [ + "### FP8 scaling\n", + "\n", + "Due to the very limited dynamic range of FP8 types, the results of FP8 GEMM rarely fit into the dynamic range of FP8. To address this issue, we use *scaling factors* for each operand to ensure they fit into the dynamic range of FP8 type. The internal computations of GEMM are performed in higher precision.\n", + "\n", + "We first scale the input FP8 operands to expand them from the FP8 type into the desired range (*dequantize*). Then, we scale the result down to make it fit into the dynamic range (*quantize*) again. With scales, matrix multiplication becomes:\n", + "$$\n", + "D = \\text{scale}_D\\cdot\\left( (\\text{scale}_A \\cdot A)\\;@\\;(\\text{scale}_B \\cdot B) \\right) \n", + "$$\n", + "\n", + "We will discuss methods for properly choosing the scales later." + ] + }, + { + "cell_type": "markdown", + "id": "ecdd8446-1dff-48eb-905f-d221caedb140", + "metadata": {}, + "source": [ + "## FP8 operations with nvmath-python" + ] + }, + { + "cell_type": "markdown", + "id": "a756e092-904b-48f2-a862-ac6f119b6a6b", + "metadata": {}, + "source": [ + "\n", + "
\n", + " FP8 matrix multiplication in nvmath-python requires CUDA Toolkit 12.8 or newer and a device with compute capability 8.9 or higher. Devices with compute capability 8.9 or higher include devices based on Ada, Hopper or Blackwell architectures, such as L40, H100, H200, B100, B200. Please refer to the table at https://developer.nvidia.com/cuda-gpus to check the compute capability of your device. Note that you also need PyTorch version built with CUDA 12.8.\n", + "
" + ] + }, + { + "cell_type": "markdown", + "id": "bddce74d-eac5-4c3c-9fc8-070336a0a3d4", + "metadata": {}, + "source": [ + "Let's begin by computing a simple FP8 GEMM with nvmath-python. The only API difference between this simple FP8 operation and a classic GEMM with nvmath-python is the presence of the `quantization_scales` argument. It can be either a `MatmulQuantizationScales` object or a dictionary. Scales are allowed (and required) only for FP8 operands.\n", + "\n", + "There are certain limitations regarding the shapes and layouts of the operands. All dimensions must be divisible by 16, and `b` needs to be in column-major layout while `a` needs to be in row-major layout. Since PyTorch creates row-major tensors by default, we will transpose `b` after creation.\n", + "\n", + "Let's create `a`, `b`, and `c` with values sampled from $\\mathcal{N}(0, 10)$." + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "127c1137-7c7c-44f8-84b5-b3592588d09b", + "metadata": {}, + "outputs": [], + "source": [ + "import nvmath\n", + "\n", + "m, n, k = 256, 240, 128\n", + "a = torch.normal(mean=torch.zeros(m, k), std=5).cuda().type(torch.float8_e4m3fn)\n", + "b = torch.normal(mean=torch.zeros(n, k), std=5).cuda().type(torch.float8_e4m3fn).T" + ] + }, + { + "cell_type": "markdown", + "id": "2208fd8a-7e67-4141-be6e-3d9989d9fc42", + "metadata": {}, + "source": [ + "To demonstrate why scaling is so important for FP8 operations, let's first compute the result with all scales set to 1. Note that we're using stateless `matmul` function for simplicty, but stateful `Matmul` class should be used for repeated computations." + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "b76dddc7-6bdb-4f61-8e1c-a0be9f7357e1", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "tensor([[-320., -448., 2., ..., 104., 8., -36.],\n", + " [-288., -120., -144., ..., 448., 448., 288.],\n", + " [-448., 176., -60., ..., -176., 96., -448.],\n", + " ...,\n", + " [ 192., 384., 104., ..., 320., 20., -144.],\n", + " [-192., -448., 288., ..., 176., 112., 384.],\n", + " [-160., -320., 192., ..., 448., -120., 96.]], device='cuda:0',\n", + " dtype=torch.float8_e4m3fn)" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "result_unscaled = nvmath.linalg.advanced.matmul(a, b, quantization_scales={\"a\": 1, \"b\": 1, \"d\": 1})\n", + "result_unscaled" + ] + }, + { + "cell_type": "markdown", + "id": "2499c680-4bde-4315-b33c-21651127ccfd", + "metadata": {}, + "source": [ + "Note that many values reached the maximum/minimum value of `float8_e4m3fn` ($\\pm 448$). To avoid this, let's lower the scale for the result (D):" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "e13fa8ac-f5a2-4a40-8dbf-581ef36bbd01", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "tensor([[-160., -288., 1., ..., 52., 4., -18.],\n", + " [-144., -60., -72., ..., 224., 224., 144.],\n", + " [-288., 88., -30., ..., -88., 48., -240.],\n", + " ...,\n", + " [ 96., 192., 52., ..., 160., 10., -72.],\n", + " [ -96., -256., 144., ..., 88., 56., 192.],\n", + " [ -80., -160., 96., ..., 240., -60., 48.]], device='cuda:0',\n", + " dtype=torch.float8_e4m3fn)" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "result_scaled = nvmath.linalg.advanced.matmul(a, b, quantization_scales={\"a\": 1, \"b\": 1, \"d\": 0.5})\n", + "result_scaled" + ] + }, + { + "cell_type": "markdown", + "id": "bc45d256-6e46-4b75-97f8-40f45ad929c6", + "metadata": {}, + "source": [ + "Now all values have been scaled down by a factor of $0.5$. As a result, most of them fit into the dynamic range of `float8_e4m3fn`. Let's illustrate this with a histogram:" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "ff104d7e-e38e-4028-b5b3-04fc28c6ff41", + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "fig, axs = plt.subplots(2, 1)\n", + "bins = range(-550, 551, 50)\n", + "axs[0].hist(result_unscaled.flatten().cpu().type(torch.float32), density=True, bins=bins,label=\"D scale=1\")\n", + "axs[1].hist(result_scaled.flatten().cpu().type(torch.float32), density=True, bins=bins, label=\"D scale=0.5\")\n", + "for ax in axs:\n", + " ax.set_yticks([])\n", + " ax.axvline(448, color=\"tab:red\")\n", + " ax.axvline(-448, color=\"tab:red\")\n", + " ax.legend()" + ] + }, + { + "cell_type": "markdown", + "id": "1e30578d-74fe-412f-a695-37ebc54ca688", + "metadata": {}, + "source": [ + "Notice how values are clamped to ±448 when the scale is set to 1." + ] + }, + { + "cell_type": "markdown", + "id": "8884013b-a995-4e71-b6d9-2113e1aec12f", + "metadata": {}, + "source": [ + "### Mixing different data types for A, B, C and D\n", + "\n", + "Unlike higher precision operations, for FP8 operations the types of all operands don't have to be equal. Specifically:\n", + "- A and B can both be `float8_e4m3fn`, or one of them can be `float8_e5m2`.\n", + "- C can be `float16`, `bfloat16` or `float32` (but not `float8`).\n", + "- If C is a 16-bit float, D can have the type of A, B or C.\n", + "\n", + "For more details on the supported types, please refer to the [cublasLtMatmul documentation in cuBLASLt docs](https://docs.nvidia.com/cuda/cublas/#cublasltmatmul).\n", + "\n", + "In nvmath-python, the default result type for D is the type of C:" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "98f3789e-7089-4625-aa94-39174da46acb", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "a.dtype=torch.float8_e5m2 b.dtype=torch.float8_e4m3fn c.dtype=torch.bfloat16 => d.dtype=torch.bfloat16\n" + ] + } + ], + "source": [ + "a = torch.rand(m, k).type(torch.float8_e5m2)\n", + "b = torch.rand(n, k).type(torch.float8_e4m3fn).T\n", + "c = torch.rand(m, n).type(torch.bfloat16)\n", + "d = nvmath.linalg.advanced.matmul(a, b, c=c, beta=1, quantization_scales={\"a\": 1, \"b\": 1})\n", + "print(f\"{a.dtype=} {b.dtype=} {c.dtype=} => {d.dtype=}\")" + ] + }, + { + "cell_type": "markdown", + "id": "184af9ab-0b19-4636-bb1d-c04b76f98e8f", + "metadata": {}, + "source": [ + "If C is not specified, the default type for D is the type of A:" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "17cba1da-efa7-464a-a3c6-ed5d5ece4018", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "a.dtype=torch.float8_e5m2 b.dtype=torch.float8_e4m3fn => d.dtype=torch.float8_e5m2\n" + ] + } + ], + "source": [ + "a = torch.rand(m, k).type(torch.float8_e5m2)\n", + "b = torch.rand(n, k).type(torch.float8_e4m3fn).T\n", + "d = nvmath.linalg.advanced.matmul(a, b, quantization_scales={\"a\": 1, \"b\": 1, \"d\": 1})\n", + "print(f\"{a.dtype=} {b.dtype=} => {d.dtype=}\")" + ] + }, + { + "cell_type": "markdown", + "id": "cea06ad0-ef17-438d-94cd-d4e0ccfd6ff5", + "metadata": {}, + "source": [ + "However, you can also request a specific type for D by setting the `result_type` option:" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "20d19166-70ec-40ae-864a-28e440364426", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "a.dtype=torch.float8_e5m2 b.dtype=torch.float8_e4m3fn options={'result_type': } => d.dtype=torch.bfloat16\n" + ] + } + ], + "source": [ + "a = torch.rand(m, k).type(torch.float8_e5m2)\n", + "b = torch.rand(n, k).type(torch.float8_e4m3fn).T\n", + "options = {\n", + " \"result_type\": nvmath.CudaDataType.CUDA_R_16BF # bfloat16\n", + "}\n", + "d = nvmath.linalg.advanced.matmul(a, b, quantization_scales={\"a\": 1, \"b\": 1}, options=options)\n", + "print(f\"{a.dtype=} {b.dtype=} {options=} => {d.dtype=}\")" + ] + }, + { + "cell_type": "markdown", + "id": "2ae843db-bfd1-49f2-bd3b-2e47ced9308b", + "metadata": {}, + "source": [ + "Note that we provide a D scale only when D is FP8." + ] + }, + { + "cell_type": "markdown", + "id": "c95c17c5-806e-49df-b7e2-05804fcefe25", + "metadata": {}, + "source": [ + "### Scale modification in stateful Matmul\n", + "\n", + "Until now, we were using stateless `matmul` function to perform the multiplication. However, for repeated computations, stateful `Matmul` class should be used. For more details on the stateful and stateless APIs, visit the [nvmath-python documentation](https://docs.nvidia.com/cuda/nvmath-python/latest/overview.html#stateless-and-stateful-apis)\n", + ".\n", + "When performing multiple multiplications with a stateful `Matmul` object, you'll likely need to change the scales along with the operands. You can do this by specifying new scales as the `quantization_scales` argument of `reset_operands`:" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "0d2b1749-888c-4fb1-9cef-e94d4c44868a", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Mean value of result with A scale set to 2: 126.25104522705078\n", + "Mean value of result with A scale set to 0.0001: 0.006436571013182402\n" + ] + } + ], + "source": [ + "a = torch.rand(m, k, device=\"cuda\").type(torch.float8_e5m2)\n", + "b = torch.rand(n, k, device=\"cuda\").type(torch.float8_e4m3fn).T\n", + "with nvmath.linalg.advanced.Matmul(a, b, quantization_scales={\"a\": 2, \"b\": 2, \"d\": 1}) as mm:\n", + " mm.plan()\n", + " result1 = mm.execute()\n", + " print(f\"Mean value of result with A scale set to 2: {result1.type(torch.float32).mean().item()}\")\n", + " new_a = torch.rand(m, k, device=\"cuda\").type(torch.float8_e5m2)\n", + " mm.reset_operands(a=new_a, quantization_scales={\"a\": 0.0001}) # Not that not all scales need to be reset\n", + " result2 = mm.execute()\n", + " print(f\"Mean value of result with A scale set to 0.0001: {result2.type(torch.float32).mean().item()}\")" + ] + }, + { + "cell_type": "markdown", + "id": "e4581f18-bdec-46d3-b79b-dd2ea8f4c542", + "metadata": {}, + "source": [ + "Alternatively, you can provide the quantization scales as single-element GPU tensors (instead of Python scalars). This allows you to change them in-place:" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "081c14f3-3df9-48f7-bd3a-2a994e299cf4", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Mean value of result with A scale set to 2: 127.38880920410156\n", + "Mean value of result with A scale set to 0.0001: 0.006453291978687048\n" + ] + } + ], + "source": [ + "a = torch.rand(m, k, device=\"cuda\").type(torch.float8_e5m2)\n", + "b = torch.rand(n, k, device=\"cuda\").type(torch.float8_e4m3fn).T\n", + "a_scale = torch.full((1,), 2., device=\"cuda\", dtype=torch.float32) # We'll change it in-place\n", + "with nvmath.linalg.advanced.Matmul(a, b, quantization_scales={\"a\": a_scale, \"b\": 2, \"d\": 1}) as mm:\n", + " mm.plan()\n", + " result1 = mm.execute()\n", + " print(f\"Mean value of result with A scale set to 2: {result1.type(torch.float32).mean().item()}\")\n", + " a_scale[:] = 0.0001 # Note: no reset_operands\n", + " result2 = mm.execute()\n", + " print(f\"Mean value of result with A scale set to 0.0001: {result2.type(torch.float32).mean().item()}\")" + ] + }, + { + "cell_type": "markdown", + "id": "7bd981a0-75e4-4c54-8f56-318350295264", + "metadata": {}, + "source": [ + "### Delayed scaling with amax\n", + "\n", + "In real-world applications, choosing the scales might be non-trivial. However, in some cases, such as during the training of deep learning models, the distribution of inputs in subsequent iterations tends to be similar, which allows estimating the scales needed based on the results from previous iterations.\n", + "\n", + "To facilitate choosing scales automatically, nvmath-python can output the maximum absolute value in the result, abbreviated as *amax*. To request the amax, set the `result_amax` option to `True`. When amax is requested, nvmath-python will return a tuple containing the actual result and an auxiliary output dictionary containing the amax as a single-element tensor (possibly along with other outputs)." + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "ad0262b4-e0b6-4c07-8b49-8f437ed92145", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "type(ret)=\n", + "aux={'result_amax': tensor([42.3923], device='cuda:0')}\n" + ] + } + ], + "source": [ + "a = torch.rand(m, k, device=\"cuda\").type(torch.float8_e4m3fn)\n", + "b = torch.rand(n, k, device=\"cuda\").type(torch.float8_e4m3fn).T\n", + "options = {\n", + " \"result_amax\": True\n", + "}\n", + "ret = nvmath.linalg.advanced.matmul(a, b, quantization_scales={\"a\": 1, \"b\": 1, \"d\": 1}, options=options)\n", + "print(f\"{type(ret)=}\")\n", + "result, aux = ret\n", + "print(f\"{aux=}\")" + ] + }, + { + "cell_type": "markdown", + "id": "22a6e556-9e39-4e19-86a3-f831d8a18198", + "metadata": {}, + "source": [ + "Let's now implement a simple delayed scaling approach. With delayed scaling, the amax values from previous iterations are used to compute the correct D scales for subsequent iterations. In each iteration, we will multiply two tensors with values sampled from $\\mathcal{N}(0, 1)$, with each tensor scaled by a random factor sampled uniformly from $(0, 10)$.\n", + "\n", + "We'll start with all scales set to 1. In each iteration, we will adjust the scale for D to:\n", + "$$\n", + "\\text{scale}_D = \\frac{\\max(\\mathtt{float8\\_e4m3fn})}{\\text{amax}} = \\frac{448}{\\text{amax}}\n", + "$$" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "f1cbd081-6c55-44d1-84e9-e65b34839e19", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Iteration 1: amax=tensor([2117.1724], device='cuda:0'), 28.85% values are 448. Setting D scale to 0.21160298585891724\n", + "Iteration 2: amax=tensor([1987.5466], device='cuda:0'), 0.00% values are 448. Setting D scale to 0.2254035323858261\n", + "Iteration 3: amax=tensor([2071.7327], device='cuda:0'), 0.00% values are 448. Setting D scale to 0.21624411642551422\n", + "Iteration 4: amax=tensor([2219.4202], device='cuda:0'), 0.00% values are 448. Setting D scale to 0.20185452699661255\n", + "Iteration 5: amax=tensor([1961.7855], device='cuda:0'), 0.00% values are 448. Setting D scale to 0.22836339473724365\n", + "Iteration 6: amax=tensor([2017.6495], device='cuda:0'), 0.00% values are 448. Setting D scale to 0.2220405638217926\n", + "Iteration 7: amax=tensor([2089.1074], device='cuda:0'), 0.00% values are 448. Setting D scale to 0.21444565057754517\n", + "Iteration 8: amax=tensor([2168.7864], device='cuda:0'), 0.00% values are 448. Setting D scale to 0.20656713843345642\n", + "Iteration 9: amax=tensor([2081.2173], device='cuda:0'), 0.00% values are 448. Setting D scale to 0.2152586430311203\n", + "Iteration 10: amax=tensor([2324.1553], device='cuda:0'), 0.00% values are 448. Setting D scale to 0.1927582025527954\n" + ] + } + ], + "source": [ + "m, n, k = 1024, 1024, 2048\n", + "a = torch.zeros(m, k, device=\"cuda\").type(torch.float8_e4m3fn)\n", + "b = torch.zeros(n, k, device=\"cuda\").type(torch.float8_e4m3fn).T\n", + "\n", + "def regenerate_inputs():\n", + " global a, b\n", + " a[:] = 3 * torch.randn(a.shape, device=\"cuda\")\n", + " b[:] = 3 * torch.randn(b.shape, device=\"cuda\")\n", + "\n", + "scales = {\"a\": 1, \"b\": 1, \"d\": torch.ones((1,), device=\"cuda\", dtype=torch.float32)} # D scale is a GPU tensor to allow for in-place change\n", + "options = {\"result_amax\": True}\n", + "num_iters = 10\n", + "with nvmath.linalg.advanced.Matmul(a, b, quantization_scales=scales, options=options) as mm:\n", + " mm.plan()\n", + " for iteration in range(1, num_iters+1):\n", + " regenerate_inputs()\n", + " d, aux = mm.execute()\n", + " amax = aux[\"result_amax\"]\n", + " overflow_frac = (d.cpu().abs() == 448).sum() / d.nelement()\n", + " scales[\"d\"][:] = 448 / amax\n", + " print(f\"Iteration {iteration}: amax={amax}, {overflow_frac*100:.02f}% values are 448. Setting D scale to {scales['d'].item()}\")" + ] + }, + { + "cell_type": "markdown", + "id": "0ab4fa2b-a778-4fef-81b8-1d9897c0499c", + "metadata": {}, + "source": [ + "Depending on the use case, it might be beneficial to compute the new scale based not on a single amax value, but on the history of amax values." + ] + }, + { + "cell_type": "markdown", + "id": "3d5bd83a-085b-47b0-b334-a884199938ff", + "metadata": {}, + "source": [ + "## MXFP8 (microscaling FP8) operations with nvmath-python\n", + "\n", + "
\n", + "MXFP8 matrix multiplication in nvmath-python requires CUDA Toolkit 12.8 or newer and a device with compute capability 10.0 or higher. Devices with compute capability 10.0 or higher include devices based on Blackwell architecture, such as B100 and B200. Please refer to the table at https://developer.nvidia.com/cuda-gpus to check the compute capability of your device. Note that you also need PyTorch version built with CUDA 12.8.\n", + "
" + ] + }, + { + "cell_type": "markdown", + "id": "ca86db73-f598-4856-a68c-a72bfaac4bec", + "metadata": {}, + "source": [ + "### The limits of tensor-wide scaling\n", + "\n", + "Sometimes, even with scaling, getting results clamped is unavoidable. The D scale factor needs to be small enough to squeeze all (or almost all) values into the dynamic range of the FP8 type used. However, if it's too small, numbers very close to 0 turn into 0 when scaled down. To illustrate this, let's consider the following (artificial) example.\n", + "\n", + "Let A and B be matrices sampled from $\\mathcal{N}(0, 1)$. However, to expand the range of values, let's scale each row by a random power of 2, sampled from $2^{\\;\\mathcal{U}(-8, 8)}$. This example is exaggerated for learning purposes, but illustrates the problems with FP8 tensor-wide scaling well." + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "da80b7c2-9b6d-4911-b8b6-a761b3797d9d", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Absolute values in the result range from 1.998816878767684e-05 to 12908.6708984375.\n" + ] + }, + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "m, n, k = 256, 256, 256\n", + "\n", + "a_fp32 = torch.randn(m, k) * (2 ** (torch.rand(m) * 16 - 8)[:,None])\n", + "b_fp32 = torch.randn(n, k).T\n", + "d_fp32 = a_fp32 @ b_fp32\n", + "\n", + "fig, axs = plt.subplots(1, 3, figsize=(12, 3))\n", + "plt.colorbar(axs[0].imshow(a_fp32.abs().log2()), ax=axs[0])\n", + "axs[0].set_title(\"A (log2 of abs)\")\n", + "axs[0].axis(\"off\")\n", + "plt.colorbar(axs[1].imshow(b_fp32.abs().log2()), ax=axs[1])\n", + "axs[1].set_title(\"B (log2 of abs)\")\n", + "axs[1].axis(\"off\")\n", + "plt.colorbar(axs[2].imshow(d_fp32.abs().log2()), ax=axs[2])\n", + "# ax=axs[2])\n", + "axs[2].set_title(\"A @ B (log2 of abs)\")\n", + "axs[2].axis(\"off\")\n", + "\n", + "print(f\"Absolute values in the result range from {d_fp32.abs().min()} to {d_fp32.abs().max()}.\")" + ] + }, + { + "cell_type": "markdown", + "id": "e311db38-d240-4c2a-8986-844073093046", + "metadata": {}, + "source": [ + "With such a huge range of values, regardless of the scale we choose, either some very small values will be scaled down to 0, or some big values will be clamped to 448. Let's see on a plot how the choice of the quantization scale for D affects the number of errors of each kind." + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "id": "e871cb38-474d-4e43-ae5e-c92a25aaaddd", + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "a = a_fp32.cuda().type(torch.float8_e4m3fn)\n", + "b = b_fp32.cuda().type(torch.float8_e4m3fn)\n", + "\n", + "def count_errors(dscale):\n", + " scales = {\"a\": 1, \"b\": 1, \"d\": dscale}\n", + " d = nvmath.linalg.advanced.matmul(a, b, quantization_scales=scales)\n", + " return (d == 0).sum().item() / d.nelement(), (d.cpu().abs() == 448).sum().item() / d.nelement()\n", + "\n", + "dscales = 2**torch.linspace(-5, 0, 100)\n", + "errors = torch.as_tensor([count_errors(dscale) for dscale in dscales])\n", + "underflows = errors[:,0] * 100\n", + "overflows = errors[:,1] * 100\n", + "total_fp8_errors = underflows + overflows\n", + "\n", + "fig, ax = plt.subplots(1, 1, figsize=(6,4))\n", + "ax.plot(dscales, underflows, label=\"Underflow errors\")\n", + "ax.plot(dscales, overflows, label=\"Overflow errors\")\n", + "ax.plot(dscales, total_fp8_errors, color=\"black\", linestyle=\":\", label=\"Total errors\")\n", + "ax.set_xscale(\"log\", base=2)\n", + "ax.set_xlabel(\"Applied D scale\")\n", + "ax.set_ylabel(\"Errors [%]\")\n", + "ax.legend();\n", + "plt.ylim(ymin=0);" + ] + }, + { + "cell_type": "markdown", + "id": "9c7e2a81-5eb1-4eab-b2e8-4589fb78679f", + "metadata": {}, + "source": [ + "### Basics of MXFP8\n", + "\n", + "To address this issue, MXFP8, which stands for microscaling FP8, uses *block scaling* instead of *tensor-wide scaling*: instead of applying a single scale factor to the whole matrix, separate scales are applied to blocks of 32 elements. This way, even when there are both very big and very small numbers in the matrix, it is possible keep both of them - unless, of course, both big and small values happen to be placed in the same scaling block.\n", + "\n", + "To enable MXFP8, set `block_scaling` option to `True`.\n", + "\n", + "Also, in MXFP8 mode, result scaling works differently. Instead of expecting the user to provide the scale for D, cuBLAS will compute appropriate scale for each block, apply the scales, and return the scaled output together with the scales used. The scales will be returned as `\"d_out\"` in the auxilary outputs dictionary.\n", + "\n", + "As in case of FP8, you have to provide scales for A and B. Because the layout of the block scaling factor is quite complex, nvmath provides `create_mxfp8_scale` helper, which allows you to create a block scaling factors for a given tensor, filled with a constant value. The second argument of `create_mxfp8_scale` is a base 2 exponent of the scale to be used. In this case, by passing $0$ we request the scale to be $2^0=1$." + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "id": "6e4151e8-393e-4df3-b9e5-3c24ad8ebbd8", + "metadata": {}, + "outputs": [], + "source": [ + "scales = {\n", + " \"a\": nvmath.linalg.advanced.helpers.matmul.create_mxfp8_scale(a, 0),\n", + " \"b\": nvmath.linalg.advanced.helpers.matmul.create_mxfp8_scale(b, 0),\n", + "}" + ] + }, + { + "cell_type": "markdown", + "id": "c628c8c2-78df-4df5-991e-058bc38ebfd2", + "metadata": {}, + "source": [ + "Let's try that:" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "id": "9da4bff5-433c-4d4e-838b-da67311eee4e", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(tensor([[-384., 160., 24., ..., 416., 208., 224.],\n", + " [-120., -120., -104., ..., -22., 144., 160.],\n", + " [-192., -288., -120., ..., 224., 96., 15.],\n", + " ...,\n", + " [-416., 416., 384., ..., 18., 32., -72.],\n", + " [ 112., 320., -60., ..., 112., 13., 64.],\n", + " [ 192., 128., -192., ..., 160., 176., -56.]], device='cuda:0',\n", + " dtype=torch.float8_e4m3fn),\n", + " {'d_out_scale': tensor([128, 128, 129, ..., 117, 117, 117], device='cuda:0',\n", + " dtype=torch.uint8)})" + ] + }, + "execution_count": 16, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "options = {\"block_scaling\": True}\n", + "result, aux = nvmath.linalg.advanced.matmul(a, b, options=options, quantization_scales=scales) # Note: a tuple is returned\n", + "result, aux" + ] + }, + { + "cell_type": "markdown", + "id": "46252c4a-e3f1-4838-bf0e-d8bc5d025877", + "metadata": {}, + "source": [ + "We will discuss the meaning of the block scales tensor later. For now, let's just use the `apply_mxfp8_scales` to apply the returned scales to the returned D:" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "id": "e80b4308-c1c5-4ef9-a6a3-f66117a3ad68", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "tensor([[-7.6800e+02, 3.2000e+02, 4.8000e+01, ..., 8.3200e+02,\n", + " 4.1600e+02, 4.4800e+02],\n", + " [-3.7500e+00, -3.7500e+00, -3.2500e+00, ..., -6.8750e-01,\n", + " 4.5000e+00, 5.0000e+00],\n", + " [-1.8750e-01, -2.8125e-01, -1.1719e-01, ..., 4.3750e-01,\n", + " 1.8750e-01, 2.9297e-02],\n", + " ...,\n", + " [-1.3000e+01, 1.3000e+01, 1.2000e+01, ..., 1.1250e+00,\n", + " 2.0000e+00, -4.5000e+00],\n", + " [ 2.1875e-01, 6.2500e-01, -1.1719e-01, ..., 4.3750e-01,\n", + " 5.0781e-02, 2.5000e-01],\n", + " [ 1.8750e-01, 1.2500e-01, -1.8750e-01, ..., 1.5625e-01,\n", + " 1.7188e-01, -5.4688e-02]], device='cuda:0')" + ] + }, + "execution_count": 17, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "nvmath.linalg.advanced.helpers.matmul.apply_mxfp8_scale(result, aux[\"d_out_scale\"])" + ] + }, + { + "cell_type": "markdown", + "id": "e98a2fc1-5d6f-470d-ac0f-23ef78083d0e", + "metadata": {}, + "source": [ + "Let's see how many underflow/overflow errors happened in the MXFP8 computation:" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "id": "7fafc3e4-5882-455e-820c-a8efe80f0c51", + "metadata": {}, + "outputs": [], + "source": [ + "mxfp8_underflows = 100 * (result == 0).sum() / result.nelement()\n", + "mxfp8_overflows = 100 * (result.cpu().abs() == 448).sum() / result.nelement()\n", + "mxfp8_errors = mxfp8_underflows + mxfp8_overflows" + ] + }, + { + "cell_type": "markdown", + "id": "94e8c513-31f5-4089-b7a6-64ce4ac53863", + "metadata": {}, + "source": [ + "Let's see how it compares to FP8:" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "id": "278362e0-373b-477f-983b-d2c8a0b7fa27", + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "fig, ax = plt.subplots(1, 1, figsize=(6,4))\n", + "ax.plot(dscales, total_fp8_errors, label=\"FP8 errors\", color=\"black\", linestyle=\":\")\n", + "ax.axhline(mxfp8_errors.item(), label=\"MXFP8 errors\", color=\"#76b900\")\n", + "ax.set_xscale(\"log\", base=2)\n", + "ax.set_xlabel(\"Applied D scale for FP8\")\n", + "ax.set_ylabel(\"Errors [%]\")\n", + "plt.ylim(ymin=0)\n", + "ax.legend();" + ] + }, + { + "cell_type": "markdown", + "id": "43625c5f-f9a5-4027-97eb-79b93d132a53", + "metadata": {}, + "source": [ + "In this particular case, MXFP8 performed so well, because by scaling each row by a single factor we ensured that the numbers in each MXFP8 block are of similar magnitude. If we scaled each element individually by a random power of 2, the gains from MXFP8 would be much less visible. However, in the Deep Learning workloads, MXFP8 has been demonstrated to improve the accuracy." + ] + }, + { + "cell_type": "markdown", + "id": "1d69ac06-2a69-4659-b089-fc810997f626", + "metadata": {}, + "source": [ + "### MXFP8 block scales value encoding\n", + "\n", + "#### Value encoding\n", + "As you may have noticed, the scales used in MXFP8 are small integers:" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "id": "19f89955-7ee7-499d-912f-43f09d03564d", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "tensor([128, 128, 129, ..., 117, 117, 117], device='cuda:0',\n", + " dtype=torch.uint8)" + ] + }, + "execution_count": 20, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "aux[\"d_out_scale\"]" + ] + }, + { + "cell_type": "markdown", + "id": "a9b0659a-d48a-4ce0-82f5-17820d93fa33", + "metadata": {}, + "source": [ + "Each scale is encoded in the UE8M0 format, which means an unsigned 8-bit exponent, without any exp}licit significand bits. We interpret the value $x$ as:\n", + "$\n", + "2^{x - 127}\n", + "$. For example, value `132` means that scale used was $2^{132-127}=2^5=32$.\n", + "\n", + "### [Advanced] MXFP8 block scales layout\n", + "\n", + "The layout of the scales is relatively complex. In typical use cases, you shouldn't have to care about it. However, in advanced use cases you may need to tweak particular scaling factors. This section is optional and for curious users. In normal use-cases, the details of MXPF8 layout should be opaque to the user and are subject to change in future releases of nvmath-python.\n", + "\n", + "We provide `get_mxfp8_scale_offset`, which given a tuple of indices in the matrix, returns the index in the scales vector containing the scale for this item. To illustrate its usage, let's create two simple matrices, and prepare unit scales for both:" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "id": "9d6289a6-5909-4439-beea-eb3a4511f19c", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "tensor([[1., 1., 1., ..., 1., 1., 1.],\n", + " [1., 1., 1., ..., 1., 1., 1.],\n", + " [1., 1., 1., ..., 1., 1., 1.],\n", + " ...,\n", + " [1., 1., 1., ..., 1., 1., 1.],\n", + " [1., 1., 1., ..., 1., 1., 1.],\n", + " [1., 1., 1., ..., 1., 1., 1.]])" + ] + }, + "execution_count": 21, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "a = torch.ones(128, 128).type(torch.float8_e4m3fn)\n", + "b = torch.eye(128).type(torch.float8_e4m3fn).T # identity matrix\n", + "scales = {\n", + " \"a\": nvmath.linalg.advanced.helpers.matmul.create_mxfp8_scale(a, 0),\n", + " \"b\": nvmath.linalg.advanced.helpers.matmul.create_mxfp8_scale(b, 0),\n", + "}\n", + "result, aux = nvmath.linalg.advanced.matmul(a, b, quantization_scales=scales, options={\"block_scaling\": True})\n", + "actual_result = nvmath.linalg.advanced.helpers.matmul.apply_mxfp8_scale(result, aux[\"d_out_scale\"])\n", + "actual_result" + ] + }, + { + "cell_type": "markdown", + "id": "304b3d9b-5a72-40b2-86af-afe2ad0ea4a6", + "metadata": {}, + "source": [ + "Now, let's use `get_mxfp8_scale_offset` to get the scale for element at $(40, 70)$ in `a` and set the scale to 16x bigger value. Let's repeat the matmul and display the results. Note how changing one scale affected the whole block of 32 elements." + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "id": "b57a80c2-40d4-443c-8e8f-aebe8cc65dca", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Offset is 134\n" + ] + }, + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "offset = nvmath.linalg.advanced.helpers.matmul.get_mxfp8_scale_offset(a, (40, 70))\n", + "print(\"Offset is\", offset)\n", + "scales[\"a\"][offset] += 4 # Increasing the exponent by 4 means scaling 16x.\n", + "\n", + "result2, aux = nvmath.linalg.advanced.matmul(a, b, quantization_scales=scales, options={\"block_scaling\": True})\n", + "actual_result2 = nvmath.linalg.advanced.helpers.matmul.apply_mxfp8_scale(result, aux[\"d_out_scale\"])\n", + "\n", + "fig, axs = plt.subplots(1, 2, figsize=(6, 4))\n", + "axs[0].imshow(actual_result)\n", + "axs[1].imshow(actual_result2)\n", + "axs[0].set_title(\"Before scale change\")\n", + "axs[1].set_title(\"After scale change\");" + ] + }, + { + "cell_type": "markdown", + "id": "7bd2ee42-bb37-4dd0-b3b9-a10ad992413d", + "metadata": {}, + "source": [ + "To visualize the layout of MXFP8 scales, let's use `get_mxfp8_scale_offset` again, but this time computing the offset for each element. Note how the blocks for A (row-major layout) are laid out horizontally, while blocks of B (column-major layout) are laid out vertically. This is because the MXFP8 scales are always applied to blocks of consecutive elements in the memory." + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "id": "bd4c3000-7f2a-46f8-b5a8-07514dc02344", + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "all_indices = torch.meshgrid(torch.arange(128), torch.arange(128), indexing=\"ij\")\n", + "offsets_a = nvmath.linalg.advanced.helpers.matmul.get_mxfp8_scale_offset(a, all_indices)\n", + "offsets_b = nvmath.linalg.advanced.helpers.matmul.get_mxfp8_scale_offset(b, all_indices)\n", + "\n", + "fig, ax = plt.subplots(1, 2, figsize=(8, 4))\n", + "ax[0].imshow(offsets_a * 10 % 93, cmap=\"gray\")\n", + "ax[1].imshow(offsets_b * 10 % 93, cmap=\"gray\")\n", + "ax[0].set_title(\"Blocks in A (row-major)\")\n", + "ax[1].set_title(\"Blocks in B (column-major)\");" + ] + }, + { + "cell_type": "markdown", + "id": "53917462-7d60-4370-b6c0-4d90a2c27d45", + "metadata": {}, + "source": [ + "For more details about the MXFP8 scales layout, please visit the cuBLASLt documentation." + ] + }, + { + "cell_type": "markdown", + "id": "6686d5f8-701e-4cd4-a6ab-170b761e62c4", + "metadata": {}, + "source": [ + "## Learning more\n", + "- For more examples of FP8 and MXFP8 usage, see the [examples on our GitHub](https://github.com/NVIDIA/nvmath-python/tree/main/examples/linalg/advanced/matmul).\n", + "- For more details on narrow-precision types, visit [cuBLASLt documentation](https://docs.nvidia.com/cuda/cublas/#narrow-precision-data-types-usage)." + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.11" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/nvmath/__init__.py b/nvmath/__init__.py index 5755be5..c508d69 100644 --- a/nvmath/__init__.py +++ b/nvmath/__init__.py @@ -1,14 +1,16 @@ -# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. +# Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. # # SPDX-License-Identifier: Apache-2.0 +import importlib.metadata + from nvmath import bindings from nvmath._utils import ComputeType from nvmath._utils import CudaDataType from nvmath._utils import LibraryPropertyType from nvmath import fft, linalg -from nvmath.memory import * +from nvmath.memory import BaseCUDAMemoryManager, MemoryPointer # Attempt to preload libraries. Fail silently if preload fails. @@ -19,3 +21,16 @@ def _force_lib_load(): _force_lib_load() + +__all__ = [ + "BaseCUDAMemoryManager", + "bindings", + "ComputeType", + "CudaDataType", + "fft", + "LibraryPropertyType", + "linalg", + "MemoryPointer", +] + +__version__ = importlib.metadata.version("nvmath-python") diff --git a/nvmath/_internal/enum_utils.py b/nvmath/_internal/enum_utils.py index 03a9eee..54f4335 100644 --- a/nvmath/_internal/enum_utils.py +++ b/nvmath/_internal/enum_utils.py @@ -1,4 +1,4 @@ -# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. +# Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. # # SPDX-License-Identifier: Apache-2.0 @@ -10,7 +10,7 @@ import dataclasses from enum import IntEnum import re -from typing import Any, ClassVar, Optional +from typing import Any, ClassVar from collections.abc import Callable import numpy diff --git a/nvmath/_internal/formatters.py b/nvmath/_internal/formatters.py index f1b785a..d1bafde 100644 --- a/nvmath/_internal/formatters.py +++ b/nvmath/_internal/formatters.py @@ -1,4 +1,4 @@ -# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. +# Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. # # SPDX-License-Identifier: Apache-2.0 diff --git a/nvmath/_internal/layout.py b/nvmath/_internal/layout.py index fa497ef..2e5d6c1 100644 --- a/nvmath/_internal/layout.py +++ b/nvmath/_internal/layout.py @@ -1,4 +1,4 @@ -# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. +# Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. # # SPDX-License-Identifier: Apache-2.0 diff --git a/nvmath/_internal/mem_limit.py b/nvmath/_internal/mem_limit.py index 69d6b5b..609a874 100644 --- a/nvmath/_internal/mem_limit.py +++ b/nvmath/_internal/mem_limit.py @@ -1,4 +1,4 @@ -# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. +# Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. # # SPDX-License-Identifier: Apache-2.0 diff --git a/nvmath/_internal/package_ifc.py b/nvmath/_internal/package_ifc.py index 855af2b..ed9d76a 100644 --- a/nvmath/_internal/package_ifc.py +++ b/nvmath/_internal/package_ifc.py @@ -1,4 +1,4 @@ -# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. +# Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. # # SPDX-License-Identifier: Apache-2.0 diff --git a/nvmath/_internal/package_ifc_cupy.py b/nvmath/_internal/package_ifc_cupy.py index 328d602..92773c6 100644 --- a/nvmath/_internal/package_ifc_cupy.py +++ b/nvmath/_internal/package_ifc_cupy.py @@ -1,4 +1,4 @@ -# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. +# Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. # # SPDX-License-Identifier: Apache-2.0 diff --git a/nvmath/_internal/package_ifc_torch.py b/nvmath/_internal/package_ifc_torch.py index 0b704ec..251f281 100644 --- a/nvmath/_internal/package_ifc_torch.py +++ b/nvmath/_internal/package_ifc_torch.py @@ -1,4 +1,4 @@ -# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. +# Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. # # SPDX-License-Identifier: Apache-2.0 diff --git a/nvmath/_internal/package_wrapper.py b/nvmath/_internal/package_wrapper.py index af22414..986b7f5 100644 --- a/nvmath/_internal/package_wrapper.py +++ b/nvmath/_internal/package_wrapper.py @@ -1,4 +1,4 @@ -# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. +# Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. # # SPDX-License-Identifier: Apache-2.0 @@ -13,7 +13,7 @@ PACKAGE: dict[str, type[Package]] = {"cupy": CupyPackage} try: - import torch + import torch # noqa: F401 from .package_ifc_torch import TorchPackage PACKAGE["torch"] = TorchPackage diff --git a/nvmath/_internal/tensor_ifc.py b/nvmath/_internal/tensor_ifc.py index 767e7a4..d797ff8 100644 --- a/nvmath/_internal/tensor_ifc.py +++ b/nvmath/_internal/tensor_ifc.py @@ -1,4 +1,4 @@ -# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. +# Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. # # SPDX-License-Identifier: Apache-2.0 @@ -57,6 +57,11 @@ def numpy(self, stream_holder): def shape(self) -> Sequence[int]: raise NotImplementedError + @property + @abstractmethod + def size(self) -> int: + raise NotImplementedError + @property @abstractmethod def strides(self) -> Sequence[int]: diff --git a/nvmath/_internal/tensor_ifc_cupy.py b/nvmath/_internal/tensor_ifc_cupy.py index c8a8f98..84fc212 100644 --- a/nvmath/_internal/tensor_ifc_cupy.py +++ b/nvmath/_internal/tensor_ifc_cupy.py @@ -1,4 +1,4 @@ -# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. +# Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. # # SPDX-License-Identifier: Apache-2.0 @@ -53,6 +53,10 @@ def dtype(self): def shape(self): return tuple(self.tensor.shape) + @property + def size(self): + return self.tensor.size + @property def strides(self): return tuple(stride_in_bytes // self.tensor.itemsize for stride_in_bytes in self.tensor.strides) diff --git a/nvmath/_internal/tensor_ifc_numpy.py b/nvmath/_internal/tensor_ifc_numpy.py index c945234..6808204 100644 --- a/nvmath/_internal/tensor_ifc_numpy.py +++ b/nvmath/_internal/tensor_ifc_numpy.py @@ -1,4 +1,4 @@ -# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. +# Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. # # SPDX-License-Identifier: Apache-2.0 @@ -52,6 +52,10 @@ def dtype(self): def shape(self): return tuple(self.tensor.shape) + @property + def size(self): + return self.tensor.size + @property def strides(self): return tuple(stride_in_bytes // self.tensor.itemsize for stride_in_bytes in self.tensor.strides) diff --git a/nvmath/_internal/tensor_ifc_torch.py b/nvmath/_internal/tensor_ifc_torch.py index 23f8119..894a7ad 100644 --- a/nvmath/_internal/tensor_ifc_torch.py +++ b/nvmath/_internal/tensor_ifc_torch.py @@ -1,4 +1,4 @@ -# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. +# Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. # # SPDX-License-Identifier: Apache-2.0 @@ -51,6 +51,10 @@ def dtype(self): def shape(self): return tuple(self.tensor.shape) + @property + def size(self): + return self.tensor.nelement() + @property def strides(self): return self.tensor.stride() diff --git a/nvmath/_internal/tensor_wrapper.py b/nvmath/_internal/tensor_wrapper.py index 58367ab..e5e5945 100644 --- a/nvmath/_internal/tensor_wrapper.py +++ b/nvmath/_internal/tensor_wrapper.py @@ -1,4 +1,4 @@ -# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. +# Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. # # SPDX-License-Identifier: Apache-2.0 diff --git a/nvmath/_internal/typemaps.py b/nvmath/_internal/typemaps.py index 268b591..1ffb45a 100644 --- a/nvmath/_internal/typemaps.py +++ b/nvmath/_internal/typemaps.py @@ -1,4 +1,4 @@ -# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. +# Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. # # SPDX-License-Identifier: Apache-2.0 @@ -6,7 +6,7 @@ Functions to link type names with CUDA data and compute types. """ -__all__ = ["COMPUTE_TYPE_TO_NAME", "DATA_TYPE_TO_NAME", "NAME_TO_DATA_TYPE", "NAME_TO_COMPUTE_TYPE"] +__all__ = ["COMPUTE_TYPE_TO_NAME", "DATA_TYPE_TO_NAME", "NAME_TO_DATA_TYPE", "NAME_TO_COMPUTE_TYPE", "NAME_TO_DATA_WIDTH"] from enum import IntEnum import re @@ -58,19 +58,25 @@ class cudaDataType(IntEnum): CUDA_C_64I = 25 CUDA_R_64U = 26 CUDA_C_64U = 27 + CUDA_R_8F_E4M3 = 28 + CUDA_R_8F_E5M2 = 29 def create_cuda_data_type_map(cuda_data_type_enum_class): """ Map the data type name to the corresponding CUDA data type. """ - cuda_data_type_pattern = re.compile(r"CUDA_(?PC|R)_(?P\d+)(?PF|I|U|BF)") + cuda_data_type_pattern = re.compile(r"CUDA_(?PC|R)_(?P\d+)(?PF|I|U|BF)_?(?P(E\dM\d)?)") type_code_map = {"i": "int", "u": "uint", "f": "float", "bf": "bfloat"} + # A map from (width, exponent kind) to qualifiers (finite, unsigned zero, ...) for data + # types. + type_qualifier_map = {(8, "e4m3"): "fn"} complex_types = {"float": "complex", "bfloat": "bcomplex"} cuda_data_type_map = dict() + data_type_width_map = dict() for d in cuda_data_type_enum_class: m = cuda_data_type_pattern.match(d.name) @@ -87,9 +93,19 @@ def create_cuda_data_type_map(cuda_data_type_enum_class): type_code = complex_types[type_code] name = type_code + str(width) + + # Handle narrow type kinds. + if width <= 8: + kind = m.group("kind").lower() + # Handle type qualifiers for narrow types. + kind += type_qualifier_map.get((width, kind), "") + if kind: + name += "_" + kind + cuda_data_type_map[name] = d + data_type_width_map[name] = width - return cuda_data_type_map + return cuda_data_type_map, data_type_width_map def create_cuda_compute_type_map(cuda_compute_type_enum_class): @@ -125,7 +141,7 @@ def create_cuda_compute_type_map(cuda_compute_type_enum_class): return cuda_compute_type_map -NAME_TO_DATA_TYPE = create_cuda_data_type_map(cudaDataType) +NAME_TO_DATA_TYPE, NAME_TO_DATA_WIDTH = create_cuda_data_type_map(cudaDataType) DATA_TYPE_TO_NAME = {v: k for k, v in NAME_TO_DATA_TYPE.items()} NAME_TO_COMPUTE_TYPE = create_cuda_compute_type_map(ComputeType) COMPUTE_TYPE_TO_NAME = {v: k for k, v in NAME_TO_COMPUTE_TYPE.items()} diff --git a/nvmath/_internal/utils.py b/nvmath/_internal/utils.py index 59ee5d0..560ac98 100644 --- a/nvmath/_internal/utils.py +++ b/nvmath/_internal/utils.py @@ -1,4 +1,4 @@ -# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. +# Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. # # SPDX-License-Identifier: Apache-2.0 @@ -22,7 +22,6 @@ from . import formatters from . import mem_limit from . import package_wrapper -from . import tensor_wrapper from .package_ifc import StreamHolder from .tensor_ifc import Tensor from .layout import is_contiguous_and_dense @@ -236,7 +235,6 @@ def get_memory_limit(memory_limit, device): """ Parse user provided memory limit and return the memory limit in bytes. """ - import re _, total_memory = device.mem_info if isinstance(memory_limit, int): diff --git a/nvmath/_utils.py b/nvmath/_utils.py index 8c0531b..db56760 100644 --- a/nvmath/_utils.py +++ b/nvmath/_utils.py @@ -1,4 +1,4 @@ -# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. +# Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. # # SPDX-License-Identifier: Apache-2.0 diff --git a/nvmath/bindings/__init__.py b/nvmath/bindings/__init__.py index defbe92..6988162 100644 --- a/nvmath/bindings/__init__.py +++ b/nvmath/bindings/__init__.py @@ -1,17 +1,27 @@ -# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. +# Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. # # SPDX-License-Identifier: Apache-2.0 # type: ignore from nvmath.bindings import cublas +from nvmath.bindings import cufft +from nvmath.bindings import curand from nvmath.bindings import cusolver from nvmath.bindings import cusolverDn -from nvmath.bindings import cufft from nvmath.bindings import cusparse -from nvmath.bindings import curand try: from nvmath.bindings import nvpl except ImportError: nvpl = None + +__all__ = [ + "cublas", + "cufft", + "curand", + "cusolver", + "cusolverDn", + "cusparse", + "nvpl", +] diff --git a/nvmath/bindings/_internal/cublas.pxd b/nvmath/bindings/_internal/cublas.pxd index 00d1cf0..f589f58 100644 --- a/nvmath/bindings/_internal/cublas.pxd +++ b/nvmath/bindings/_internal/cublas.pxd @@ -1,8 +1,8 @@ -# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. +# Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. # # SPDX-License-Identifier: Apache-2.0 # -# This code was automatically generated across versions from 11.0.3 to 12.6.2. Do not modify it directly. +# This code was automatically generated across versions from 11.0.3 to 12.8.0. Do not modify it directly. from ..cycublas cimport * diff --git a/nvmath/bindings/_internal/cublasLt.pxd b/nvmath/bindings/_internal/cublasLt.pxd index 6a2fcb1..3e4859f 100644 --- a/nvmath/bindings/_internal/cublasLt.pxd +++ b/nvmath/bindings/_internal/cublasLt.pxd @@ -1,8 +1,8 @@ -# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. +# Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. # # SPDX-License-Identifier: Apache-2.0 # -# This code was automatically generated across versions from 11.0.3 to 12.6.2. Do not modify it directly. +# This code was automatically generated across versions from 11.0.3 to 12.8.0. Do not modify it directly. from ..cycublasLt cimport * diff --git a/nvmath/bindings/_internal/cublasLt_linux.pyx b/nvmath/bindings/_internal/cublasLt_linux.pyx index 19cc443..8e69249 100644 --- a/nvmath/bindings/_internal/cublasLt_linux.pyx +++ b/nvmath/bindings/_internal/cublasLt_linux.pyx @@ -1,8 +1,8 @@ -# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. +# Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. # # SPDX-License-Identifier: Apache-2.0 # -# This code was automatically generated across versions from 11.0.3 to 12.6.2. Do not modify it directly. +# This code was automatically generated across versions from 11.0.3 to 12.8.0. Do not modify it directly. from libc.stdint cimport intptr_t diff --git a/nvmath/bindings/_internal/cublasLt_windows.pyx b/nvmath/bindings/_internal/cublasLt_windows.pyx index 95f2182..3db7c89 100644 --- a/nvmath/bindings/_internal/cublasLt_windows.pyx +++ b/nvmath/bindings/_internal/cublasLt_windows.pyx @@ -1,8 +1,8 @@ -# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. +# Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. # # SPDX-License-Identifier: Apache-2.0 # -# This code was automatically generated across versions from 11.0.3 to 12.6.2. Do not modify it directly. +# This code was automatically generated across versions from 11.0.3 to 12.8.0. Do not modify it directly. from libc.stdint cimport intptr_t diff --git a/nvmath/bindings/_internal/cublas_linux.pyx b/nvmath/bindings/_internal/cublas_linux.pyx index 1bf35f4..d4101ea 100644 --- a/nvmath/bindings/_internal/cublas_linux.pyx +++ b/nvmath/bindings/_internal/cublas_linux.pyx @@ -1,8 +1,8 @@ -# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. +# Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. # # SPDX-License-Identifier: Apache-2.0 # -# This code was automatically generated across versions from 11.0.3 to 12.6.2. Do not modify it directly. +# This code was automatically generated across versions from 11.0.3 to 12.8.0. Do not modify it directly. from libc.stdint cimport intptr_t diff --git a/nvmath/bindings/_internal/cublas_windows.pyx b/nvmath/bindings/_internal/cublas_windows.pyx index 46e2a23..722d2ea 100644 --- a/nvmath/bindings/_internal/cublas_windows.pyx +++ b/nvmath/bindings/_internal/cublas_windows.pyx @@ -1,8 +1,8 @@ -# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. +# Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. # # SPDX-License-Identifier: Apache-2.0 # -# This code was automatically generated across versions from 11.0.3 to 12.6.2. Do not modify it directly. +# This code was automatically generated across versions from 11.0.3 to 12.8.0. Do not modify it directly. from libc.stdint cimport intptr_t diff --git a/nvmath/bindings/_internal/cufft.pxd b/nvmath/bindings/_internal/cufft.pxd index c3f319d..8d23c3d 100644 --- a/nvmath/bindings/_internal/cufft.pxd +++ b/nvmath/bindings/_internal/cufft.pxd @@ -1,8 +1,8 @@ -# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. +# Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. # # SPDX-License-Identifier: Apache-2.0 # -# This code was automatically generated across versions from 11.0.3 to 12.6.2. Do not modify it directly. +# This code was automatically generated across versions from 11.0.3 to 12.8.0. Do not modify it directly. from ..cycufft cimport * diff --git a/nvmath/bindings/_internal/cufft_linux.pyx b/nvmath/bindings/_internal/cufft_linux.pyx index 7d358a9..8fe8719 100644 --- a/nvmath/bindings/_internal/cufft_linux.pyx +++ b/nvmath/bindings/_internal/cufft_linux.pyx @@ -1,8 +1,8 @@ -# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. +# Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. # # SPDX-License-Identifier: Apache-2.0 # -# This code was automatically generated across versions from 11.0.3 to 12.6.2. Do not modify it directly. +# This code was automatically generated across versions from 11.0.3 to 12.8.0. Do not modify it directly. from libc.stdint cimport intptr_t diff --git a/nvmath/bindings/_internal/cufft_windows.pyx b/nvmath/bindings/_internal/cufft_windows.pyx index d732860..2485629 100644 --- a/nvmath/bindings/_internal/cufft_windows.pyx +++ b/nvmath/bindings/_internal/cufft_windows.pyx @@ -1,8 +1,8 @@ -# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. +# Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. # # SPDX-License-Identifier: Apache-2.0 # -# This code was automatically generated across versions from 11.0.3 to 12.6.2. Do not modify it directly. +# This code was automatically generated across versions from 11.0.3 to 12.8.0. Do not modify it directly. from libc.stdint cimport intptr_t diff --git a/nvmath/bindings/_internal/curand.pxd b/nvmath/bindings/_internal/curand.pxd index 6a085d0..2542cb2 100644 --- a/nvmath/bindings/_internal/curand.pxd +++ b/nvmath/bindings/_internal/curand.pxd @@ -1,8 +1,8 @@ -# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. +# Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. # # SPDX-License-Identifier: Apache-2.0 # -# This code was automatically generated across versions from 11.0.3 to 12.6.2. Do not modify it directly. +# This code was automatically generated across versions from 11.0.3 to 12.8.0. Do not modify it directly. from ..cycurand cimport * diff --git a/nvmath/bindings/_internal/curand_linux.pyx b/nvmath/bindings/_internal/curand_linux.pyx index d3d215c..d94b22c 100644 --- a/nvmath/bindings/_internal/curand_linux.pyx +++ b/nvmath/bindings/_internal/curand_linux.pyx @@ -1,8 +1,8 @@ -# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. +# Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. # # SPDX-License-Identifier: Apache-2.0 # -# This code was automatically generated across versions from 11.0.3 to 12.6.2. Do not modify it directly. +# This code was automatically generated across versions from 11.0.3 to 12.8.0. Do not modify it directly. from libc.stdint cimport intptr_t diff --git a/nvmath/bindings/_internal/curand_windows.pyx b/nvmath/bindings/_internal/curand_windows.pyx index ba73a8b..daeea75 100644 --- a/nvmath/bindings/_internal/curand_windows.pyx +++ b/nvmath/bindings/_internal/curand_windows.pyx @@ -1,8 +1,8 @@ -# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. +# Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. # # SPDX-License-Identifier: Apache-2.0 # -# This code was automatically generated across versions from 11.0.3 to 12.6.2. Do not modify it directly. +# This code was automatically generated across versions from 11.0.3 to 12.8.0. Do not modify it directly. from libc.stdint cimport intptr_t diff --git a/nvmath/bindings/_internal/cusolver.pxd b/nvmath/bindings/_internal/cusolver.pxd index 9ee36b3..3aee07a 100644 --- a/nvmath/bindings/_internal/cusolver.pxd +++ b/nvmath/bindings/_internal/cusolver.pxd @@ -1,4 +1,4 @@ -# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. +# Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. # # SPDX-License-Identifier: Apache-2.0 # diff --git a/nvmath/bindings/_internal/cusolverDn.pxd b/nvmath/bindings/_internal/cusolverDn.pxd index 49fe8ca..f05b99a 100644 --- a/nvmath/bindings/_internal/cusolverDn.pxd +++ b/nvmath/bindings/_internal/cusolverDn.pxd @@ -1,8 +1,8 @@ -# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. +# Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. # # SPDX-License-Identifier: Apache-2.0 # -# This code was automatically generated across versions from 11.0.3 to 12.6.2. Do not modify it directly. +# This code was automatically generated across versions from 11.0.3 to 12.8.0. Do not modify it directly. from ..cycusolverDn cimport * diff --git a/nvmath/bindings/_internal/cusolverDn_linux.pyx b/nvmath/bindings/_internal/cusolverDn_linux.pyx index c99ca26..4fa178c 100644 --- a/nvmath/bindings/_internal/cusolverDn_linux.pyx +++ b/nvmath/bindings/_internal/cusolverDn_linux.pyx @@ -1,8 +1,8 @@ -# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. +# Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. # # SPDX-License-Identifier: Apache-2.0 # -# This code was automatically generated across versions from 11.0.3 to 12.6.2. Do not modify it directly. +# This code was automatically generated across versions from 11.0.3 to 12.8.0. Do not modify it directly. from libc.stdint cimport intptr_t diff --git a/nvmath/bindings/_internal/cusolverDn_windows.pyx b/nvmath/bindings/_internal/cusolverDn_windows.pyx index f705fa6..5ece197 100644 --- a/nvmath/bindings/_internal/cusolverDn_windows.pyx +++ b/nvmath/bindings/_internal/cusolverDn_windows.pyx @@ -1,8 +1,8 @@ -# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. +# Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. # # SPDX-License-Identifier: Apache-2.0 # -# This code was automatically generated across versions from 11.0.3 to 12.6.2. Do not modify it directly. +# This code was automatically generated across versions from 11.0.3 to 12.8.0. Do not modify it directly. from libc.stdint cimport intptr_t diff --git a/nvmath/bindings/_internal/cusolver_linux.pyx b/nvmath/bindings/_internal/cusolver_linux.pyx index 87ef093..ea07066 100644 --- a/nvmath/bindings/_internal/cusolver_linux.pyx +++ b/nvmath/bindings/_internal/cusolver_linux.pyx @@ -1,4 +1,4 @@ -# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. +# Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. # # SPDX-License-Identifier: Apache-2.0 # diff --git a/nvmath/bindings/_internal/cusolver_windows.pyx b/nvmath/bindings/_internal/cusolver_windows.pyx index 07c5cfd..580d6c8 100644 --- a/nvmath/bindings/_internal/cusolver_windows.pyx +++ b/nvmath/bindings/_internal/cusolver_windows.pyx @@ -1,4 +1,4 @@ -# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. +# Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. # # SPDX-License-Identifier: Apache-2.0 # diff --git a/nvmath/bindings/_internal/cusparse.pxd b/nvmath/bindings/_internal/cusparse.pxd index 8f8025a..7669424 100644 --- a/nvmath/bindings/_internal/cusparse.pxd +++ b/nvmath/bindings/_internal/cusparse.pxd @@ -1,8 +1,8 @@ -# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. +# Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. # # SPDX-License-Identifier: Apache-2.0 # -# This code was automatically generated across versions from 11.0.3 to 12.6.2. Do not modify it directly. +# This code was automatically generated across versions from 11.0.3 to 12.8.0. Do not modify it directly. from ..cycusparse cimport * diff --git a/nvmath/bindings/_internal/cusparse_linux.pyx b/nvmath/bindings/_internal/cusparse_linux.pyx index 41e25c8..831a2ef 100644 --- a/nvmath/bindings/_internal/cusparse_linux.pyx +++ b/nvmath/bindings/_internal/cusparse_linux.pyx @@ -1,8 +1,8 @@ -# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. +# Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. # # SPDX-License-Identifier: Apache-2.0 # -# This code was automatically generated across versions from 11.0.3 to 12.6.2. Do not modify it directly. +# This code was automatically generated across versions from 11.0.3 to 12.8.0. Do not modify it directly. from libc.stdint cimport intptr_t diff --git a/nvmath/bindings/_internal/cusparse_windows.pyx b/nvmath/bindings/_internal/cusparse_windows.pyx index 2601c03..44a4ab6 100644 --- a/nvmath/bindings/_internal/cusparse_windows.pyx +++ b/nvmath/bindings/_internal/cusparse_windows.pyx @@ -1,8 +1,8 @@ -# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. +# Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. # # SPDX-License-Identifier: Apache-2.0 # -# This code was automatically generated across versions from 11.0.3 to 12.6.2. Do not modify it directly. +# This code was automatically generated across versions from 11.0.3 to 12.8.0. Do not modify it directly. from libc.stdint cimport intptr_t diff --git a/nvmath/bindings/_internal/utils.pxd b/nvmath/bindings/_internal/utils.pxd index e3b82e9..546f268 100644 --- a/nvmath/bindings/_internal/utils.pxd +++ b/nvmath/bindings/_internal/utils.pxd @@ -1,3 +1,7 @@ +# Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. +# +# SPDX-License-Identifier: Apache-2.0 + from libc.stdint cimport int32_t, int64_t, intptr_t from libcpp.vector cimport vector from libcpp cimport bool as cppbool diff --git a/nvmath/bindings/_internal/utils.pyx b/nvmath/bindings/_internal/utils.pyx index b69e2e6..c5b4761 100644 --- a/nvmath/bindings/_internal/utils.pyx +++ b/nvmath/bindings/_internal/utils.pyx @@ -1,3 +1,7 @@ +# Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. +# +# SPDX-License-Identifier: Apache-2.0 + cimport cpython from libc.stdint cimport intptr_t from libcpp.utility cimport move diff --git a/nvmath/bindings/cublas.pxd b/nvmath/bindings/cublas.pxd index 906294d..f96952a 100644 --- a/nvmath/bindings/cublas.pxd +++ b/nvmath/bindings/cublas.pxd @@ -1,8 +1,8 @@ -# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. +# Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. # # SPDX-License-Identifier: Apache-2.0 # -# This code was automatically generated across versions from 11.0.3 to 12.6.2. Do not modify it directly. +# This code was automatically generated across versions from 11.0.3 to 12.8.0. Do not modify it directly. from libc.stdint cimport intptr_t diff --git a/nvmath/bindings/cublas.pyx b/nvmath/bindings/cublas.pyx index 726a2eb..2654e38 100644 --- a/nvmath/bindings/cublas.pyx +++ b/nvmath/bindings/cublas.pyx @@ -1,8 +1,8 @@ -# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. +# Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. # # SPDX-License-Identifier: Apache-2.0 # -# This code was automatically generated across versions from 11.0.3 to 12.6.2. Do not modify it directly. +# This code was automatically generated across versions from 11.0.3 to 12.8.0. Do not modify it directly. cimport cython # NOQA from libcpp.vector cimport vector diff --git a/nvmath/bindings/cublasLt.pxd b/nvmath/bindings/cublasLt.pxd index f1e47e2..00c79f6 100644 --- a/nvmath/bindings/cublasLt.pxd +++ b/nvmath/bindings/cublasLt.pxd @@ -1,8 +1,8 @@ -# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. +# Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. # # SPDX-License-Identifier: Apache-2.0 # -# This code was automatically generated across versions from 11.0.3 to 12.6.2. Do not modify it directly. +# This code was automatically generated across versions from 11.0.3 to 12.8.0. Do not modify it directly. from libc.stdint cimport intptr_t @@ -45,6 +45,7 @@ ctypedef cublasLtMatmulAlgoCapAttributes_t _MatmulAlgoCapAttribute ctypedef cublasLtMatmulAlgoConfigAttributes_t _MatmulAlgoConfigAttribute ctypedef cublasLtClusterShape_t _ClusterShape ctypedef cublasLtMatmulInnerShape_t _MatmulInnerShape +ctypedef cublasLtMatmulMatrixScale_t _MatmulMatrixScale ############################################################################### diff --git a/nvmath/bindings/cublasLt.pyx b/nvmath/bindings/cublasLt.pyx index 2f5f392..98f6c98 100644 --- a/nvmath/bindings/cublasLt.pyx +++ b/nvmath/bindings/cublasLt.pyx @@ -1,13 +1,13 @@ -# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. +# Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. # # SPDX-License-Identifier: Apache-2.0 # -# This code was automatically generated across versions from 11.0.3 to 12.6.2. Do not modify it directly. +# This code was automatically generated across versions from 11.0.3 to 12.8.0. Do not modify it directly. cimport cython # NOQA from libcpp.vector cimport vector from cpython cimport buffer as _buffer -from cpython cimport memoryview as _memoryview +from cpython.memoryview cimport PyMemoryView_FromMemory from enum import IntEnum as _IntEnum @@ -128,7 +128,7 @@ cdef class MatmulAlgo: raise ValueError("ptr must not be null (0)") cdef MatmulAlgo obj = MatmulAlgo.__new__(MatmulAlgo) cdef flag = _buffer.PyBUF_READ if readonly else _buffer.PyBUF_WRITE - cdef object buf = _memoryview.PyMemoryView_FromMemory( + cdef object buf = PyMemoryView_FromMemory( ptr, sizeof(cublasLtMatmulAlgo_t) * size, flag) data = _numpy.ndarray((size,), buffer=buf, dtype=matmul_algo_dtype) @@ -283,7 +283,7 @@ cdef class MatmulHeuristicResult: raise ValueError("ptr must not be null (0)") cdef MatmulHeuristicResult obj = MatmulHeuristicResult.__new__(MatmulHeuristicResult) cdef flag = _buffer.PyBUF_READ if readonly else _buffer.PyBUF_WRITE - cdef object buf = _memoryview.PyMemoryView_FromMemory( + cdef object buf = PyMemoryView_FromMemory( ptr, sizeof(cublasLtMatmulHeuristicResult_t) * size, flag) data = _numpy.ndarray((size,), buffer=buf, dtype=matmul_heuristic_result_dtype) @@ -929,6 +929,10 @@ class MatmulTile(_IntEnum): TILE_768x56 = CUBLASLT_MATMUL_TILE_768x56 TILE_768x72 = CUBLASLT_MATMUL_TILE_768x72 TILE_768x80 = CUBLASLT_MATMUL_TILE_768x80 + TILE_256x512 = CUBLASLT_MATMUL_TILE_256x512 + TILE_256x1024 = CUBLASLT_MATMUL_TILE_256x1024 + TILE_512x512 = CUBLASLT_MATMUL_TILE_512x512 + TILE_512x1024 = CUBLASLT_MATMUL_TILE_512x1024 class MatmulStages(_IntEnum): """See `cublasLtMatmulStages_t`.""" @@ -967,6 +971,7 @@ class MatmulStages(_IntEnum): STAGES_32xAUTO = CUBLASLT_MATMUL_STAGES_32xAUTO STAGES_64xAUTO = CUBLASLT_MATMUL_STAGES_64xAUTO STAGES_128xAUTO = CUBLASLT_MATMUL_STAGES_128xAUTO + STAGES_256xAUTO = CUBLASLT_MATMUL_STAGES_256xAUTO STAGES_16x80 = CUBLASLT_MATMUL_STAGES_16x80 STAGES_64x80 = CUBLASLT_MATMUL_STAGES_64x80 @@ -1037,6 +1042,13 @@ class MatmulDescAttribute(_IntEnum): ATOMIC_SYNC_NUM_CHUNKS_D_COLS = CUBLASLT_MATMUL_DESC_ATOMIC_SYNC_NUM_CHUNKS_D_COLS ATOMIC_SYNC_IN_COUNTERS_POINTER = CUBLASLT_MATMUL_DESC_ATOMIC_SYNC_IN_COUNTERS_POINTER ATOMIC_SYNC_OUT_COUNTERS_POINTER = CUBLASLT_MATMUL_DESC_ATOMIC_SYNC_OUT_COUNTERS_POINTER + A_SCALE_MODE = CUBLASLT_MATMUL_DESC_A_SCALE_MODE + B_SCALE_MODE = CUBLASLT_MATMUL_DESC_B_SCALE_MODE + C_SCALE_MODE = CUBLASLT_MATMUL_DESC_C_SCALE_MODE + D_SCALE_MODE = CUBLASLT_MATMUL_DESC_D_SCALE_MODE + EPILOGUE_AUX_SCALE_MODE = CUBLASLT_MATMUL_DESC_EPILOGUE_AUX_SCALE_MODE + D_OUT_SCALE_POINTER = CUBLASLT_MATMUL_DESC_D_OUT_SCALE_POINTER + D_OUT_SCALE_MODE = CUBLASLT_MATMUL_DESC_D_OUT_SCALE_MODE class MatrixTransformDescAttribute(_IntEnum): """See `cublasLtMatrixTransformDescAttributes_t`.""" @@ -1200,6 +1212,12 @@ class MatmulInnerShape(_IntEnum): MMA1688 = CUBLASLT_MATMUL_INNER_SHAPE_MMA1688 MMA16816 = CUBLASLT_MATMUL_INNER_SHAPE_MMA16816 +class MatmulMatrixScale(_IntEnum): + """See `cublasLtMatmulMatrixScale_t`.""" + SCALAR_32F = CUBLASLT_MATMUL_MATRIX_SCALE_SCALAR_32F + VEC16_UE4M3 = CUBLASLT_MATMUL_MATRIX_SCALE_VEC16_UE4M3 + VEC32_UE8M0 = CUBLASLT_MATMUL_MATRIX_SCALE_VEC32_UE8M0 + ############################################################################### # Error handling @@ -1393,6 +1411,13 @@ cdef dict matmul_desc_attribute_sizes = { CUBLASLT_MATMUL_DESC_ATOMIC_SYNC_NUM_CHUNKS_D_COLS: _numpy.int32, CUBLASLT_MATMUL_DESC_ATOMIC_SYNC_IN_COUNTERS_POINTER: _numpy.int32, CUBLASLT_MATMUL_DESC_ATOMIC_SYNC_OUT_COUNTERS_POINTER: _numpy.int32, + CUBLASLT_MATMUL_DESC_A_SCALE_MODE: _numpy.int32, + CUBLASLT_MATMUL_DESC_B_SCALE_MODE: _numpy.int32, + CUBLASLT_MATMUL_DESC_C_SCALE_MODE: _numpy.int32, + CUBLASLT_MATMUL_DESC_D_SCALE_MODE: _numpy.int32, + CUBLASLT_MATMUL_DESC_EPILOGUE_AUX_SCALE_MODE: _numpy.int32, + CUBLASLT_MATMUL_DESC_D_OUT_SCALE_POINTER: _numpy.intp, + CUBLASLT_MATMUL_DESC_D_OUT_SCALE_MODE: _numpy.int32, } cpdef get_matmul_desc_attribute_dtype(int attr): diff --git a/nvmath/bindings/cufft.pxd b/nvmath/bindings/cufft.pxd index e24af44..fbe9281 100644 --- a/nvmath/bindings/cufft.pxd +++ b/nvmath/bindings/cufft.pxd @@ -1,8 +1,8 @@ -# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. +# Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. # # SPDX-License-Identifier: Apache-2.0 # -# This code was automatically generated across versions from 11.0.3 to 12.6.2. Do not modify it directly. +# This code was automatically generated across versions from 11.0.3 to 12.8.0. Do not modify it directly. from libc.stdint cimport intptr_t @@ -18,18 +18,18 @@ ctypedef cufftCallbackLoadC CallbackLoadC ctypedef cufftCallbackLoadZ CallbackLoadZ ctypedef cufftCallbackLoadR CallbackLoadR ctypedef cufftCallbackLoadD CallbackLoadD -ctypedef cufftCallbackStoreC CallbackStoreC -ctypedef cufftCallbackStoreZ CallbackStoreZ -ctypedef cufftCallbackStoreR CallbackStoreR -ctypedef cufftCallbackStoreD CallbackStoreD ctypedef cufftJITCallbackLoadC JITCallbackLoadC ctypedef cufftJITCallbackLoadZ JITCallbackLoadZ ctypedef cufftJITCallbackLoadR JITCallbackLoadR ctypedef cufftJITCallbackLoadD JITCallbackLoadD -ctypedef cufftJITCallbackStoreC JITCallbackStoreC -ctypedef cufftJITCallbackStoreZ JITCallbackStoreZ +ctypedef cufftCallbackStoreR CallbackStoreR ctypedef cufftJITCallbackStoreR JITCallbackStoreR +ctypedef cufftCallbackStoreD CallbackStoreD ctypedef cufftJITCallbackStoreD JITCallbackStoreD +ctypedef cufftCallbackStoreC CallbackStoreC +ctypedef cufftJITCallbackStoreC JITCallbackStoreC +ctypedef cufftCallbackStoreZ CallbackStoreZ +ctypedef cufftJITCallbackStoreZ JITCallbackStoreZ ctypedef cudaStream_t Stream ctypedef cudaDataType DataType diff --git a/nvmath/bindings/cufft.pyx b/nvmath/bindings/cufft.pyx index 884bc21..16d9f38 100644 --- a/nvmath/bindings/cufft.pyx +++ b/nvmath/bindings/cufft.pyx @@ -1,8 +1,8 @@ -# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. +# Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. # # SPDX-License-Identifier: Apache-2.0 # -# This code was automatically generated across versions from 11.0.3 to 12.6.2. Do not modify it directly. +# This code was automatically generated across versions from 11.0.3 to 12.8.0. Do not modify it directly. cimport cython # NOQA from libc.stdint cimport int64_t diff --git a/nvmath/bindings/curand.pxd b/nvmath/bindings/curand.pxd index a22451c..673162d 100644 --- a/nvmath/bindings/curand.pxd +++ b/nvmath/bindings/curand.pxd @@ -1,8 +1,8 @@ -# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. +# Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. # # SPDX-License-Identifier: Apache-2.0 # -# This code was automatically generated across versions from 11.0.3 to 12.6.2. Do not modify it directly. +# This code was automatically generated across versions from 11.0.3 to 12.8.0. Do not modify it directly. cimport cython diff --git a/nvmath/bindings/curand.pyx b/nvmath/bindings/curand.pyx index 5d22a02..caf9520 100644 --- a/nvmath/bindings/curand.pyx +++ b/nvmath/bindings/curand.pyx @@ -1,12 +1,12 @@ -# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. +# Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. # # SPDX-License-Identifier: Apache-2.0 # -# This code was automatically generated across versions from 11.0.3 to 12.6.2. Do not modify it directly. +# This code was automatically generated across versions from 11.0.3 to 12.8.0. Do not modify it directly. cimport cython # NOQA cimport cpython -from cpython cimport memoryview as _memoryview +from cpython.memoryview cimport PyMemoryView_FromMemory from enum import IntEnum as _IntEnum @@ -479,7 +479,7 @@ cpdef get_scramble_constants32(size_t size): with nogil: status = curandGetScrambleConstants32(&constants) check_status(status) - cdef object buf = _memoryview.PyMemoryView_FromMemory( + cdef object buf = PyMemoryView_FromMemory( constants, size * sizeof(unsigned int), cpython.PyBUF_READ) return _numpy.ndarray((size,), buffer=buf, dtype=_numpy.uint32) @@ -501,7 +501,7 @@ cpdef get_scramble_constants64(size_t size): with nogil: status = curandGetScrambleConstants64(&constants) check_status(status) - cdef object buf = _memoryview.PyMemoryView_FromMemory( + cdef object buf = PyMemoryView_FromMemory( constants, size * sizeof(unsigned long long), cpython.PyBUF_READ) return _numpy.ndarray((size,), buffer=buf, dtype=_numpy.uint64) @@ -524,7 +524,7 @@ cpdef get_direction_vectors32(int set_, size_t size): with nogil: status = curandGetDirectionVectors32(&vec, <_DirectionVectorSet>set_) check_status(status) - cdef object buf = _memoryview.PyMemoryView_FromMemory( + cdef object buf = PyMemoryView_FromMemory( vec, size * sizeof(curandDirectionVectors32_t), cpython.PyBUF_READ) return _numpy.ndarray((size, 32,), buffer=buf, dtype=_numpy.uint32) @@ -547,6 +547,6 @@ cpdef get_direction_vectors64(int set_, size_t size): with nogil: status = curandGetDirectionVectors64(&vec, <_DirectionVectorSet>set_) check_status(status) - cdef object buf = _memoryview.PyMemoryView_FromMemory( + cdef object buf = PyMemoryView_FromMemory( vec, size * sizeof(curandDirectionVectors64_t), cpython.PyBUF_READ) return _numpy.ndarray((size, 64,), buffer=buf, dtype=_numpy.uint64) diff --git a/nvmath/bindings/cusolver.pxd b/nvmath/bindings/cusolver.pxd index 51940da..7a89543 100644 --- a/nvmath/bindings/cusolver.pxd +++ b/nvmath/bindings/cusolver.pxd @@ -1,4 +1,4 @@ -# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. +# Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. # # SPDX-License-Identifier: Apache-2.0 # diff --git a/nvmath/bindings/cusolver.pyx b/nvmath/bindings/cusolver.pyx index 30f5e3e..cd42b2f 100644 --- a/nvmath/bindings/cusolver.pyx +++ b/nvmath/bindings/cusolver.pyx @@ -1,4 +1,4 @@ -# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. +# Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. # # SPDX-License-Identifier: Apache-2.0 # diff --git a/nvmath/bindings/cusolverDn.pxd b/nvmath/bindings/cusolverDn.pxd index fc68205..a17ba82 100644 --- a/nvmath/bindings/cusolverDn.pxd +++ b/nvmath/bindings/cusolverDn.pxd @@ -1,8 +1,8 @@ -# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. +# Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. # # SPDX-License-Identifier: Apache-2.0 # -# This code was automatically generated across versions from 11.0.3 to 12.6.2. Do not modify it directly. +# This code was automatically generated across versions from 11.0.3 to 12.8.0. Do not modify it directly. from libc.stdint cimport intptr_t diff --git a/nvmath/bindings/cusolverDn.pyx b/nvmath/bindings/cusolverDn.pyx index 9c1b5f6..4b7c363 100644 --- a/nvmath/bindings/cusolverDn.pyx +++ b/nvmath/bindings/cusolverDn.pyx @@ -1,8 +1,8 @@ -# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. +# Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. # # SPDX-License-Identifier: Apache-2.0 # -# This code was automatically generated across versions from 11.0.3 to 12.6.2. Do not modify it directly. +# This code was automatically generated across versions from 11.0.3 to 12.8.0. Do not modify it directly. from libcpp.vector cimport vector diff --git a/nvmath/bindings/cusparse.pxd b/nvmath/bindings/cusparse.pxd index 3df3661..bb433aa 100644 --- a/nvmath/bindings/cusparse.pxd +++ b/nvmath/bindings/cusparse.pxd @@ -1,8 +1,8 @@ -# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. +# Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. # # SPDX-License-Identifier: Apache-2.0 # -# This code was automatically generated across versions from 11.0.3 to 12.6.2. Do not modify it directly. +# This code was automatically generated across versions from 11.0.3 to 12.8.0. Do not modify it directly. cimport cython diff --git a/nvmath/bindings/cusparse.pyx b/nvmath/bindings/cusparse.pyx index c66bfbb..f25e9cd 100644 --- a/nvmath/bindings/cusparse.pyx +++ b/nvmath/bindings/cusparse.pyx @@ -1,8 +1,8 @@ -# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. +# Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. # # SPDX-License-Identifier: Apache-2.0 # -# This code was automatically generated across versions from 11.0.3 to 12.6.2. Do not modify it directly. +# This code was automatically generated across versions from 11.0.3 to 12.8.0. Do not modify it directly. cimport cython # NOQA diff --git a/nvmath/bindings/cycublas.pxd b/nvmath/bindings/cycublas.pxd index cc984a1..bf81dec 100644 --- a/nvmath/bindings/cycublas.pxd +++ b/nvmath/bindings/cycublas.pxd @@ -1,8 +1,8 @@ -# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. +# Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. # # SPDX-License-Identifier: Apache-2.0 # -# This code was automatically generated across versions from 11.0.3 to 12.6.2. Do not modify it directly. +# This code was automatically generated across versions from 11.0.3 to 12.8.0. Do not modify it directly. # This layer exposes the C header to Cython as-is. from libc.stdint cimport int64_t diff --git a/nvmath/bindings/cycublas.pyx b/nvmath/bindings/cycublas.pyx index 242dae3..19065c4 100644 --- a/nvmath/bindings/cycublas.pyx +++ b/nvmath/bindings/cycublas.pyx @@ -1,8 +1,8 @@ -# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. +# Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. # # SPDX-License-Identifier: Apache-2.0 # -# This code was automatically generated across versions from 11.0.3 to 12.6.2. Do not modify it directly. +# This code was automatically generated across versions from 11.0.3 to 12.8.0. Do not modify it directly. from ._internal cimport cublas as _cublas diff --git a/nvmath/bindings/cycublasLt.pxd b/nvmath/bindings/cycublasLt.pxd index c3879b9..6574006 100644 --- a/nvmath/bindings/cycublasLt.pxd +++ b/nvmath/bindings/cycublasLt.pxd @@ -1,8 +1,8 @@ -# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. +# Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. # # SPDX-License-Identifier: Apache-2.0 # -# This code was automatically generated across versions from 11.0.3 to 12.6.2. Do not modify it directly. +# This code was automatically generated across versions from 11.0.3 to 12.8.0. Do not modify it directly. # This layer exposes the C header to Cython as-is. from libc.stdint cimport int64_t, uint64_t @@ -648,6 +648,10 @@ ctypedef enum cublasLtMatmulTile_t "cublasLtMatmulTile_t": CUBLASLT_MATMUL_TILE_768x56 "CUBLASLT_MATMUL_TILE_768x56" = 628 CUBLASLT_MATMUL_TILE_768x72 "CUBLASLT_MATMUL_TILE_768x72" = 629 CUBLASLT_MATMUL_TILE_768x80 "CUBLASLT_MATMUL_TILE_768x80" = 630 + CUBLASLT_MATMUL_TILE_256x512 "CUBLASLT_MATMUL_TILE_256x512" = 631 + CUBLASLT_MATMUL_TILE_256x1024 "CUBLASLT_MATMUL_TILE_256x1024" = 632 + CUBLASLT_MATMUL_TILE_512x512 "CUBLASLT_MATMUL_TILE_512x512" = 633 + CUBLASLT_MATMUL_TILE_512x1024 "CUBLASLT_MATMUL_TILE_512x1024" = 634 ctypedef enum cublasLtMatmulStages_t "cublasLtMatmulStages_t": CUBLASLT_MATMUL_STAGES_UNDEFINED "CUBLASLT_MATMUL_STAGES_UNDEFINED" = 0 @@ -685,6 +689,7 @@ ctypedef enum cublasLtMatmulStages_t "cublasLtMatmulStages_t": CUBLASLT_MATMUL_STAGES_32xAUTO "CUBLASLT_MATMUL_STAGES_32xAUTO" = 34 CUBLASLT_MATMUL_STAGES_64xAUTO "CUBLASLT_MATMUL_STAGES_64xAUTO" = 35 CUBLASLT_MATMUL_STAGES_128xAUTO "CUBLASLT_MATMUL_STAGES_128xAUTO" = 36 + CUBLASLT_MATMUL_STAGES_256xAUTO "CUBLASLT_MATMUL_STAGES_256xAUTO" = 37 CUBLASLT_MATMUL_STAGES_16x80 "CUBLASLT_MATMUL_STAGES_16x80" = 29 CUBLASLT_MATMUL_STAGES_64x80 "CUBLASLT_MATMUL_STAGES_64x80" = 30 @@ -750,6 +755,13 @@ ctypedef enum cublasLtMatmulDescAttributes_t "cublasLtMatmulDescAttributes_t": CUBLASLT_MATMUL_DESC_ATOMIC_SYNC_NUM_CHUNKS_D_COLS "CUBLASLT_MATMUL_DESC_ATOMIC_SYNC_NUM_CHUNKS_D_COLS" = 28 CUBLASLT_MATMUL_DESC_ATOMIC_SYNC_IN_COUNTERS_POINTER "CUBLASLT_MATMUL_DESC_ATOMIC_SYNC_IN_COUNTERS_POINTER" = 29 CUBLASLT_MATMUL_DESC_ATOMIC_SYNC_OUT_COUNTERS_POINTER "CUBLASLT_MATMUL_DESC_ATOMIC_SYNC_OUT_COUNTERS_POINTER" = 30 + CUBLASLT_MATMUL_DESC_A_SCALE_MODE "CUBLASLT_MATMUL_DESC_A_SCALE_MODE" = 31 + CUBLASLT_MATMUL_DESC_B_SCALE_MODE "CUBLASLT_MATMUL_DESC_B_SCALE_MODE" = 32 + CUBLASLT_MATMUL_DESC_C_SCALE_MODE "CUBLASLT_MATMUL_DESC_C_SCALE_MODE" = 33 + CUBLASLT_MATMUL_DESC_D_SCALE_MODE "CUBLASLT_MATMUL_DESC_D_SCALE_MODE" = 34 + CUBLASLT_MATMUL_DESC_EPILOGUE_AUX_SCALE_MODE "CUBLASLT_MATMUL_DESC_EPILOGUE_AUX_SCALE_MODE" = 35 + CUBLASLT_MATMUL_DESC_D_OUT_SCALE_POINTER "CUBLASLT_MATMUL_DESC_D_OUT_SCALE_POINTER" = 36 + CUBLASLT_MATMUL_DESC_D_OUT_SCALE_MODE "CUBLASLT_MATMUL_DESC_D_OUT_SCALE_MODE" = 37 ctypedef enum cublasLtMatrixTransformDescAttributes_t "cublasLtMatrixTransformDescAttributes_t": CUBLASLT_MATRIX_TRANSFORM_DESC_SCALE_TYPE "CUBLASLT_MATRIX_TRANSFORM_DESC_SCALE_TYPE" @@ -904,6 +916,11 @@ ctypedef enum cublasLtMatmulInnerShape_t "cublasLtMatmulInnerShape_t": CUBLASLT_MATMUL_INNER_SHAPE_MMA1688 "CUBLASLT_MATMUL_INNER_SHAPE_MMA1688" = 3 CUBLASLT_MATMUL_INNER_SHAPE_MMA16816 "CUBLASLT_MATMUL_INNER_SHAPE_MMA16816" = 4 +ctypedef enum cublasLtMatmulMatrixScale_t "cublasLtMatmulMatrixScale_t": + CUBLASLT_MATMUL_MATRIX_SCALE_SCALAR_32F "CUBLASLT_MATMUL_MATRIX_SCALE_SCALAR_32F" = 0 + CUBLASLT_MATMUL_MATRIX_SCALE_VEC16_UE4M3 "CUBLASLT_MATMUL_MATRIX_SCALE_VEC16_UE4M3" = 1 + CUBLASLT_MATMUL_MATRIX_SCALE_VEC32_UE8M0 "CUBLASLT_MATMUL_MATRIX_SCALE_VEC32_UE8M0" = 2 + # types cdef extern from *: @@ -932,17 +949,17 @@ ctypedef void* cublasLtMatrixTransformDesc_t 'cublasLtMatrixTransformDesc_t' ctypedef void* cublasLtMatmulPreference_t 'cublasLtMatmulPreference_t' ctypedef struct cublasLtMatmulAlgo_t 'cublasLtMatmulAlgo_t': uint64_t data[8] +ctypedef void (*cublasLtLoggerCallback_t 'cublasLtLoggerCallback_t')( + int logLevel, + const char* functionName, + const char* message +) ctypedef struct cublasLtMatmulHeuristicResult_t 'cublasLtMatmulHeuristicResult_t': cublasLtMatmulAlgo_t algo size_t workspaceSize cublasStatus_t state float wavesCount int reserved[4] -ctypedef void (*cublasLtLoggerCallback_t 'cublasLtLoggerCallback_t')( - int logLevel, - const char* functionName, - const char* message -) ############################################################################### diff --git a/nvmath/bindings/cycublasLt.pyx b/nvmath/bindings/cycublasLt.pyx index 966e6b3..e625f49 100644 --- a/nvmath/bindings/cycublasLt.pyx +++ b/nvmath/bindings/cycublasLt.pyx @@ -1,8 +1,8 @@ -# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. +# Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. # # SPDX-License-Identifier: Apache-2.0 # -# This code was automatically generated across versions from 11.0.3 to 12.6.2. Do not modify it directly. +# This code was automatically generated across versions from 11.0.3 to 12.8.0. Do not modify it directly. from ._internal cimport cublasLt as _cublasLt diff --git a/nvmath/bindings/cycufft.pxd b/nvmath/bindings/cycufft.pxd index 94b53c4..b523fea 100644 --- a/nvmath/bindings/cycufft.pxd +++ b/nvmath/bindings/cycufft.pxd @@ -1,8 +1,8 @@ -# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. +# Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. # # SPDX-License-Identifier: Apache-2.0 # -# This code was automatically generated across versions from 11.0.3 to 12.6.2. Do not modify it directly. +# This code was automatically generated across versions from 11.0.3 to 12.8.0. Do not modify it directly. # This layer exposes the C header to Cython as-is. ############################################################################### @@ -158,83 +158,83 @@ ctypedef cufftDoubleReal (*cufftCallbackLoadD 'cufftCallbackLoadD')( void* callerInfo, void* sharedPointer ) -ctypedef void (*cufftCallbackStoreC 'cufftCallbackStoreC')( - void* dataOut, - size_t offset, - cufftComplex element, +ctypedef cufftComplex (*cufftJITCallbackLoadC 'cufftJITCallbackLoadC')( + void* dataIn, + unsigned long long offset, void* callerInfo, void* sharedPointer ) -ctypedef void (*cufftCallbackStoreZ 'cufftCallbackStoreZ')( - void* dataOut, - size_t offset, - cufftDoubleComplex element, +ctypedef cufftDoubleComplex (*cufftJITCallbackLoadZ 'cufftJITCallbackLoadZ')( + void* dataIn, + unsigned long long offset, void* callerInfo, void* sharedPointer ) -ctypedef void (*cufftCallbackStoreR 'cufftCallbackStoreR')( - void* dataOut, - size_t offset, - cufftReal element, +ctypedef cufftReal (*cufftJITCallbackLoadR 'cufftJITCallbackLoadR')( + void* dataIn, + unsigned long long offset, void* callerInfo, void* sharedPointer ) -ctypedef void (*cufftCallbackStoreD 'cufftCallbackStoreD')( - void* dataOut, - size_t offset, - cufftDoubleReal element, +ctypedef cufftDoubleReal (*cufftJITCallbackLoadD 'cufftJITCallbackLoadD')( + void* dataIn, + unsigned long long offset, void* callerInfo, void* sharedPointer ) -ctypedef cufftComplex (*cufftJITCallbackLoadC 'cufftJITCallbackLoadC')( - void* dataIn, - unsigned long long offset, +ctypedef void (*cufftCallbackStoreR 'cufftCallbackStoreR')( + void* dataOut, + size_t offset, + cufftReal element, void* callerInfo, void* sharedPointer ) -ctypedef cufftDoubleComplex (*cufftJITCallbackLoadZ 'cufftJITCallbackLoadZ')( - void* dataIn, +ctypedef void (*cufftJITCallbackStoreR 'cufftJITCallbackStoreR')( + void* dataOut, unsigned long long offset, + cufftReal element, void* callerInfo, void* sharedPointer ) -ctypedef cufftReal (*cufftJITCallbackLoadR 'cufftJITCallbackLoadR')( - void* dataIn, - unsigned long long offset, +ctypedef void (*cufftCallbackStoreD 'cufftCallbackStoreD')( + void* dataOut, + size_t offset, + cufftDoubleReal element, void* callerInfo, void* sharedPointer ) -ctypedef cufftDoubleReal (*cufftJITCallbackLoadD 'cufftJITCallbackLoadD')( - void* dataIn, +ctypedef void (*cufftJITCallbackStoreD 'cufftJITCallbackStoreD')( + void* dataOut, unsigned long long offset, + cufftDoubleReal element, void* callerInfo, void* sharedPointer ) -ctypedef void (*cufftJITCallbackStoreC 'cufftJITCallbackStoreC')( +ctypedef void (*cufftCallbackStoreC 'cufftCallbackStoreC')( void* dataOut, - unsigned long long offset, + size_t offset, cufftComplex element, void* callerInfo, void* sharedPointer ) -ctypedef void (*cufftJITCallbackStoreZ 'cufftJITCallbackStoreZ')( +ctypedef void (*cufftJITCallbackStoreC 'cufftJITCallbackStoreC')( void* dataOut, unsigned long long offset, - cufftDoubleComplex element, + cufftComplex element, void* callerInfo, void* sharedPointer ) -ctypedef void (*cufftJITCallbackStoreR 'cufftJITCallbackStoreR')( +ctypedef void (*cufftCallbackStoreZ 'cufftCallbackStoreZ')( void* dataOut, - unsigned long long offset, - cufftReal element, + size_t offset, + cufftDoubleComplex element, void* callerInfo, void* sharedPointer ) -ctypedef void (*cufftJITCallbackStoreD 'cufftJITCallbackStoreD')( +ctypedef void (*cufftJITCallbackStoreZ 'cufftJITCallbackStoreZ')( void* dataOut, unsigned long long offset, - cufftDoubleReal element, + cufftDoubleComplex element, void* callerInfo, void* sharedPointer ) diff --git a/nvmath/bindings/cycufft.pyx b/nvmath/bindings/cycufft.pyx index 561a30a..5e249f9 100644 --- a/nvmath/bindings/cycufft.pyx +++ b/nvmath/bindings/cycufft.pyx @@ -1,8 +1,8 @@ -# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. +# Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. # # SPDX-License-Identifier: Apache-2.0 # -# This code was automatically generated across versions from 11.0.3 to 12.6.2. Do not modify it directly. +# This code was automatically generated across versions from 11.0.3 to 12.8.0. Do not modify it directly. from ._internal cimport cufft as _cufft diff --git a/nvmath/bindings/cycurand.pxd b/nvmath/bindings/cycurand.pxd index e191e39..c377a61 100644 --- a/nvmath/bindings/cycurand.pxd +++ b/nvmath/bindings/cycurand.pxd @@ -1,8 +1,8 @@ -# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. +# Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. # # SPDX-License-Identifier: Apache-2.0 # -# This code was automatically generated across versions from 11.0.3 to 12.6.2. Do not modify it directly. +# This code was automatically generated across versions from 11.0.3 to 12.8.0. Do not modify it directly. # This layer exposes the C header to Cython as-is. from libc.stdint cimport int64_t diff --git a/nvmath/bindings/cycurand.pyx b/nvmath/bindings/cycurand.pyx index d2a5ec8..e9d44f2 100644 --- a/nvmath/bindings/cycurand.pyx +++ b/nvmath/bindings/cycurand.pyx @@ -1,8 +1,8 @@ -# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. +# Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. # # SPDX-License-Identifier: Apache-2.0 # -# This code was automatically generated across versions from 11.0.3 to 12.6.2. Do not modify it directly. +# This code was automatically generated across versions from 11.0.3 to 12.8.0. Do not modify it directly. from ._internal cimport curand as _curand diff --git a/nvmath/bindings/cycusolver.pxd b/nvmath/bindings/cycusolver.pxd index 85b2af7..85f3d80 100644 --- a/nvmath/bindings/cycusolver.pxd +++ b/nvmath/bindings/cycusolver.pxd @@ -1,4 +1,4 @@ -# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. +# Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. # # SPDX-License-Identifier: Apache-2.0 # diff --git a/nvmath/bindings/cycusolver.pyx b/nvmath/bindings/cycusolver.pyx index 10d66ab..69f772d 100644 --- a/nvmath/bindings/cycusolver.pyx +++ b/nvmath/bindings/cycusolver.pyx @@ -1,4 +1,4 @@ -# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. +# Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. # # SPDX-License-Identifier: Apache-2.0 # diff --git a/nvmath/bindings/cycusolverDn.pxd b/nvmath/bindings/cycusolverDn.pxd index 2c94637..377de4b 100644 --- a/nvmath/bindings/cycusolverDn.pxd +++ b/nvmath/bindings/cycusolverDn.pxd @@ -1,8 +1,8 @@ -# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. +# Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. # # SPDX-License-Identifier: Apache-2.0 # -# This code was automatically generated across versions from 11.0.3 to 12.6.2. Do not modify it directly. +# This code was automatically generated across versions from 11.0.3 to 12.8.0. Do not modify it directly. # This layer exposes the C header to Cython as-is. from libc.stdint cimport int64_t diff --git a/nvmath/bindings/cycusolverDn.pyx b/nvmath/bindings/cycusolverDn.pyx index 3d03f3c..be4f61c 100644 --- a/nvmath/bindings/cycusolverDn.pyx +++ b/nvmath/bindings/cycusolverDn.pyx @@ -1,8 +1,8 @@ -# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. +# Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. # # SPDX-License-Identifier: Apache-2.0 # -# This code was automatically generated across versions from 11.0.3 to 12.6.2. Do not modify it directly. +# This code was automatically generated across versions from 11.0.3 to 12.8.0. Do not modify it directly. from ._internal cimport cusolverDn as _cusolverDn diff --git a/nvmath/bindings/cycusparse.pxd b/nvmath/bindings/cycusparse.pxd index de2f01b..e9279fd 100644 --- a/nvmath/bindings/cycusparse.pxd +++ b/nvmath/bindings/cycusparse.pxd @@ -1,8 +1,8 @@ -# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. +# Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. # # SPDX-License-Identifier: Apache-2.0 # -# This code was automatically generated across versions from 11.0.3 to 12.6.2. Do not modify it directly. +# This code was automatically generated across versions from 11.0.3 to 12.8.0. Do not modify it directly. # This layer exposes the C header to Cython as-is. from libc.stdint cimport int64_t diff --git a/nvmath/bindings/cycusparse.pyx b/nvmath/bindings/cycusparse.pyx index b59feb7..0907680 100644 --- a/nvmath/bindings/cycusparse.pyx +++ b/nvmath/bindings/cycusparse.pyx @@ -1,8 +1,8 @@ -# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. +# Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. # # SPDX-License-Identifier: Apache-2.0 # -# This code was automatically generated across versions from 11.0.3 to 12.6.2. Do not modify it directly. +# This code was automatically generated across versions from 11.0.3 to 12.8.0. Do not modify it directly. from ._internal cimport cusparse as _cusparse diff --git a/nvmath/bindings/nvpl/__init__.py b/nvmath/bindings/nvpl/__init__.py index dfcdae0..6172ecb 100644 --- a/nvmath/bindings/nvpl/__init__.py +++ b/nvmath/bindings/nvpl/__init__.py @@ -1,7 +1,11 @@ -# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. +# Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. # # SPDX-License-Identifier: Apache-2.0 # type: ignore from . import fft + +__all__ = [ + "fft", +] diff --git a/nvmath/bindings/nvpl/_internal/fft.pxd b/nvmath/bindings/nvpl/_internal/fft.pxd index a422030..c5ba5eb 100644 --- a/nvmath/bindings/nvpl/_internal/fft.pxd +++ b/nvmath/bindings/nvpl/_internal/fft.pxd @@ -1,4 +1,4 @@ -# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. +# Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. # # SPDX-License-Identifier: Apache-2.0 # diff --git a/nvmath/bindings/nvpl/_internal/fft_linux.pyx b/nvmath/bindings/nvpl/_internal/fft_linux.pyx index 7421bc0..041d312 100644 --- a/nvmath/bindings/nvpl/_internal/fft_linux.pyx +++ b/nvmath/bindings/nvpl/_internal/fft_linux.pyx @@ -1,4 +1,4 @@ -# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. +# Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. # # SPDX-License-Identifier: Apache-2.0 # diff --git a/nvmath/bindings/nvpl/cyfft.pxd b/nvmath/bindings/nvpl/cyfft.pxd index c227697..d2530a5 100644 --- a/nvmath/bindings/nvpl/cyfft.pxd +++ b/nvmath/bindings/nvpl/cyfft.pxd @@ -1,4 +1,4 @@ -# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. +# Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. # # SPDX-License-Identifier: Apache-2.0 # diff --git a/nvmath/bindings/nvpl/cyfft.pyx b/nvmath/bindings/nvpl/cyfft.pyx index 052d7e8..dfb122e 100644 --- a/nvmath/bindings/nvpl/cyfft.pyx +++ b/nvmath/bindings/nvpl/cyfft.pyx @@ -1,4 +1,4 @@ -# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. +# Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. # # SPDX-License-Identifier: Apache-2.0 # diff --git a/nvmath/bindings/nvpl/fft.pxd b/nvmath/bindings/nvpl/fft.pxd index 449da11..3380a13 100644 --- a/nvmath/bindings/nvpl/fft.pxd +++ b/nvmath/bindings/nvpl/fft.pxd @@ -1,4 +1,4 @@ -# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. +# Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. # # SPDX-License-Identifier: Apache-2.0 # diff --git a/nvmath/bindings/nvpl/fft.pyx b/nvmath/bindings/nvpl/fft.pyx index 01d84ac..db58f06 100644 --- a/nvmath/bindings/nvpl/fft.pyx +++ b/nvmath/bindings/nvpl/fft.pyx @@ -1,4 +1,4 @@ -# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. +# Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. # # SPDX-License-Identifier: Apache-2.0 # diff --git a/nvmath/device/__init__.py b/nvmath/device/__init__.py index 6b167aa..d62c03f 100644 --- a/nvmath/device/__init__.py +++ b/nvmath/device/__init__.py @@ -1,14 +1,15 @@ -# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. +# Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. # # SPDX-License-Identifier: Apache-2.0 from .patch import patch_codegen patch_codegen() -from .common_cuda import * -from .cufftdx import * -from .cublasdx import * -from .cublasdx_backend import * -from .vector_types_numba import * + +from .common_cuda import * # noqa: E402, F403 +from .cufftdx import * # noqa: E402, F403 +from .cublasdx import * # noqa: E402, F403 +from .cublasdx_backend import * # noqa: E402, F403 +from .vector_types_numba import * # noqa: E402, F403 del patch_codegen diff --git a/nvmath/device/caching.py b/nvmath/device/caching.py index 1f31fe4..1c4d5aa 100644 --- a/nvmath/device/caching.py +++ b/nvmath/device/caching.py @@ -1,4 +1,4 @@ -# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. +# Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. # # SPDX-License-Identifier: Apache-2.0 diff --git a/nvmath/device/common.py b/nvmath/device/common.py index 53dd12a..42a241a 100644 --- a/nvmath/device/common.py +++ b/nvmath/device/common.py @@ -1,4 +1,4 @@ -# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. +# Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. # # SPDX-License-Identifier: Apache-2.0 @@ -9,7 +9,7 @@ SHARED_DEVICE_DOCSTRINGS = { - "compiler": "A string to specify the compiler for the device code, currently supports ``None`` (default) and ``'Numba'``", + "compiler": "A string to specify the compiler for the device code, currently supports ``None`` (default) and ``'numba'``", # "precision": """\ The computation precision specified as a numpy float dtype, currently supports ``numpy.float16``, ``numpy.float32`` and diff --git a/nvmath/device/common_cpp.py b/nvmath/device/common_cpp.py index 62e7990..a7c3781 100644 --- a/nvmath/device/common_cpp.py +++ b/nvmath/device/common_cpp.py @@ -1,4 +1,4 @@ -# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. +# Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. # # SPDX-License-Identifier: Apache-2.0 diff --git a/nvmath/device/common_cuda.py b/nvmath/device/common_cuda.py index adf9b25..ad87d71 100644 --- a/nvmath/device/common_cuda.py +++ b/nvmath/device/common_cuda.py @@ -1,4 +1,4 @@ -# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. +# Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. # # SPDX-License-Identifier: Apache-2.0 @@ -113,8 +113,15 @@ def get_current_device_cc(): CHECK_CUDART(err) err, prop = cudart.cudaGetDeviceProperties(device) CHECK_CUDART(err) + major, minor = prop.major, prop.minor + # TODO: dx does not support platforms > arch90 for now + if (major, minor) > (9, 0): + logging.info( + f"The current device supports compute capability {prop.major}.{prop.minor}, but the generated LTO version is capped at 9.0." + ) + major, minor = 9, 0 logging.info(f"Using device {device} for default compute capability, found cc = {prop.major}.{prop.minor}") - return ComputeCapability(prop.major, prop.minor) + return ComputeCapability(major, minor) def get_default_code_type(): diff --git a/nvmath/device/common_mathdx.py b/nvmath/device/common_mathdx.py index 12c95d0..aed233e 100644 --- a/nvmath/device/common_mathdx.py +++ b/nvmath/device/common_mathdx.py @@ -1,4 +1,4 @@ -# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. +# Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. # # SPDX-License-Identifier: Apache-2.0 diff --git a/nvmath/device/common_numba.py b/nvmath/device/common_numba.py index b3f1287..f542f02 100644 --- a/nvmath/device/common_numba.py +++ b/nvmath/device/common_numba.py @@ -1,4 +1,4 @@ -# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. +# Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. # # SPDX-License-Identifier: Apache-2.0 diff --git a/nvmath/device/cublasdx.py b/nvmath/device/cublasdx.py index cc9e68c..65e8688 100644 --- a/nvmath/device/cublasdx.py +++ b/nvmath/device/cublasdx.py @@ -1,4 +1,4 @@ -# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. +# Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. # # SPDX-License-Identifier: Apache-2.0 @@ -117,6 +117,11 @@ def __init__( raise RuntimeError( "Minimal compute capability 7.0 is required by cuBLASDx, got " f"{code_type.cc.major}.{code_type.cc.minor}" ) + # TODO: cublasdx does not support platforms > arch90 for now + if (code_type.cc.major, code_type.cc.minor) > (9, 0): + raise RuntimeError( + f"The maximum compute capability currently supported by device APIs is 9.0, got {code_type.cc.major}.{code_type.cc.minor}" + ) if len(transpose_mode) != 2: raise ValueError( diff --git a/nvmath/device/cublasdx_backend.py b/nvmath/device/cublasdx_backend.py index 1cc27dd..6016485 100644 --- a/nvmath/device/cublasdx_backend.py +++ b/nvmath/device/cublasdx_backend.py @@ -1,4 +1,4 @@ -# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. +# Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. # # SPDX-License-Identifier: Apache-2.0 diff --git a/nvmath/device/cublasdx_numba.py b/nvmath/device/cublasdx_numba.py index 66b925c..73f374a 100644 --- a/nvmath/device/cublasdx_numba.py +++ b/nvmath/device/cublasdx_numba.py @@ -1,4 +1,4 @@ -# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. +# Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. # # SPDX-License-Identifier: Apache-2.0 diff --git a/nvmath/device/cufftdx.py b/nvmath/device/cufftdx.py index 264ff03..3df3e80 100644 --- a/nvmath/device/cufftdx.py +++ b/nvmath/device/cufftdx.py @@ -1,4 +1,4 @@ -# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. +# Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. # # SPDX-License-Identifier: Apache-2.0 @@ -87,8 +87,8 @@ class FFTOptions: - ``'real_mode'``, currently supports ``'normal'`` and ``'folded``. Note: - The class is not meant to used directly with its constructor. Users are instead - advised to use :func:`fft` create the object. + The class is not meant to be used directly with its constructor. Users are instead + advised to use :func:`fft` to create the object. See Also: The attributes of this class provide a 1:1 mapping with the CUDA C++ cuFFTDx APIs. @@ -116,6 +116,11 @@ def __init__( raise RuntimeError( f"Minimal compute capability 7.0 is required by cuFFTDx, got {code_type.cc.major}.{code_type.cc.minor}" ) + # TODO: cufftdx does not support platforms > arch90 for now + if (code_type.cc.major, code_type.cc.minor) > (9, 0): + raise RuntimeError( + f"The maximum compute capability currently supported by device APIs is 9.0, got {code_type.cc.major}.{code_type.cc.minor}" + ) # # Check that the knobs are, individually, valid @@ -246,14 +251,6 @@ def create(self, **kwargs): # Private implementations # - def _valid(self, knob): - if knob == "elements_per_thread": - return [self._suggested_elements_per_thread] - elif knob == "ffts_per_block": - return [1, self._suggested_ffts_per_block] - else: - raise ValueError("Unsupported knob") - def _suggested(self, what): # Generate full PTX cpp = generate_block( @@ -449,7 +446,7 @@ def fft(*, compiler=None, **kwargs): compiler (str): {compiler} - code_type (CodeType): {code_type}. Optional if compiler is specified as ``'Numba'``. + code_type (CodeType): {code_type}. Optional if compiler is specified as ``'numba'``. execution (str): {execution} diff --git a/nvmath/device/cufftdx_backend.py b/nvmath/device/cufftdx_backend.py index f4e04e6..c24ddbc 100644 --- a/nvmath/device/cufftdx_backend.py +++ b/nvmath/device/cufftdx_backend.py @@ -1,4 +1,4 @@ -# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. +# Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. # # SPDX-License-Identifier: Apache-2.0 diff --git a/nvmath/device/cufftdx_db.py b/nvmath/device/cufftdx_db.py index 8857a59..ec079b2 100644 --- a/nvmath/device/cufftdx_db.py +++ b/nvmath/device/cufftdx_db.py @@ -1,4 +1,4 @@ -# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. +# Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. # # SPDX-License-Identifier: Apache-2.0 diff --git a/nvmath/device/cufftdx_numba.py b/nvmath/device/cufftdx_numba.py index 9c85741..e69b33b 100644 --- a/nvmath/device/cufftdx_numba.py +++ b/nvmath/device/cufftdx_numba.py @@ -1,4 +1,4 @@ -# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. +# Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. # # SPDX-License-Identifier: Apache-2.0 diff --git a/nvmath/device/cufftdx_workspace.py b/nvmath/device/cufftdx_workspace.py index 92a29fb..9ca0bd8 100644 --- a/nvmath/device/cufftdx_workspace.py +++ b/nvmath/device/cufftdx_workspace.py @@ -1,4 +1,4 @@ -# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. +# Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. # # SPDX-License-Identifier: Apache-2.0 diff --git a/nvmath/device/curand_kernel.py b/nvmath/device/curand_kernel.py index c811eeb..d446ed8 100644 --- a/nvmath/device/curand_kernel.py +++ b/nvmath/device/curand_kernel.py @@ -1,4 +1,4 @@ -# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. +# Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. # # SPDX-License-Identifier: Apache-2.0 diff --git a/nvmath/device/nvrtc.py b/nvmath/device/nvrtc.py index 3c48389..94609bc 100644 --- a/nvmath/device/nvrtc.py +++ b/nvmath/device/nvrtc.py @@ -1,4 +1,4 @@ -# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. +# Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. # # SPDX-License-Identifier: Apache-2.0 diff --git a/nvmath/device/patch.py b/nvmath/device/patch.py index 733fcbd..adf9682 100644 --- a/nvmath/device/patch.py +++ b/nvmath/device/patch.py @@ -1,4 +1,4 @@ -# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. +# Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. # # SPDX-License-Identifier: Apache-2.0 @@ -7,12 +7,9 @@ # support LTO code generation and linking # find libnvvm located in PYPI wheels # - -import os import functools import numba -from numba.cuda.cudadrv import libs import numba.cuda.cudadrv.nvrtc as nvrtc import numba.cuda.cudadrv.nvvm as nvvm import pynvjitlink.patch # type: ignore diff --git a/nvmath/device/random.py b/nvmath/device/random.py index 7892faa..4efc763 100644 --- a/nvmath/device/random.py +++ b/nvmath/device/random.py @@ -1,4 +1,4 @@ -# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. +# Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. # # SPDX-License-Identifier: Apache-2.0 diff --git a/nvmath/device/random_helpers.py b/nvmath/device/random_helpers.py index d063e27..3508c4a 100644 --- a/nvmath/device/random_helpers.py +++ b/nvmath/device/random_helpers.py @@ -1,4 +1,4 @@ -# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. +# Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. # # SPDX-License-Identifier: Apache-2.0 diff --git a/nvmath/device/random_states.py b/nvmath/device/random_states.py index a544149..edcad17 100644 --- a/nvmath/device/random_states.py +++ b/nvmath/device/random_states.py @@ -1,12 +1,11 @@ -# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. +# Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. # # SPDX-License-Identifier: Apache-2.0 import operator -import re from nvmath.device import curand_kernel -from nvmath.device.common_mathdx import CURAND_HOME +from nvmath.device.common_mathdx import CURAND_HOME # noqa: F401 from numba import cuda, types from numba.extending import models, register_model, typeof_impl @@ -17,11 +16,6 @@ from llvmlite import ir -import os -import logging - -from numba import config - xorwow_dtype = np.dtype( [ ("d", np.uint32), diff --git a/nvmath/device/types.py b/nvmath/device/types.py index 89fdcb2..c653b94 100644 --- a/nvmath/device/types.py +++ b/nvmath/device/types.py @@ -1,4 +1,4 @@ -# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. +# Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. # # SPDX-License-Identifier: Apache-2.0 diff --git a/nvmath/device/vector_types_numba.py b/nvmath/device/vector_types_numba.py index 1426ced..50e3ca0 100644 --- a/nvmath/device/vector_types_numba.py +++ b/nvmath/device/vector_types_numba.py @@ -1,4 +1,4 @@ -# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. +# Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. # # SPDX-License-Identifier: Apache-2.0 diff --git a/nvmath/fft/__init__.py b/nvmath/fft/__init__.py index 874dce5..f9ec48d 100644 --- a/nvmath/fft/__init__.py +++ b/nvmath/fft/__init__.py @@ -1,7 +1,7 @@ -# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. +# Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. # # SPDX-License-Identifier: Apache-2.0 -from ._configuration import * -from ._helpers import * -from .fft import * +from ._configuration import * # noqa: F403 +from ._helpers import * # noqa: F403 +from .fft import * # noqa: F403 diff --git a/nvmath/fft/_configuration.py b/nvmath/fft/_configuration.py index 7c583a6..6abddaa 100644 --- a/nvmath/fft/_configuration.py +++ b/nvmath/fft/_configuration.py @@ -1,4 +1,4 @@ -# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. +# Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. # # SPDX-License-Identifier: Apache-2.0 @@ -8,9 +8,7 @@ from enum import IntEnum from logging import Logger from typing import Literal -import warnings -from nvmath._internal.mem_limit import MEM_LIMIT_RE_PCT, MEM_LIMIT_RE_VAL, MEM_LIMIT_DOC from nvmath.memory import BaseCUDAMemoryManager @@ -74,12 +72,6 @@ class FFTOptions: * (m - 1) + 1`. The specified value should be either ``'even'`` or ``'odd'``, with the default being ``'even'``. - last_axis_size: See :attr:`last_axis_parity`. - - .. deprecated:: 0.2.1 - :attr:`last_axis_size` will be removed in 0.3.0. Use - :attr:`last_axis_parity` instead. - result_layout: The layout to use for the result, either ``'natural'`` or ``'optimized'``. For the ``'natural'`` option, the result layout is the same as that of the operand. The default is ``'optimized'``, which generally provides @@ -123,7 +115,6 @@ class FFTOptions: fft_type: Literal["C2C", "C2R", "R2C"] | None = None inplace: bool = False last_axis_parity: Literal["even", "odd"] | None = "even" - last_axis_size: None = None result_layout: Literal["natural", "optimized"] | None = "optimized" device_id: int | None = None logger: Logger | None = None @@ -138,14 +129,6 @@ def __post_init__(self): if not isinstance(self.inplace, bool): raise ValueError("The value specified for 'inplace' must be of type bool (True or False).") - # TODO: Remove in version 0.3.0 - if self.last_axis_size is not None: - warnings.warn( - "FFTOptions.last_axis_size is deprecated and will be removed in version 0.3.0. " - "Use FFTOptions.last_axis_parity instead.", - DeprecationWarning, - ) - self.last_axis_parity = self.last_axis_size valid_last_axis_parity = ["even", "odd"] if self.last_axis_parity not in valid_last_axis_parity: raise ValueError(f"The value specified for 'last_axis_parity' must be one of {valid_last_axis_parity}.") diff --git a/nvmath/fft/_exec_utils.py b/nvmath/fft/_exec_utils.py index dfcc05c..2076f34 100644 --- a/nvmath/fft/_exec_utils.py +++ b/nvmath/fft/_exec_utils.py @@ -1,4 +1,4 @@ -# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. +# Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. # # SPDX-License-Identifier: Apache-2.0 @@ -31,7 +31,7 @@ def _check_init_cufft(): ) from e try: - import cupy + import cupy # noqa: F401 except ImportError as e: raise RuntimeError("Currently, the FFT CUDA execution requires cupy. Please make sure cupy is installed.") from e diff --git a/nvmath/fft/_helpers.py b/nvmath/fft/_helpers.py index 9a0c4c9..393bdbc 100644 --- a/nvmath/fft/_helpers.py +++ b/nvmath/fft/_helpers.py @@ -1,4 +1,4 @@ -# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. +# Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. # # SPDX-License-Identifier: Apache-2.0 @@ -24,7 +24,7 @@ def _check_numba_available(): try: - import numba + import numba # noqa: F401 except ModuleNotFoundError as e: raise RuntimeError("Numba is required to compile FFT prolog and epilog functions.") from e diff --git a/nvmath/fft/fft.py b/nvmath/fft/fft.py index 8af5838..6d6ca15 100644 --- a/nvmath/fft/fft.py +++ b/nvmath/fft/fft.py @@ -1,4 +1,4 @@ -# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. +# Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. # # SPDX-License-Identifier: Apache-2.0 @@ -464,7 +464,7 @@ def get_fft_plan_traits( if last_axis_size == 0: raise ValueError( f"The size of the last FFT axis in the result for FFT type '{fft_abstract_type}' is 0 for operand shape = " - f"{operand_shape} and axes = {axes}. To fix this, provide 'last_axis_size' = 'odd' to the FFT options." + f"{operand_shape} and axes = {axes}. To fix this, provide 'last_axis_parity' = 'odd' to the FFT options." ) ordered_fft_out_shape = list(ordered_fft_in_shape) index = ordered_axes.index(last_axis_id) @@ -839,7 +839,12 @@ def create_fft_key(operand, *, axes=None, options=None, execution=None, inplace= # Prolog and epilog, if used. if prolog is not None or epilog is not None: - get_data = lambda device_callable: None if device_callable is None else (device_callable.ltoir, device_callable.data) + prolog = utils.check_or_create_options(_configuration.DeviceCallable, prolog, "prolog", keep_none=True) + epilog = utils.check_or_create_options(_configuration.DeviceCallable, epilog, "epilog", keep_none=True) + + def get_data(device_callable): + return None if device_callable is None else (device_callable.ltoir, device_callable.data) + callable_data = get_data(prolog), get_data(epilog) else: callable_data = None @@ -1071,7 +1076,7 @@ class FFT: - The input must be Hermitian-symmetric when :attr:`FFTOptions.fft_type` is ``'C2R'``, otherwise the result is undefined. As a specific example, if the input for a C2R FFT was generated using an R2C FFT with an odd last axis size, then - :attr:`FFTOptions.last_axis_size` must be set to `odd` to recover the original + :attr:`FFTOptions.last_axis_parity` must be set to `odd` to recover the original signal. """ @@ -1823,6 +1828,7 @@ def _release_workspace_memory_perhaps(self, exception: Exception | None = None) "the value of 'workspace_allocated_here'." ) self._free_workspace_memory_perhaps(release_workspace) + self._workspace_allocated_here = False return True @utils.precondition(_check_valid_fft) @@ -2199,9 +2205,9 @@ def irfft(x, *, axes=None, options=None, execution=None, prolog=None, epilog=Non Returns: A real tensor that remains on the same device and belongs to the same package as the input operand. The extent of the last transformed axis in the result will be - ``(operand.shape[axes[-1]] - 1) * 2`` if :attr:`FFTOptions.last_axis_size` is + ``(operand.shape[axes[-1]] - 1) * 2`` if :attr:`FFTOptions.last_axis_parity` is ``even``, or ``operand.shape[axes[-1]] * 2 - 1`` if - :attr:`FFTOptions.last_axis_size` is ``odd``. + :attr:`FFTOptions.last_axis_parity` is ``odd``. See Also: :func:`fft`, :func:`ifft`, :class:`FFT`. @@ -2239,7 +2245,7 @@ def irfft(x, *, axes=None, options=None, execution=None, prolog=None, epilog=Non example, 1-D transforms require the first element (and the last element, if the extent is even) of the input to be purely real-valued. In addition, if the input to `irfft` was generated using an R2C FFT with an odd last axis size, - :attr:`FFTOptions.last_axis_size` must be set to ``odd`` to recover the original + :attr:`FFTOptions.last_axis_parity` must be set to ``odd`` to recover the original signal. - For more details, please refer to `C2R example `_ diff --git a/nvmath/linalg/__init__.py b/nvmath/linalg/__init__.py index 3422703..ebe6de8 100644 --- a/nvmath/linalg/__init__.py +++ b/nvmath/linalg/__init__.py @@ -1,5 +1,11 @@ -# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. +# Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. # # SPDX-License-Identifier: Apache-2.0 from . import advanced +from nvmath.bindings.cublas import ComputeType # type: ignore + +__all__ = [ + "advanced", + "ComputeType", +] diff --git a/nvmath/linalg/_internal/__init__.py b/nvmath/linalg/_internal/__init__.py index 281f836..6ff03ac 100644 --- a/nvmath/linalg/_internal/__init__.py +++ b/nvmath/linalg/_internal/__init__.py @@ -1,5 +1,3 @@ -# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. +# Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. # # SPDX-License-Identifier: Apache-2.0 - -from .enum_to_tuples import * diff --git a/nvmath/linalg/_internal/algo_cap_ifc.py b/nvmath/linalg/_internal/algo_cap_ifc.py index 90bb56f..59a7b4c 100644 --- a/nvmath/linalg/_internal/algo_cap_ifc.py +++ b/nvmath/linalg/_internal/algo_cap_ifc.py @@ -1,4 +1,4 @@ -# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. +# Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. # # SPDX-License-Identifier: Apache-2.0 diff --git a/nvmath/linalg/_internal/algo_config_ifc.py b/nvmath/linalg/_internal/algo_config_ifc.py index ccc7837..442d8e4 100644 --- a/nvmath/linalg/_internal/algo_config_ifc.py +++ b/nvmath/linalg/_internal/algo_config_ifc.py @@ -1,4 +1,4 @@ -# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. +# Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. # # SPDX-License-Identifier: Apache-2.0 diff --git a/nvmath/linalg/_internal/enum_to_tuples.py b/nvmath/linalg/_internal/enum_to_tuples.py index 7ec8360..42fba08 100644 --- a/nvmath/linalg/_internal/enum_to_tuples.py +++ b/nvmath/linalg/_internal/enum_to_tuples.py @@ -1,4 +1,4 @@ -# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. +# Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. # # SPDX-License-Identifier: Apache-2.0 diff --git a/nvmath/linalg/_internal/epilog_protocol.py b/nvmath/linalg/_internal/epilog_protocol.py index 3e284f6..a44ec04 100644 --- a/nvmath/linalg/_internal/epilog_protocol.py +++ b/nvmath/linalg/_internal/epilog_protocol.py @@ -1,4 +1,4 @@ -# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. +# Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. # # SPDX-License-Identifier: Apache-2.0 @@ -135,7 +135,7 @@ def update_ptr(self, mm_desc_ifc, ptr): class BiasHandler(EpilogInputHandler): - def __init__(self, logger, mm_traits, enumerator, d_dtype_name): + def __init__(self, logger, mm_traits, enumerator, c_dtype_name, d_dtype_name, aux_dtype_name): self.logger = logger self.mm_traits = mm_traits @@ -264,7 +264,7 @@ def gelu_aux_mm_shape(m, n): class ReluAuxHandler(EpilogOutputHandler): - def __init__(self, logger, mm_traits, enumerator, d_dtype_name): + def __init__(self, logger, mm_traits, enumerator, c_dtype_name, d_dtype_name, aux_dtype_name): self.logger = logger self.mm_traits = mm_traits @@ -279,6 +279,8 @@ def __init__(self, logger, mm_traits, enumerator, d_dtype_name): self.aux_shape = mm_traits.batch_shape + [m, n] aux_axis_order = [batch_len, batch_len + 1] + list(mm_traits.batch_axis_order) # Column order for the bitmask. self.aux_strides = calculate_strides(self.aux_shape, aux_axis_order) + if aux_dtype_name is not None: + raise ValueError("Custom type for auxiliary outputs is not supported for RELU epilogs.") self.aux_dtype_name = "uint8" # We store bitmask using int8 dtype but the values below are in number of elements. @@ -309,7 +311,7 @@ def update_ptr(self, mm_desc_ifc, ptr): class GeluAuxHandler(EpilogOutputHandler): - def __init__(self, logger, mm_traits, enumerator, d_dtype_name): + def __init__(self, logger, mm_traits, enumerator, c_dtype_name, d_dtype_name, aux_dtype_name): self.logger = logger self.mm_traits = mm_traits @@ -326,7 +328,15 @@ def __init__(self, logger, mm_traits, enumerator, d_dtype_name): self.aux_shape = mm_traits.batch_shape + [m, n] aux_axis_order = [batch_len, batch_len + 1] + list(mm_traits.batch_axis_order) # Column order for the GELU inputs. self.aux_strides = calculate_strides(self.aux_shape, aux_axis_order) - self.aux_dtype_name = d_dtype_name + + self.version = cublaslt.get_version() + + if aux_dtype_name: + if self.version < 120800: + raise ValueError("Specifying custom AUX data type is not supported for cuBLAS < 12.8.") + self.aux_dtype_name = aux_dtype_name + else: + self.aux_dtype_name = c_dtype_name if "float8" in d_dtype_name else d_dtype_name self.aux_ld = m # should be consistent with order (currently COL). self.aux_batch_offset = m * n @@ -347,9 +357,10 @@ def update(self, mm_desc_ifc): mm_desc_ifc.epilogue_aux_ld = self.aux_ld # Set the aux batch offset. mm_desc_ifc.epilogue_aux_batch_stride = self.aux_batch_offset - # The aux data type is by default the data type of the result for all the cases we - # support. - assert self.aux_dtype_name == self.d_dtype_name, "Internal error." + # Set the pointer to 0x1 to bypass the cuBLAS check. + mm_desc_ifc.epilogue_aux_pointer = 0x1 + if self.aux_dtype_name is not None and self.version >= 120800: + mm_desc_ifc.epilogue_aux_data_type = typemaps.NAME_TO_DATA_TYPE[self.aux_dtype_name] def update_ptr(self, mm_desc_ifc, ptr): # Set the aux pointer. @@ -357,7 +368,7 @@ def update_ptr(self, mm_desc_ifc, ptr): class BgradHandler(EpilogOutputHandler): - def __init__(self, logger, mm_traits, enumerator, d_dtype_name): + def __init__(self, logger, mm_traits, enumerator, c_dtype_name, d_dtype_name, aux_dtype_name): self.logger = logger self.mm_traits = mm_traits @@ -372,6 +383,9 @@ def __init__(self, logger, mm_traits, enumerator, d_dtype_name): self._name = enumerator.name.lower() + if aux_dtype_name is not None: + raise ValueError("Custom type for auxiliary outputs is not supported for RELU epilogs.") + m = mm_traits.N if enumerator == Epilog.BGRADB else mm_traits.M batch_len = len(mm_traits.batch_axis_order) @@ -418,7 +432,7 @@ def update_ptr(self, mm_desc_ifc, ptr): class DReluAuxHandler(EpilogInputHandler): - def __init__(self, logger, mm_traits, enumerator, d_dtype_name): + def __init__(self, logger, mm_traits, enumerator, c_dtype_name, d_dtype_name, aux_dtype_name): self.logger = logger self.mm_traits = mm_traits @@ -520,7 +534,7 @@ def update(self, mm_desc_ifc, relu_aux_tensor): class DGeluAuxHandler(EpilogInputHandler): - def __init__(self, logger, mm_traits, enumerator, d_dtype_name): + def __init__(self, logger, mm_traits, enumerator, c_dtype_name, d_dtype_name, aux_dtype_name): self.logger = logger self.mm_traits = mm_traits diff --git a/nvmath/linalg/_internal/matmul_desc_ifc.py b/nvmath/linalg/_internal/matmul_desc_ifc.py index 5a3bb39..a70860c 100644 --- a/nvmath/linalg/_internal/matmul_desc_ifc.py +++ b/nvmath/linalg/_internal/matmul_desc_ifc.py @@ -1,4 +1,4 @@ -# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. +# Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. # # SPDX-License-Identifier: Apache-2.0 @@ -8,19 +8,14 @@ __all__ = ["MatmulDescInterface"] -from collections.abc import Sequence -import itertools -import numbers -import operator import ctypes import logging -logger = logging.getLogger() - import numpy as np from nvmath.bindings import cublasLt as cublaslt +logger = logging.getLogger() DescEnum = cublaslt.MatmulDescAttribute diff --git a/nvmath/linalg/_internal/matmul_pref_ifc.py b/nvmath/linalg/_internal/matmul_pref_ifc.py index fbea322..5b06d72 100644 --- a/nvmath/linalg/_internal/matmul_pref_ifc.py +++ b/nvmath/linalg/_internal/matmul_pref_ifc.py @@ -1,4 +1,4 @@ -# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. +# Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. # # SPDX-License-Identifier: Apache-2.0 @@ -9,19 +9,15 @@ __all__ = ["MatmulPreferenceInterface"] -from collections.abc import Sequence -import itertools -import numbers -import operator import logging -logger = logging.getLogger() - import numpy as np from nvmath.bindings import cublasLt as cublaslt +logger = logging.getLogger() + PreferenceEnum = cublaslt.MatmulPreferenceAttribute diff --git a/nvmath/linalg/_internal/matrix_layout_ifc.py b/nvmath/linalg/_internal/matrix_layout_ifc.py index 2a2bf22..b615117 100644 --- a/nvmath/linalg/_internal/matrix_layout_ifc.py +++ b/nvmath/linalg/_internal/matrix_layout_ifc.py @@ -1,4 +1,4 @@ -# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. +# Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. # # SPDX-License-Identifier: Apache-2.0 @@ -8,10 +8,6 @@ __all__ = ["MatrixLayoutInterface"] -from collections.abc import Sequence -import itertools -import numbers -import operator import numpy as np diff --git a/nvmath/linalg/_internal/typemaps.py b/nvmath/linalg/_internal/typemaps.py index ea17b11..736c57a 100644 --- a/nvmath/linalg/_internal/typemaps.py +++ b/nvmath/linalg/_internal/typemaps.py @@ -1,4 +1,4 @@ -# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. +# Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. # # SPDX-License-Identifier: Apache-2.0 @@ -6,11 +6,16 @@ Functions to link type names with CUBLAS compute types. """ -__all__ = ["NAME_TO_DEFAULT_SCALE_TYPE", "NAME_TO_DEFAULT_COMPUTE_TYPE"] +__all__ = [ + "NAMES_TO_DEFAULT_SCALE_TYPE", + "NAMES_TO_DEFAULT_COMPUTE_TYPE", + "SUPPORTED_TYPES", + "COMPUTE_TYPE_TO_DEFAULT_SCALE_TYPE", + "SCALE_TYPE_TO_DEFAULT_COMPUTE_TYPE", +] from nvmath.bindings import cublas # type: ignore from nvmath._internal.typemaps import cudaDataType -import re def create_default_scale_type_map(): @@ -22,22 +27,23 @@ def create_default_scale_type_map(): dt = cudaDataType scale_type_map = dict() - # scale_type_map['float8'] = dt.CUDA_R_32F # both CUDA_R_8F_E4M3 and CUDA_R_8F_E5M2 -> - # CUDA_R_32F - scale_type_map["bfloat16"] = dt.CUDA_R_32F - scale_type_map["float16"] = dt.CUDA_R_32F - scale_type_map["float32"] = dt.CUDA_R_32F - scale_type_map["float64"] = dt.CUDA_R_64F - scale_type_map["complex32"] = dt.CUDA_C_32F - scale_type_map["complex64"] = dt.CUDA_C_32F - scale_type_map["complex128"] = dt.CUDA_C_64F + scale_type_map["float8_e4m3fn", "float8_e5m2"] = dt.CUDA_R_32F + scale_type_map["float8_e5m2", "float8_e4m3fn"] = dt.CUDA_R_32F + scale_type_map["float8_e4m3fn", "float8_e4m3fn"] = dt.CUDA_R_32F + scale_type_map["bfloat16", "bfloat16"] = dt.CUDA_R_32F + scale_type_map["float16", "float16"] = dt.CUDA_R_32F + scale_type_map["float32", "float32"] = dt.CUDA_R_32F + scale_type_map["float64", "float64"] = dt.CUDA_R_64F + scale_type_map["complex32", "complex32"] = dt.CUDA_C_32F + scale_type_map["complex64", "complex64"] = dt.CUDA_C_32F + scale_type_map["complex128", "complex128"] = dt.CUDA_C_64F return scale_type_map def create_compute_type_to_scale_type_map(is_complex): """ - Map the compute type to the corresponding CUDA data type that's appropriate for + Map the compute type to the corresponding CUDA data type that's appropriate for default scale. """ @@ -82,22 +88,23 @@ def create_scale_type_to_compute_type_map(): def create_compute_type_map(): """ - Map the data type name to the corresponding CUDA data type that's appropriate for + Map the data type name to the corresponding CUDA data type that's appropriate for default scale. """ ct = cublas.ComputeType compute_type_map = dict() - # compute_type_map['float8'] = ct.COMPUTE_32F - # both CUDA_R_8F_E4M3 and CUDA_R_8F_E5M2 -> CUBLAS_COMPUTE_32F - compute_type_map["bfloat16"] = ct.COMPUTE_32F - compute_type_map["float16"] = ct.COMPUTE_32F - compute_type_map["float32"] = ct.COMPUTE_32F - compute_type_map["float64"] = ct.COMPUTE_64F - compute_type_map["complex32"] = ct.COMPUTE_32F - compute_type_map["complex64"] = ct.COMPUTE_32F - compute_type_map["complex128"] = ct.COMPUTE_64F + compute_type_map["float8_e4m3fn", "float8_e5m2"] = ct.COMPUTE_32F + compute_type_map["float8_e5m2", "float8_e4m3fn"] = ct.COMPUTE_32F + compute_type_map["float8_e4m3fn", "float8_e4m3fn"] = ct.COMPUTE_32F + compute_type_map["bfloat16", "bfloat16"] = ct.COMPUTE_32F + compute_type_map["float16", "float16"] = ct.COMPUTE_32F + compute_type_map["float32", "float32"] = ct.COMPUTE_32F + compute_type_map["float64", "float64"] = ct.COMPUTE_64F + compute_type_map["complex32", "complex32"] = ct.COMPUTE_32F + compute_type_map["complex64", "complex64"] = ct.COMPUTE_32F + compute_type_map["complex128", "complex128"] = ct.COMPUTE_64F return compute_type_map @@ -107,10 +114,22 @@ def create_compute_type_map(): cublas.ComputeType.COMPUTE_64F: ("float64", "complex128"), } -NAME_TO_DEFAULT_SCALE_TYPE = create_default_scale_type_map() -NAME_TO_DEFAULT_COMPUTE_TYPE = create_compute_type_map() +NAMES_TO_DEFAULT_SCALE_TYPE = create_default_scale_type_map() +NAMES_TO_DEFAULT_COMPUTE_TYPE = create_compute_type_map() COMPUTE_TYPE_TO_DEFAULT_SCALE_TYPE = { "real": create_compute_type_to_scale_type_map(is_complex=False), "complex": create_compute_type_to_scale_type_map(is_complex=True), } SCALE_TYPE_TO_DEFAULT_COMPUTE_TYPE = create_scale_type_to_compute_type_map() + +SUPPORTED_TYPES = [ + "float8_e4m3fn", + "float8_e5m2", + "bfloat16", + "float16", + "float32", + "float64", + "complex32", + "complex64", + "complex128", +] diff --git a/nvmath/linalg/_internal/utils.py b/nvmath/linalg/_internal/utils.py index fe47e29..bf0bce8 100644 --- a/nvmath/linalg/_internal/utils.py +++ b/nvmath/linalg/_internal/utils.py @@ -1,4 +1,4 @@ -# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. +# Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. # # SPDX-License-Identifier: Apache-2.0 diff --git a/nvmath/linalg/advanced/__init__.py b/nvmath/linalg/advanced/__init__.py index 7ba8dbe..5f9cb25 100644 --- a/nvmath/linalg/advanced/__init__.py +++ b/nvmath/linalg/advanced/__init__.py @@ -1,7 +1,8 @@ -# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. +# Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. # # SPDX-License-Identifier: Apache-2.0 -from ._algorithmmod import * -from ._configuration import * -from .matmulmod import * +from ._algorithmmod import * # noqa: F403 +from ._configuration import * # noqa: F403 +from .matmulmod import * # noqa: F403 +from . import helpers as helpers # noqa: F403 diff --git a/nvmath/linalg/advanced/_algorithmmod.py b/nvmath/linalg/advanced/_algorithmmod.py index c7db5db..2019d02 100644 --- a/nvmath/linalg/advanced/_algorithmmod.py +++ b/nvmath/linalg/advanced/_algorithmmod.py @@ -1,4 +1,4 @@ -# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. +# Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. # # SPDX-License-Identifier: Apache-2.0 @@ -10,7 +10,6 @@ import dataclasses -import numpy as np from nvmath.linalg.advanced._configuration import AlgorithmCapabilities from nvmath.linalg._internal.algo_cap_ifc import AlgoCapInterface diff --git a/nvmath/linalg/advanced/_configuration.py b/nvmath/linalg/advanced/_configuration.py index cd5298d..ef7bdc3 100644 --- a/nvmath/linalg/advanced/_configuration.py +++ b/nvmath/linalg/advanced/_configuration.py @@ -1,4 +1,4 @@ -# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. +# Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. # # SPDX-License-Identifier: Apache-2.0 @@ -7,7 +7,9 @@ "MatmulInnerShape", "MatmulNumericalImplFlags", "MatmulOptions", + "MatmulEpilogPreferences", "MatmulPlanPreferences", + "MatmulQuantizationScales", "MatmulReductionScheme", "matrix_qualifiers_dtype", ] @@ -22,8 +24,8 @@ from nvmath.bindings import cublas # type: ignore from nvmath.bindings import cublasLt as cublaslt # type: ignore from nvmath._internal import enum_utils +from nvmath._internal.utils import check_or_create_options from nvmath._internal.mem_limit import check_memory_str -from nvmath._internal.mem_limit import MEM_LIMIT_RE_PCT, MEM_LIMIT_RE_VAL, MEM_LIMIT_DOC from nvmath.memory import BaseCUDAMemoryManager from nvmath._utils import CudaDataType @@ -39,11 +41,28 @@ class MatmulOptions: Attributes: compute_type (nvmath.linalg.ComputeType): CUDA compute type. A suitable compute type - will be selected if not specified. + will be selected if not specified. scale_type (nvmath.CudaDataType): CUDA data type. A suitable data type consistent with the compute type will be selected if not specified. + result_type (nvmath.CudaDataType): CUDA data type. A requested datatype of the + result. If not specified, this type will be determined based on the input types. + Non-default result types are only supported for narrow-precision (FP8 and lower) + operations. + + result_amax (bool): If set, the absolute maximum (amax) of the result will be + returned in the auxiliary output tensor. Only supported for narrow-precision + (FP8 and lower) operations. + + block_scaling (bool): If set, block scaling (MXFP8) will be used instead of + tensor-wide scaling for FP8 operations. If the result is a narrow-precision + (FP8 and lower) data type, scales used for result quantization will be returned + in the auxiliary output tensor as ``"d_out_scale"`` in UE8M0 format. For more + information on UE8M0 format, see the documentation of + :class:`~linalg.advanced.MatmulQuantizationScales`. + This option is only supported for narrow-precision (FP8 and lower) operations. + sm_count_target (int) : The number of SMs to use for execution. The default is 0, corresponding to all available SMs. @@ -82,6 +101,9 @@ class MatmulOptions: compute_type: int | None = None scale_type: int | None = None + result_type: int | None = None + result_amax: bool = False + block_scaling: bool = False sm_count_target: int | None = 0 fast_accumulation: bool | None = False device_id: int | None = None @@ -157,29 +179,61 @@ class MatmulNumericalImplFlags(IntEnum): ALL = (1 << 64) - 1 +@dataclasses.dataclass +class MatmulEpilogPreferences: + """A data class for providing epilog options as part of ``preferences`` to the + :meth:`Matmul.plan` method and the wrapper function :func:`matmul`. + + Attributes: + aux_type (nvmath.CudaDataType): The requested datatype of the + epilog auxiliary output. If not specified, this type will be determined based on + the input types. Non-default auxiliary output types are only supported for + narrow-precision operations and certain epilogs. For more details on the + supported combinations, see ``CUBLASLT_MATMUL_DESC_EPILOGUE_AUX_DATA_TYPE`` in + cuBLAS documentation. If this option is set to a narrow-precision data type, + an additional epilog input ``"aux_quantization_scale"`` needs to be specified. + + aux_amax (bool): If set, the absolute maximum (amax) of the epilog + auxiliary output will be returned in the auxiliary output tensor. + Only supported when ``aux_type`` option is set to a narrow-precision + data type. + + See Also: + :meth:`Matmul.plan`, :func:`matmul`, :class:`MatmulPlanPreferences` + """ + + aux_type: int | None = None + aux_amax: bool = False + + @dataclasses.dataclass class MatmulPlanPreferences: """A data class for providing options to the :meth:`Matmul.plan` method and the wrapper function :func:`matmul`. Attributes: - reduction_scheme_mask (object of type - :class:`linalg.advanced.MatmulReductionScheme`) : Enumerators from - :class:`linalg.advanced.MatmulReductionScheme` combined with bitwise operator - ``|``. The default is all reduction schemes. + reduction_scheme_mask (:class:`nvmath.linalg.advanced.MatmulReductionScheme`): + Enumerators from + :class:`nvmath.linalg.advanced.MatmulReductionScheme` combined with + bitwise operator ``|``. The default is all reduction schemes. max_waves_count (float) : The maximum wave count. Selecting a value greater than 0. will exclude algorithms with device utilization greater than specified. The default is 0. - numerical_impl_mask (object of type - :class:`linalg.advanced.MatmulNumericalImplFlags`) : Enumerators from - :class:`linalg.advanced.MatmulNumericalImplFlags` combined with bitwise operator - ``|``. The default is all numerical implementation flag choices. + numerical_impl_mask (:class:`nvmath.linalg.advanced.MatmulNumericalImplFlags`): + Enumerators from + :class:`nvmath.nvmath.linalg.advanced.MatmulNumericalImplFlags` combined with + bitwise operator ``|``. The default is all numerical implementation flag + choices. limit (int) : The number of algorithms to consider. If not specified, a suitable default will be chosen. + epilog (:class:`nvmath.linalg.advanced.MatmulEpilogPreferences`): + Epilog preferences (as an object of class + :class:`~nvmath.linalg.advanced.MatmulEpilogPreferences` or a `dict`). + See Also: :meth:`Matmul.plan`, :func:`matmul` """ @@ -188,6 +242,7 @@ class MatmulPlanPreferences: max_waves_count: float | None = 0.0 numerical_impl_mask: MatmulNumericalImplFlags | None = MatmulNumericalImplFlags.ALL limit: int = 8 + epilog: MatmulEpilogPreferences | None = None def __post_init__(self): if self.reduction_scheme_mask is None: @@ -204,6 +259,42 @@ def __post_init__(self): if self.limit is None: self.limit = MatmulPlanPreferences.limit + self.epilog = check_or_create_options(MatmulEpilogPreferences, self.epilog, "epilog preferences") + + +@dataclasses.dataclass +class MatmulQuantizationScales: + """A data class for providing quantization_scales to :class:`Matmul` constructor and the + wrapper function :func:`matmul`. + + Scales can only be set for narrow-precision (FP8 and lower) matrices. + + When ``MatmulOptions.block_scaling=False``, each scale can either be a scalar (integer + or float) or a single-element tensor of shape ``()`` or ``(1,)``. + + When ``MatmulOptions.block_scaling=True``, each scale should be a 1D ``uint8`` tensor + with layout matching the requirements of cuBLAS MXFP8 scaling tensor. Values in the + tensor will be interpreted as UE8M0 values. This means that a value :math:`x` in the + scaling tensor will cause cuBLAS to multiply the respective block by :math:`2^{x-127}`. + + Attributes: + a (float or Tensor) : Scale for matrix A. + + b (float or Tensor) : Scale for matrix B. + + c (float or Tensor) : Scale for matrix C. + + d (float or Tensor) : Scale for matrix D. + + See Also: + :class:`Matmul`, :func:`matmul` + """ + + a : float | None = None + b : float | None = None + c : float | None = None + d : float | None = None + _create_options = enum_utils.create_options_class_from_enum _algo_cap_enum = cublaslt.MatmulAlgoCapAttribute diff --git a/nvmath/_version.py b/nvmath/linalg/advanced/helpers/__init__.py similarity index 72% rename from nvmath/_version.py rename to nvmath/linalg/advanced/helpers/__init__.py index 506a64f..bc833c9 100644 --- a/nvmath/_version.py +++ b/nvmath/linalg/advanced/helpers/__init__.py @@ -2,4 +2,4 @@ # # SPDX-License-Identifier: Apache-2.0 -__version__ = "0.2.1" +from . import matmul as matmul # noqa: F403 diff --git a/nvmath/linalg/advanced/helpers/matmul.py b/nvmath/linalg/advanced/helpers/matmul.py new file mode 100644 index 0000000..fe47c05 --- /dev/null +++ b/nvmath/linalg/advanced/helpers/matmul.py @@ -0,0 +1,173 @@ +# Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. +# +# SPDX-License-Identifier: Apache-2.0 + +from nvmath._internal.tensor_wrapper import wrap_operand +from nvmath._internal.utils import create_empty_tensor, infer_object_package, get_or_create_stream + +__all__ = ["create_mxfp8_scale", "invert_mxfp8_scale", "get_mxfp8_scale_offset", "apply_mxfp8_scale"] + + +def _validate_tensor(x, where, tensor_name="tensor", dtype=None): + """ + Validate the tensor package and dtype. + Args: + x: wrapped tensor object + where: name of the function that is performing the validation + tensor_name: name of the tensor to use in the error messages + dtype: if not None, check that the object dtype matches the specified dtype + """ + package = infer_object_package(x) + if package != "torch": + raise ValueError( + f"Only torch.Tensor is currently supported by function '{where}'; the " + f"specified {tensor_name} belongs to '{package}' package." + ) + x = wrap_operand(x) + if dtype is not None and x.dtype != dtype: + raise ValueError( + f"The function '{where}' requires the specified {tensor_name} to have dtype " + f"'{dtype}', whereas it has dtype '{x.dtype}'." + ) + + +def _validate_mxfp8_scale(scale, where, x=None): + if x is not None: + _validate_tensor(x, where) + x = wrap_operand(x) + _validate_tensor(scale, where, tensor_name="scale tensor", dtype="uint8") + scale = wrap_operand(scale) + + if x is not None and scale.shape != (x.size // 32,): + raise ValueError( + f"The shape of scale {scale.shape} is not compatible with a tensor of shape {x.shape}. " + f"The expected scale shape is {(x.size // 32,)}." + ) + return scale + + +def create_mxfp8_scale(x, exponent, stream=None): + """ + Create MXFP8 block scale with the same value for the whole tensor ``x``. + + Args: + x: The tensor to create the block scale for + + exponent: An integer from [-127, 128] range. Effective scale will be ``2^exponent``. + + stream: Optional stream to create the block scale on. + Defaults to the stream of ``x``. + + Returns: + An MXFP8 block scale factors tensor to be used with MXFP8 computations. + """ + _validate_tensor(x, "create_mxfp8_scale") + x = wrap_operand(x) + + if not -127 <= exponent <= 128: + raise ValueError("The exponent should be an integer from [-127, 128] range.") + + stream_holder = get_or_create_stream(x.device_id, stream, x.name) + scale = create_empty_tensor( + x.__class__, (x.size // 32,), "uint8", device_id=x.device, stream_holder=stream_holder, verify_strides=False + ) + scale.tensor[:] = exponent + 127 + return scale.tensor + + +def invert_mxfp8_scale(scale): + """ + Compute a reciprocal of MXFP8 block scale. + + Args: + scale: MXFP8 block scale tensor. + + Returns: + An MXFP8 block scale factors tensor with reciprocals of the values in ``scale``. + """ + _validate_mxfp8_scale(scale, "invert_mxfp8_scale") + + scale[scale == 255] = 254 # Prevent the overflow + return (127 + 127) - scale + + +def get_mxfp8_scale_offset(x, index): + """ + Computes the offset of MXFP8 scale used for element ``x[index]``. + + Args: + x: The tensor to which ``index`` referes. + + index: A tuple of tensor indices. This function supports broadcasting, + so the `index` can be a tuple of integers or a tuple of tensors. + + Returns: + A single integer indicating an offset to the MXFP8 block scale factor which + is applied to ``x[index]`` during scaling. + + Note: + In typical use-cases, there should be no need to manually modify MXFP8 scales. + The scales returned as ``"d_out_scale"`` by one multiplication can be directly + reused as input scales for another multiplication. + """ + + _validate_tensor(x, where="get_mxfp8_scale_offset") + x = wrap_operand(x) + ndim = len(x.shape) + if len(index) != ndim: + raise ValueError("Index length should match the number of dimensions of x.") + + if ndim == 2: + batch_offset = 0 + elif ndim > 2: + # Compute batch offset + batch_strides = x.strides[:-2] + batch_index = index[:-2] + batch_offset = sum(i * stride for i, stride in zip(batch_index, batch_strides, strict=True)) // min(batch_strides) + else: + raise ValueError(f"Got {ndim}-D tensor in `get_mxfp8_scale_offset`, but expected at least 2-D.") + major_d, minor_d = (-2, -1) if x.strides[-2] > x.strides[-1] else (-1, -2) + major, minor, minor_length = index[major_d], index[minor_d], x.shape[minor_d] + + # Compute tile offset + tile_minor = minor // 128 + tile_major = major // 128 + tile_offset = (minor_length // 128) * tile_major + tile_minor + minor = minor % 128 + major = major % 128 + + # Compute offset in the tile + minor = minor // 32 + offset = (major % 32) * 16 + (major // 32) * 4 + minor + + # Add the offsets together + tile_size = 128 * 128 // 32 + matrix_size = x.shape[-1] * x.shape[-2] // 32 + return batch_offset * matrix_size + tile_offset * tile_size + offset + + +def apply_mxfp8_scale(x, scale): + """ + Apply MXFP8 block scale factors to tensor ``x``. + + Args: + x: The tensor to which the scaling should be applied. + + scale: The block scale factors to apply. + + Returns: + A ``float32`` tensor with values of ``x`` with scales applied. + + Note: + This function is not intended for production usage due to its relatively low + performance and high memory consumption. Instead of applying the scales + manually using this function, use + :attr:`~nvmath.linalg.advanced.MatmulOptions.result_type` to request non-FP8 output. + """ + scale = _validate_mxfp8_scale(scale, "apply_mxfp8_scale", x=x) + import torch + + idx = get_mxfp8_scale_offset(x, torch.meshgrid(*(torch.arange(d) for d in x.shape), indexing="ij")).to(x.device) + + actual_scale = 2 ** (scale.tensor.type(torch.float32)[idx] - 127) + return x.type(torch.float32) * actual_scale diff --git a/nvmath/linalg/advanced/matmulmod.py b/nvmath/linalg/advanced/matmulmod.py index 05324bf..2e13c4b 100644 --- a/nvmath/linalg/advanced/matmulmod.py +++ b/nvmath/linalg/advanced/matmulmod.py @@ -1,4 +1,4 @@ -# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. +# Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. # # SPDX-License-Identifier: Apache-2.0 @@ -9,7 +9,6 @@ from dataclasses import dataclass import functools import logging -import math import operator from collections.abc import Sequence @@ -33,17 +32,16 @@ from nvmath.linalg._internal import matmul_desc_ifc, matmul_pref_ifc, matrix_layout_ifc from nvmath.linalg._internal.typemaps import ( - NAME_TO_DEFAULT_SCALE_TYPE, - NAME_TO_DEFAULT_COMPUTE_TYPE, + NAMES_TO_DEFAULT_SCALE_TYPE, + NAMES_TO_DEFAULT_COMPUTE_TYPE, COMPUTE_TYPE_TO_DEFAULT_SCALE_TYPE, SCALE_TYPE_TO_DEFAULT_COMPUTE_TYPE, + SUPPORTED_TYPES, ) from nvmath.linalg._internal.utils import ( axis_order_in_memory, calculate_strides, check_batch_tileable, - create_handle, - destroy_handle, get_handle, pointer_aligned_to, ) @@ -268,7 +266,7 @@ def get_mm_traits(a_layout, b_layout, c_layout, logger): ) raise ValueError(message) logger.debug( - f"The batch layout for A corresponding to shape = {a_batch_shape} and strides = {a_batch_strides} IS " "tileable." + f"The batch layout for A corresponding to shape = {a_batch_shape} and strides = {a_batch_strides} IS tileable." ) batch_shape = a_batch_shape batch_axis_order = a_batch_axis_order = axis_order_in_memory(a_batch_strides) @@ -281,7 +279,7 @@ def get_mm_traits(a_layout, b_layout, c_layout, logger): ) raise ValueError(message) logger.debug( - f"The batch layout for B corresponding to shape = {b_batch_shape} and strides = {b_batch_strides} IS " "tileable." + f"The batch layout for B corresponding to shape = {b_batch_shape} and strides = {b_batch_strides} IS tileable." ) batch_shape = b_batch_shape batch_axis_order = b_batch_axis_order = axis_order_in_memory(b_batch_strides) @@ -462,17 +460,10 @@ def get_result_traits(mm_traits, epilog_ordering, logger): supported types are :class:`numpy.ndarray`, :class:`cupy.ndarray`, and :class:`torch.Tensor`.""".replace("\n", " "), # "c_admonitions": """ - .. note:: - The broadcasting behavior of a 1-D (vector) `c` deviates from the - equivalent NumPy expression. With nvmath-python, `c` is internally - promoted to shape (M, 1) in order to broadcast with ``a @ b``; this matches the - behavior of cuBLASLt. With NumPy, a 1-D `c` behaves as if it has shape - (1, N) in the expression ``a @ b + c``. - - .. deprecated:: 0.2.1 - In order to avoid broadcasting behavior ambiguity, nvmath-python will no longer - accept a 1-D (vector) `c` starting in version 0.3.0. Use a singleton - dimension to convert your input array to 2-D. + .. versionchanged:: 0.3.0 + In order to avoid broadcasting behavior ambiguity, nvmath-python no longer + accepts a 1-D (vector) `c`. Use a singleton dimension to convert your input + array to 2-D. """, # "alpha": """\ @@ -482,6 +473,13 @@ def get_result_traits(mm_traits, epilog_ordering, logger): "beta": """\ The scale factor for the matrix addition term as a real or complex number. A value for `beta` must be provided if operand `c` is specified.""".replace("\n", " "), + # + "quantization_scales": """\ +Specify scale factors for the matrix multiplication as a :class:`~nvmath.linalg.advanced.MatmulQuantizationScales` +object. Alternatively, a `dict` containing the parameters for the +:class:`~nvmath.linalg.advanced.MatmulQuantizationScales` +constructor can also be provided. +Allowed and required only for narrow-precision (FP8 and lower) operations.""".replace("\n", " "), # "algorithms": """\ A sequence of :class:`Algorithm` objects that can be directly provided to bypass planning. The algorithm objects must be @@ -520,9 +518,45 @@ def get_result_traits(mm_traits, epilog_ordering, logger): "result": """\ The result of the specified matrix multiplication (epilog applied), which remains on the same device and belong to the same package as the input operands. If an epilog (like :attr:`nvmath.linalg.advanced.MatmulEpilog.RELU_AUX`) that -results in extra output is used, a tuple is returned with the first element being the matrix multiplication result -(epilog applied) and the second element being the auxiliary output provided by the selected epilog as a -`dict`.""".replace("\n", " "), +results in extra output is used, or an extra output is requested (for example by setting +:attr:`~nvmath.linalg.advanced.MatmulOptions.result_amax` option in ``options`` argument), +a tuple is returned with the first element being the matrix multiplication result (epilog applied) and the second element +being the auxiliary output provided as a `dict`. """.replace("\n", " "), + # + "narrow_precision": """\ + Matrix multiplication with narrow-precision operands is supported, in both FP8 and MXFP8 formats. + + .. note:: + + Narrow-precision matrix multiplication in nvmath-python requires **CUDA Toolkit 12.8 or newer**. + **FP8 requires a device with compute capability 8.9 or higher** (Ada, Hopper, Blackwell or newer architecture). + **MXFP8 requires a device with compute capability 10.0 or higher** (Blackwell or newer architecture). + Please refer to the `compute capability table `_ + to check the compute capability of your device. + + For FP8 operations: + + * For each operand a scaling factor needs to be specified via ``quantization_scales`` argument. + * Maximum absolute value of the result (amax) can be requested via + :attr:`~nvmath.linalg.advanced.MatmulOptions.result_amax` option in ``options`` argument. + * Custom result type (both FP8 and non-FP8) can be requested via + :attr:`~nvmath.linalg.advanced.MatmulOptions.result_type` option in ``options`` argument. + + For MXFP8 operations: + + * To enable MXFP8 operations, :attr:`~nvmath.linalg.advanced.MatmulOptions.block_scaling` option + must be set to ``True``. + * Block scaling factors need to be specified via ``quantization_scales`` argument. + * Utilities in :mod:`nvmath.linalg.advanced.helpers.matmul` can be used to create and modify + block scaling factors. + * When MXFP8 is used and the result type is a narrow-precision data type, the auxiliary output + ``"d_out_scale"`` will be returned in the auxiliary output tensor. It will contain the scales + that were used for the result quantization. + + Please refer to the examples and narrow-precision operations tutorial for more details. + For more details on the FP8 and MXFP8 formats in cuBLAS, + see the `cublasLtMatmul documentation `_. +""".strip(), # "semantics": """\ The semantics of the matrix multiplication follows :func:`numpy.matmul` semantics, with some restrictions on @@ -535,10 +569,10 @@ def get_result_traits(mm_traits, epilog_ordering, logger): multiplication, the appended ``1`` is removed from the result's dimensions. * If `a` or `b` is N-D (N > 2), then the operand is treated as a batch of matrices. If both `a` and `b` are N-D, their batch dimensions must match. If exactly one of `a` or `b` is N-D, the other operand is broadcast. - * The operand for the matrix addition `c` may be a vector of length M, a matrix of shape (M, 1) or (M, N), or - batched versions of the latter (..., M, 1) or (..., M, N). Here M and N are the dimensions of the result of - the matrix multiplication. If a vector is provided or N = 1, the columns of `c` are broadcast for the - addition. If batch dimensions are not present, `c` is broadcast across batches as needed. + * The operand for the matrix addition `c` may be a matrix of shape (M, 1) or (M, N), or the batched versions + (..., M, 1) or (..., M, N). Here M and N are the dimensions of the result of the matrix multiplication. If N = 1, the + columns of `c` are broadcast for the addition; the rows of `c` are never broadcast. If batch dimensions are not + present, `c` is broadcast across batches as needed. * Similarly, when operating on a batch, auxiliary outputs are 3-D for all epilogs. Therefore, epilogs that return 1-D vectors of length N in non-batched mode return 3-D matrices of size (batch, N, 1) in batched mode. """.strip(), @@ -612,9 +646,14 @@ class Matmul: stream: {stream} + quantization_scales: {quantization_scales} + Semantics: {semantics} + Narrow-precision support: + {narrow_precision} + See Also: :meth:`autotune`, :meth:`plan`, :meth:`reset_operands`, :meth:`execute` @@ -697,16 +736,35 @@ class Matmul: directory. """ - def __init__(self, a, b, /, c=None, *, alpha=None, beta=None, qualifiers=None, options=None, stream=None): + def __init__( + self, + a, + b, + /, + c=None, + *, + alpha=None, + beta=None, + qualifiers=None, + quantization_scales=None, + options=None, + stream=None, + ): options = utils.check_or_create_options(_configuration.MatmulOptions, options, "Matrix multiplication options") self.options = options self.logger = options.logger if options.logger is not None else logging.getLogger() + def check_dtype(dtype, operand_name): + if dtype not in SUPPORTED_TYPES: + raise ValueError(f"The dtype of operand {operand_name} ({dtype}) is not supported.") + # The matrix multiplication has two required operands 'a' and 'b', and one optional # operand 'c'. a = tensor_wrapper.wrap_operand(a) b = tensor_wrapper.wrap_operand(b) + check_dtype(a.dtype, "A") + check_dtype(b.dtype, "B") self.logger.info("= SPECIFICATION PHASE =") self.logger.info(f"The data type of operand A is '{a.dtype}', and that of operand B is '{b.dtype}'.") @@ -714,14 +772,23 @@ def __init__(self, a, b, /, c=None, *, alpha=None, beta=None, qualifiers=None, o if c is not None: self.num_operands = 3 c = tensor_wrapper.wrap_operand(c) + if len(c.shape) < 2: + raise ValueError( + "In order to avoid broadcasting behavior ambiguity, `c` must be at least 2-D. " + "Use a singleton dimension to convert your input array to 2-D." + ) + check_dtype(c.dtype, "C") self.logger.info(f"The data type of operand C is {c.dtype}.") if c is not None and beta is None: raise ValueError("A value for beta must be provided if operand C is provided.") - if a.dtype != b.dtype: - raise ValueError(f"The dtype of operands A {a.dtype} and B {b.dtype} must be the same.") - self.ab_dtype_name = a.dtype + if (a.dtype, b.dtype) not in NAMES_TO_DEFAULT_SCALE_TYPE: + raise ValueError(f"Unsupported combination of dtypes for operands A {a.dtype} and B {b.dtype}.") + + # Currently, a.dtype != b.dtype is only supported for FP8 (different FP8 kinds are + # allowed), so we assume that A and B have equal width. + self.input_type_width = typemaps.NAME_TO_DATA_WIDTH[a.dtype] assert self.num_operands == 2 or self.num_operands == 3, "Internal Error." @@ -745,8 +812,7 @@ def __init__(self, a, b, /, c=None, *, alpha=None, beta=None, qualifiers=None, o self.memory_space = "cpu" self.device_id = options.device_id self.logger.info( - f"The input operands' memory space is {self.memory_space}, and the execution space is on device " - f"{self.device_id}." + f"The input operands' memory space is {self.memory_space}, and the execution space is on device {self.device_id}." ) # Allocate device memory (in stream context) if needed. @@ -808,40 +874,52 @@ def __init__(self, a, b, /, c=None, *, alpha=None, beta=None, qualifiers=None, o # Determine the data types for a and b. self.a_dtype = typemaps.NAME_TO_DATA_TYPE[a.dtype] self.b_dtype = typemaps.NAME_TO_DATA_TYPE[b.dtype] + self.a_dtype_name = a.dtype + self.b_dtype_name = b.dtype + + self.is_complex = "complex" in self.a_dtype_name or "complex" in self.b_dtype_name + + # Determine the data types for c and d. + self.d_dtype = options.result_type + if self.num_operands == 3: + self.c_dtype = typemaps.NAME_TO_DATA_TYPE[c.dtype] + if self.d_dtype is None: + self.d_dtype = self.c_dtype + elif self.num_operands == 2: + if self.d_dtype is None: + self.d_dtype = self.a_dtype + if self.d_dtype in (CudaDataType.CUDA_R_8F_E5M2, CudaDataType.CUDA_R_8F_E4M3): + self.c_dtype = CudaDataType.CUDA_R_16F + else: + self.c_dtype = self.d_dtype + self.c_dtype_name = typemaps.DATA_TYPE_TO_NAME[self.c_dtype] + self.d_dtype_name = typemaps.DATA_TYPE_TO_NAME[self.d_dtype] + self.c_dtype_width = typemaps.NAME_TO_DATA_WIDTH[self.c_dtype_name] + self.d_dtype_width = typemaps.NAME_TO_DATA_WIDTH[self.d_dtype_name] - # Determine the data type for c (if not provided) and d. The two must match. - if self.num_operands == 2: - self.d_dtype_name, self.d_dtype = self.ab_dtype_name, typemaps.NAME_TO_DATA_TYPE[self.ab_dtype_name] - self.c_dtype = self.d_dtype - self.c_dtype_name = self.d_dtype_name - else: - self.c_dtype_name = c.dtype - self.c_dtype = typemaps.NAME_TO_DATA_TYPE[self.c_dtype_name] - self.d_dtype = self.c_dtype - self.d_dtype_name = typemaps.DATA_TYPE_TO_NAME[self.d_dtype] self.logger.info(f"The data type for the result D is '{self.d_dtype_name}'.") def assert_valid_compute_type(compute_type): - if compute_type not in cublas.ComputeType: - message = f"Unsupported compute type. The compute type '{compute_type}' is currently not supported." + if compute_type not in COMPUTE_TYPE_TO_DEFAULT_SCALE_TYPE["real"]: + message = f"Unsupported compute type. The compute type '{repr(compute_type)}' is currently not supported." raise ValueError(message) # Determine the scale type. if options.scale_type is None: if options.compute_type is not None: assert_valid_compute_type(options.compute_type) - if "complex" in self.ab_dtype_name: + if self.is_complex: scale_type_map = COMPUTE_TYPE_TO_DEFAULT_SCALE_TYPE["complex"] else: scale_type_map = COMPUTE_TYPE_TO_DEFAULT_SCALE_TYPE["real"] self.scale_type = scale_type_map[options.compute_type] else: - self.scale_type = NAME_TO_DEFAULT_SCALE_TYPE[self.ab_dtype_name] + self.scale_type = NAMES_TO_DEFAULT_SCALE_TYPE[(self.a_dtype_name, self.b_dtype_name)] self.scale_type_name = typemaps.DATA_TYPE_TO_NAME[self.scale_type] else: self.scale_type = options.scale_type - if self.scale_type not in typemaps.DATA_TYPE_TO_NAME: - message = f"Unsupported scale type. The data type '{self.scale_type}' is currently not supported." + if self.scale_type not in SCALE_TYPE_TO_DEFAULT_COMPUTE_TYPE: + message = f"Unsupported scale type. The data type '{repr(self.scale_type)}' is currently not supported." raise ValueError(message) self.scale_type_name = typemaps.DATA_TYPE_TO_NAME[self.scale_type] self.logger.info(f"The scale type is '{self.scale_type_name}'.") @@ -851,38 +929,52 @@ def assert_valid_compute_type(compute_type): if options.scale_type is not None: self.compute_type = SCALE_TYPE_TO_DEFAULT_COMPUTE_TYPE[options.scale_type] else: - self.compute_type = NAME_TO_DEFAULT_COMPUTE_TYPE[self.ab_dtype_name] + self.compute_type = NAMES_TO_DEFAULT_COMPUTE_TYPE[(self.a_dtype_name, self.b_dtype_name)] else: self.compute_type = options.compute_type assert_valid_compute_type(self.compute_type) self.logger.info(f"The compute type is {self.compute_type.name}.") - def is_supported(dtype, compute_type, scale_type): + def is_supported(atype, btype, compute_type, scale_type): ct = cublas.ComputeType st = CudaDataType + abtype = atype if atype == btype else (atype, btype) if compute_type in (ct.COMPUTE_16F, ct.COMPUTE_16F_PEDANTIC): - return scale_type == st.CUDA_R_16F and dtype == "float16" - elif compute_type in (ct.COMPUTE_32F, ct.COMPUTE_32F_PEDANTIC): + return scale_type == st.CUDA_R_16F and abtype == "float16" + elif compute_type == ct.COMPUTE_32F_PEDANTIC: + if scale_type == st.CUDA_R_32F: + return abtype in ("float32", "bfloat16", "float16", "float8_e4m3fn", "float8_e5m2") + elif scale_type == st.CUDA_C_32F: + return abtype == "complex64" + elif compute_type == ct.COMPUTE_32F: if scale_type == st.CUDA_R_32F: - return dtype in ("float32", "bfloat16", "float16") + return abtype in ( + "float32", + "bfloat16", + "float16", + "float8_e4m3fn", + "float8_e5m2", + ("float8_e4m3fn", "float8_e5m2"), + ("float8_e5m2", "float8_e4m3fn"), + ) elif scale_type == st.CUDA_C_32F: - return dtype == "complex64" + return abtype == "complex64" elif compute_type in (ct.COMPUTE_32F_FAST_16F, ct.COMPUTE_32F_FAST_16BF, ct.COMPUTE_32F_FAST_TF32): if scale_type == st.CUDA_R_32F: - return dtype == "float32" + return abtype == "float32" if scale_type == st.CUDA_C_32F: - return dtype == "complex64" + return abtype == "complex64" elif compute_type in (ct.COMPUTE_64F, ct.COMPUTE_64F_PEDANTIC): if scale_type == st.CUDA_R_64F: - return dtype == "float64" + return abtype == "float64" if scale_type == st.CUDA_C_64F: - return dtype == "complex128" + return abtype == "complex128" return False - if not is_supported(self.ab_dtype_name, self.compute_type, self.scale_type): + if not is_supported(self.a_dtype_name, self.b_dtype_name, self.compute_type, self.scale_type): raise ValueError( f"Selected scale_type={repr(self.scale_type)} compute_type={repr(self.compute_type)} " - + f"are not supported for data type {self.ab_dtype_name}" + + f"are not supported for data types {self.a_dtype_name} (A) and {self.b_dtype_name} (B)." ) # Set alpha and beta. @@ -900,6 +992,27 @@ def is_supported(dtype, compute_type, scale_type): except (ValueError, TypeError) as e: raise ValueError(f"The value provided for beta {beta} is not convertible to dtype '{self.beta.dtype}'.") from e + # Set narrow-precision (FP8 and lower) quantization_scales. + if self.input_type_width <= 8: + self.quantization_scales = self._validate_operand_scales(quantization_scales, all_required=True) + elif quantization_scales is not None: + self.logger.warning( + "Matmul: The provided scales are ignored, since they are only applicable to narrow-precision (FP8 and lower) " + "operations." + ) + + if self.options.result_amax and self.d_dtype_width > 8: + raise ValueError("result_amax=True is allowed only for narrow-precision (FP8 and lower) results") + + # Check operands alignment if needed + if self.input_type_width <= 8: + for operand, operand_name in zip(self.operands, "ABC", strict=False): + if operand.data_ptr % 16 != 0: + raise ValueError( + f"For narrow-precision (FP8 and lower) multiplication, operand {operand_name} should be aligned to 16 " + "bytes." + ) + # Capture operand extents and strides for consistency check when resetting operands. self.operand_extents = tuple(o.shape for o in self.operands) self.operand_strides = tuple(o.strides for o in self.operands) @@ -909,6 +1022,12 @@ def is_supported(dtype, compute_type, scale_type): b_layout = MatrixLayout(self.operands[1].shape, self.operands[1].strides, self.qualifiers[1]["is_conjugate"]) c_layout = MatrixLayout(self.operands[2].shape, self.operands[2].strides) if self.num_operands == 3 else None + # Enforce equal batch shape for A and B if block_scaling=True. + if self.options.block_scaling and a_layout.shape[:-2] != b_layout.shape[:-2]: + raise ValueError( + "When block_scaling=True, the batch dimensions of A and B must match (broadcasting is not supported)." + ) + # Get the operation traits. self.mm_traits = get_mm_traits(a_layout, b_layout, c_layout, self.logger) self.result_traits = None # Wait till planning to determine this based on the epilog. @@ -942,6 +1061,14 @@ def is_supported(dtype, compute_type, scale_type): self.mm_desc_ifc.fast_accum = options.fast_accumulation self.logger.info(f"The flag for fast accumulation mode is {options.fast_accumulation}.") + if self.input_type_width == 8 and version < 120800: + raise ValueError( + f"FP8 is not supported for cuBLASLt version {version}. cuBLASLt version 12.8 or higher is required." + ) + + # Planning preferences + self.preferences = None + # Epilog attributes. self.epilog = None @@ -960,6 +1087,9 @@ def is_supported(dtype, compute_type, scale_type): # Keep track of epilog output handlers to allocate output in execute(). self.epilog_output_handlers = [] + # Non-epilog aux outputs. Currently, only used for quantization outputs (amax etc.) + self.aux_outputs = None + # Plan attributes. self.preference_ptr = None self.a_layout_ptr, self.b_layout_ptr, self.c_layout_ptr, self.d_layout_ptr = None, None, None, None @@ -979,6 +1109,9 @@ def is_supported(dtype, compute_type, scale_type): self.workspace_stream = None self.last_compute_event = None + # Device-side array with the quantization_scales + self.quantization_scales_device = {} + self.valid_state = True self.logger.info("The Matmul operation has been created.") @@ -1018,7 +1151,7 @@ def _free_plan_resources(self, exception: Exception | None = None) -> bool: if self.b_layout_ptr is not None: cublaslt.matrix_layout_destroy(self.b_layout_ptr) self.b_layout_ptr = None - if self.num_operands == 3 and self.c_layout_ptr is not None: # Note that c layout aliases with that of d. + if self.c_layout_ptr != self.d_layout_ptr and self.c_layout_ptr is not None: cublaslt.matrix_layout_destroy(self.c_layout_ptr) self.c_layout_ptr = None if self.d_layout_ptr is not None: @@ -1079,6 +1212,7 @@ def _release_workspace_memory_perhaps_wrapper(self, exception: Exception | None This is used in @atomic. """ self._release_workspace_memory_perhaps(release_workspace=self.workspace_allocated_here) + self._reset_workspace_allocation_tracking() return True @utils.precondition(_check_valid_matmul) @@ -1149,6 +1283,126 @@ def applicable_algorithm_ids(self, limit=8): ) return algo_ids + def _validate_operand_scales(self, quantization_scales, all_required): + """ + Validates the user-provided quantization scales and wraps them converts them to + MatmulQuantizationScales if needed. + """ + if quantization_scales is None: + raise ValueError( + "Scales are required for narrow-precision (FP8 and lower) operations. Please set `quantization_scales` " + "argument." + ) + quantization_scales = utils.check_or_create_options( + _configuration.MatmulQuantizationScales, quantization_scales, "Scale factors" + ) + expected_scales = "AB" + if self.d_dtype_width <= 8 and not self.options.block_scaling: + expected_scales += "D" + elif quantization_scales.d is not None: + if self.options.block_scaling: + raise ValueError("Quantization scaling is not supported for D when `block_scaling` option is enabled.") + if self.d_dtype_width > 8: + raise ValueError( + "Quantization scaling is not supported for D when it is not a narrow-precision (FP8 and lower) type." + ) + if self.num_operands == 3 and self.c_dtype_width <= 8: + expected_scales += "C" + elif quantization_scales.c is not None: + raise ValueError( + "Quantization scaling is not supported for C when it is not a narrow-precision (FP8 and lower) type." + ) + if all_required: + for operand in expected_scales: + if getattr(quantization_scales, operand.lower()) is None: + raise ValueError(f"Scale for {operand.upper()} is not specified") + return quantization_scales + + def _validate_epilog_aux_scale(self, aux_quantization_scale, *, required): + is_fp8_aux = ( + self.preferences.epilog.aux_type is not None + and typemaps.NAME_TO_DATA_WIDTH[typemaps.DATA_TYPE_TO_NAME[self.preferences.epilog.aux_type]] <= 8 + ) + if aux_quantization_scale is not None and not is_fp8_aux: + raise ValueError( + "Scales for epilog auxiliary output are not supported when `preferences.epilog.aux_type` is not set to a " + "narrow-precision type." + ) + elif aux_quantization_scale is None and is_fp8_aux and required: + raise ValueError( + '"aux_quantization_scale" epilog input is required when `preferences.epilog.aux_type` is not set to a ' + "narrow-precision type." + ) + + def _prepare_quantization_scale(self, scale, operand, cublas_operand, operand_size=None): + if scale is None: + return + elif isinstance(scale, int | float): + if self.options.block_scaling: + raise ValueError("A scalar tensor-wide scale factor is not allowed when block_scaling=True.") + # If it's a scalar, copy to GPU. Float32 is the only type allowed by + # cublasLtMatmulScale_t for tensor-wide scaling. + self.logger.debug(f"Scale for {operand.upper()} will be copied to device {self.device_id}.") + self.quantization_scales_device[operand] = tensor_wrapper.wrap_operand(cp.asarray([scale], dtype="float32")) + else: + if utils.infer_object_package(scale) != self.package: + raise TypeError("The quantization scaling tensors must belong to the same package as the operands.") + self.quantization_scales_device[operand] = tensor_wrapper.wrap_operand(scale) + device_id = self.quantization_scales_device[operand].device_id + if device_id is not None and self.device_id != device_id: + raise ValueError(f"The scales must be on the same device ({device_id}) as the operands ({self.device_id}).") + if self.quantization_scales_device[operand].device in (None, "cpu"): + # If it's on CPU, copy to GPU + self.logger.debug(f"Scale for {operand.upper()} will be copied to device {self.device_id}.") + self.quantization_scales_device[operand] = tensor_wrapper.wrap_operand( + self.quantization_scales_device[operand].to(self.device_id) + ) + if not self.options.block_scaling: + if self.quantization_scales_device[operand].shape not in ((1,), ()): + raise ValueError( + f"The provided {operand.upper()} scale tensor has to be of shape (1,) or (). " + f"Got {self.quantization_scales_device[operand].shape} instead." + ) + if self.quantization_scales_device[operand].dtype != "float32": + raise ValueError( + f"The provided {operand.upper()} scale tensor has to be float32 type. " + f"Got {self.quantization_scales_device[operand].dtype} instead." + ) + elif self.input_type_width == 8: + if operand_size is None: + raise ValueError(f"Block scaling is not supported for {operand.upper()} scale.") + expected_shape = (operand_size // 32,) + if self.quantization_scales_device[operand].shape != expected_shape: + raise ValueError( + f"Scales for {operand.upper()} should have shape {expected_shape}. " + f"Got {self.quantization_scales_device[operand].shape}." + ) + if self.quantization_scales_device[operand].dtype != "uint8": + raise ValueError(f"Block scales for {operand.upper()} should be uint8 tensor.") + else: + raise ValueError("block_scaling == True is not supported for non-FP8 types.") + setattr(self.mm_desc_ifc, f"{cublas_operand}_scale_pointer", self.quantization_scales_device[operand].data_ptr) + if self.options.block_scaling: + self.logger.debug(f"Using VEC32_UE8M0 scale mode for operand {operand.upper()}.") + setattr(self.mm_desc_ifc, f"{cublas_operand}_scale_mode", cublaslt.MatmulMatrixScale.VEC32_UE8M0) + else: + self.logger.debug(f"Using SCALAR_32F scale mode for operand {operand.upper()}.") + setattr(self.mm_desc_ifc, f"{cublas_operand}_scale_mode", cublaslt.MatmulMatrixScale.SCALAR_32F) + + def _prepare_operand_quantization_scales(self, scales): + """ + Copies the scales to the GPU and updates the pointers in mm_desc_ifc. + """ + for operand in "abcd": + scale = getattr(scales, operand) + if self.options.block_scaling and operand == "a": + operand_size = self.operands[0].size + elif self.options.block_scaling and operand == "b": + operand_size = self.operands[1].size + else: + operand_size = None + self._prepare_quantization_scale(scale, operand, cublas_operand=operand, operand_size=operand_size) + @utils.precondition(_check_valid_matmul) @utils.atomic(_free_plan_resources, method=True) def plan( @@ -1246,6 +1500,11 @@ def plan( self.epilog_input_name_to_handler = dict() # Clear input name to handler map as well, self.epilog_inputs_traits = dict() # ... and the input traits as well. + preferences = utils.check_or_create_options( + _configuration.MatmulPlanPreferences, preferences, "Matrix multiplication plan preferences" + ) + self.preferences = preferences + mm_traits = self.mm_traits stream_holder = utils.get_or_create_stream(self.device_id, stream, self.package) @@ -1257,7 +1516,7 @@ def plan( if epilog is None and epilog_inputs is not None: self.logger.warning( - f"Matmul: The provided epilog inputs {epilog_inputs.keys()} are ignored since an epilog is not " "specified." + f"Matmul: The provided epilog inputs {epilog_inputs.keys()} are ignored since an epilog is not specified." ) self.epilog = epilog @@ -1294,11 +1553,47 @@ def plan( ): msg = f"The epilog {epilog.name} requires input matrix 'c' to be F-contiguous (column-major)." raise ValueError(msg) + if ( + version < 120804 + # A has one row + and self.mm_traits.M == 1 + # C is broadcast + and self.mm_traits.c_layout_traits is not None + and self.mm_traits.c_layout_traits.ld == 0 + # Using both an bias epilog and C + and self.epilog & _configuration.MatmulEpilog.BIAS > 0 + ): + message = ( + "When matrix 'a' has one row, " + "simultaneously broadcasting matrix 'c' and using a BIAS epilog requires cublaslt >= 120804; " + f"You have version {version}. Update to CUDA Toolkit >= 12.8.1." + ) + raise ValueError(message) + + # Take a copy of the user-provided inputs. + if epilog_inputs is not None: + epilog_inputs = epilog_inputs.copy() + else: + epilog_inputs = {} + + # Get the dtype of auxiliary buffer + aux_dtype_name = ( + typemaps.DATA_TYPE_TO_NAME[self.preferences.epilog.aux_type] + if self.preferences.epilog.aux_type is not None + else None + ) + + # Extract aux quantization scale from the inputs. + aux_quantization_scale = ( + epilog_inputs.pop("aux_quantization_scale") if "aux_quantization_scale" in epilog_inputs else None + ) + self._validate_epilog_aux_scale(aux_quantization_scale, required=True) + self._prepare_quantization_scale(aux_quantization_scale, "epilog_aux", cublas_operand="epilogue_aux") epilog_input_handler_types = EPILOG_INPUT_HANDLERS_MAP[epilog] if epilog_input_handler_types: epilog_input_handlers = [ - handler_type(self.logger, mm_traits, epilog, self.d_dtype_name) + handler_type(self.logger, mm_traits, epilog, self.c_dtype_name, self.d_dtype_name, aux_dtype_name) for handler_type in epilog_input_handler_types ] @@ -1311,19 +1606,13 @@ def plan( required_epilog_input_names = {h.name for h in epilog_input_handlers} self.logger.info(f"The epilog requires the following additional inputs: {required_epilog_input_names}.") - if required_epilog_input_names and epilog_inputs is None: - raise ValueError( - f"The epilog {epilog.name} requires the following input tensors: {required_epilog_input_names}." - ) - if required_epilog_input_names != set(epilog_inputs.keys()): raise ValueError( f"The epilog {epilog.name} requires the following input tensors: " f"{required_epilog_input_names}. The provided tensor names are: {epilog_inputs.keys()}" ) - # Wrap epilog inputs. Take a copy of the user-provided dict. - epilog_inputs = epilog_inputs.copy() + # Wrap epilog inputs. for name in epilog_inputs: epilog_inputs[name] = tensor_wrapper.wrap_operand(epilog_inputs[name]) @@ -1340,7 +1629,7 @@ def plan( device_id = utils.get_operands_device_id(list(epilog_inputs.values())) if device_id is not None and self.device_id != device_id: raise ValueError( - f"The epilog inputs must be on the same device ({device_id}) as the operands " f"({self.device_id})." + f"The epilog inputs must be on the same device ({device_id}) as the operands ({self.device_id})." ) # Move epilog inputs to the GPU, if needed. @@ -1377,10 +1666,9 @@ def plan( epilog_output_handler_types = EPILOG_OUTPUT_HANDLERS_MAP[epilog] if epilog_output_handler_types: self.epilog_output_handlers = epilog_output_handlers = [ - handler_type(self.logger, mm_traits, epilog, self.d_dtype_name) + handler_type(self.logger, mm_traits, epilog, self.c_dtype_name, self.d_dtype_name, aux_dtype_name) for handler_type in epilog_output_handler_types ] - # Check if the epilog requires a specific result layout, and if the # requirement is consistent for all the handlers. epilog_output_handlers_ordering = {h.order for h in epilog_output_handlers} @@ -1407,19 +1695,23 @@ def plan( f"{self.result_traits.d_layout_traits.batch_offset}." ) - preferences = utils.check_or_create_options( - _configuration.MatmulPlanPreferences, preferences, "Matrix multiplication plan preferences" - ) - # Internally transpose operand A if required (conjugate flag) and create layout. transpose = False - if mm_traits.a_layout_traits.is_conjugate and "complex" in self.ab_dtype_name: + if mm_traits.a_layout_traits.is_conjugate and self.is_complex: self.mm_desc_ifc.transa = cublas.Operation.C transpose = True self.logger.debug( "To conjugate A, the operand A will be internally transposed and the matrix multiplication will be " "performed with OP_C for operand A." ) + if self.input_type_width <= 8: + # narrow-precision (FP8 and lower) data types are only supported for transa=OP_T + self.mm_desc_ifc.transa = cublas.Operation.T + transpose = True + self.logger.debug( + "For narrow-precision (FP8 and lower) multiplication, the operand A will be internally transposed and the " + "matrix multiplication will be performed with OP_T for operand A." + ) m, n, ld, a_order = mm_traits.a_layout_traits.get_mm_layout(transpose=transpose) self.a_layout_ptr = cublaslt.matrix_layout_create(self.a_dtype, rows=m, cols=n, ld=ld) self.logger.debug(f"Layout for A: rows = {m}, cols = {n}, ld = {ld}.") @@ -1427,7 +1719,7 @@ def plan( # Internally transpose operand B if required (conjugate flag, or epilog is BGRADB) # and create layout. transpose = False - if mm_traits.b_layout_traits.is_conjugate and "complex" in self.ab_dtype_name: + if mm_traits.b_layout_traits.is_conjugate and self.is_complex: self.mm_desc_ifc.transb = cublas.Operation.C transpose = True self.logger.debug( @@ -1465,7 +1757,18 @@ def plan( layout_d_ifc.strided_batch_offset = result_traits.d_layout_traits.batch_offset if self.num_operands == 2: - self.c_layout_ptr = self.d_layout_ptr + if self.c_dtype == self.d_dtype: + # If C and D have equal types, reuse the layout. + self.c_layout_ptr = self.d_layout_ptr + else: + # Otherwise, create a D-like layout, but with different type. + self.c_layout_ptr = cublaslt.matrix_layout_create( + self.c_dtype, rows=mm_traits.M, cols=mm_traits.N, ld=result_traits.d_layout_traits.ld + ) + layout_c_ifc = matrix_layout_ifc.MatrixLayoutInterface(self.c_layout_ptr) + layout_c_ifc.order = result_traits.d_layout_traits.order + layout_c_ifc.batch_count = mm_traits.batch_count + layout_c_ifc.strided_batch_offset = result_traits.d_layout_traits.batch_offset else: self.c_layout_ptr = cublaslt.matrix_layout_create( self.c_dtype, rows=mm_traits.M, cols=mm_traits.N, ld=mm_traits.c_layout_traits.ld @@ -1475,6 +1778,35 @@ def plan( layout_c_ifc.batch_count = mm_traits.batch_count layout_c_ifc.strided_batch_offset = mm_traits.c_layout_traits.batch_offset + if ( + self.input_type_width == 8 + and self.options.block_scaling + and (mm_traits.M % 128 != 0 or mm_traits.N % 128 != 0 or mm_traits.K % 128 != 0) + ): + raise ValueError( + f"M={mm_traits.M} N={mm_traits.N} K={mm_traits.K} must be divisible by 128 when block_scaling=True." + ) + + if self.input_type_width == 8 and (mm_traits.M % 16 != 0 or mm_traits.N % 16 != 0 or mm_traits.K % 16 != 0): + raise ValueError(f"M={mm_traits.M} N={mm_traits.N} K={mm_traits.K} must be divisible by 16 for FP8 operations") + + if self.options.block_scaling and self.d_dtype_width == 8: + self.mm_desc_ifc.alpha_vector_batch_stride = 1 # Workaround for library caching issue + + # cublasLtMatmulAlgoGetHeuristic requires the scale pointer to be set. + self.aux_outputs = { + "d_out_scale": utils.create_empty_tensor( + self.result_class, + ((mm_traits.M * mm_traits.N) // 32 * self.mm_traits.batch_count), + "uint8", + self.device_id, + stream_holder, + verify_strides=False, + ) + } + self.mm_desc_ifc.d_out_scale_pointer = self.aux_outputs["d_out_scale"].data_ptr + self.mm_desc_ifc.d_out_scale_mode = cublaslt.MatmulMatrixScale.VEC32_UE8M0 + limit = preferences.limit if algorithms is None: num_algorithms = np.empty((1,), dtype=np.int32) @@ -1493,6 +1825,9 @@ def plan( cublaslt.matmul_preference_destroy(self.preference_ptr) self.preference_ptr = cublaslt.matmul_preference_create() + if self.input_type_width <= 8: + self._prepare_operand_quantization_scales(self.quantization_scales) + if algorithms is None: # Set preferences. preference_ifc = matmul_pref_ifc.MatmulPreferenceInterface(self.preference_ptr) @@ -1667,7 +2002,18 @@ def _check_and_set_operand( return @utils.precondition(_check_valid_matmul) - def reset_operands(self, a=None, b=None, c=None, *, alpha=None, beta=None, epilog_inputs=None, stream=None): + def reset_operands( + self, + a=None, + b=None, + c=None, + *, + alpha=None, + beta=None, + quantization_scales=None, + epilog_inputs=None, + stream=None, + ): """ Reset the operands held by this :class:`Matmul` instance. @@ -1705,6 +2051,8 @@ def reset_operands(self, a=None, b=None, c=None, *, alpha=None, beta=None, epilo stream: {stream} + quantization_scales: {quantization_scales} + Examples: >>> import cupy as cp @@ -1810,6 +2158,25 @@ def reset_operands(self, a=None, b=None, c=None, *, alpha=None, beta=None, epilo f"The value provided for beta {beta} is not convertible to dtype '{self.beta.dtype}'." ) from e + # Update quantization_scales. + if quantization_scales is not None: + quantization_scales = self._validate_operand_scales(quantization_scales, all_required=False) + if quantization_scales.a is not None: + self.quantization_scales.a = quantization_scales.a + if quantization_scales.b is not None: + self.quantization_scales.b = quantization_scales.b + if quantization_scales.c is not None: + self.quantization_scales.c = quantization_scales.c + if quantization_scales.d is not None: + self.quantization_scales.d = quantization_scales.d + self._prepare_operand_quantization_scales(self.quantization_scales) + + if epilog_inputs is not None and "aux_quantization_scale" in epilog_inputs: + epilog_inputs = epilog_inputs.copy() + aux_quantization_scale = epilog_inputs.pop("aux_quantization_scale") + self._validate_epilog_aux_scale(aux_quantization_scale, required=False) + self._prepare_quantization_scale(aux_quantization_scale, "epilog_aux", cublas_operand="epilogue_aux") + stream_holder = utils.get_or_create_stream(self.device_id, stream, self.package) # Reset the provided operands. @@ -1822,7 +2189,7 @@ def reset_operands(self, a=None, b=None, c=None, *, alpha=None, beta=None, epilo self.mm_desc_ifc, stream_holder, operand_index=index, - dtype=self.ab_dtype_name, + dtype=self.a_dtype_name, extents=self.operand_extents[index], strides=self.operand_strides[index], ) @@ -1836,7 +2203,7 @@ def reset_operands(self, a=None, b=None, c=None, *, alpha=None, beta=None, epilo self.mm_desc_ifc, stream_holder, operand_index=index, - dtype=self.ab_dtype_name, + dtype=self.b_dtype_name, extents=self.operand_extents[index], strides=self.operand_strides[index], ) @@ -2022,7 +2389,7 @@ def execute_matmul(algorithm_ptr): self.logger.info(f"The best performance remains at {formatters.FLOPSStr(orig_flop_rate, 'FLOP/s')}.") end = timer() - self.logger.info(f"The autotuning took {(end - start) * 1000.:.3f} ms to complete.") + self.logger.info(f"The autotuning took {(end - start) * 1000.0:.3f} ms to complete.") @utils.precondition(_check_valid_matmul) @utils.precondition(_check_planned, "Execution") @@ -2074,6 +2441,18 @@ def execute(self, *, algorithm=None, release_workspace=False, stream=None): ) if log_debug: self.logger.debug(f"The auxiliary output tensor '{name}' has been created.") + if self.preferences.epilog.aux_amax: + if "float8" not in dtype_name: + raise ValueError("epilog.aux_amax=True is not supported when epilog output type is not FP8.") + self.epilog_outputs[f"{name}_amax"] = utils.create_empty_tensor( + self.result_class, + (1,), + "float32", # This is the only type allowed by cuBLAS for AMAX. + self.device_id, + stream_holder, + verify_strides=False, + ) + self.mm_desc_ifc.epilogue_aux_amax_pointer = self.epilog_outputs[f"{name}_amax"].data_ptr # Update the data pointer in the MM descriptor. handler.update_ptr(self.mm_desc_ifc, aux.data_ptr) @@ -2097,6 +2476,30 @@ def execute(self, *, algorithm=None, release_workspace=False, stream=None): if log_debug: self.logger.debug("The output (empty) tensor has been created.") + self.aux_outputs = dict() + + if self.options.result_amax: + self.aux_outputs["result_amax"] = utils.create_empty_tensor( + self.result_class, + (1,), + "float32", # This is the only type allowed by cuBLAS for AMAX. + self.device_id, + stream_holder, + verify_strides=False, + ) + self.mm_desc_ifc.amax_d_pointer = self.aux_outputs["result_amax"].data_ptr + + if self.options.block_scaling and self.d_dtype_width == 8: + self.aux_outputs["d_out_scale"] = utils.create_empty_tensor( + self.result_class, + (self.mm_traits.batch_count * self.result_traits.result_shape[-1] * self.result_traits.result_shape[-2] // 32), + "uint8", + self.device_id, + stream_holder, + verify_strides=False, + ) + self.mm_desc_ifc.d_out_scale_pointer = self.aux_outputs["d_out_scale"].data_ptr + # Select the first (best) algorithm if one is not provided. if algorithm is None: algorithm_struct = self.algorithms_buffer[0]["algo"] @@ -2150,17 +2553,19 @@ def execute(self, *, algorithm=None, release_workspace=False, stream=None): self._release_workspace_memory_perhaps(True) # Return the result and auxiliary outputs, if present. + all_outputs = self.epilog_outputs | self.aux_outputs if self.memory_space == "cpu": out = self.result.to("cpu", stream_holder=stream_holder) # Copy auxiliary output to CPU. - aux = {name: self.epilog_outputs[name].to("cpu", stream_holder=stream_holder) for name in self.epilog_outputs} + aux = {name: all_outputs[name].to("cpu", stream_holder=stream_holder) for name in all_outputs} else: out = self.result.tensor # Return the unwrapped epilog output tensor(s). - aux = {name: self.epilog_outputs[name].tensor for name in self.epilog_outputs} + aux = {name: all_outputs[name].tensor for name in all_outputs} # Release internal reference to the result to permit recycling of memory. self.result = None + self.aux_outputs = dict() self.epilog_outputs = dict() self._reset_workspace_allocation_tracking() @@ -2215,6 +2620,7 @@ def matmul( epilog=None, epilog_inputs=None, qualifiers=None, + quantization_scales=None, options=None, preferences=None, algorithm=None, @@ -2271,12 +2677,17 @@ def matmul( stream: {stream} + quantization_scales: {quantization_scales} + Returns: {result} Semantics: {semantics} + Narrow-precision support: + {narrow_precision} + See Also: :class:`Matmul`, :class:`MatmulOptions`, :class:`MatmulEpilog`, :class:`MatmulPlanPreferences` @@ -2359,7 +2770,17 @@ def matmul( else: algorithms = [algorithm] # The type of algorithm should be algorithm.Algorithm and will be checked in plan() - with Matmul(a, b, c=c, alpha=alpha, beta=beta, qualifiers=qualifiers, options=options, stream=stream) as mm: + with Matmul( + a, + b, + c=c, + alpha=alpha, + beta=beta, + qualifiers=qualifiers, + options=options, + stream=stream, + quantization_scales=quantization_scales, + ) as mm: mm.plan(preferences=preferences, epilog=epilog, epilog_inputs=epilog_inputs, stream=stream, algorithms=algorithms) r = mm.execute(stream=stream) diff --git a/nvmath/memory.py b/nvmath/memory.py index 2b4f576..7ce947b 100644 --- a/nvmath/memory.py +++ b/nvmath/memory.py @@ -1,4 +1,4 @@ -# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. +# Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. # # SPDX-License-Identifier: Apache-2.0 diff --git a/pyproject.toml b/pyproject.toml index a2c64e1..84c385e 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,4 +1,4 @@ -# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. +# Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. # # SPDX-License-Identifier: Apache-2.0 @@ -15,8 +15,8 @@ backend-path = ["builder"] [project] name = "nvmath-python" +version = "0.3.0" dynamic = [ - "version", "readme", "dependencies", "optional-dependencies" @@ -51,7 +51,6 @@ classifiers = [ ] [tool.setuptools.dynamic] -version = { attr = "nvmath._version.__version__" } readme = { file = ["DESCRIPTION.rst"], content-type = "text/x-rst" } dependencies = {file = ["requirements/pip/nvmath-python.txt"] } @@ -106,8 +105,6 @@ ignore = [ "I001", "SIM105", "SIM108", - # Ignore module-import-not-at-top-of-file - "E402", # Ignore multiple-statements-on-one-line-colon "E701", # Ignore multiple-statements-on-one-line-semicolon @@ -118,12 +115,6 @@ ignore = [ "E731", # Ignore ambiguous-variable-name "E741", - # Ignore unused-import - "F401", - # Ignore undefined-local-with-import-star - "F403", - # Ignore undefined-local-with-import-star-usage - "F405", ] fixable = ["ALL"] diff --git a/requirements/README.md b/requirements/README.md index ff4ef5b..0851556 100644 --- a/requirements/README.md +++ b/requirements/README.md @@ -70,3 +70,4 @@ requirements are included by the top-level requirements sets. | requirements/pip/tests.txt | Test dependencies | | requirements/pip/torch-cu11.txt | Enable torch use in tests and examples via wheels for CUDA-11.8 | | requirements/pip/torch-cu12.txt | Enable torch use in tests and examples via wheels for CUDA-12.1 | +| requirements/pip/torch-cu12-nightly.txt | Enable torch nightly + CTK-12.8 wheels | diff --git a/requirements/pip/docs.txt b/requirements/pip/docs.txt index d26766f..1ec3a4a 100644 --- a/requirements/pip/docs.txt +++ b/requirements/pip/docs.txt @@ -1,8 +1,12 @@ breathe enum-tools grip +jupyter myst-parser +nbsphinx +nbsphinx-link nvidia-sphinx-theme +pandoc sphinx sphinx-favicon sphinx-toolbox diff --git a/requirements/pip/torch-cu12-nightly.txt b/requirements/pip/torch-cu12-nightly.txt new file mode 100644 index 0000000..227fee8 --- /dev/null +++ b/requirements/pip/torch-cu12-nightly.txt @@ -0,0 +1,5 @@ +# pytorch >=2.3 to ensure numpy 1/2 compatibility +# torch wheels depend on nvidia wheels; do not add if testing system ctk +--pre +--index https://download.pytorch.org/whl/nightly/cu128 +torch>=2.6; platform_system!="Windows" diff --git a/setup.py b/setup.py index c38e6d9..22a8b7b 100644 --- a/setup.py +++ b/setup.py @@ -1,4 +1,4 @@ -# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. +# Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. # # SPDX-License-Identifier: Apache-2.0 @@ -21,7 +21,7 @@ # flow, so we need to hack sys.path ourselves... source_root = os.path.abspath(os.path.dirname(__file__)) sys.path.append(os.path.join(source_root, "builder")) -import utils # this is builder.utils +import utils # type: ignore # this is builder.utils # noqa: E402 # List the main modules, and infer the auxiliary modules automatically @@ -129,10 +129,6 @@ def calculate_modules(module): setup( ext_modules=cythonize(ext_modules, verbose=True, language_level=3, compiler_directives=compiler_directives), packages=find_packages(include=["nvmath", "nvmath.*"]), - package_data=dict.fromkeys( - find_packages(include=["nvmath.*"]), - ["*.pxd", "*.pyx", "*.py"], - ), zip_safe=False, cmdclass=cmdclass, ) diff --git a/tests/conftest.py b/tests/conftest.py index 35d4348..4898dc5 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -1,4 +1,4 @@ -# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. +# Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. # # SPDX-License-Identifier: Apache-2.0 @@ -8,7 +8,6 @@ from collections.abc import Iterable import datetime -import os import hypothesis diff --git a/tests/docstring_tests/test_docstrings.py b/tests/docstring_tests/test_docstrings.py index 19ccb2e..725b837 100644 --- a/tests/docstring_tests/test_docstrings.py +++ b/tests/docstring_tests/test_docstrings.py @@ -1,3 +1,7 @@ +# Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. +# +# SPDX-License-Identifier: Apache-2.0 + import contextlib import os import sphinx.cmd.build diff --git a/tests/example_tests/device_tests/test_device_samples.py b/tests/example_tests/device_tests/test_device_samples.py index 5d45955..8c4b078 100644 --- a/tests/example_tests/device_tests/test_device_samples.py +++ b/tests/example_tests/device_tests/test_device_samples.py @@ -1,14 +1,12 @@ -# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. +# Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. # # SPDX-License-Identifier: Apache-2.0 import glob import os -import re import pytest -from nvmath import bindings from ..test_utils import run_sample diff --git a/tests/example_tests/fft_tests/test_fft_samples.py b/tests/example_tests/fft_tests/test_fft_samples.py index ba36d1a..c56d475 100644 --- a/tests/example_tests/fft_tests/test_fft_samples.py +++ b/tests/example_tests/fft_tests/test_fft_samples.py @@ -1,4 +1,4 @@ -# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. +# Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. # # SPDX-License-Identifier: Apache-2.0 @@ -36,7 +36,7 @@ def _has_numba(): try: - import numba + import numba # noqa: F401 return True except ModuleNotFoundError: diff --git a/tests/example_tests/matmul_tests/test_advanced_matmul_samples.py b/tests/example_tests/matmul_tests/test_advanced_matmul_samples.py index 1f29823..cf0f1f2 100644 --- a/tests/example_tests/matmul_tests/test_advanced_matmul_samples.py +++ b/tests/example_tests/matmul_tests/test_advanced_matmul_samples.py @@ -1,4 +1,4 @@ -# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. +# Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. # # SPDX-License-Identifier: Apache-2.0 @@ -9,7 +9,7 @@ import pytest try: - import cupy + import cupy # noqa: F401 except ModuleNotFoundError: pytest.skip("cupy required for matmul tests", allow_module_level=True) @@ -36,9 +36,40 @@ "example13_epilog_stateful_reset.py": 11501, "example14_autotune.py": 11501, "example16_reuse_algorithms.py": 11501, + "example17_fp8.py": 120800, + "example18_fp8_types.py": 120800, + "example19_fp8_reset.py": 120800, + "example20_fp8_inplace_scale_change.py": 120800, + "example21_fp8_amax.py": 120800, + "example22_fp8_delayed_scaling.py": 120800, + "example23_fp8_epilog.py": 120800, + "example24_fp8_epilog_aux.py": 120800, + "example25_mxfp8.py": 120800, + "example26_mxfp8_d_out.py": 120800, + "example27_mxfp8_chaining.py": 120800, + "example28_mxfp8_epilog.py": 120800, + "example29_mxfp8_layout.py": 120800, +} + +min_cc = { + "example17_fp8.py": (8, 9), + "example18_fp8_types.py": (8, 9), + "example19_fp8_reset.py": (8, 9), + "example20_fp8_inplace_scale_change.py": (8, 9), + "example21_fp8_amax.py": (8, 9), + "example22_fp8_delayed_scaling.py": (8, 9), + "example23_fp8_epilog.py": (8, 9), + "example24_fp8_epilog_aux.py": (8, 9), + "example25_mxfp8.py": (10, 0), + "example26_mxfp8_d_out.py": (10, 0), + "example27_mxfp8_chaining.py": (10, 0), + "example28_mxfp8_epilog.py": (10, 0), + "example29_mxfp8_layout.py": (10, 0), } cublas_version = bindings.cublasLt.get_version() +device_properties = cupy.cuda.runtime.getDeviceProperties(cupy.cuda.runtime.getDevice()) +cc = (device_properties["major"], device_properties["minor"]) @pytest.mark.parametrize("sample", sample_files) @@ -48,4 +79,7 @@ def test_sample(self, sample): required_cublas_version = min_cublas_version.get(filename, 0) if cublas_version < required_cublas_version: pytest.skip(f"cublas version {cublas_version} lower than required ({required_cublas_version})") + required_cc = min_cc.get(filename, (0, 0)) + if cc < required_cc: + pytest.skip(f"compute capability {cc} lower than required {required_cc}") run_sample(samples_path, sample) diff --git a/tests/example_tests/test_utils.py b/tests/example_tests/test_utils.py index 136772b..48e9bf0 100644 --- a/tests/example_tests/test_utils.py +++ b/tests/example_tests/test_utils.py @@ -1,4 +1,4 @@ -# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. +# Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. # # SPDX-License-Identifier: Apache-2.0 diff --git a/tests/nvmath_tests/device/cpp_conv.py b/tests/nvmath_tests/device/cpp_conv.py index 5104865..150f909 100644 --- a/tests/nvmath_tests/device/cpp_conv.py +++ b/tests/nvmath_tests/device/cpp_conv.py @@ -1,4 +1,4 @@ -# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. +# Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. # # SPDX-License-Identifier: Apache-2.0 diff --git a/tests/nvmath_tests/device/cpp_gemm_batched.py b/tests/nvmath_tests/device/cpp_gemm_batched.py index 0c81600..1eeb015 100644 --- a/tests/nvmath_tests/device/cpp_gemm_batched.py +++ b/tests/nvmath_tests/device/cpp_gemm_batched.py @@ -1,4 +1,4 @@ -# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. +# Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. # # SPDX-License-Identifier: Apache-2.0 diff --git a/tests/nvmath_tests/device/cpp_gemm_loop.py b/tests/nvmath_tests/device/cpp_gemm_loop.py index e5bb6c4..3f19de8 100644 --- a/tests/nvmath_tests/device/cpp_gemm_loop.py +++ b/tests/nvmath_tests/device/cpp_gemm_loop.py @@ -1,4 +1,4 @@ -# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. +# Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. # # SPDX-License-Identifier: Apache-2.0 diff --git a/tests/nvmath_tests/device/curand/compiled_apis.py b/tests/nvmath_tests/device/curand/compiled_apis.py index 17397e4..a46840c 100644 --- a/tests/nvmath_tests/device/curand/compiled_apis.py +++ b/tests/nvmath_tests/device/curand/compiled_apis.py @@ -1,4 +1,4 @@ -# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. +# Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. # # SPDX-License-Identifier: Apache-2.0 diff --git a/tests/nvmath_tests/device/curand/distributions.py b/tests/nvmath_tests/device/curand/distributions.py index 3e919ea..d5558f6 100644 --- a/tests/nvmath_tests/device/curand/distributions.py +++ b/tests/nvmath_tests/device/curand/distributions.py @@ -1,12 +1,14 @@ -# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. +# Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. # # SPDX-License-Identifier: Apache-2.0 +from collections.abc import Callable + import numpy as np -import scipy.stats as stats import nvmath.device.random as R -from collections.abc import Callable -from .generators import * +import scipy.stats as stats + +from . import generators class Distribution: @@ -22,7 +24,7 @@ def ppf(self): """ raise NotImplementedError - def curand_variants(self) -> dict[tuple[str, int, Generator], Callable]: + def curand_variants(self) -> dict[tuple[str, int, generators.Generator], Callable]: """ A dictionary mapping (dtype, group size, generator) to curand distribution function. """ @@ -72,22 +74,22 @@ def cdf(self, x): def curand_variants(self): return { - ("float", 1, XorwowGenerator()): R.uniform, - ("float", 1, MrgGenerator()): R.uniform, - ("float", 1, PhiloxGenerator()): R.uniform, - ("float", 1, Sobol32Generator()): R.uniform, - ("float", 1, ScrambledSobol32Generator()): R.uniform, - ("float", 1, Sobol64Generator()): R.uniform, - ("float", 1, ScrambledSobol64Generator()): R.uniform, - ("double", 1, XorwowGenerator()): R.uniform_double, - ("double", 1, MrgGenerator()): R.uniform_double, - ("double", 1, PhiloxGenerator()): R.uniform_double, - ("double", 1, Sobol32Generator()): R.uniform_double, - ("double", 1, ScrambledSobol32Generator()): R.uniform_double, - ("double", 1, Sobol64Generator()): R.uniform_double, - ("double", 1, ScrambledSobol64Generator()): R.uniform_double, - ("double", 2, PhiloxGenerator()): R.uniform2_double, - ("float", 4, PhiloxGenerator()): R.uniform4, + ("float", 1, generators.XorwowGenerator()): R.uniform, + ("float", 1, generators.MrgGenerator()): R.uniform, + ("float", 1, generators.PhiloxGenerator()): R.uniform, + ("float", 1, generators.Sobol32Generator()): R.uniform, + ("float", 1, generators.ScrambledSobol32Generator()): R.uniform, + ("float", 1, generators.Sobol64Generator()): R.uniform, + ("float", 1, generators.ScrambledSobol64Generator()): R.uniform, + ("double", 1, generators.XorwowGenerator()): R.uniform_double, + ("double", 1, generators.MrgGenerator()): R.uniform_double, + ("double", 1, generators.PhiloxGenerator()): R.uniform_double, + ("double", 1, generators.Sobol32Generator()): R.uniform_double, + ("double", 1, generators.ScrambledSobol32Generator()): R.uniform_double, + ("double", 1, generators.Sobol64Generator()): R.uniform_double, + ("double", 1, generators.ScrambledSobol64Generator()): R.uniform_double, + ("double", 2, generators.PhiloxGenerator()): R.uniform2_double, + ("float", 4, generators.PhiloxGenerator()): R.uniform4, } def curand(self, dtype_name, group_size): @@ -107,27 +109,27 @@ def cdf(self, x): def curand_variants(self): return { - ("float", 1, XorwowGenerator()): R.normal, - ("float", 1, MrgGenerator()): R.normal, - ("float", 1, PhiloxGenerator()): R.normal, - ("float", 1, Sobol32Generator()): R.normal, - ("float", 1, ScrambledSobol32Generator()): R.normal, - ("float", 1, Sobol64Generator()): R.normal, - ("float", 1, ScrambledSobol64Generator()): R.normal, - ("double", 1, XorwowGenerator()): R.normal_double, - ("double", 1, MrgGenerator()): R.normal_double, - ("double", 1, PhiloxGenerator()): R.normal_double, - ("double", 1, Sobol32Generator()): R.normal_double, - ("double", 1, ScrambledSobol32Generator()): R.normal_double, - ("double", 1, Sobol64Generator()): R.normal_double, - ("double", 1, ScrambledSobol64Generator()): R.normal_double, - ("float", 2, XorwowGenerator()): R.normal2, - ("float", 2, MrgGenerator()): R.normal2, - ("float", 2, PhiloxGenerator()): R.normal2, - ("double", 2, XorwowGenerator()): R.normal2_double, - ("double", 2, MrgGenerator()): R.normal2_double, - ("double", 2, PhiloxGenerator()): R.normal2_double, - ("float", 4, PhiloxGenerator()): R.normal4, + ("float", 1, generators.XorwowGenerator()): R.normal, + ("float", 1, generators.MrgGenerator()): R.normal, + ("float", 1, generators.PhiloxGenerator()): R.normal, + ("float", 1, generators.Sobol32Generator()): R.normal, + ("float", 1, generators.ScrambledSobol32Generator()): R.normal, + ("float", 1, generators.Sobol64Generator()): R.normal, + ("float", 1, generators.ScrambledSobol64Generator()): R.normal, + ("double", 1, generators.XorwowGenerator()): R.normal_double, + ("double", 1, generators.MrgGenerator()): R.normal_double, + ("double", 1, generators.PhiloxGenerator()): R.normal_double, + ("double", 1, generators.Sobol32Generator()): R.normal_double, + ("double", 1, generators.ScrambledSobol32Generator()): R.normal_double, + ("double", 1, generators.Sobol64Generator()): R.normal_double, + ("double", 1, generators.ScrambledSobol64Generator()): R.normal_double, + ("float", 2, generators.XorwowGenerator()): R.normal2, + ("float", 2, generators.MrgGenerator()): R.normal2, + ("float", 2, generators.PhiloxGenerator()): R.normal2, + ("double", 2, generators.XorwowGenerator()): R.normal2_double, + ("double", 2, generators.MrgGenerator()): R.normal2_double, + ("double", 2, generators.PhiloxGenerator()): R.normal2_double, + ("float", 4, generators.PhiloxGenerator()): R.normal4, } def curand(self, dtype_name, group_size): @@ -150,27 +152,27 @@ def cdf(self, x): def curand_variants(self): return { - ("float", 1, XorwowGenerator()): R.log_normal, - ("float", 1, MrgGenerator()): R.log_normal, - ("float", 1, PhiloxGenerator()): R.log_normal, - ("float", 1, Sobol32Generator()): R.log_normal, - ("float", 1, ScrambledSobol32Generator()): R.log_normal, - ("float", 1, Sobol64Generator()): R.log_normal, - ("float", 1, ScrambledSobol64Generator()): R.log_normal, - ("double", 1, XorwowGenerator()): R.log_normal_double, - ("double", 1, MrgGenerator()): R.log_normal_double, - ("double", 1, PhiloxGenerator()): R.log_normal_double, - ("double", 1, Sobol32Generator()): R.log_normal_double, - ("double", 1, ScrambledSobol32Generator()): R.log_normal_double, - ("double", 1, Sobol64Generator()): R.log_normal_double, - ("double", 1, ScrambledSobol64Generator()): R.log_normal_double, - ("float", 2, XorwowGenerator()): R.log_normal2, - ("float", 2, MrgGenerator()): R.log_normal2, - ("float", 2, PhiloxGenerator()): R.log_normal2, - ("double", 2, XorwowGenerator()): R.log_normal2_double, - ("double", 2, MrgGenerator()): R.log_normal2_double, - ("double", 2, PhiloxGenerator()): R.log_normal2_double, - ("float", 4, PhiloxGenerator()): R.log_normal4, + ("float", 1, generators.XorwowGenerator()): R.log_normal, + ("float", 1, generators.MrgGenerator()): R.log_normal, + ("float", 1, generators.PhiloxGenerator()): R.log_normal, + ("float", 1, generators.Sobol32Generator()): R.log_normal, + ("float", 1, generators.ScrambledSobol32Generator()): R.log_normal, + ("float", 1, generators.Sobol64Generator()): R.log_normal, + ("float", 1, generators.ScrambledSobol64Generator()): R.log_normal, + ("double", 1, generators.XorwowGenerator()): R.log_normal_double, + ("double", 1, generators.MrgGenerator()): R.log_normal_double, + ("double", 1, generators.PhiloxGenerator()): R.log_normal_double, + ("double", 1, generators.Sobol32Generator()): R.log_normal_double, + ("double", 1, generators.ScrambledSobol32Generator()): R.log_normal_double, + ("double", 1, generators.Sobol64Generator()): R.log_normal_double, + ("double", 1, generators.ScrambledSobol64Generator()): R.log_normal_double, + ("float", 2, generators.XorwowGenerator()): R.log_normal2, + ("float", 2, generators.MrgGenerator()): R.log_normal2, + ("float", 2, generators.PhiloxGenerator()): R.log_normal2, + ("double", 2, generators.XorwowGenerator()): R.log_normal2_double, + ("double", 2, generators.MrgGenerator()): R.log_normal2_double, + ("double", 2, generators.PhiloxGenerator()): R.log_normal2_double, + ("float", 4, generators.PhiloxGenerator()): R.log_normal4, } def curand(self, dtype_name, group_size): @@ -199,14 +201,14 @@ def ppf(self, x): def curand_variants(self): return { - ("uint32", 1, XorwowGenerator()): R.poisson, - ("uint32", 1, MrgGenerator()): R.poisson, - ("uint32", 1, PhiloxGenerator()): R.poisson, - ("uint32", 1, Sobol32Generator()): R.poisson, - ("uint32", 1, ScrambledSobol32Generator()): R.poisson, - ("uint32", 1, Sobol64Generator()): R.poisson, - ("uint32", 1, ScrambledSobol64Generator()): R.poisson, - ("uint32", 4, PhiloxGenerator()): R.poisson4, + ("uint32", 1, generators.XorwowGenerator()): R.poisson, + ("uint32", 1, generators.MrgGenerator()): R.poisson, + ("uint32", 1, generators.PhiloxGenerator()): R.poisson, + ("uint32", 1, generators.Sobol32Generator()): R.poisson, + ("uint32", 1, generators.ScrambledSobol32Generator()): R.poisson, + ("uint32", 1, generators.Sobol64Generator()): R.poisson, + ("uint32", 1, generators.ScrambledSobol64Generator()): R.poisson, + ("uint32", 4, generators.PhiloxGenerator()): R.poisson4, } def curand(self, dtype_name, group_size): diff --git a/tests/nvmath_tests/device/curand/generators.py b/tests/nvmath_tests/device/curand/generators.py index 5957345..27eb86c 100644 --- a/tests/nvmath_tests/device/curand/generators.py +++ b/tests/nvmath_tests/device/curand/generators.py @@ -1,10 +1,9 @@ -# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. +# Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. # # SPDX-License-Identifier: Apache-2.0 from numba import cuda, uint32, uint64 import nvmath.device.random as R -from functools import cache from .compiled_apis import compiled_apis import cffi diff --git a/tests/nvmath_tests/device/curand/test_random.py b/tests/nvmath_tests/device/curand/test_random.py index 52fa09b..dce8f8f 100644 --- a/tests/nvmath_tests/device/curand/test_random.py +++ b/tests/nvmath_tests/device/curand/test_random.py @@ -1,14 +1,21 @@ -# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. +# Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. # # SPDX-License-Identifier: Apache-2.0 import numpy as np import scipy.stats as stats import pytest -import nvmath.device.random as R -from .generators import * -from .distributions import * -from .utils import * + +from . import distributions +from . import generators +from .utils import ( + all_supported_configs, + generate_random_numbers, + per_thread_skipahead, + per_thread_skipahead_sequence, + prepare_states, + prepare_states_and_generate, +) """ This set of tests checks random device APIs. @@ -25,13 +32,13 @@ @pytest.mark.parametrize( "distribution,dtype_name,group_size,generator", all_supported_configs( - Poisson(1), - Poisson(10), - Poisson(15), - LogNormal(), - LogNormal(-2, 0.5), - Uniform(), - Normal(), + distributions.Poisson(1), + distributions.Poisson(10), + distributions.Poisson(15), + distributions.LogNormal(), + distributions.LogNormal(-2, 0.5), + distributions.Uniform(), + distributions.Normal(), ), ) def test_distribution(distribution, dtype_name, generator, nsamples, threads, blocks, group_size): @@ -81,7 +88,9 @@ def test(x): @pytest.mark.parametrize("threads,blocks", ((38, 2), (1, 1))) @pytest.mark.parametrize( "distribution,dtype_name,group_size,generator", - all_supported_configs(Poisson(12), LogNormal(2, 1.1), Uniform(), Normal()), + all_supported_configs( + distributions.Poisson(12), distributions.LogNormal(2, 1.1), distributions.Uniform(), distributions.Normal() + ), ) def test_seeds(distribution, dtype_name, generator, nsamples, threads, blocks, group_size): """ @@ -122,7 +131,7 @@ def generate_with_seed(seed): ) @pytest.mark.parametrize( "generator", - [pytest.param(g(), id=g().name()) for g in GENERATORS if g().supports_skipahead()], + [pytest.param(g(), id=g().name()) for g in generators.GENERATORS if g().supports_skipahead()], ) def test_skipahead(generator, threads, blocks): """ @@ -139,7 +148,7 @@ def test_skipahead(generator, threads, blocks): def gen(states, n): return generate_random_numbers( states=states, - distribution=Uniform(), + distribution=distributions.Uniform(), dtype_name="float", nsamples=n, threads=threads, @@ -195,7 +204,7 @@ def gen_all(): ) @pytest.mark.parametrize( "generator", - [pytest.param(g(), id=g().name()) for g in GENERATORS if g().supports_skipahead_subsequence()], + [pytest.param(g(), id=g().name()) for g in generators.GENERATORS if g().supports_skipahead_subsequence()], ) def test_skipahead_sequence(generator, threads, blocks): """ @@ -211,7 +220,7 @@ def test_skipahead_sequence(generator, threads, blocks): def gen(states): return generate_random_numbers( states=states, - distribution=Uniform(), + distribution=distributions.Uniform(), dtype_name="double", nsamples=1, threads=threads, diff --git a/tests/nvmath_tests/device/curand/utils.py b/tests/nvmath_tests/device/curand/utils.py index a0e2549..696a585 100644 --- a/tests/nvmath_tests/device/curand/utils.py +++ b/tests/nvmath_tests/device/curand/utils.py @@ -1,12 +1,14 @@ -# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. +# Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. # # SPDX-License-Identifier: Apache-2.0 +from functools import cache +from numba import cuda import numpy as np -import pytest import nvmath.device.random as R -from functools import cache -from .generators import * +import pytest + +from . import generators from .compiled_apis import compiled_apis NP_DTYPES = { @@ -48,7 +50,7 @@ def random_kernel(states, nsamples, result, curand_distribution_args): def prepare_states( *, - generator: Generator, + generator: generators.Generator, seed, threads, blocks, @@ -140,7 +142,7 @@ def per_thread_skipahead_sequence( def prepare_states_and_generate( *, - generator: Generator, + generator: generators.Generator, seed, threads, blocks, diff --git a/tests/nvmath_tests/device/helpers.py b/tests/nvmath_tests/device/helpers.py index dabcbe3..87defdc 100644 --- a/tests/nvmath_tests/device/helpers.py +++ b/tests/nvmath_tests/device/helpers.py @@ -1,4 +1,4 @@ -# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. +# Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. # # SPDX-License-Identifier: Apache-2.0 @@ -36,6 +36,10 @@ def set_device(): CHECK_CUDART(err) err, prop = cudart.cudaGetDeviceProperties(0) CHECK_CUDART(err) + # TODO: dx does not support platforms > arch90 for now and version is capped + # at 9.0 + if (prop.major, prop.minor) > (9, 0): + return (9, 0) return (prop.major, prop.minor) diff --git a/tests/nvmath_tests/device/helpers_cpp.py b/tests/nvmath_tests/device/helpers_cpp.py index a8ec85b..c09af46 100644 --- a/tests/nvmath_tests/device/helpers_cpp.py +++ b/tests/nvmath_tests/device/helpers_cpp.py @@ -1,14 +1,13 @@ -# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. +# Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. # # SPDX-License-Identifier: Apache-2.0 -from cuda import cuda -from cuda import nvrtc +from cuda import cudart, nvrtc, cuda from nvmath.device.common_mathdx import CUDA_HOME as _CUDA_HOME from nvmath.device.common_mathdx import MATHDX_HOME as _MATHDX_HOME -from .helpers import CHECK_CUDA, CHECK_NVRTC, make_args, get_unsigned +from .helpers import CHECK_CUDA, CHECK_CUDART, CHECK_NVRTC, make_args, get_unsigned def run_and_time(kernel, grid_dim, block_dim, shared_memory_size, ncycles, *args): @@ -62,6 +61,12 @@ def run_and_time(kernel, grid_dim, block_dim, shared_memory_size, ncycles, *args def compile_cpp_kernel(cpp, sm, mangled): print(f"compile_cpp_kernel CUDA_HOME = {_CUDA_HOME}, _MATHDX_HOME = {_MATHDX_HOME}") + # TODO: dx does not support platforms > arch90 for now and version is capped + # at 9.0, but we want to compile program against actual architecture. + err, prop = cudart.cudaGetDeviceProperties(0) + CHECK_CUDART(err) + sm = (prop.major, prop.minor) + opts = ( [b"--std=c++17", b"--device-as-default-execution-space", b"-DCUFFTDX_DETAIL_USE_CUDA_STL=1"] + [bytes(f"--include-path={h}/include", encoding="ascii") for h in _CUDA_HOME] diff --git a/tests/nvmath_tests/device/helpers_numba.py b/tests/nvmath_tests/device/helpers_numba.py index 4f6f017..bf5b744 100644 --- a/tests/nvmath_tests/device/helpers_numba.py +++ b/tests/nvmath_tests/device/helpers_numba.py @@ -1,4 +1,4 @@ -# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. +# Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. # # SPDX-License-Identifier: Apache-2.0 diff --git a/tests/nvmath_tests/device/numba_conv.py b/tests/nvmath_tests/device/numba_conv.py index 9e8ec73..c6d264f 100644 --- a/tests/nvmath_tests/device/numba_conv.py +++ b/tests/nvmath_tests/device/numba_conv.py @@ -1,4 +1,4 @@ -# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. +# Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. # # SPDX-License-Identifier: Apache-2.0 diff --git a/tests/nvmath_tests/device/numba_gemm_batched.py b/tests/nvmath_tests/device/numba_gemm_batched.py index fbff205..6696da5 100644 --- a/tests/nvmath_tests/device/numba_gemm_batched.py +++ b/tests/nvmath_tests/device/numba_gemm_batched.py @@ -1,4 +1,4 @@ -# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. +# Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. # # SPDX-License-Identifier: Apache-2.0 @@ -8,7 +8,6 @@ import cupy from numba import cuda from .helpers_numba import run_and_time, shared_load_3d, shared_store_3d -import time class NumbaGemmBatched: diff --git a/tests/nvmath_tests/device/numba_gemm_loop.py b/tests/nvmath_tests/device/numba_gemm_loop.py index 24ad7ed..17f35fd 100644 --- a/tests/nvmath_tests/device/numba_gemm_loop.py +++ b/tests/nvmath_tests/device/numba_gemm_loop.py @@ -1,4 +1,4 @@ -# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. +# Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. # # SPDX-License-Identifier: Apache-2.0 diff --git a/tests/nvmath_tests/device/test_cublasdx_generic.py b/tests/nvmath_tests/device/test_cublasdx_generic.py index 0b3124c..437092a 100644 --- a/tests/nvmath_tests/device/test_cublasdx_generic.py +++ b/tests/nvmath_tests/device/test_cublasdx_generic.py @@ -1,4 +1,4 @@ -# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. +# Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. # # SPDX-License-Identifier: Apache-2.0 diff --git a/tests/nvmath_tests/device/test_cublasdx_numba.py b/tests/nvmath_tests/device/test_cublasdx_numba.py index 44eb543..62a2fdd 100644 --- a/tests/nvmath_tests/device/test_cublasdx_numba.py +++ b/tests/nvmath_tests/device/test_cublasdx_numba.py @@ -1,4 +1,4 @@ -# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. +# Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. # # SPDX-License-Identifier: Apache-2.0 @@ -239,6 +239,7 @@ def test_matmul(shape, block_size, block_dim, data_type, trans, precision, np_ty assert MM.block_dim == Dim3(*block_dim) assert MM.max_threads_per_block <= 1024 assert MM.code_type.kind == "lto" + assert MM.code_type.cc.major == SM[0] assert MM.code_type.cc.minor == SM[1] diff --git a/tests/nvmath_tests/device/test_cublasdx_numba_perf.py b/tests/nvmath_tests/device/test_cublasdx_numba_perf.py index b163783..0f84e0f 100644 --- a/tests/nvmath_tests/device/test_cublasdx_numba_perf.py +++ b/tests/nvmath_tests/device/test_cublasdx_numba_perf.py @@ -1,4 +1,4 @@ -# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. +# Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. # # SPDX-License-Identifier: Apache-2.0 diff --git a/tests/nvmath_tests/device/test_cufftdx_generic.py b/tests/nvmath_tests/device/test_cufftdx_generic.py index 4f32eb9..5330980 100644 --- a/tests/nvmath_tests/device/test_cufftdx_generic.py +++ b/tests/nvmath_tests/device/test_cufftdx_generic.py @@ -1,4 +1,4 @@ -# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. +# Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. # # SPDX-License-Identifier: Apache-2.0 diff --git a/tests/nvmath_tests/device/test_cufftdx_numba.py b/tests/nvmath_tests/device/test_cufftdx_numba.py index 5941e40..38c6e49 100644 --- a/tests/nvmath_tests/device/test_cufftdx_numba.py +++ b/tests/nvmath_tests/device/test_cufftdx_numba.py @@ -1,4 +1,4 @@ -# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. +# Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. # # SPDX-License-Identifier: Apache-2.0 diff --git a/tests/nvmath_tests/device/test_cufftdx_numba_perf.py b/tests/nvmath_tests/device/test_cufftdx_numba_perf.py index b059e7f..56137a7 100644 --- a/tests/nvmath_tests/device/test_cufftdx_numba_perf.py +++ b/tests/nvmath_tests/device/test_cufftdx_numba_perf.py @@ -1,4 +1,4 @@ -# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. +# Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. # # SPDX-License-Identifier: Apache-2.0 diff --git a/tests/nvmath_tests/device/test_vector_types_numba.py b/tests/nvmath_tests/device/test_vector_types_numba.py index 22d88db..2365c16 100644 --- a/tests/nvmath_tests/device/test_vector_types_numba.py +++ b/tests/nvmath_tests/device/test_vector_types_numba.py @@ -1,4 +1,4 @@ -# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. +# Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. # # SPDX-License-Identifier: Apache-2.0 diff --git a/tests/nvmath_tests/fft/test_default_backend.py b/tests/nvmath_tests/fft/test_default_backend.py index 6db05e6..6da51fe 100644 --- a/tests/nvmath_tests/fft/test_default_backend.py +++ b/tests/nvmath_tests/fft/test_default_backend.py @@ -1,4 +1,4 @@ -# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. +# Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. # # SPDX-License-Identifier: Apache-2.0 diff --git a/tests/nvmath_tests/fft/test_fft_with_hypothesis.py b/tests/nvmath_tests/fft/test_fft_with_hypothesis.py index 5dc0e4f..85837c8 100644 --- a/tests/nvmath_tests/fft/test_fft_with_hypothesis.py +++ b/tests/nvmath_tests/fft/test_fft_with_hypothesis.py @@ -1,10 +1,14 @@ +# Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. +# +# SPDX-License-Identifier: Apache-2.0 + import itertools import cupy as cp import numpy as np import scipy.fft -from hypothesis import given, reproduce_failure, strategies as st +from hypothesis import given, strategies as st from hypothesis.extra.numpy import arrays, array_shapes import nvmath diff --git a/tests/nvmath_tests/fft/test_lto_callbacks.py b/tests/nvmath_tests/fft/test_lto_callbacks.py index 93877e9..f58a92d 100644 --- a/tests/nvmath_tests/fft/test_lto_callbacks.py +++ b/tests/nvmath_tests/fft/test_lto_callbacks.py @@ -1,4 +1,4 @@ -# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. +# Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. # # SPDX-License-Identifier: Apache-2.0 @@ -52,7 +52,8 @@ get_custom_stream, get_primes_up_to, init_assert_exec_backend_specified, - fx_last_operand_layout, + # pytest fixture is used but not detected by linter because of strange syntax + fx_last_operand_layout, # noqa: F401 ) from .utils.check_helpers import ( add_in_place, @@ -150,7 +151,7 @@ def allow_to_fail_compund_shape(e, shape, axes): def _has_numba(): try: - import numba + import numba # noqa: F401 return True except ModuleNotFoundError: @@ -399,6 +400,28 @@ def epilog_cb(data_out, offset, value, filter_data, unused): epilog_ltoir = nvmath.fft.compile_epilog(epilog_cb, epilog_dtype.name, epilog_dtype.name) cb_kwargs["epilog"] = {"ltoir": epilog_ltoir} scaling *= 5 + # Test create_key() function + try: + key1 = nvmath.fft.FFT.create_key( + signal, + axes=axes, + prolog=cb_kwargs["prolog"] if fft_callbacks.has_prolog() else None, + epilog=cb_kwargs["epilog"] if fft_callbacks.has_epilog() else None, + ) + assert key1 is not None + key2 = nvmath.fft.FFT.create_key( + signal, + axes=axes, + prolog=nvmath.fft.DeviceCallable(**cb_kwargs["prolog"]) if fft_callbacks.has_prolog() else None, + epilog=nvmath.fft.DeviceCallable(**cb_kwargs["epilog"]) if fft_callbacks.has_epilog() else None, + ) + assert key1 == key2 + except RuntimeError as e: + if "The FFT CPU execution is not available" in str(e) and mem_backend == MemBackend.cpu: + # Skip this check since create_key() function needs CPU FFT lib availability + pass + else: + raise ref = get_fft_ref(get_scaled(signal, scaling), axes=axes) @@ -1423,8 +1446,10 @@ def test_operand_and_filter_shapes_fft_ifft( "dtype_1", "shape_0", "axes_0", + "shape_kind_0", "shape_1", "axes_1", + "shape_kind_1", "callbacks_0", "callbacks_1", ), @@ -1437,16 +1462,39 @@ def test_operand_and_filter_shapes_fft_ifft( rng.choice(lto_callback_supperted_types), repr(shape_0), repr(axes_0), + shape_kind_0, repr(shape_1), repr(axes_1), + shape_kind_1, callbacks_0, rng.choice(list(LtoCallback)), ) for dtype_0 in lto_callback_supperted_types - for shape_0, axes_0, shape_1, axes_1 in [ - ((4200, 13), (0,), (7, 4199), (1,)), # 2*2*2*3*5*5*7, 13*17*19 - ((420, 512, 3), (0, 1), (5, 4, 4307), (1, 2)), # 4307=59*73 - ((2, 16, 16, 5), (0, 1, 2), (3, 9, 49, 25), (1, 2, 3)), + for shape_0, axes_0, shape_kind_0, shape_1, axes_1, shape_kind_1, in [ + ( + (4200, 13), + (0,), + ShapeKind.pow2357, + (7, 4199), + (1,), + ShapeKind.random, + ), # 2*2*2*3*5*5*7, 13*17*19 + ( + (420, 512, 3), + (0, 1), + ShapeKind.pow2357, + (5, 4, 4307), + (1, 2), + ShapeKind.random, + ), # 4307=59*73 + ( + (2, 16, 16, 5), + (0, 1, 2), + ShapeKind.pow2, + (3, 9, 49, 25), + (1, 2, 3), + ShapeKind.pow2357, + ), ] for framework in Framework.enabled() if ExecBackend.cufft in supported_backends.exec @@ -1463,8 +1511,10 @@ def test_two_plans_different_cbs( dtype_1, shape_0, axes_0, + shape_kind_0, shape_1, axes_1, + shape_kind_1, callbacks_0, callbacks_1, ): @@ -1579,8 +1629,8 @@ def epilog_cb_1(data_out, offset, value, filter_data, unused): assert_array_type(fft_0_out, framework, mem_backend, get_fft_dtype(dtype_0)) assert_array_type(fft_1_out, framework, mem_backend, get_fft_dtype(dtype_1)) - assert_norm_close(fft_0_out, ref_0, **get_tolerance(signal_0)) - assert_norm_close(fft_1_out, ref_1, **get_tolerance(signal_1)) + assert_norm_close(fft_0_out, ref_0, **get_tolerance(signal_0, shape_kind=shape_kind_0)) + assert_norm_close(fft_1_out, ref_1, **get_tolerance(signal_1, shape_kind=shape_kind_1)) @skip_if_lto_unssuported diff --git a/tests/nvmath_tests/fft/test_perf.py b/tests/nvmath_tests/fft/test_perf.py index 8a45ef7..cf97661 100644 --- a/tests/nvmath_tests/fft/test_perf.py +++ b/tests/nvmath_tests/fft/test_perf.py @@ -1,4 +1,4 @@ -# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. +# Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. # # SPDX-License-Identifier: Apache-2.0 diff --git a/tests/nvmath_tests/fft/test_perf_2d.py b/tests/nvmath_tests/fft/test_perf_2d.py index 9af56a2..cd5fb5c 100644 --- a/tests/nvmath_tests/fft/test_perf_2d.py +++ b/tests/nvmath_tests/fft/test_perf_2d.py @@ -1,4 +1,4 @@ -# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. +# Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. # # SPDX-License-Identifier: Apache-2.0 diff --git a/tests/nvmath_tests/fft/test_perf_4-5d.py b/tests/nvmath_tests/fft/test_perf_4-5d.py index d06524f..5ad6f4f 100644 --- a/tests/nvmath_tests/fft/test_perf_4-5d.py +++ b/tests/nvmath_tests/fft/test_perf_4-5d.py @@ -1,8 +1,7 @@ -# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. +# Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. # # SPDX-License-Identifier: Apache-2.0 -import nvmath import numpy as np import itertools import functools @@ -14,7 +13,7 @@ if ExecBackend.cufft in supported_backends.exec: import cupy - from nvmath_tests.helpers import time_cupy, random_complex, print_aligned_table, fft_perf_GFlops + from nvmath_tests.helpers import time_cupy, print_aligned_table, fft_perf_GFlops def test_fft(): try: diff --git a/tests/nvmath_tests/fft/test_stateful.py b/tests/nvmath_tests/fft/test_stateful.py index 9f57e88..19a5f35 100644 --- a/tests/nvmath_tests/fft/test_stateful.py +++ b/tests/nvmath_tests/fft/test_stateful.py @@ -1,4 +1,4 @@ -# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. +# Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. # # SPDX-License-Identifier: Apache-2.0 @@ -1383,7 +1383,7 @@ def decreasingly_aligned_signals(): "fft_type": fft_type.value, "inplace": inplace, "result_layout": layout.value, - "last_axis_size": "odd", + "last_axis_parity": "odd", }, axes=axes, ) as fft: diff --git a/tests/nvmath_tests/fft/test_stateless_1d.py b/tests/nvmath_tests/fft/test_stateless_1d.py index bdbca3c..cffd81f 100644 --- a/tests/nvmath_tests/fft/test_stateless_1d.py +++ b/tests/nvmath_tests/fft/test_stateless_1d.py @@ -1,4 +1,4 @@ -# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. +# Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. # # SPDX-License-Identifier: Apache-2.0 @@ -54,6 +54,7 @@ get_random_1d_shape, get_random_input_data, get_custom_stream, + get_stream_pointer, init_assert_exec_backend_specified, ) from .utils.check_helpers import ( @@ -648,9 +649,9 @@ def test_fft_array_device_id(monkeypatch, framework, exec_backend, mem_backend, @pytest.mark.parametrize( - ("framework", "exec_backend", "mem_backend", "dtype"), + ("framework", "exec_backend", "mem_backend", "dtype", "use_stream_ptr"), [ - (framework, exec_backend, mem_backend, dtype) + (framework, exec_backend, mem_backend, dtype, use_stream_ptr) for framework in Framework.enabled() for exec_backend in [ExecBackend.cufft] if exec_backend in supported_backends.exec @@ -658,9 +659,10 @@ def test_fft_array_device_id(monkeypatch, framework, exec_backend, mem_backend, if mem_backend in supported_backends.framework_mem[framework] for dtype in framework_exec_type_support[framework][exec_backend] if not is_half(dtype) # for the fft size, the halfs lack precision + for use_stream_ptr in (True, False) ], ) -def test_fft_custom_stream(framework, exec_backend, mem_backend, dtype): +def test_fft_custom_stream(framework, exec_backend, mem_backend, dtype, use_stream_ptr): stream = get_custom_stream(framework) shape = 1024 * 1024 @@ -670,14 +672,18 @@ def test_fft_custom_stream(framework, exec_backend, mem_backend, dtype): fft_ref = fft_ref * 42 fft_fn = nvmath.fft.fft if is_complex(dtype) else nvmath.fft.rfft - fft = fft_fn( - signal, - execution=exec_backend.nvname, - options={ - "blocking": "auto", - }, - stream=stream, - ) + try: + fft = fft_fn( + signal, + execution=exec_backend.nvname, + options={ + "blocking": "auto", + }, + stream=get_stream_pointer(stream) if use_stream_ptr else stream, + ) + except TypeError as e: + assert "A stream object must be provided for PyTorch operands" in str(e) and framework == Framework.torch + return with use_stream(stream): # The stateless API synchronizes on plan creation, diff --git a/tests/nvmath_tests/fft/test_stateless_nd.py b/tests/nvmath_tests/fft/test_stateless_nd.py index dc76440..0f8c34c 100644 --- a/tests/nvmath_tests/fft/test_stateless_nd.py +++ b/tests/nvmath_tests/fft/test_stateless_nd.py @@ -1,4 +1,4 @@ -# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. +# Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. # # SPDX-License-Identifier: Apache-2.0 diff --git a/tests/nvmath_tests/fft/utils/axes_utils.py b/tests/nvmath_tests/fft/utils/axes_utils.py index 1630e80..f874df3 100644 --- a/tests/nvmath_tests/fft/utils/axes_utils.py +++ b/tests/nvmath_tests/fft/utils/axes_utils.py @@ -1,4 +1,4 @@ -# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. +# Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. # # SPDX-License-Identifier: Apache-2.0 diff --git a/tests/nvmath_tests/fft/utils/check_helpers.py b/tests/nvmath_tests/fft/utils/check_helpers.py index f6e7021..c9afe69 100644 --- a/tests/nvmath_tests/fft/utils/check_helpers.py +++ b/tests/nvmath_tests/fft/utils/check_helpers.py @@ -1,4 +1,4 @@ -# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. +# Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. # # SPDX-License-Identifier: Apache-2.0 diff --git a/tests/nvmath_tests/fft/utils/common_axes.py b/tests/nvmath_tests/fft/utils/common_axes.py index 883b860..9b54a09 100644 --- a/tests/nvmath_tests/fft/utils/common_axes.py +++ b/tests/nvmath_tests/fft/utils/common_axes.py @@ -1,4 +1,4 @@ -# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. +# Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. # # SPDX-License-Identifier: Apache-2.0 diff --git a/tests/nvmath_tests/fft/utils/input_fixtures.py b/tests/nvmath_tests/fft/utils/input_fixtures.py index bbb90d2..4a88e5d 100644 --- a/tests/nvmath_tests/fft/utils/input_fixtures.py +++ b/tests/nvmath_tests/fft/utils/input_fixtures.py @@ -1,4 +1,4 @@ -# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. +# Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. # # SPDX-License-Identifier: Apache-2.0 @@ -137,6 +137,16 @@ def get_custom_stream(framework: Framework, device_id=None): raise ValueError(f"Unknown GPU framework {framework}") +def get_stream_pointer(stream) -> int: + package = stream.__class__.__module__.split(".")[0] + if package == "cupy": + return stream.ptr + elif package == "torch": + return stream.cuda_stream + else: + raise ValueError(f"Unknown GPU framework {package}") + + def init_assert_exec_backend_specified(): import pytest import nvmath diff --git a/tests/nvmath_tests/fft/utils/support_matrix.py b/tests/nvmath_tests/fft/utils/support_matrix.py index 73d356b..1c18e76 100644 --- a/tests/nvmath_tests/fft/utils/support_matrix.py +++ b/tests/nvmath_tests/fft/utils/support_matrix.py @@ -1,4 +1,4 @@ -# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. +# Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. # # SPDX-License-Identifier: Apache-2.0 diff --git a/tests/nvmath_tests/helpers.py b/tests/nvmath_tests/helpers.py index 3ab23d1..86f46c0 100644 --- a/tests/nvmath_tests/helpers.py +++ b/tests/nvmath_tests/helpers.py @@ -1,4 +1,4 @@ -# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. +# Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. # # SPDX-License-Identifier: Apache-2.0 diff --git a/tests/nvmath_tests/linalg/advanced/matmul/fp8_utils.py b/tests/nvmath_tests/linalg/advanced/matmul/fp8_utils.py new file mode 100644 index 0000000..55915ad --- /dev/null +++ b/tests/nvmath_tests/linalg/advanced/matmul/fp8_utils.py @@ -0,0 +1,264 @@ +# Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. +# +# SPDX-License-Identifier: Apache-2.0 + +try: + import torch +except ImportError: + torch = None +import pytest +import numpy as np +from .utils import sample_matrix, assert_tensors_equal, to_numpy +from nvmath._internal.utils import check_or_create_options +from nvmath.linalg.advanced import matmul +from nvmath.linalg.advanced import _configuration + +if torch is None: + pytest.skip("Torch is required for FP8 tests", allow_module_level=True) + + +class Fp8Helper: + """ + A helper class to compare quantized results. + """ + + def __init__(self, exponent_bits, significand_bits): + self.exponent_bits, self.significand_bits = exponent_bits, significand_bits + + def is_normal(e, m): + if (exponent_bits, significand_bits) == (4, 3): + return e > 0 and (e, m) != (2**exponent_bits - 1, 2**significand_bits - 1) + elif (exponent_bits, significand_bits) == (5, 2): + return e > 0 and e != 2**exponent_bits - 1 + else: + raise RuntimeError(f"is_normal not implemented for E{exponent_bits}M{significand_bits}") + + # Compute all values of FP8 number. + exponent_bias = 2 ** (exponent_bits - 1) - 1 + normal_values = [ + sign * 2 ** (e - exponent_bias) * (1 + m / 2**significand_bits) + for e in range(2**exponent_bits) + for m in range(2**significand_bits) + for sign in (-1, 1) + if is_normal(e, m) + ] + subnormal_values = [ + sign * 2 ** (1 - exponent_bias) * (0 + m / 2**significand_bits) + for m in range(1, 2**significand_bits) + for sign in (-1, 1) + ] + values = subnormal_values + normal_values + [0.0] + assert len(set(values)) == len(values) + self.values = np.asarray(sorted(values)) + + # For each value, calculate the range it covers. + middles = (self.values[1:] + self.values[:-1]) / 2 + self.lranges = np.append(np.asarray([-np.inf]), middles) + self.rranges = np.append(middles, np.asarray([np.inf])) + + def range(self, value): + """ + Finds a representable value closest to `value` and returns its range. + """ + + i = np.abs(self.values - value).argmin() + return self.lranges[i], self.rranges[i] + + def absdiff(self, quantized, expected): + """ + Returns absolute difference between the ranges of quantized numbers and the expected + values. + """ + l, r = np.vectorize(self.range)(quantized) + diff = np.minimum(abs(l - expected), abs(r - expected)) + diff[(l <= expected) & (r >= expected)] = 0.0 + return diff + + def allclose(self, quantized, expected, atol=1e-2, rtol=1e-2, return_info=False): + """ + Checks if quantized values are close enough to the expected ones. + """ + quantized = to_numpy(quantized.type(torch.float64)) + expected = to_numpy(expected.type(torch.float64)) + ok = np.all(self.absdiff(quantized, expected) <= atol + rtol * np.abs(expected)) + if not return_info: + return ok + else: + aerr = self.absdiff(quantized, expected) + rerr = aerr / (np.abs(expected) + 0.000001) + return ok, { + "aerr": np.max(aerr), + "atol": atol, + "rerr": np.max(rerr), + "rtol": rtol, + } + + +fp8helpers = { + "float8_e4m3fn": Fp8Helper(4, 3), + "float8_e5m2": Fp8Helper(5, 2), +} + + +def choose_scales( + a, b, c, atype, btype, ctype, dtype, alpha=1.0, beta=1.0, allowed_in_range=(0.5, 2), allowed_out_range=(1, 100) +): + """ + Chooses reasonable scales for each of the operands. Tries to fit (absolute values of) + a, b and c into `allowed_in_range`, and (absolute values of) d into `allowed_out_range`. + However, some noise is added at the end, so this is not a hard guarantee. + """ + a = a.type(torch.float32) + amax = a.abs().max().item() + ascale = np.random.uniform(*allowed_in_range) / amax if amax > 0 else 1 + a *= ascale + + b = b.type(torch.float32) + bmax = b.abs().max().item() + bscale = np.random.uniform(*allowed_in_range) / bmax if bmax > 0 else 1 + b *= bscale + + if c is not None: + c = c.type(torch.float32) + cmax = c.abs().max().item() + cscale = np.random.uniform(*allowed_in_range) / cmax if cmax > 0 else 1 + c *= cscale + else: + cscale = None + + if c is not None: + d = alpha * a @ b + beta * c + else: + d = alpha * a @ b + + dmax = d.abs().max().item() + dscale = np.random.uniform(*allowed_out_range) / dmax if dmax > 0 else 1 + + # Add some noise + ascale *= np.random.uniform(0.95, 1.05) + bscale *= np.random.uniform(0.95, 1.05) + if cscale is not None: + cscale *= np.random.uniform(0.95, 1.05) + dscale *= np.random.uniform(0.95, 1.05) + + # Flip the signs randomly + ascale *= np.random.choice((-1, 1)) + bscale *= np.random.choice((-1, 1)) + if cscale is not None: + cscale *= np.random.choice((-1, 1)) + dscale *= np.random.choice((-1, 1)) + + result_type = dtype or ctype or atype + if "float8" not in result_type: + dscale = None # Not supported, would raise an error + + if ctype and "float8" not in ctype: + cscale = None # Not supported, would raise an error + + return {"a": ascale, "b": bscale, "c": cscale, "d": dscale} + + +def simple_scales(atype, btype, ctype, dtype): + ascale = 1.2 + bscale = 3.4 + cscale = 0.56 + dscale = 0.78 + result_type = dtype or ctype or atype + if "float8" not in result_type: + dscale = None # Not supported, would raise an error + if not ctype or "float8" not in ctype: + cscale = None # Not supported, would raise an error + return {"a": ascale, "b": bscale, "c": cscale, "d": dscale} + + +def fp8_matmul_reference( + a, b, c=None, *args, epilog_inputs=None, quantization_scales=None, options=None, preferences=None, **kwargs +): + """ + Computes FP8-like matmul, but with higher precision. + """ + quantization_scales = check_or_create_options( + _configuration.MatmulQuantizationScales, quantization_scales, "Matmul quantization_scales" + ) + options = check_or_create_options(_configuration.MatmulOptions, options, "Matmul options") + preferences = check_or_create_options(_configuration.MatmulPlanPreferences, preferences, "Matmul preferences") + options.result_type = None + options.result_amax = False + preferences.epilog.aux_type = None + epilog_aux_amax = preferences.epilog.aux_amax + preferences.epilog.aux_amax = False + + if epilog_inputs is None: + epilog_inputs = {} + epilog_inputs = epilog_inputs.copy() + aux_scale = epilog_inputs.pop("aux_quantization_scale", None) + + for key in ("bias", "gelu_aux"): + if epilog_inputs and key in epilog_inputs: + epilog_inputs[key] = epilog_inputs[key].type(torch.float32) + + a_scale = quantization_scales.a if quantization_scales.a is not None else 1 + b_scale = quantization_scales.b if quantization_scales.b is not None else 1 + c_scale = quantization_scales.c if quantization_scales.c is not None else 1 + d_scale = quantization_scales.d if quantization_scales.d is not None else 1 + + ascaled = a.type(torch.float32) * a_scale + bscaled = b.type(torch.float32) * b_scale + cscaled = None if c is None else c_scale * c.type(torch.float32) + result = matmul(ascaled, bscaled, cscaled, *args, options=options, epilog_inputs=epilog_inputs, **kwargs) + if isinstance(result, tuple): + d, aux = result + d *= d_scale + assert len(aux) == 1 + key = list(aux.keys())[0] + if epilog_aux_amax: + aux[f"{key}_amax"] = max(key) + if aux_scale is not None: + aux[key] *= aux_scale + else: + result *= d_scale + + return result + + +def assert_fp8_equal(result, reference, atol=1e-2, rtol=1e-2): + """ + Checks if the result is close enough to the reference. For quantized results, uses + Fp8Helper. + """ + result_type = str(result.dtype).split(".")[-1] + if "float8" in result_type: + ok, info = fp8helpers[result_type].allclose(result, reference, atol=1e-1, rtol=5e-2, return_info=True) + if not ok: + print(f"Absolute error: {info['aerr']} (tolerance {info['atol']})") + print(f"Relative error: {info['rerr']} (tolerance {info['rtol']})") + print("Result:") + print(result) + print("Reference:") + print(reference) + assert ok + else: + assert_tensors_equal(result, reference, atol=atol, rtol=rtol) + + +def generate_inputs(m, n, k, atype, btype, ctype, *, c_transposed=False, min=0, max=5, use_cuda): + """ + Generates matmul inputs of given shapes and types. + """ + + a = sample_matrix("torch", atype, (m, k), use_cuda=use_cuda, min=min, max=max) + b = sample_matrix("torch", btype, (n, k), use_cuda=use_cuda, min=min, max=max).T + + if ctype is not None: + if not c_transposed: + c = sample_matrix("torch", ctype, (m, n), use_cuda=use_cuda, min=min, max=max) + else: + c = sample_matrix("torch", ctype, (n, m), use_cuda=use_cuda, min=min, max=max).T + beta = np.random.uniform(-2, 2) + else: + c = None + beta = None + + alpha = np.random.uniform(-2, 2) + + return a, b, c, alpha, beta diff --git a/tests/nvmath_tests/linalg/advanced/matmul/test_epilog.py b/tests/nvmath_tests/linalg/advanced/matmul/test_epilog.py index ba9fe94..f1fceda 100644 --- a/tests/nvmath_tests/linalg/advanced/matmul/test_epilog.py +++ b/tests/nvmath_tests/linalg/advanced/matmul/test_epilog.py @@ -1,4 +1,4 @@ -# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. +# Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. # # SPDX-License-Identifier: Apache-2.0 @@ -6,15 +6,25 @@ This set of tests verifies the correctness of the epilog handling. """ +import numpy as np +import pytest + from nvmath.linalg.advanced import matmul, Matmul, MatmulEpilog as Epilog from nvmath.bindings import cublasLt as cublaslt -import pytest -import random -from .utils import * -import numpy as np + +from .utils import ( + compare_tensors, + get_absolute_tolerance, + get_framework, + sample_float_tensor, + sample_matrix, + assert_tensors_equal, + skip_if_cublas_before, + to_numpy, +) try: - from cupy import tanh, sqrt, pi, cosh + import cupy except ModuleNotFoundError: pytest.skip("cupy required for matmul tests", allow_module_level=True) @@ -28,7 +38,7 @@ def relu(x): def gelu(x): - return 0.5 * x * (1 + tanh(sqrt(2 / pi) * (x + 0.0447115 * x**3))) + return 0.5 * x * (1 + cupy.tanh(cupy.sqrt(2 / cupy.pi) * (x + 0.0447115 * x**3))) simple_epilogs = ( @@ -55,7 +65,7 @@ def verify_relu_bitmask(x, bitmask): n, m = x.shape for i in range(n): for j in range(m): - if abs(x[i][j]) <= get_tolerance(x): + if abs(x[i][j]) <= get_absolute_tolerance(x): # This value is dangerously close to 0 and the bitmask might # be incorrect due to precision issues. continue @@ -136,9 +146,9 @@ def dgelu(x): """ Derivative of (tanh-approximated) gelu from the values returned by GELU_AUX """ - sech = lambda x: 1 / cosh(x) + sech = lambda x: 1 / cupy.cosh(x) return ( - 0.5 * tanh(0.0356774 * x**3 + 0.797885 * x) + 0.5 * cupy.tanh(0.0356774 * x**3 + 0.797885 * x) + (0.0535161 * x**3 + 0.398942 * x) * (sech(0.0356774 * x**3 + 0.797885 * x)) ** 2 + 0.5 ) @@ -483,3 +493,63 @@ def test_renamed_epilog_inputs(): epilog_inputs={"not_a_bias": cupy.full((4, 1), np.float32(0.8))}, ), ) + +@pytest.mark.parametrize("epilog", epilogs_with_bias) +@pytest.mark.parametrize("test_case", [ + { + "name": "mismatch_batch_size", + "a_shape": (2, 4, 5), + "b_shape": (2, 5, 8), + "bias_shape": (3, 4, 1), + "error_pattern": "batch dimensions of the bias.*must match", + "make_bias": lambda shape: cupy.full(shape, np.float32(0.8)), + "min_cublas_version": 11703 + }, + { + "name": "mismatched_batch_axis_order", + "a_shape": (2, 3, 4, 5), + "b_shape": (2, 3, 5, 8), + "bias_shape": (2, 3, 4, 1), + "error_pattern": "batch axis order of the bias.*must match", + "make_bias": lambda shape: cupy.lib.stride_tricks.as_strided( + cupy.full((2, 3, 4, 1), np.float32(0.8)), + shape=shape, + strides=(4, 12, 1, 4) + ), + "min_cublas_version": 11703 + }, + { + "name": "non_tileable_batch", + "a_shape": (2, 3, 4, 5), + "b_shape": (2, 3, 5, 8), + "bias_shape": (2, 3, 4, 1), + "error_pattern": "not supported because it is not tileable", + "make_bias": lambda shape: cupy.lib.stride_tricks.as_strided( + cupy.full(shape, np.float32(0.8)), + shape=shape, + strides=(16, 4, 1, 4) + ), + "min_cublas_version": 11703 + }, + { + "name": "invalid_stride", + "a_shape": (4, 5), + "b_shape": (5, 8), + "bias_shape": (4, 2), + "error_pattern": "stride of the bias.*must be 1", + "make_bias": lambda shape: cupy.lib.stride_tricks.as_strided( + cupy.full(shape, np.float32(0.8)), + shape=(4, 1), + strides=(2, 1) + ), + "min_cublas_version": 11501 + } +]) +def test_invalid_bias(epilog, test_case): + skip_if_cublas_before(test_case["min_cublas_version"]) + + a = sample_float_tensor(test_case["a_shape"]) + b = sample_float_tensor(test_case["b_shape"]) + bias = test_case["make_bias"](test_case["bias_shape"]) + with pytest.raises(ValueError, match=test_case["error_pattern"]): + matmul(a, b, epilog=epilog, epilog_inputs={"bias": bias}) \ No newline at end of file diff --git a/tests/nvmath_tests/linalg/advanced/matmul/test_fp8.py b/tests/nvmath_tests/linalg/advanced/matmul/test_fp8.py new file mode 100644 index 0000000..47e22a9 --- /dev/null +++ b/tests/nvmath_tests/linalg/advanced/matmul/test_fp8.py @@ -0,0 +1,647 @@ +# Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. +# +# SPDX-License-Identifier: Apache-2.0 + + +try: + import torch +except ImportError: + torch = None +import pytest +from .utils import sample_matrix, assert_tensors_equal, matmul_with_random_autotune +from .fp8_utils import choose_scales, generate_inputs, assert_fp8_equal, fp8_matmul_reference +from nvmath.linalg.advanced import Matmul, matmul, MatmulQuantizationScales +from nvmath._internal.typemaps import NAME_TO_DATA_TYPE +from nvmath.bindings import cublasLt as cublaslt + +if torch is None: + pytest.skip("Torch is required for FP8 tests", allow_module_level=True) + +if (torch.cuda.get_device_properties(0).major, torch.cuda.get_device_properties(0).minor) < (8, 9): + pytest.skip("CC>=8.9 is required for FP8 tests", allow_module_level=True) + +if cublaslt.get_version() < 120800: + pytest.skip("cuBLAS 120800 is required for FP8 tests", allow_module_level=True) + +SUPPORTED_TYPE_COMBINATIONS = [ + # No type specification for D. + ("float8_e4m3fn", "float8_e4m3fn", "float16", None), + ("float8_e4m3fn", "float8_e4m3fn", "bfloat16", None), + ("float8_e4m3fn", "float8_e4m3fn", "float32", None), + ("float8_e4m3fn", "float8_e5m2", "float32", None), + ("float8_e4m3fn", "float8_e5m2", "float16", None), + ("float8_e4m3fn", "float8_e5m2", "bfloat16", None), + # No type specification for D. No C. + ("float8_e4m3fn", "float8_e4m3fn", None, None), + ("float8_e4m3fn", "float8_e5m2", None, None), + ("float8_e5m2", "float8_e4m3fn", None, None), + # Explicit type specification for A, B, C, D. + ("float8_e4m3fn", "float8_e4m3fn", "float16", "float16"), + ("float8_e4m3fn", "float8_e4m3fn", "float16", "float8_e4m3fn"), + ("float8_e4m3fn", "float8_e4m3fn", "bfloat16", "bfloat16"), + ("float8_e4m3fn", "float8_e4m3fn", "bfloat16", "float8_e4m3fn"), + ("float8_e4m3fn", "float8_e4m3fn", "float32", "float32"), + ("float8_e4m3fn", "float8_e5m2", "bfloat16", "bfloat16"), + ("float8_e4m3fn", "float8_e5m2", "bfloat16", "float8_e5m2"), + ("float8_e4m3fn", "float8_e5m2", "bfloat16", "float8_e4m3fn"), + ("float8_e4m3fn", "float8_e5m2", "float16", "float16"), + ("float8_e4m3fn", "float8_e5m2", "float16", "float8_e5m2"), + ("float8_e4m3fn", "float8_e5m2", "float16", "float8_e4m3fn"), + ("float8_e5m2", "float8_e4m3fn", "bfloat16", "bfloat16"), + ("float8_e5m2", "float8_e4m3fn", "bfloat16", "float8_e5m2"), + ("float8_e5m2", "float8_e4m3fn", "bfloat16", "float8_e4m3fn"), + ("float8_e5m2", "float8_e4m3fn", "float16", "float16"), + ("float8_e5m2", "float8_e4m3fn", "float16", "float8_e5m2"), + ("float8_e5m2", "float8_e4m3fn", "float16", "float8_e4m3fn"), + # Explicit type specification for A, B, D. No C. + ("float8_e4m3fn", "float8_e4m3fn", None, "float16"), + ("float8_e4m3fn", "float8_e4m3fn", None, "bfloat16"), + ("float8_e4m3fn", "float8_e4m3fn", None, "float32"), + ("float8_e4m3fn", "float8_e5m2", None, "bfloat16"), + ("float8_e4m3fn", "float8_e5m2", None, "float8_e5m2"), + ("float8_e4m3fn", "float8_e5m2", None, "float16"), + ("float8_e5m2", "float8_e4m3fn", None, "bfloat16"), + ("float8_e5m2", "float8_e4m3fn", None, "float8_e5m2"), + ("float8_e5m2", "float8_e4m3fn", None, "float16"), + ("float8_e5m2", "float8_e4m3fn", None, "float8_e4m3fn"), +] + +if cublaslt.get_version() < 120600: + SUPPORTED_TYPE_COMBINATIONS = [(a, b, c, d) for (a, b, c, d) in SUPPORTED_TYPE_COMBINATIONS if a == b] + + +def expected_result_type(atype, btype, ctype, dtype): + return dtype or ctype or atype + + +SUPPORTED_TYPE_COMBINATIONS_WITH_FP8_D = [ + (a, b, c, d) for (a, b, c, d) in SUPPORTED_TYPE_COMBINATIONS if "float8" in expected_result_type(a, b, c, d) +] + +SUPPORTED_TYPE_COMBINATIONS_WITH_NON_FP8_D = [ + t for t in SUPPORTED_TYPE_COMBINATIONS if t not in SUPPORTED_TYPE_COMBINATIONS_WITH_FP8_D +] + + +@pytest.mark.parametrize("atype,btype,ctype,dtype", SUPPORTED_TYPE_COMBINATIONS) +@pytest.mark.parametrize( + "m,n,k", + ( + (16, 16, 16), + (32, 16, 16), + (16, 32, 16), + (16, 16, 32), + (64, 32, 16), + (16, 96, 32), + (64, 16, 32), + (64, 96, 16), + ), +) +@pytest.mark.parametrize(("use_cuda"), (True, False)) +def test_stateful(m, n, k, atype, btype, ctype, dtype, use_cuda): + """ + General test of FP8 multiplication. + """ + a, b, c, alpha, beta = generate_inputs(m, n, k, atype, btype, ctype, use_cuda=use_cuda) + quantization_scales = choose_scales(a, b, c, atype, btype, ctype, dtype, alpha=alpha, beta=beta) + options = {"result_type": NAME_TO_DATA_TYPE[dtype] if dtype else None} + + mm = Matmul(a, b, c, alpha=alpha, beta=beta, quantization_scales=quantization_scales, options=options) + mm.plan() + result = mm.execute() + + assert str(result.dtype).split(".")[-1] == expected_result_type(atype, btype, ctype, dtype) + + reference = fp8_matmul_reference(a, b, c, alpha=alpha, beta=beta, quantization_scales=quantization_scales, options=options) + assert_fp8_equal(result, reference) + + +@pytest.mark.parametrize("atype,btype,ctype,dtype", SUPPORTED_TYPE_COMBINATIONS) +@pytest.mark.parametrize( + "m,n,k", + ((16, 16, 16),), +) +@pytest.mark.parametrize("amax", (True, False)) +@pytest.mark.parametrize(("use_cuda"), (True, False)) +def test_autotuning(m, n, k, atype, btype, ctype, dtype, amax, use_cuda): + """ + Tests if autotuning works with FP8 multiplication. + """ + a, b, c, alpha, beta = generate_inputs(m, n, k, atype, btype, ctype, use_cuda=use_cuda) + quantization_scales = choose_scales(a, b, c, atype, btype, ctype, dtype, alpha=alpha, beta=beta) + result_type = expected_result_type(atype, btype, ctype, dtype) + options = {"result_type": NAME_TO_DATA_TYPE[dtype] if dtype else None, "result_amax": amax and "float8" in result_type} + mm = Matmul(a, b, c, alpha=alpha, beta=beta, quantization_scales=quantization_scales, options=options) + mm.plan() + mm.autotune() + result = mm.execute() + if isinstance(result, tuple): + result = result[0] + reference = fp8_matmul_reference(a, b, c, alpha=alpha, beta=beta, quantization_scales=quantization_scales, options=options) + assert_fp8_equal(result, reference) + + +@pytest.mark.parametrize("atype,btype,ctype,dtype", SUPPORTED_TYPE_COMBINATIONS) +@pytest.mark.parametrize( + "m,n,k", + ((96, 128, 16),), +) +@pytest.mark.parametrize(("use_cuda"), (True, False)) +def test_stateless(m, n, k, atype, btype, ctype, dtype, use_cuda): + """ + Tests if stateless `matmul` supports quantization_scales and options. + """ + a, b, c, alpha, beta = generate_inputs(m, n, k, atype, btype, ctype, use_cuda=use_cuda) + quantization_scales = choose_scales(a, b, c, atype, btype, ctype, dtype, alpha=alpha, beta=beta) + options = {"result_type": NAME_TO_DATA_TYPE[dtype] if dtype else None} + result = matmul(a, b, c, alpha=alpha, beta=beta, quantization_scales=quantization_scales, options=options) + assert str(result.dtype).split(".")[-1] == expected_result_type(atype, btype, ctype, dtype) + + reference = fp8_matmul_reference(a, b, c, alpha=alpha, beta=beta, quantization_scales=quantization_scales, options=options) + + assert_fp8_equal(result, reference) + + +@pytest.mark.parametrize("atype,btype,ctype,dtype", SUPPORTED_TYPE_COMBINATIONS) +@pytest.mark.parametrize( + "a_batch,b_batch,c_batch,d_batch", + ( + ((), (), (), ()), + ((3,), (3,), (3,), (3,)), + ((8,), (8,), (8,), (8,)), + ((5,), (), (5,), (5,)), + ((), (2,), (2,), (2,)), + ((2, 3), (2, 3), (2, 3), (2, 3)), + ), +) +@pytest.mark.parametrize( + "m,n,k", + ((16, 16, 16),), +) +@pytest.mark.parametrize(("use_cuda"), (True,)) +def test_batching(m, n, k, atype, btype, ctype, dtype, a_batch, b_batch, c_batch, d_batch, use_cuda): + """ + Tests if batching works with FP8. + """ + + def sample_batch(batch_shape, matrix_shape, type, transposed=False): + shape = (*batch_shape, *matrix_shape) + if transposed: + shape = (*shape[:-2], shape[-1], shape[-2]) + x = sample_matrix("torch", type, shape, use_cuda=use_cuda, min=0, max=2) + return x.swapaxes(-1, -2) if transposed else x + + a = sample_batch(a_batch, (m, k), atype, transposed=False) + b = sample_batch(b_batch, (k, n), btype, transposed=True) + + if ctype is not None: + c = sample_batch(c_batch, (m, n), ctype, transposed=False) + beta = 0.12 + else: + c = None + beta = None + + alpha = 0.32 + + options = {"result_type": NAME_TO_DATA_TYPE[dtype] if dtype else None} + quantization_scales = choose_scales(a, b, c, atype, btype, ctype, dtype, alpha=alpha, beta=beta) + result = matmul_with_random_autotune( + a, b, c, alpha=alpha, beta=beta, quantization_scales=quantization_scales, options=options + ) + + expected_result_shape = (*d_batch, m, n) + assert result.shape == expected_result_shape + + reference = fp8_matmul_reference(a, b, c, alpha=alpha, beta=beta, quantization_scales=quantization_scales, options=options) + assert_fp8_equal(result, reference) + + +@pytest.mark.parametrize("atype,btype,ctype,dtype", SUPPORTED_TYPE_COMBINATIONS) +@pytest.mark.parametrize( + "m,n,k", + ((64, 32, 16),), +) +@pytest.mark.parametrize("a_scale_kind", ("scalar", "gpu", "cpu")) +@pytest.mark.parametrize("b_scale_kind", ("scalar", "gpu", "cpu")) +@pytest.mark.parametrize("c_scale_kind", ("scalar", "gpu", "cpu")) +@pytest.mark.parametrize("d_scale_kind", ("scalar", "gpu", "cpu")) +@pytest.mark.parametrize(("use_cuda"), (True, False)) +def test_tensor_scales(m, n, k, atype, btype, ctype, dtype, a_scale_kind, b_scale_kind, c_scale_kind, d_scale_kind, use_cuda): + """ + Test scales provided as scalars/GPU tensors/CPU tensors + """ + a, b, c, alpha, beta = generate_inputs(m, n, k, atype, btype, ctype, use_cuda=use_cuda) + scalar_scales = choose_scales(a, b, c, atype, btype, ctype, dtype, alpha=alpha, beta=beta) + + def prepare_scales(scales): + """ + Change some of the scales into tensors + """ + + def wrap_scale(x, kind): + if kind == "scalar" or x is None: + return x + tensor = torch.as_tensor(x, dtype=torch.float32) + return tensor.cuda() if kind == "gpu" else tensor + + return { + "a": wrap_scale(scales["a"], a_scale_kind), + "b": wrap_scale(scales["b"], b_scale_kind), + "c": wrap_scale(scales["c"], c_scale_kind), + "d": wrap_scale(scales["d"], d_scale_kind), + } + + options = {"result_type": NAME_TO_DATA_TYPE[dtype] if dtype else None} + + scales = prepare_scales(scalar_scales) + mm = Matmul(a, b, c, alpha=alpha, beta=beta, quantization_scales=scales, options=options) + mm.plan() + result = mm.execute() + reference = fp8_matmul_reference(a, b, c, alpha=alpha, beta=beta, quantization_scales=scalar_scales, options=options) + assert_fp8_equal(result, reference) + + # In-place modification of GPU scales + if a_scale_kind == "gpu": + scalar_scales["a"] *= 0.5 + scales["a"].copy_(scalar_scales["a"]) + if b_scale_kind == "gpu": + scalar_scales["b"] *= -1 + scales["b"].copy_(scalar_scales["b"]) + if c_scale_kind == "gpu" and scalar_scales["c"] is not None: + scalar_scales["c"] *= -1 + scales["c"].copy_(scalar_scales["c"]) + if d_scale_kind == "gpu" and scalar_scales["d"] is not None: + scalar_scales["d"] *= 0.5 + scales["d"].copy_(scalar_scales["d"]) + result = mm.execute() + reference = fp8_matmul_reference(a, b, c, alpha=alpha, beta=beta, quantization_scales=scalar_scales, options=options) + assert_fp8_equal(result, reference) + + # Reset of the scales + new_a, new_b, new_c, new_alpha, new_beta = generate_inputs(m, n, k, atype, btype, ctype, use_cuda=use_cuda) + new_scalar_scales = choose_scales(new_a, new_b, new_c, atype, btype, ctype, dtype, alpha=new_alpha, beta=new_beta) + new_scales = prepare_scales(new_scalar_scales) + mm.reset_operands(a=new_a, b=new_b, c=new_c, quantization_scales=new_scales, alpha=new_alpha, beta=new_beta) + result = mm.execute() + reference = fp8_matmul_reference( + new_a, new_b, new_c, alpha=new_alpha, beta=new_beta, quantization_scales=new_scalar_scales, options=options + ) + assert_fp8_equal(result, reference) + + +@pytest.mark.parametrize("atype,btype,ctype,dtype", SUPPORTED_TYPE_COMBINATIONS_WITH_FP8_D) +@pytest.mark.parametrize( + "m,n,k", + ( + (16, 16, 16), + (96, 64, 16), + ), +) +@pytest.mark.parametrize(("stateless"), (True, False)) +@pytest.mark.parametrize(("use_cuda"), (True, False)) +def test_amax(m, n, k, atype, btype, ctype, dtype, stateless, use_cuda): + """ + Test if amax is computed properly. + """ + a, b, c, alpha, beta = generate_inputs(m, n, k, atype, btype, ctype, use_cuda=use_cuda) + scales = choose_scales(a, b, c, atype, btype, ctype, dtype, alpha=alpha, beta=beta) + options = {"result_type": NAME_TO_DATA_TYPE[dtype] if dtype else None, "result_amax": True} + if not stateless: + with Matmul(a, b, c, alpha=alpha, beta=beta, quantization_scales=scales, options=options) as mm: + mm.plan() + result, aux = mm.execute() + else: + result, aux = matmul_with_random_autotune(a, b, c, alpha=alpha, beta=beta, quantization_scales=scales, options=options) + assert len(aux) == 1 + amax = aux["result_amax"] + reference = fp8_matmul_reference(a, b, c, alpha=alpha, beta=beta, options=options, quantization_scales=scales) + assert_fp8_equal(result, reference) + not_scaled_reference = fp8_matmul_reference( + a, + b, + c, + alpha=alpha, + beta=beta, + options=options, + quantization_scales={k: v if k != "d" else 1.0 for k, v in scales.items()}, + ) + assert_tensors_equal(amax, (not_scaled_reference.abs().max()), atol=0.01, rtol=1e-3) + + +@pytest.mark.parametrize("atype,btype,ctype,dtype", SUPPORTED_TYPE_COMBINATIONS) +@pytest.mark.parametrize( + "reset_a,reset_b,reset_c", + ((a, b, c) for a in (True, False) for b in (True, False) for c in (True, False) if (a, b, c) != (False, False, False)), +) +@pytest.mark.parametrize("reset_alpha", (True, False)) +@pytest.mark.parametrize("reset_beta", (True, False)) +@pytest.mark.parametrize("m,n,k", ((16, 16, 16),)) +@pytest.mark.parametrize(("use_cuda"), (True, False)) +def test_reset_operands(m, n, k, atype, btype, ctype, dtype, reset_a, reset_b, reset_c, reset_alpha, reset_beta, use_cuda): + """ + Tests if reset_operands works with FP8 matmuls without resetting the scales. + """ + if reset_c and ctype is None: + pytest.skip("Can't reset C because C is not specified") + a, b, c, alpha, beta = generate_inputs(m, n, k, atype, btype, ctype, use_cuda=use_cuda) + quantization_scales = choose_scales(a, b, c, atype, btype, ctype, dtype, alpha=alpha, beta=beta) + + options = {"result_type": NAME_TO_DATA_TYPE[dtype] if dtype else None} + mm = Matmul(a, b, c, alpha=alpha, beta=beta, quantization_scales=quantization_scales, options=options) + mm.plan() + result1 = mm.execute() + reference1 = fp8_matmul_reference(a, b, c, alpha=alpha, beta=beta, quantization_scales=quantization_scales, options=options) + assert_fp8_equal(result1, reference1) + + new_a, new_b, new_c, new_alpha, new_beta = generate_inputs(m, n, k, atype, btype, ctype, use_cuda=use_cuda) + + reset_kwargs = {} + if reset_a: + reset_kwargs["a"] = a = new_a + if reset_b: + reset_kwargs["b"] = b = new_b + if reset_c: + reset_kwargs["c"] = c = new_c + if reset_alpha: + reset_kwargs["alpha"] = alpha = new_alpha + if reset_beta: + reset_kwargs["beta"] = beta = new_beta + + mm.reset_operands(**reset_kwargs) + result2 = mm.execute() + reference2 = fp8_matmul_reference(a, b, c, alpha=alpha, beta=beta, quantization_scales=quantization_scales, options=options) + assert_fp8_equal(result2, reference2) + + +@pytest.mark.parametrize("atype,btype,ctype,dtype", SUPPORTED_TYPE_COMBINATIONS) +@pytest.mark.parametrize("reset_a_scale", (True, False)) +@pytest.mark.parametrize("reset_b_scale", (True, False)) +@pytest.mark.parametrize("reset_c_scale", (True, False)) +@pytest.mark.parametrize("reset_d_scale", (True, False)) +@pytest.mark.parametrize("m,n,k", ((16, 16, 16),)) +@pytest.mark.parametrize("use_cuda", (True,)) +def test_reset_quantization_scales( + m, n, k, atype, btype, ctype, dtype, reset_a_scale, reset_b_scale, reset_c_scale, reset_d_scale, use_cuda +): + """ + Tests if reset_operands allows resetting (some or all) quantization_scales. + """ + a, b, c, alpha, beta = generate_inputs(m, n, k, atype, btype, ctype, use_cuda=use_cuda) + quantization_scales = choose_scales(a, b, c, atype, btype, ctype, dtype, alpha=alpha, beta=beta) + + options = {"result_type": NAME_TO_DATA_TYPE[dtype] if dtype else None} + mm = Matmul(a, b, c, alpha=alpha, beta=beta, quantization_scales=quantization_scales, options=options) + mm.plan() + result1 = mm.execute() + reference1 = fp8_matmul_reference(a, b, c, alpha=alpha, beta=beta, quantization_scales=quantization_scales, options=options) + assert_fp8_equal(result1, reference1) + + new_a, new_b, new_c, new_alpha, new_beta = generate_inputs(m, n, k, atype, btype, ctype, use_cuda=use_cuda) + new_quantization_scales = choose_scales(new_a, new_b, new_c, atype, btype, ctype, dtype, alpha=new_alpha, beta=new_beta) + + reset_kwargs = {"a": new_a} + + reset_quantization_scales = {} + if reset_a_scale: + reset_quantization_scales["a"] = quantization_scales["a"] = new_quantization_scales["a"] + if reset_b_scale: + reset_quantization_scales["b"] = quantization_scales["b"] = new_quantization_scales["b"] + if reset_c_scale: + reset_quantization_scales["c"] = quantization_scales["c"] = new_quantization_scales["c"] + if reset_d_scale: + reset_quantization_scales["d"] = quantization_scales["d"] = new_quantization_scales["d"] + if reset_quantization_scales: + reset_kwargs["quantization_scales"] = reset_quantization_scales + + mm.reset_operands(**reset_kwargs) + result2 = mm.execute() + reference2 = fp8_matmul_reference( + new_a, b, c, alpha=alpha, beta=beta, quantization_scales=quantization_scales, options=options + ) + assert_fp8_equal(result2, reference2) + + +@pytest.mark.parametrize("atype,btype,ctype,dtype", SUPPORTED_TYPE_COMBINATIONS[0:1]) +@pytest.mark.parametrize("m,n,k", ((16, 16, 16),)) +@pytest.mark.parametrize("use_cuda", (True,)) +def test_quantization_scales_as_object(m, n, k, atype, btype, ctype, dtype, use_cuda): + """ + Tests if passing the scales as an instance of MatmulQuantizationScales works. + """ + a, b, c, alpha, beta = generate_inputs(m, n, k, atype, btype, ctype, use_cuda=use_cuda) + dict_scales = choose_scales(a, b, c, atype, btype, ctype, dtype, alpha=alpha, beta=beta) + quantization_scales = MatmulQuantizationScales( + a=dict_scales["a"], b=dict_scales["b"], c=dict_scales["c"], d=dict_scales["d"] + ) + result = matmul(a, b, c, alpha=alpha, beta=beta, quantization_scales=quantization_scales) + assert str(result.dtype).split(".")[-1] == expected_result_type(atype, btype, ctype, dtype) + + reference = fp8_matmul_reference(a, b, c, alpha=alpha, beta=beta, quantization_scales=quantization_scales) + + assert_fp8_equal(result, reference) + + +############### +@pytest.mark.parametrize("atype,btype,ctype,dtype", SUPPORTED_TYPE_COMBINATIONS) +@pytest.mark.parametrize("a_scale", (True, False)) +@pytest.mark.parametrize("b_scale", (True, False)) +@pytest.mark.parametrize("c_scale", (True, False)) +@pytest.mark.parametrize("d_scale", (True, False)) +@pytest.mark.parametrize("use_cuda", (True,)) +def test_validation_required_quantization_scales(atype, btype, ctype, dtype, a_scale, b_scale, c_scale, d_scale, use_cuda): + """ + Tests if unspecified quantization_scales trigger an error. + """ + a, b, c, alpha, beta = generate_inputs(16, 16, 16, atype, btype, ctype, use_cuda=use_cuda) + all_quantization_scales = choose_scales(a, b, c, atype, btype, ctype, dtype, alpha=alpha, beta=beta) + + quantization_scales = {} + if a_scale: + quantization_scales["a"] = all_quantization_scales["a"] + if b_scale: + quantization_scales["b"] = all_quantization_scales["b"] + if c_scale: + quantization_scales["c"] = all_quantization_scales["c"] + if d_scale: + quantization_scales["d"] = all_quantization_scales["d"] + + quantization_scales_ok = all(x in quantization_scales or all_quantization_scales[x] is None for x in "abcd") + + options = {"result_type": NAME_TO_DATA_TYPE[dtype] if dtype else None} + + with pytest.raises(ValueError, match=r"Scales are required for narrow-precision \(FP8 and lower\) operations"): + matmul(a, b, c, alpha=alpha, beta=beta, options=options) + + if quantization_scales_ok: + result = matmul(a, b, c, alpha=alpha, beta=beta, quantization_scales=quantization_scales, options=options) + reference = fp8_matmul_reference( + a, b, c, alpha=alpha, beta=beta, quantization_scales=quantization_scales, options=options + ) + assert_fp8_equal(result, reference) + else: + with pytest.raises(ValueError): + matmul(a, b, c, alpha=alpha, beta=beta, quantization_scales=quantization_scales, options=options) + + +@pytest.mark.parametrize("atype,btype,ctype,dtype", SUPPORTED_TYPE_COMBINATIONS[1:2]) +@pytest.mark.parametrize("m,n,k", ((16, 16, 16),)) +@pytest.mark.parametrize("use_cuda", (True,)) +def test_validation_invalid_quantization_scales_type(m, n, k, atype, btype, ctype, dtype, use_cuda): + """ + Tests what happens when an invalid type is provided for `quantization_scales`. + """ + a, b, c, alpha, beta = generate_inputs(m, n, k, atype, btype, ctype, use_cuda=use_cuda) + with pytest.raises( + TypeError, match="Scale factors must be provided as an object of type MatmulQuantizationScales or as a dict" + ): + matmul(a, b, quantization_scales="oh no!") + + +@pytest.mark.parametrize( + "atype,btype, ctype", + ( + ("float8_e4m3fn", "float8_e5m2", "float16"), + ("float8_e5m2", "float8_e4m3fn", None), + ("float8_e5m2", "float8_e4m3fn", "float32"), + ), +) +@pytest.mark.parametrize("m,n,k", ((16, 16, 16),)) +def test_validation_unsupported_different_ab_types(m, n, k, atype, btype, ctype): + version = cublaslt.get_version() + if version >= 120600: + pytest.skip(f"Different A and B types are supported for cuBLAS {version}") + a, b, c, alpha, beta = generate_inputs(m, n, k, atype, btype, ctype, use_cuda=True) + with pytest.raises( + ValueError, match=f"FP8 multiplication of {atype} and {btype} is not supported in cuBLASLt version {version}" + ): + matmul(a, b, quantization_scales={"a": 1, "b": 1, "c": 1, "d": 1}) + + +@pytest.mark.parametrize("atype,btype,ctype,dtype", SUPPORTED_TYPE_COMBINATIONS) +@pytest.mark.parametrize( + "m,n,k", + ( + (16, 16, 8), + (32, 8, 16), + (8, 16, 16), + (32, 32, 12), + (32, 36, 16), + (4, 48, 32), + (32, 96, 17), + (80, 11, 64), + (19, 96, 128), + (33, 44, 55), + ), +) +def test_validation_invalid_sizes(m, n, k, atype, btype, ctype, dtype): + """ + Tests if invalid size raises an error. + """ + a, b, c, alpha, beta = generate_inputs(m, n, k, atype, btype, ctype, use_cuda=True) + quantization_scales = choose_scales(a, b, c, atype, btype, ctype, dtype, alpha=alpha, beta=beta) + options = {"result_type": NAME_TO_DATA_TYPE[dtype] if dtype else None} + + with pytest.raises(ValueError, match="must be divisible by 16"): + matmul(a, b, c, alpha=alpha, beta=beta, quantization_scales=quantization_scales, options=options) + + +@pytest.mark.parametrize("atype,btype,ctype,dtype", SUPPORTED_TYPE_COMBINATIONS) +@pytest.mark.parametrize( + "m,n,k", + ((64, 64, 64),), +) +@pytest.mark.parametrize("misaligned", ("a", "b")) +@pytest.mark.parametrize("offset", (1, 2, 4, 8, 12)) +def test_validation_misaligned(m, n, k, atype, btype, ctype, dtype, misaligned, offset): + """ + Tests if invalid alignment raises an error or returns a correct result. + """ + a, b, c, alpha, beta = generate_inputs(m, n, k, atype, btype, ctype, use_cuda=True) + + def gen_misaligned(shape, type): + assert len(shape) == 2 + aligned = torch.rand(shape[0] * shape[1] + offset).type(getattr(torch, type)).cuda() + return aligned[offset:].reshape(shape) + + if misaligned == "a": + a = gen_misaligned((m, k), atype) + if misaligned == "b": + b = gen_misaligned((n, k), btype).T + + quantization_scales = choose_scales(a, b, c, atype, btype, ctype, dtype, alpha=alpha, beta=beta) + options = {"result_type": NAME_TO_DATA_TYPE[dtype] if dtype else None} + + with pytest.raises(ValueError, match="should be aligned to 16 bytes"): + matmul(a, b, c, alpha=alpha, beta=beta, quantization_scales=quantization_scales, options=options) + + +@pytest.mark.parametrize( + "atype,btype,ctype,dtype", + [ + (a, b, c, d) + for (a, b, c, d) in SUPPORTED_TYPE_COMBINATIONS + if "float8" not in expected_result_type(a, b, c, d) or (c and "float8" not in c) + ], +) +@pytest.mark.parametrize( + "m,n,k", + ((16, 16, 16),), +) +def test_validation_non_fp8_scale(m, n, k, atype, btype, ctype, dtype): + """ + Tests if scales are prohibited for non-FP8 tensors + """ + a, b, c, alpha, beta = generate_inputs(m, n, k, atype, btype, ctype, use_cuda=True) + quantization_scales = {"a": 1, "b": 1, "c": 1, "d": 1} + options = {"result_type": NAME_TO_DATA_TYPE[dtype] if dtype else None} + with pytest.raises( + ValueError, + match=r"Quantization scaling is not supported for . when it is not a narrow-precision \(FP8 and lower\) type", + ): + matmul(a, b, c, alpha=alpha, beta=beta, quantization_scales=quantization_scales, options=options) + + +@pytest.mark.parametrize( + "atype,btype,ctype,dtype", + [ + (a, b, c, d) + for (a, b, c, d) in SUPPORTED_TYPE_COMBINATIONS + if "float8" not in expected_result_type(a, b, c, d) or (c and "float8" not in c) + ], +) +@pytest.mark.parametrize( + "m,n,k", + ((16, 16, 16),), +) +def test_validation_non_fp8_scale_reset(m, n, k, atype, btype, ctype, dtype): + """ + Tests if attempt to reset the scale for non-FP8 tensor raises an error. + """ + a, b, c, alpha, beta = generate_inputs(m, n, k, atype, btype, ctype, use_cuda=True) + quantization_scales = choose_scales(a, b, c, atype, btype, ctype, dtype, alpha=alpha, beta=beta) + options = {"result_type": NAME_TO_DATA_TYPE[dtype] if dtype else None} + with Matmul(a, b, c, alpha=alpha, beta=beta, quantization_scales=quantization_scales, options=options) as mm: + mm.plan() + with pytest.raises( + ValueError, + match=r"Quantization scaling is not supported for . when it is not a narrow-precision \(FP8 and lower\) type", + ): + mm.reset_operands(a=a, quantization_scales={"a": 1, "b": 1, "c": 1, "d": 1}) + + +@pytest.mark.parametrize("atype,btype,ctype,dtype", SUPPORTED_TYPE_COMBINATIONS_WITH_NON_FP8_D) +@pytest.mark.parametrize( + "m,n,k", + ((16, 16, 16),), +) +@pytest.mark.parametrize(("stateless"), (True, False)) +@pytest.mark.parametrize(("use_cuda"), (True, False)) +def test_validation_non_fp8_amax(m, n, k, atype, btype, ctype, dtype, stateless, use_cuda): + """ + Test if amax is not supported for non-FP8 D. + """ + a, b, c, alpha, beta = generate_inputs(m, n, k, atype, btype, ctype, use_cuda=use_cuda) + scales = choose_scales(a, b, c, atype, btype, ctype, dtype, alpha=alpha, beta=beta) + options = {"result_type": NAME_TO_DATA_TYPE[dtype] if dtype else None, "result_amax": True} + with pytest.raises(ValueError, match=r"result_amax=True is allowed only for narrow-precision \(FP8 and lower\) results"): + matmul(a, b, c, alpha=alpha, beta=beta, quantization_scales=scales, options=options) diff --git a/tests/nvmath_tests/linalg/advanced/matmul/test_fp8_epilogs.py b/tests/nvmath_tests/linalg/advanced/matmul/test_fp8_epilogs.py new file mode 100644 index 0000000..5b7b177 --- /dev/null +++ b/tests/nvmath_tests/linalg/advanced/matmul/test_fp8_epilogs.py @@ -0,0 +1,310 @@ +# Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. +# +# SPDX-License-Identifier: Apache-2.0 + + +try: + import torch +except ImportError: + torch = None +import pytest +from .utils import sample_matrix, allow_cublas_unsupported, matmul_with_random_autotune +from .fp8_utils import assert_fp8_equal, fp8_matmul_reference, simple_scales, generate_inputs, choose_scales +from nvmath.linalg.advanced import Matmul, MatmulEpilog as Epilog +from nvmath._internal.typemaps import NAME_TO_DATA_TYPE +from nvmath.bindings import cublasLt as cublaslt +from .test_fp8 import SUPPORTED_TYPE_COMBINATIONS, expected_result_type +from contextlib import nullcontext + +if torch is None: + pytest.skip("Torch is required for FP8 tests", allow_module_level=True) + +COMPUTE_CAPABILITY = (torch.cuda.get_device_properties(0).major, torch.cuda.get_device_properties(0).minor) + +if COMPUTE_CAPABILITY < (8, 9): + pytest.skip("CC>=8.9 is required for FP8 tests", allow_module_level=True) + +if cublaslt.get_version() < 120800: + pytest.skip("cuBLAS 120800 is required for FP8 tests", allow_module_level=True) + + +def unpack_bitmask(bitmask, shape): + if len(bitmask.shape) > 2: + return torch.stack([unpack_bitmask(bitmask[i], shape) for i in range(bitmask.shape[0])]) + result = torch.zeros(shape) + n, m = shape + for i in range(n): + for j in range(m): + result[i][j] = bool(int(bitmask[i // 8][j].item()) & (1 << i % 8)) + return result + + +@pytest.mark.parametrize( + "m,n,k", + ( + (16, 16, 16), + (80, 96, 16), + ), +) +@pytest.mark.parametrize( + "a_batch,b_batch,c_batch,d_batch", + ( + ((), (), (), ()), + ((2, 3), (2, 3), (2, 3), (2, 3)), + ), +) +@pytest.mark.parametrize(("use_cuda"), (True, False)) +@pytest.mark.parametrize("atype,btype,ctype,dtype", SUPPORTED_TYPE_COMBINATIONS) +@pytest.mark.parametrize( + "epilog_name,order,epilog_aux_type,epilog_aux_amax", + ( + ("BIAS", "col", None, False), + ("RELU", "col", None, False), + ("RELU", "row", None, False), + ("RELU_AUX", "col", None, False), + ("RELU_BIAS", "col", None, False), + ("RELU_AUX_BIAS", "col", None, False), + ("GELU", "col", None, False), + ("GELU_AUX", "col", None, False), + ("GELU_BIAS", "col", None, False), + ("GELU_AUX_BIAS", "col", None, False), + ("BGRADA", "col", None, False), + ("BGRADB", "col", None, False), + ("DRELU", "col", None, False), + ("DRELU_BGRAD", "col", None, False), + ("DGELU", "col", None, False), + ("DGELU_BGRAD", "col", None, False), + ("GELU_AUX", "col", "float8_e4m3fn", True), + ("GELU_AUX", "col", "float8_e4m3fn", False), + ), +) +def test_epilogs( + m, + n, + k, + atype, + btype, + ctype, + dtype, + epilog_name, + order, + a_batch, + b_batch, + c_batch, + d_batch, + epilog_aux_type, + epilog_aux_amax, + use_cuda, +): + epilog = getattr(Epilog, epilog_name) + + result_type = expected_result_type(atype, btype, ctype, dtype) + inferred_ctype = ctype or ("float16" if "float8" in result_type else result_type) + + # Handle gaps in cuBLAS support + allow_not_supported = False + allow_not_supported |= (inferred_ctype, result_type) == ("float32", "float32") and "GELU" in epilog_name + allow_not_supported |= epilog_name in ("DGELU", "GELU_AUX") and inferred_ctype == "float16" and result_type == "float16" + allow_not_supported |= epilog_name in ("DRELU", "DGELU") and "float16" in inferred_ctype and "float8" in result_type + allow_not_supported |= "BGRAD" in epilog_name + allow_not_supported |= epilog_name.startswith("RELU_AUX") and atype != btype + allow_not_supported |= epilog_name == "RELU_AUX_BIAS" and inferred_ctype == "float32" + allow_not_supported |= epilog_name in ("GELU", "GELU_BIAS") and "float8" in result_type + allow_not_supported |= epilog_aux_type is not None and not ( + atype == "float8_e4m3fn" and btype == "float8_e4m3fn" and ctype == "float16" and epilog_name == "GELU_AUX" + ) + if COMPUTE_CAPABILITY <= (8, 9): + allow_not_supported |= "AUX" in epilog_name and "float8" in result_type and d_batch != () + allow_not_supported |= epilog_name.startswith("GELU_AUX") and atype != btype + + def sample_batch(batch_shape, matrix_shape, type, transposed=False): + shape = (*batch_shape, *matrix_shape) + if transposed: + shape = (*shape[:-2], shape[-1], shape[-2]) + x = sample_matrix("torch", type, shape, use_cuda=use_cuda, min=-0.2, max=1) + return x.swapaxes(-1, -2) if transposed else x + + a = sample_batch(a_batch, (m, k), atype, transposed=False) + b = sample_batch(b_batch, (k, n), btype, transposed=True) + alpha, beta = 0.8, None + if ctype is not None: + c = sample_batch(c_batch, (m, n), ctype, transposed=(order == "col")) + beta = 0.12 + else: + c = None + beta = None + + quantization_scales = simple_scales(atype, btype, ctype, dtype) + options = {"result_type": NAME_TO_DATA_TYPE[dtype] if dtype else None} + preferences = { + "epilog": { + "aux_type": NAME_TO_DATA_TYPE[epilog_aux_type] if epilog_aux_type else None, + "aux_amax": epilog_aux_amax, + } + } + + # Prepare epilog inputs if needed + inputs = {} + if epilog_aux_type and "float8" in epilog_aux_type: + inputs["aux_quantization_scale"] = 0.45 + if "BIAS" in epilog_name: + bias_type = "float16" if inferred_ctype == "float16" else "bfloat16" + bias = sample_matrix("torch", bias_type, (m,), use_cuda=use_cuda, min=0, max=1) + inputs["bias"] = bias + if "DRELU" in epilog_name: + round_16 = lambda x: (x + 15) // 16 * 16 + inputs["relu_aux"] = torch.randint(low=0, high=256, size=(n, round_16(m // 16))).type(torch.uint8).T + if "DGELU" in epilog_name: + if order == "col": + inputs["gelu_aux"] = sample_matrix("torch", result_type, (n, m), use_cuda=use_cuda, min=-5, max=5).T + else: + inputs["gelu_aux"] = sample_matrix("torch", result_type, (m, n), use_cuda=use_cuda, min=-5, max=5) + + # Run matmul. Allow cuBLAS NOT_SUPPORTED error for certain configurations (see above) + def unpack_matmul(result): + return result if isinstance(result, tuple) else (result, {}) + + with ( + nullcontext() + if not allow_not_supported + else allow_cublas_unsupported( + message=f"FP8 epilog not supported by cuBLAS: {epilog_name} for A:{atype} B:{btype} C:{ctype} D:{dtype}", + allow_invalid_value=True, + ) + ): + result, aux = unpack_matmul( + matmul_with_random_autotune( + a, + b, + c, + alpha=alpha, + beta=beta, + epilog=epilog, + quantization_scales=quantization_scales, + options=options, + preferences=preferences, + epilog_inputs=inputs, + ) + ) + + assert result.shape == (*d_batch, m, n) + + # Compute the reference and compare + reference, reference_aux = unpack_matmul( + fp8_matmul_reference( + a, + b, + c, + alpha=alpha, + beta=beta, + epilog=epilog, + quantization_scales=quantization_scales, + options=options, + preferences=preferences, + epilog_inputs=inputs, + ) + ) + if "GELU" in epilog_name and result_type not in ("float16", "float32"): + assert_fp8_equal(result, reference, atol=1e-1, rtol=1e-1) + else: + assert_fp8_equal(result, reference) + + # Compare auxiliary outputs + assert set(aux.keys()) == set(reference_aux.keys()) + for key in aux: + if key == "relu_aux": + x = unpack_bitmask(aux[key], (m, n)) + y = unpack_bitmask(reference_aux[key], (m, n)) + assert torch.mean((x == y).type(torch.float32)) > 0.99 + elif key == "gelu_aux": + assert_fp8_equal(aux[key], reference_aux[key]) + if epilog_aux_type is not None: + assert str(aux[key].dtype).split(".")[-1] == epilog_aux_type + elif key == "gelu_aux_amax": + assert torch.allclose( + aux["gelu_aux_amax"], + (aux["gelu_aux"].type(torch.float32) / inputs.get("aux_quantization_scale", 1)).abs().max(), + atol=1e-1, + rtol=1e-1, + ) + elif key == "drelu_bgrad" or key == "dgelu_bgrad": + assert_fp8_equal(aux[key], reference.sum(axis=-1, keepdims=(d_batch != ())), atol=1e-1, rtol=1e-1) + if epilog_aux_type is not None: + assert str(aux[key].dtype).split(".")[-1] == epilog_aux_type + else: + raise RuntimeError(f"Test for {key} not implemented") + + +@pytest.mark.parametrize( + "m,n,k", + ((80, 96, 16),), +) +@pytest.mark.parametrize(("use_cuda"), (True, False)) +@pytest.mark.parametrize("atype,btype,ctype,dtype", (("float8_e4m3fn", "float8_e4m3fn", None, None),)) +@pytest.mark.parametrize( + "epilog_name,epilog_aux_type", + (("GELU_AUX", "float8_e4m3fn"),), +) +def test_epilog_aux_scale_reset( + m, + n, + k, + atype, + btype, + ctype, + dtype, + epilog_name, + epilog_aux_type, + use_cuda, +): + """ + Test if reset_operands can reset epilog aux scales. + """ + epilog = getattr(Epilog, epilog_name) + + a, b, c, alpha, beta = generate_inputs(m, n, k, atype, btype, ctype, use_cuda=use_cuda) + quantization_scales = choose_scales(a, b, c, atype, btype, ctype, dtype, alpha=alpha, beta=beta) + options = { + "result_type": NAME_TO_DATA_TYPE[dtype] if dtype else None, + } + preferences = { + "epilog": { + "aux_type": NAME_TO_DATA_TYPE[epilog_aux_type] if epilog_aux_type else None, + } + } + inputs = {"aux_quantization_scale": 10} + mm = Matmul(a, b, c, alpha=alpha, beta=beta, quantization_scales=quantization_scales, options=options) + mm.plan(epilog=epilog, epilog_inputs=inputs, preferences=preferences) + result, aux = mm.execute() + reference, reference_aux = fp8_matmul_reference( + a, + b, + c, + alpha=alpha, + beta=beta, + quantization_scales=quantization_scales, + options=options, + preferences=preferences, + epilog_inputs=inputs, + epilog=epilog, + ) + assert_fp8_equal(result, reference) + assert_fp8_equal(aux["gelu_aux"], reference_aux["gelu_aux"]) + + inputs2 = {"aux_quantization_scale": -0.1} + mm.reset_operands(a=a, epilog_inputs=inputs2) + result2, aux2 = mm.execute() + reference2, reference_aux2 = fp8_matmul_reference( + a, + b, + c, + alpha=alpha, + beta=beta, + quantization_scales=quantization_scales, + options=options, + preferences=preferences, + epilog_inputs=inputs2, + epilog=epilog, + ) + assert_fp8_equal(result2, reference2) + assert_fp8_equal(aux2["gelu_aux"], reference_aux2["gelu_aux"]) diff --git a/tests/nvmath_tests/linalg/advanced/matmul/test_fp8_utils.py b/tests/nvmath_tests/linalg/advanced/matmul/test_fp8_utils.py new file mode 100644 index 0000000..44a1c4a --- /dev/null +++ b/tests/nvmath_tests/linalg/advanced/matmul/test_fp8_utils.py @@ -0,0 +1,88 @@ +# Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. +# +# SPDX-License-Identifier: Apache-2.0 + + +try: + import torch +except ImportError: + torch = None +import pytest +from .fp8_utils import fp8helpers +import numpy as np + +if torch is None: + pytest.skip("Torch is required for FP8 tests", allow_module_level=True) + + +def fp8_values(dtype, finite=False): + values = list(torch.frombuffer(bytes(range(256)), dtype=getattr(torch, dtype))) + values = [x.item() for x in values if not finite or torch.isfinite(x.type(torch.float32))] + return values + + +@pytest.mark.parametrize( + "format,left,value,right", + ( + ("float8_e4m3fn", 0.00390625, 0.005859375, 0.0078125), + ("float8_e4m3fn", 0.0, 0.001953125, 0.00390625), + ("float8_e4m3fn", -0.00390625, -0.001953125, 0.0), + ("float8_e4m3fn", -288.0, -256.0, -240.0), + ("float8_e4m3fn", 1.25, 1.375, 1.5), + ("float8_e4m3fn", 0.6875, 0.75, 0.8125), + ("float8_e4m3fn", -448.0, -416.0, -384.0), + ("float8_e4m3fn", -np.inf, -448.0, -416.0), + ("float8_e4m3fn", 416.0, 448.0, np.inf), + ("float8_e4m3fn", -0.001953125, 0.0, 0.001953125), + ("float8_e5m2", 3072.0, 3584.0, 4096.0), + ("float8_e5m2", -2048.0, -1792.0, -1536.0), + ("float8_e5m2", 0.875, 1.0, 1.25), + ("float8_e5m2", -0.625, -0.5, -0.4375), + ("float8_e5m2", -57344.0, -49152.0, -40960.0), + ("float8_e5m2", 40960.0, 49152.0, 57344.0), + ("float8_e5m2", -np.inf, -57344.0, -49152.0), + ("float8_e5m2", 49152.0, 57344.0, np.inf), + ("float8_e5m2", 1.52587890625e-05, 3.0517578125e-05, 4.57763671875e-05), + ("float8_e5m2", -4.57763671875e-05, -3.0517578125e-05, -1.52587890625e-05), + ("float8_e5m2", -1.52587890625e-05, 0.0, 1.52587890625e-05), + ), +) +def test_fp8_helper(format, left, value, right): + helper = fp8helpers[format] + isclose = lambda x, y: np.allclose(x, y, atol=0, rtol=1e-5) + + # Check ranges + expected_range_left, expected_range_right = (value + left) / 2, (value + right) / 2 + range_left, range_right = helper.range(value) + assert isclose(expected_range_left, range_left) + assert isclose(expected_range_right, range_right) + + # Check if ranges work for non-exact match + if np.isfinite(left): + range_left, range_right = helper.range(value * 0.8 + left * 0.2) + assert isclose(expected_range_left, range_left) + assert isclose(expected_range_right, range_right) + if np.isfinite(right): + range_left, range_right = helper.range(value * 0.7 + right * 0.3) + assert isclose(expected_range_left, range_left) + assert isclose(expected_range_right, range_right) + + # Check if absdiff works + scalar_absdiff = lambda x, y: helper.absdiff(np.asarray([x]), np.asarray([y])) + assert isclose(scalar_absdiff(value, value), 0) + + if np.isfinite(right): + assert isclose(scalar_absdiff(value, right), abs(right - range_right)) + assert isclose(scalar_absdiff(value, right + 1), abs(right - range_right) + 1) + assert isclose(scalar_absdiff(value, value * 0.9 + right * 0.1), 0) + assert isclose(scalar_absdiff(value, value * 0.5 + right * 0.5), 0) + if np.isfinite(left): + assert isclose(scalar_absdiff(value, value * 0.9 + left * 0.1), 0) + assert isclose(scalar_absdiff(value, value * 0.5 + left * 0.5), 0) + assert isclose(scalar_absdiff(value, left - 1), abs(left - range_left) + 1) + assert isclose(scalar_absdiff(value, left), abs(left - range_left)) + + if not np.isfinite(right): + assert isclose(scalar_absdiff(value, value + 1234567), 0) + if not np.isfinite(left): + assert isclose(scalar_absdiff(value, value - 7654321), 0) diff --git a/tests/nvmath_tests/linalg/advanced/matmul/test_ifc.py b/tests/nvmath_tests/linalg/advanced/matmul/test_ifc.py index 16e66b3..5f9d02a 100644 --- a/tests/nvmath_tests/linalg/advanced/matmul/test_ifc.py +++ b/tests/nvmath_tests/linalg/advanced/matmul/test_ifc.py @@ -1,4 +1,4 @@ -# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. +# Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. # # SPDX-License-Identifier: Apache-2.0 @@ -10,11 +10,15 @@ from nvmath.linalg.advanced import Matmul from nvmath.linalg._internal.matmul_desc_ifc import MatmulDescInterface +from nvmath.linalg._internal.matrix_layout_ifc import MatrixLayoutInterface +from nvmath.linalg._internal.matmul_pref_ifc import MatmulPreferenceInterface + +from nvmath._internal import typemaps import pytest try: - import cupy + import cupy # noqa: F401 except ModuleNotFoundError: pytest.skip("cupy required for matmul tests", allow_module_level=True) @@ -23,7 +27,28 @@ def test_matmul_desc_ifc(): """ Test MatmulDescInterface.__getattr__ (not used anywhere yet) """ - mm = Matmul(np.zeros((1, 1)), np.zeros((1, 1))) - desc = MatmulDescInterface(mm.mm_desc) - desc.epilog = 123 - assert desc.epilog == 123 + a = np.zeros((1, 1)) + with Matmul(a, a) as mm: + desc = MatmulDescInterface(mm.mm_desc) + desc.epilogue = 123 + assert desc.epilogue == 123 + +def test_matrix_layout_ifc(): + ''' + Test MatrixLayoutInterface.__getattr__ + ''' + a = np.zeros((1, 1), dtype=np.float32) + with Matmul(a, a) as mm: + algorithms = mm.plan() + layout_a_ifc = MatrixLayoutInterface(mm.a_layout_ptr) + assert typemaps.DATA_TYPE_TO_NAME[layout_a_ifc.type] == "float32" + +def test_matmul_pref_ifc(): + ''' + Test MatmulPreferenceInterface.__getattr__ + ''' + a = np.zeros((1, 1)) + with Matmul(a, a) as mm: + algorithms = mm.plan() + pref_ifc = MatmulPreferenceInterface(mm.preference_ptr) + assert pref_ifc.max_workspace_bytes == mm.memory_limit \ No newline at end of file diff --git a/tests/nvmath_tests/linalg/advanced/matmul/test_input.py b/tests/nvmath_tests/linalg/advanced/matmul/test_input.py index ac58736..083b13f 100644 --- a/tests/nvmath_tests/linalg/advanced/matmul/test_input.py +++ b/tests/nvmath_tests/linalg/advanced/matmul/test_input.py @@ -1,4 +1,4 @@ -# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. +# Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. # # SPDX-License-Identifier: Apache-2.0 @@ -7,8 +7,11 @@ """ from nvmath.linalg.advanced import matmul, matrix_qualifiers_dtype +import numpy as np import pytest -from .utils import * +from nvmath.bindings import cublasLt as cublaslt + +from .utils import compare_tensors, random_torch_complex, sample_matrix, assert_tensors_equal, to_numpy, get_framework @pytest.mark.parametrize("framework", ("torch", "numpy/cupy")) @@ -140,7 +143,7 @@ def sample_batch(batch_shape): assert_tensors_equal(result, a @ b + c) -@pytest.mark.parametrize("c_desc", (None, "M", "M1", "MN")) +@pytest.mark.parametrize("c_desc", (None, "M1", "MN")) @pytest.mark.parametrize("b_desc", ("K", "KN")) @pytest.mark.parametrize("a_desc", ("K", "MK")) @pytest.mark.parametrize("a_t", (True, False)) @@ -384,7 +387,7 @@ def test_dtype_mismatch(framework, a_dtype, b_dtype, c_dtype): c = sample_matrix(framework, c_dtype, (2, 2), True) except NotImplementedError: pytest.skip("Unable to generate matrix of this dtype") - with pytest.raises(ValueError, match=r"The dtype of operands .* must be the same"): + with pytest.raises(ValueError, match=r"Unsupported combination of dtypes"): matmul(a, b, c, beta=1) @@ -421,6 +424,30 @@ def test_unsupported_type(): """ Tests if a proper error is reported for an unsupported data type. """ - a = b = c = np.asarray(["hello"]) - with pytest.raises(ValueError, match=r"Unsupported dtype."): + a = b = c = np.zeros((2, 2), dtype=np.int64) + with pytest.raises(ValueError, match=r"^The dtype of operand.*not supported"): matmul(a, b, c, beta=1) + + +def test_unsupported_float8(): + """ + Tests if proper error is reported when FP8 is not supported. + """ + try: + import torch + except: + pytest.skip("Torch is required for FP8 support test.") + + if not hasattr(torch, "float8_e4m3fn"): + # Old torch versions don't support float8_e4m3fn at all. + pytest.skip("torch.float8_e4m3fn is required for FP8 support test.") + + a = torch.zeros((16, 16)).type(torch.float8_e4m3fn).cuda() + b = torch.zeros((16, 16)).type(torch.float8_e4m3fn).cuda() + + if cublaslt.get_version() < 120800: + with pytest.raises(ValueError, match=r"FP8 is not supported.*cuBLASLt version 12\.8 or higher is required"): + matmul(a, b, quantization_scales={"a": 1, "b": 1, "d": 1}) + elif (torch.cuda.get_device_properties(0).major, torch.cuda.get_device_properties(0).minor) < (8, 9): + with pytest.raises(cublaslt.cuBLASLtError): + matmul(a, b, quantization_scales={"a": 1, "b": 1, "d": 1}) diff --git a/tests/nvmath_tests/linalg/advanced/matmul/test_matmul_with_hypothesis.py b/tests/nvmath_tests/linalg/advanced/matmul/test_matmul_with_hypothesis.py index 63adf01..0052d5d 100644 --- a/tests/nvmath_tests/linalg/advanced/matmul/test_matmul_with_hypothesis.py +++ b/tests/nvmath_tests/linalg/advanced/matmul/test_matmul_with_hypothesis.py @@ -1,9 +1,13 @@ +# Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. +# +# SPDX-License-Identifier: Apache-2.0 + import collections import logging import re import typing -from hypothesis import given, settings, reproduce_failure, assume +from hypothesis import given, assume from hypothesis.extra.numpy import arrays, from_dtype from hypothesis.strategies import ( one_of, @@ -11,7 +15,6 @@ none, floats, integers, - complex_numbers, sampled_from, fixed_dictionaries, composite, @@ -24,15 +27,21 @@ pytest.skip("cupy is required for matmul tests", allow_module_level=True) import numpy as np -from nvmath._utils import CudaDataType +import nvmath.linalg +from nvmath import CudaDataType from nvmath.bindings.cublasLt import cuBLASLtError, ReductionScheme -from nvmath.linalg._internal.typemaps import NAME_TO_DEFAULT_COMPUTE_TYPE, CUBLAS_COMPUTE_TYPE_TO_NAME +from nvmath.linalg._internal.typemaps import ( + NAMES_TO_DEFAULT_COMPUTE_TYPE, + CUBLAS_COMPUTE_TYPE_TO_NAME, + SCALE_TYPE_TO_DEFAULT_COMPUTE_TYPE, + COMPUTE_TYPE_TO_DEFAULT_SCALE_TYPE, +) from nvmath.linalg.advanced import MatmulEpilog, MatmulNumericalImplFlags, MatmulPlanPreferences, matmul from nvmath.linalg.advanced.matmulmod import EPILOG_INPUT_HANDLERS_MAP, EPILOG_MINIMUM_VERSIONS_MAP from nvmath.memory import _RawCUDAMemoryManager, BaseCUDAMemoryManager, _CupyCUDAMemoryManager from nvmath_tests.helpers import nvmath_seed -from nvmath_tests.linalg.advanced.matmul.utils import get_tolerance +from .utils import get_absolute_tolerance MatmulEpilog_BIAS_list = [ MatmulEpilog.BIAS, @@ -97,7 +106,7 @@ def compare_result(ref, res): ref, equal_nan=True, rtol=(1e-02 if res.dtype == np.float16 else 2e-05), - atol=2 * get_tolerance(ref), + atol=2 * get_absolute_tolerance(ref), ) @@ -122,7 +131,7 @@ def drelu(x, bitmask): def verify_result(a, b, c, result_c, alpha, beta, epilog, epilog_inputs): - possible_dtype = CUBLAS_COMPUTE_TYPE_TO_NAME[NAME_TO_DEFAULT_COMPUTE_TYPE[str(a.dtype)]] + possible_dtype = CUBLAS_COMPUTE_TYPE_TO_NAME[NAMES_TO_DEFAULT_COMPUTE_TYPE[(str(a.dtype), str(b.dtype))]] compute_dtype = possible_dtype[1] if np.iscomplexobj(a) else possible_dtype[0] added_singleton_dimensions: list[int] = [] @@ -219,6 +228,10 @@ def verify_result(a, b, c, result_c, alpha, beta, epilog, epilog_inputs): ) +def notNone(x): + return x is not None + + @composite def matrix_multiply_arrays(draw): m = draw(one_of(none(), problem_size_mnk)) @@ -247,11 +260,14 @@ def matrix_multiply_arrays(draw): none(), arrays( dtype=ab_type, - shape=(m_for_c,) - if n is None - else ( - m_for_c, - draw(sampled_from([1, n])), + shape=tuple( + filter( + notNone, + ( + m_for_c, + draw(sampled_from([1, n])), + ), + ) ), elements=element_properties, ), @@ -276,6 +292,13 @@ def matrix_multiply_arrays(draw): return MatmulInputs(a=a, b=b, c=c, m=m, n=n, k=k, ab_type=ab_type, bias=bias, beta=beta, alpha=alpha, epilogs=epilogs) +@composite +def preference_object_strategy(draw): + limit = draw(integers(min_value=1, max_value=8)) + reduction_scheme_mask = draw(one_of(sampled_from(ReductionScheme))) + return MatmulPlanPreferences(reduction_scheme_mask=reduction_scheme_mask, limit=limit) + + @nvmath_seed() @given( input_arrays=matrix_multiply_arrays(), @@ -286,7 +309,8 @@ def matrix_multiply_arrays(draw): { "blocking": sampled_from(options_blocking_values), "allocator": sampled_from(options_allocator_values), - "scale_type": one_of(none()), + "scale_type": one_of(none(), sampled_from(CudaDataType)), + # "compute_type": one_of(none(), sampled_from(nvmath.linalg.ComputeType)), } ), ), @@ -298,27 +322,28 @@ def matrix_multiply_arrays(draw): "max_waves_count": one_of(floats(min_value=0, max_value=100, width=32)), } ), + preference_object_strategy(), ), ) def test_matmul(input_arrays, order, options, preferences): """Call nvmath.linalg.advanced.matmul() with valid inputs.""" - try: - a, b, c, m, n, k, ab_type, bias, beta, alpha, epilogs = input_arrays - epilog, epilog1 = epilogs - - d_a = cp.asarray(a, order=order) - d_b = cp.asarray(b, order=order) - c_order = "F" if epilog is not None and epilog in [MatmulEpilog.BGRADB, MatmulEpilog.BGRADA] else order - d_c = ( - # FIXME: c must be F ordered when using BGRAD[A,B] - None if c is None else cp.asarray(c, order=c_order) - ) + a, b, c, m, n, k, ab_type, bias, beta, alpha, epilogs = input_arrays + epilog, epilog1 = epilogs + + d_a = cp.asarray(a, order=order) + d_b = cp.asarray(b, order=order) + c_order = "F" if epilog is not None and epilog in [MatmulEpilog.BGRADB, MatmulEpilog.BGRADA] else order + d_c = ( + # FIXME: c must be F ordered when using BGRAD[A,B] + None if c is None else cp.asarray(c, order=c_order) + ) - epilog_inputs = None if epilog is None else {} + epilog_inputs = None if epilog is None else {} - if epilog is not None and epilog in MatmulEpilog_BIAS_list: - epilog_inputs["bias"] = cp.asarray(bias, order=order) + if epilog is not None and epilog in MatmulEpilog_BIAS_list: + epilog_inputs["bias"] = cp.asarray(bias, order=order) + try: result_c = matmul( d_a, d_b, @@ -359,7 +384,17 @@ def test_matmul(input_arrays, order, options, preferences): raise e except ValueError as e: # FIXME: Check for CUDA toolkit version 11 - if re.search("K=1 is not supported for (BGRAD(A|B)|D(R|G)ELU) epilog", str(e)) or "requires cublaslt >=" in str(e): + if ( + re.search("K=1 is not supported for (BGRAD(A|B)|D(R|G)ELU) epilog", str(e)) + or "requires cublaslt >=" in str(e) + or ("`c` must be at least 2-D." in str(e) and c is not None and len(c.shape) < 2) + or ("Unsupported scale type." in str(e) and options["scale_type"] not in SCALE_TYPE_TO_DEFAULT_COMPUTE_TYPE) + or ( + "Unsupported compute type." in str(e) + and options["compute_type"] not in COMPUTE_TYPE_TO_DEFAULT_SCALE_TYPE["real"] + ) + or re.search("Selected scale_type=(.*) compute_type=(.*) are not supported for data types", str(e)) + ): pass else: raise e @@ -431,17 +466,17 @@ def generate_alpha_beta(value_type, value): def test_matmul_negative(a, b, c, alpha_value, beta_value, epilog, epilog_inputs, options, preferences): """Call nvmath.linalg.advanced.matmul() with invalid inputs; catch expected exceptions.""" - try: - if c is not None and ((a.dtype != c.dtype) or (a.shape[0] != c.shape[0]) or (c.shape[1] != b.shape[1])): - return + if c is not None and ((a.dtype != c.dtype) or (a.shape[0] != c.shape[0]) or (c.shape[1] != b.shape[1])): + return - d_a = cp.asarray(a, order="F") - d_b = cp.asarray(b, order="F") - d_c = cp.asarray(c, order="F") if c is not None else None + d_a = cp.asarray(a, order="F") + d_b = cp.asarray(b, order="F") + d_c = cp.asarray(c, order="F") if c is not None else None - alpha = generate_alpha_beta(a.dtype, alpha_value) - beta = generate_alpha_beta(a.dtype, beta_value) + alpha = generate_alpha_beta(a.dtype, alpha_value) + beta = generate_alpha_beta(a.dtype, beta_value) + try: result_c = matmul( d_a, d_b, @@ -466,7 +501,7 @@ def test_matmul_negative(a, b, c, alpha_value, beta_value, epilog, epilog_inputs except ValueError as e: if "A value for beta must be provided if operand C is provided." in str(e): assert (beta is None) and (c is not None) - elif f"The dtype of operands A {a.dtype} and B {b.dtype} must be the same." in str(e): + elif f"Unsupported combination of dtypes for operands A {a.dtype} and B {b.dtype}" in str(e): assert a.dtype != b.dtype elif ( f"The 'K' extent must match for the operands: K={a.shape[1]} in operand A is not equal to K={b.shape[0]} " @@ -479,10 +514,7 @@ def test_matmul_negative(a, b, c, alpha_value, beta_value, epilog, epilog_inputs assert c.shape[0] != a.shape[0] or c.shape[1] != b.shape[1] elif re.search(re.compile(r"The epilog \w+ requires the following input tensors: \{\'\w+\'\}\."), str(e)): assert epilog is not None - if "The provided tensor names are:" in str(e): - assert epilog_inputs is not None - else: - assert epilog_inputs is None + assert epilog_inputs is None or epilog_inputs == {} elif "The value specified for blocking must be either True or 'auto'." in str(e): assert options["blocking"] not in (True, "auto") elif "is not a valid CudaDataType" in str(e): @@ -496,7 +528,19 @@ def test_matmul_negative(a, b, c, alpha_value, beta_value, epilog, epilog_inputs elif "requires cublaslt >=" in str(e): from nvmath.bindings import cublasLt - assert cublasLt.get_version() < EPILOG_MINIMUM_VERSIONS_MAP[epilog]["cublaslt"] + assert cublasLt.get_version() < EPILOG_MINIMUM_VERSIONS_MAP[epilog]["cublaslt"] or ( + a.shape[-2] == 1 and c is not None and c.shape[-1] == 1 and epilog & MatmulEpilog.BIAS > 0 + ) + elif re.search("K=1 is not supported for (BGRAD(A|B)|D(R|G)ELU) epilog", str(e)): + assert a.shape[1] == 1 and b.shape[0] == 1 + assert epilog in [ + MatmulEpilog.BGRADA, + MatmulEpilog.BGRADB, + MatmulEpilog.DGELU_BGRAD, + MatmulEpilog.DGELU, + MatmulEpilog.DRELU_BGRAD, + MatmulEpilog.DRELU, + ] else: raise e except TypeError as e: diff --git a/tests/nvmath_tests/linalg/advanced/matmul/test_mxfp8.py b/tests/nvmath_tests/linalg/advanced/matmul/test_mxfp8.py new file mode 100644 index 0000000..85bdac9 --- /dev/null +++ b/tests/nvmath_tests/linalg/advanced/matmul/test_mxfp8.py @@ -0,0 +1,788 @@ +# Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. +# +# SPDX-License-Identifier: Apache-2.0 + + +try: + import torch +except ImportError: + torch = None +import pytest +from .utils import sample_matrix +from .fp8_utils import assert_fp8_equal +from nvmath.linalg.advanced import Matmul, matmul, MatmulEpilog as Epilog +from nvmath.linalg.advanced.helpers import matmul as matmul_helpers +from nvmath._internal.typemaps import NAME_TO_DATA_TYPE +from nvmath.bindings import cublasLt as cublaslt +from nvmath._internal.utils import check_or_create_options +from nvmath.linalg.advanced import _configuration +from contextlib import nullcontext +from .utils import allow_cublas_unsupported + +if torch is None: + pytest.skip("Torch is required for MXFP8 tests", allow_module_level=True) + +if torch.cuda.get_device_properties(0).major < 10: + pytest.skip("CC>=10.0 is required for MXFP8 tests", allow_module_level=True) + +if cublaslt.get_version() < 120800: + pytest.skip("cuBLAS 12.8 is required for MXFP8 tests", allow_module_level=True) + + +def unpack_matmul(result): + """ + Helper function which unpacks the result of `matmul` into D, d_out (or None), epilog aux + """ + if isinstance(result, tuple): + d, aux = result + d_out = aux.pop("d_out_scale", None) + return d, d_out, aux + else: + return result, None, {} + + +def expected_result_type(atype, btype, ctype, dtype): + """ + Result type of FP8 matmul. ctype=None means no C. dtype=None means no explicit + specification of the result type. + """ + return dtype or ctype or atype + + +def generate_mxfp8_scales(x, scale_range, *, use_cuda, validate_shape=True): + """ + Generates UE8M0 scales for x, randomly chosen from [2^scale_range[0], 2^scale_range[1]] + """ + low = scale_range[0] + 127 + high = scale_range[1] + 127 + assert low >= 0 + assert high <= 255 + if validate_shape: + assert all(s % 128 == 0 for s in x.shape[-2:]) + num_scales = x.nelement() // 32 + s = torch.randint(low=low, high=high + 1, size=(num_scales,)).to(torch.uint8) + if use_cuda: + s = s.cuda() + return s + + +def expand_mxfp8_scales(x, scales): + """ + Expands block UE8M0 scales tensor for `x` into a float32 tensor with actual scale + factors. + """ + idx = matmul_helpers.get_mxfp8_scale_offset(x, torch.meshgrid(*(torch.arange(d) for d in x.shape), indexing="ij")) + if scales.is_cuda: + idx = idx.cuda() + return 2 ** (scales.type(torch.float32)[idx] - 127) + + +def generate_simple_inputs(m, n, k, atype, btype, ctype, *, c_transposed=False, use_cuda): + """ + Generates matmul inputs of given shapes and types. + """ + + def random_choice(choices, shape=()): + choices = torch.as_tensor(choices) + idx = torch.randint(low=0, high=len(choices), size=shape) + return choices[idx] + + def random_sign(): + return random_choice([-1, 1]).item() + + # Use non-symmetric distributions to reduce the risk of catastrophic cancellations + a = random_choice([-1, 0, 0.5, 1, 1.5], shape=(m, k)).type(getattr(torch, atype)) + b = random_choice([-1, 0, 0.5, 1, 1.5], shape=(n, k)).type(getattr(torch, btype)).T + + alpha = random_sign() * 2 ** torch.randint(low=-10, high=-8, size=()).item() + + if ctype is not None: + if c_transposed: + c = random_choice([-0.25, 0, 0.5, 1], shape=(n, m)).type(getattr(torch, ctype)).T + else: + c = random_choice([-0.25, 0, 0.5, 1], shape=(m, n)).type(getattr(torch, ctype)) + beta = random_sign() * alpha * torch.rand(size=()).item() + else: + c = None + beta = None + if use_cuda: + a = a.cuda() + b = b.cuda() + if c is not None: + c = c.cuda() + return a, b, c, alpha, beta + + +def mxfp8_matmul_reference( + a, b, c=None, *args, d_out=None, quantization_scales=None, epilog_inputs=None, options=None, **kwargs +): + """ + Computes MXFP8-like matmul, but with higher precision. + """ + scales = check_or_create_options(_configuration.MatmulQuantizationScales, quantization_scales, "Matmul scales") + options = check_or_create_options(_configuration.MatmulOptions, options, "Matmul options") + options.result_type = None + + a_scale = expand_mxfp8_scales(a, scales.a) + b_scale = expand_mxfp8_scales(b, scales.b) + ascaled = a.type(torch.float32) * a_scale + bscaled = b.type(torch.float32) * b_scale + + for key in ("bias", "gelu_aux"): + if epilog_inputs and key in epilog_inputs: + epilog_inputs[key] = epilog_inputs[key].type(torch.float32) + + d = matmul( + ascaled, + bscaled, + c.type(torch.float32) if c is not None else None, + *args, + quantization_scales=None, + epilog_inputs=epilog_inputs, + options=options, + **kwargs, + ) + if d_out is not None: + d_scale = expand_mxfp8_scales(d, d_out) + d /= d_scale + return d + + +SUPPORTED_TYPE_COMBINATIONS = ( + ("float8_e4m3fn", "float8_e4m3fn", None, None), + ("float8_e4m3fn", "float8_e5m2", None, None), + ("float8_e5m2", "float8_e4m3fn", None, None), + ("float8_e4m3fn", "float8_e4m3fn", "float32", "float32"), + ("float8_e4m3fn", "float8_e5m2", "float32", "float32"), + ("float8_e5m2", "float8_e4m3fn", "float32", "float32"), + ("float8_e4m3fn", "float8_e4m3fn", "float32", None), + ("float8_e4m3fn", "float8_e5m2", "float32", None), + ("float8_e5m2", "float8_e4m3fn", "float32", None), + ("float8_e4m3fn", "float8_e4m3fn", "float16", "float16"), + ("float8_e4m3fn", "float8_e5m2", "float16", "float16"), + ("float8_e5m2", "float8_e4m3fn", "float16", "float16"), + ("float8_e4m3fn", "float8_e4m3fn", "float16", None), + ("float8_e4m3fn", "float8_e5m2", "float16", None), + ("float8_e5m2", "float8_e4m3fn", "float16", None), + ("float8_e4m3fn", "float8_e4m3fn", "bfloat16", "bfloat16"), + ("float8_e4m3fn", "float8_e5m2", "bfloat16", "bfloat16"), + ("float8_e5m2", "float8_e4m3fn", "bfloat16", "bfloat16"), + ("float8_e4m3fn", "float8_e4m3fn", "bfloat16", None), + ("float8_e4m3fn", "float8_e5m2", "bfloat16", None), + ("float8_e5m2", "float8_e4m3fn", "bfloat16", None), + ("float8_e4m3fn", "float8_e4m3fn", "float16", "float8_e4m3fn"), + ("float8_e4m3fn", "float8_e5m2", "float16", "float8_e4m3fn"), + ("float8_e5m2", "float8_e4m3fn", "float16", "float8_e4m3fn"), + ("float8_e4m3fn", "float8_e4m3fn", "bfloat16", "float8_e4m3fn"), + ("float8_e4m3fn", "float8_e5m2", "bfloat16", "float8_e4m3fn"), + ("float8_e5m2", "float8_e4m3fn", "bfloat16", "float8_e4m3fn"), +) + + +@pytest.mark.parametrize("atype,btype,ctype,dtype", SUPPORTED_TYPE_COMBINATIONS) +@pytest.mark.parametrize( + "m,n,k", + ( + (128, 128, 128), + (2 * 128, 4 * 128, 2 * 128), + (7 * 128, 5 * 128, 3 * 128), + ), +) +@pytest.mark.parametrize("a_scale_range", ((-5, 5),)) +@pytest.mark.parametrize("b_scale_range", ((-5, 5),)) +@pytest.mark.parametrize(("use_cuda"), (True, False)) +def test_mxfp8(m, n, k, atype, btype, ctype, dtype, a_scale_range, b_scale_range, use_cuda): + """ + Basic MXFP8 test. + """ + a, b, c, alpha, beta = generate_simple_inputs(m, n, k, atype, btype, ctype, use_cuda=use_cuda) + + result_type = expected_result_type(atype, btype, ctype, dtype) + ascales = generate_mxfp8_scales(a, a_scale_range, use_cuda=use_cuda) + bscales = generate_mxfp8_scales(b, b_scale_range, use_cuda=use_cuda) + + scales = {"a": ascales, "b": bscales} + options = {"result_type": NAME_TO_DATA_TYPE[dtype] if dtype else None, "block_scaling": True} + result, d_out, _ = unpack_matmul(matmul(a, b, c=c, alpha=alpha, beta=beta, quantization_scales=scales, options=options)) + + reference = mxfp8_matmul_reference( + a, b, d_out=d_out, c=c, alpha=alpha, beta=beta, quantization_scales=scales, options=options + ) + assert_fp8_equal(result, reference) + assert str(result.dtype).split(".")[-1] == result_type + + +@pytest.mark.parametrize("atype,btype,ctype,dtype", SUPPORTED_TYPE_COMBINATIONS) +@pytest.mark.parametrize("a_scale_range", ((-5, 5),)) +@pytest.mark.parametrize("b_scale_range", ((-5, 5),)) +@pytest.mark.parametrize( + "a_batch,b_batch,c_batch,d_batch", + ( + ((), (), (), ()), + ((3,), (3,), (3,), (3,)), + ((8,), (8,), (8,), (8,)), + ((2, 3), (2, 3), (2, 3), (2, 3)), + ), +) +@pytest.mark.parametrize( + "m,n,k", + ((128, 128, 128),), +) +@pytest.mark.parametrize(("use_cuda"), (True,)) +def test_batching( + m, n, k, atype, btype, ctype, dtype, a_scale_range, b_scale_range, a_batch, b_batch, c_batch, d_batch, use_cuda +): + """ + Tests if batching works with MXFP8. + """ + + def sample_batch(batch_shape, matrix_shape, type, transposed=False): + shape = (*batch_shape, *matrix_shape) + if transposed: + shape = (*shape[:-2], shape[-1], shape[-2]) + x = sample_matrix("torch", type, shape, use_cuda=use_cuda, min=0, max=2) + return x.swapaxes(-1, -2) if transposed else x + + a = sample_batch(a_batch, (m, k), atype, transposed=False) + b = sample_batch(b_batch, (k, n), btype, transposed=True) + + ascales = generate_mxfp8_scales(a, a_scale_range, use_cuda=use_cuda) + bscales = generate_mxfp8_scales(b, b_scale_range, use_cuda=use_cuda) + + if ctype is not None: + c = sample_batch(c_batch, (m, n), ctype, transposed=False) + beta = 0.12 + else: + c = None + beta = None + + alpha = 0.32 + + options = {"result_type": NAME_TO_DATA_TYPE[dtype] if dtype else None, "block_scaling": True} + scales = {"a": ascales, "b": bscales} + result, d_out, _ = unpack_matmul(matmul(a, b, c, alpha=alpha, beta=beta, quantization_scales=scales, options=options)) + reference = mxfp8_matmul_reference( + a, b, c=c, alpha=alpha, d_out=d_out, beta=beta, quantization_scales=scales, options=options + ) + expected_result_shape = (*d_batch, m, n) + assert result.shape == expected_result_shape + + reference = mxfp8_matmul_reference( + a, b, c, d_out=d_out, alpha=alpha, beta=beta, quantization_scales=scales, options=options + ) + assert_fp8_equal(result, reference) + + +@pytest.mark.parametrize("atype,btype,ctype,dtype", SUPPORTED_TYPE_COMBINATIONS) +@pytest.mark.parametrize( + "m,n,k", + ((2 * 128, 4 * 128, 3 * 128),), +) +@pytest.mark.parametrize("a_scale_range", ((-3, 3),)) +@pytest.mark.parametrize("b_scale_range", ((-3, 3),)) +@pytest.mark.parametrize(("use_cuda"), (True, False)) +def test_reset(m, n, k, atype, btype, ctype, dtype, a_scale_range, b_scale_range, use_cuda): + """ + Tests if in-place change of A/B scales and reset_operands works. + """ + a, b, c, alpha, beta = generate_simple_inputs(m, n, k, atype, btype, ctype, use_cuda=use_cuda) + + ascales = generate_mxfp8_scales(a, a_scale_range, use_cuda=use_cuda) + bscales = generate_mxfp8_scales(b, b_scale_range, use_cuda=use_cuda) + + scales = {"a": ascales, "b": bscales} + options = {"result_type": NAME_TO_DATA_TYPE[dtype] if dtype else None, "block_scaling": True} + + with Matmul(a, b, c=c, alpha=alpha, beta=beta, quantization_scales=scales, options=options) as mm: + mm.plan() + + # Check initial result + result, d_out, _ = unpack_matmul(mm.execute()) + reference = mxfp8_matmul_reference( + a, b, c=c, alpha=alpha, d_out=d_out, beta=beta, quantization_scales=scales, options=options + ) + assert_fp8_equal(result, reference) + + # Change A and B scales in place + if use_cuda: + ascales[: len(ascales) // 2] *= -1 + bscales[len(bscales) // 2 :] = bscales[0] + result = mm.execute() + result, d_out, _ = unpack_matmul(mm.execute()) + reference = mxfp8_matmul_reference( + a, b, c=c, alpha=alpha, d_out=d_out, beta=beta, quantization_scales=scales, options=options + ) + assert_fp8_equal(result, reference) + + # Reset A scale, keep B scale + ascales2 = generate_mxfp8_scales(a, a_scale_range, use_cuda=use_cuda) + scales2 = scales.copy() + scales2["a"] = ascales2 + mm.reset_operands(a=a, b=b, quantization_scales={"a": ascales2}) + result, d_out, _ = unpack_matmul(mm.execute()) + reference = mxfp8_matmul_reference( + a, b, c=c, alpha=alpha, d_out=d_out, beta=beta, quantization_scales=scales2, options=options + ) + assert_fp8_equal(result, reference) + + # Reset A scale and B scale + ascales3 = generate_mxfp8_scales(a, a_scale_range, use_cuda=use_cuda) + bscales3 = generate_mxfp8_scales(b, b_scale_range, use_cuda=use_cuda) + scales3 = scales2.copy() + scales3["a"] = ascales3 + scales3["b"] = bscales3 + mm.reset_operands(a=a, b=b, quantization_scales={"a": ascales3, "b": bscales3}) + result, d_out, _ = unpack_matmul(mm.execute()) + reference = mxfp8_matmul_reference( + a, b, c=c, alpha=alpha, d_out=d_out, beta=beta, quantization_scales=scales3, options=options + ) + assert_fp8_equal(result, reference) + + +def unpack_bitmask(bitmask, shape): + """ + Utility function unpacking ReLU aux bitmask. + """ + if len(bitmask.shape) > 2: + return torch.stack([unpack_bitmask(bitmask[i], shape) for i in range(bitmask.shape[0])]) + result = torch.zeros(shape) + n, m = shape + for i in range(n): + for j in range(m): + result[i][j] = bool(int(bitmask[i // 8][j].item()) & (1 << i % 8)) + return result + + +@pytest.mark.parametrize( + "m,n,k", + ( + (128, 128, 128), + (4 * 128, 3 * 128, 2 * 128), + ), +) +@pytest.mark.parametrize(("use_cuda"), (True, False)) +@pytest.mark.parametrize("atype,btype,ctype,dtype", SUPPORTED_TYPE_COMBINATIONS) +@pytest.mark.parametrize( + "epilog_name,order", + ( + ("RELU", "col"), + ("RELU", "row"), + ("GELU", "col"), + ("BGRADA", "col"), + ("BGRADB", "col"), + ("DRELU", "col"), + ("DRELU_BGRAD", "col"), + ("DGELU", "col"), + ("DGELU_BGRAD", "col"), + ("RELU_BIAS", "col"), + ("BIAS", "col"), + ("GELU_BIAS", "col"), + ("RELU_AUX", "col"), + ("RELU_AUX_BIAS", "col"), + ("GELU_AUX", "col"), + ("GELU_AUX_BIAS", "col"), + ), +) +@pytest.mark.parametrize( + "a_batch,b_batch,c_batch,d_batch", + ( + ((), (), (), ()), + ((2,), (2,), (2,), (2,)), + ((2, 3), (2, 3), (2, 3), (2, 3)), + ), +) +@pytest.mark.parametrize("a_scale_range", ((-2, 2),)) +@pytest.mark.parametrize("b_scale_range", ((-3, 3),)) +def test_epilogs( + m, + n, + k, + atype, + btype, + ctype, + dtype, + a_scale_range, + b_scale_range, + epilog_name, + a_batch, + b_batch, + c_batch, + d_batch, + order, + use_cuda, +): + """ + Tests epilogs with MXFP8. + """ + epilog = getattr(Epilog, epilog_name) + + result_type = expected_result_type(atype, btype, ctype, dtype) + inferred_ctype = ctype or ("float16" if "float8" in result_type else result_type) + + # Currently, those are not supported by cuBLAS, so we allow them to fail with + # "NOT_SUPPORTED". + allow_not_supported = False + allow_not_supported |= "BGRAD" in epilog_name + allow_not_supported |= "DRELU" in epilog_name + allow_not_supported |= "DGELU" in epilog_name + allow_not_supported |= "AUX" in epilog_name + + def sample_batch(batch_shape, matrix_shape, type, transposed=False): + shape = (*batch_shape, *matrix_shape) + if transposed: + shape = (*shape[:-2], shape[-1], shape[-2]) + x = sample_matrix("torch", type, shape, use_cuda=use_cuda, min=-0.2, max=1) + return x.swapaxes(-1, -2) if transposed else x + + a = sample_batch(a_batch, (m, k), atype, transposed=False) + b = sample_batch(b_batch, (k, n), btype, transposed=True) + alpha, beta = 0.12, None + if ctype is not None: + c = sample_batch(c_batch, (m, n), ctype, transposed=(order == "col")) + beta = 0.34 + else: + c = None + beta = None + + ascales = generate_mxfp8_scales(a, a_scale_range, use_cuda=use_cuda) + bscales = generate_mxfp8_scales(b, b_scale_range, use_cuda=use_cuda) + + inputs = {} + if "BIAS" in epilog_name: + bias_type = "float16" if inferred_ctype == "float16" else "bfloat16" + bias = sample_matrix("torch", bias_type, (m,), use_cuda=use_cuda, min=0, max=1) + inputs["bias"] = bias + if "DRELU" in epilog_name: + round_16 = lambda x: (x + 15) // 16 * 16 + inputs["relu_aux"] = torch.randint(low=0, high=256, size=(n, round_16(m // 8))).type(torch.uint8).T + if "DGELU" in epilog_name: + if order == "col": + inputs["gelu_aux"] = sample_matrix("torch", result_type, (n, m), use_cuda=use_cuda, min=-5, max=5).T + else: + inputs["gelu_aux"] = sample_matrix("torch", result_type, (m, n), use_cuda=use_cuda, min=-5, max=5) + + scales = {"a": ascales, "b": bscales} + options = {"result_type": NAME_TO_DATA_TYPE[dtype] if dtype else None, "block_scaling": True} + + # Run matmul. Allow cuBLAS NOT_SUPPORTED error for certain configurations (see above) + with ( + nullcontext() + if not allow_not_supported + else allow_cublas_unsupported( + message=f"MXFP8 epilog not supported by cuBLAS: {epilog_name} for A:{atype} B:{btype} C:{ctype} D:{dtype}", + allow_invalid_value=True, + ) + ): + result, d_out, aux = unpack_matmul( + matmul( + a, + b, + c, + alpha=alpha, + beta=beta, + epilog=epilog, + quantization_scales=scales, + options=options, + epilog_inputs=inputs, + ) + ) + + # Compute the reference and compare + reference = mxfp8_matmul_reference( + a, + b, + c, + d_out=d_out, + alpha=alpha, + beta=beta, + epilog=epilog, + quantization_scales=scales, + options=options, + epilog_inputs=inputs, + ) + + if isinstance(reference, tuple): + reference, reference_aux = reference + else: + reference_aux = {} + + if "GELU" in epilog_name and result_type not in ("float16", "float32"): + assert_fp8_equal(result, reference, atol=1e-1, rtol=1e-1) + else: + assert_fp8_equal(result, reference) + + # Compare auxiliary outputs + assert aux.keys() == reference_aux.keys() + for key in aux: + if key == "relu_aux": + x = unpack_bitmask(aux[key], (m, n)) + y = unpack_bitmask(reference_aux[key], (m, n)) + assert torch.mean((x == y).type(torch.float32)) > 0.99 + elif key == "gelu_aux": + assert_fp8_equal(aux[key], reference_aux[key]) + elif key == "drelu_bgrad" or key == "dgelu_bgrad": + assert_fp8_equal(aux[key], reference.sum(axis=1), atol=1e-1, rtol=1e-1) + else: + raise RuntimeError(f"Test for {key} not implemented") + + +def test_helpers(): + """ + Tests MXFP8 helpers. + """ + x = torch.ones((1024, 3 * 1024), dtype=torch.float8_e4m3fn) + scales = matmul_helpers.create_mxfp8_scale(x, 3) + y = matmul_helpers.apply_mxfp8_scale(x, scales) + assert_fp8_equal(y, x.type(torch.float32) * 8) + z = matmul_helpers.apply_mxfp8_scale(y, matmul_helpers.invert_mxfp8_scale(scales)) + assert_fp8_equal(z, x.type(torch.float32)) + + +@pytest.mark.parametrize("M,N", ((1024, 3 * 1024), (128, 128), (5 * 1024, 256))) +@pytest.mark.parametrize("nsamples", (1, 7, 100)) +@pytest.mark.parametrize("input_format", ("vectors", "ints")) +def test_indexing_helpers(M, N, nsamples, input_format): + """ + Tests indexing helpers. + """ + tensor = torch.zeros((M, N), dtype=torch.float8_e4m3fn) + xs, ys = (torch.randint(size=(nsamples,), low=0, high=d, dtype=torch.int32) for d in (M, N)) + + full = matmul_helpers.get_mxfp8_scale_offset(tensor, torch.meshgrid(*(torch.arange(d) for d in (M, N)), indexing="ij")) + reference = full[xs, ys] + + if input_format == "vectors": + result = matmul_helpers.get_mxfp8_scale_offset(tensor, (xs, ys)) + assert torch.all(result == reference) + elif input_format == "ints": + result = [matmul_helpers.get_mxfp8_scale_offset(tensor, (x, y)) for x, y in zip(xs, ys, strict=True)] + assert all(res == ref for res, ref in zip(result, reference, strict=True)) + else: + raise RuntimeError + + +@pytest.mark.parametrize("order", ("t", "b", "tb", "bt", "tbt", "btb")) +def test_mxfp8_and_fp8(order): + """ + Test if MXFP8 and FP8 work together + """ + m, n, k = 256, 256, 256 + + a = torch.zeros(m, k, device="cuda", dtype=torch.float8_e4m3fn) + b = torch.zeros(n, k, device="cuda", dtype=torch.float8_e4m3fn).T + + for kind in order: + if kind == "t": + # Tensor-wide scaling + matmul(a, b, quantization_scales={"a": 1, "b": 1, "d": 1}) + else: + # Block scaling + matmul( + a, + b, + options={"block_scaling": True}, + quantization_scales={ + "a": matmul_helpers.create_mxfp8_scale(a, 0), + "b": matmul_helpers.create_mxfp8_scale(b, 0), + }, + ) + + +@pytest.mark.parametrize( + "m,n,k", + ((128, 128, 128),), +) +@pytest.mark.parametrize(("use_cuda"), (True, False)) +@pytest.mark.parametrize("atype,btype,ctype,dtype", SUPPORTED_TYPE_COMBINATIONS) +def test_validation_d_scale(m, n, k, atype, btype, ctype, dtype, use_cuda): + """ + Test if an error is raised if D scale is provided. + """ + a, b, c, alpha, beta = generate_simple_inputs(m, n, k, atype, btype, ctype, use_cuda=use_cuda) + + ascales = generate_mxfp8_scales(a, (-10, 10), use_cuda=use_cuda) + bscales = generate_mxfp8_scales(b, (-10, 10), use_cuda=use_cuda) + scales = {"a": ascales, "b": bscales, "d": torch.zeros(m, n).type(torch.uint8).cuda()} + options = {"result_type": NAME_TO_DATA_TYPE[dtype] if dtype else None, "block_scaling": True} + with pytest.raises(ValueError, match="Quantization scaling is not supported for D when `block_scaling` option is enabled."): + matmul(a, b, c=c, alpha=alpha, beta=beta, quantization_scales=scales, options=options) + + +@pytest.mark.parametrize( + "m,n,k", + ((128, 128, 128),), +) +@pytest.mark.parametrize(("use_cuda"), (True, False)) +@pytest.mark.parametrize("atype,btype,ctype,dtype", SUPPORTED_TYPE_COMBINATIONS) +@pytest.mark.parametrize("scales_provided", ("", "a", "b")) +def test_validation_all_scales_required(m, n, k, atype, btype, ctype, dtype, scales_provided, use_cuda): + """ + Test if an error is raised if not all scales are provided. + """ + a, b, c, alpha, beta = generate_simple_inputs(m, n, k, atype, btype, ctype, use_cuda=use_cuda) + + result_type = expected_result_type(atype, btype, ctype, dtype) + ascales = generate_mxfp8_scales(a, (-10, 10), use_cuda=use_cuda) + bscales = generate_mxfp8_scales(b, (-10, 10), use_cuda=use_cuda) + scales = {} + if "a" in scales_provided: + scales["a"] = ascales + if "b" in scales_provided: + scales["b"] = bscales + + options = {"result_type": NAME_TO_DATA_TYPE[dtype] if dtype else None, "block_scaling": True} + + ok = "a" in scales_provided and "b" in scales_provided and ("d" in scales_provided or "float8" not in result_type) + + if not ok: + with pytest.raises(ValueError, match=r"Scale for . is not specified"): + matmul(a, b, c=c, alpha=alpha, beta=beta, quantization_scales=scales, options=options) + + +@pytest.mark.parametrize("atype,btype,ctype,dtype", SUPPORTED_TYPE_COMBINATIONS) +@pytest.mark.parametrize("a_scale_range", ((-5, 5),)) +@pytest.mark.parametrize("b_scale_range", ((-5, 5),)) +@pytest.mark.parametrize( + "a_batch,b_batch", + ( + ((3,), ()), + ((), (3,)), + ((2,), (3,)), + ((2, 2), (1, 4)), + ), +) +@pytest.mark.parametrize( + "m,n,k", + ((128, 128, 128),), +) +@pytest.mark.parametrize(("use_cuda"), (True, False)) +def test_validation_ab_batches_different( + m, n, k, atype, btype, ctype, dtype, a_scale_range, b_scale_range, a_batch, b_batch, use_cuda +): + """ + Tests if MXFP8 raises an error when batch sizes are different. + """ + + def sample_batch(batch_shape, matrix_shape, type, transposed=False): + shape = (*batch_shape, *matrix_shape) + if transposed: + shape = (*shape[:-2], shape[-1], shape[-2]) + x = sample_matrix("torch", type, shape, use_cuda=use_cuda, min=0, max=2) + return x.swapaxes(-1, -2) if transposed else x + + a = sample_batch(a_batch, (m, k), atype, transposed=False) + b = sample_batch(b_batch, (k, n), btype, transposed=True) + + ascales = generate_mxfp8_scales(a, a_scale_range, use_cuda=use_cuda) + bscales = generate_mxfp8_scales(b, b_scale_range, use_cuda=use_cuda) + + options = {"result_type": NAME_TO_DATA_TYPE[dtype] if dtype else None, "block_scaling": True} + scales = {"a": ascales, "b": bscales} + with pytest.raises( + ValueError, + match=r"When block_scaling=True, the batch dimensions of A and B must match \(broadcasting is not supported\).", + ): + matmul(a, b, quantization_scales=scales, options=options) + + +@pytest.mark.parametrize("atype,btype,ctype,dtype", SUPPORTED_TYPE_COMBINATIONS) +@pytest.mark.parametrize( + "m,n,k", + ( + (128 + 1, 128, 128), + (128, 128 + 1, 128), + (128, 128, 128 + 1), + (128 + 64, 128, 128), + (128, 128 + 64, 128), + (128, 128, 128 + 64), + ), +) +@pytest.mark.parametrize("a_scale_range", ((-5, 5),)) +@pytest.mark.parametrize("b_scale_range", ((-5, 5),)) +@pytest.mark.parametrize(("use_cuda"), (True, False)) +def test_validation_shapes(m, n, k, atype, btype, ctype, dtype, a_scale_range, b_scale_range, use_cuda): + """ + Tests if an error is raised when M, N, K are not divisible by 128. + """ + a, b, c, alpha, beta = generate_simple_inputs(m, n, k, atype, btype, ctype, use_cuda=use_cuda) + + ascales = generate_mxfp8_scales(a, a_scale_range, use_cuda=use_cuda, validate_shape=False) + bscales = generate_mxfp8_scales(b, b_scale_range, use_cuda=use_cuda, validate_shape=False) + + scales = {"a": ascales, "b": bscales} + options = {"result_type": NAME_TO_DATA_TYPE[dtype] if dtype else None, "block_scaling": True} + with pytest.raises(ValueError, match=f"M={m} N={n} K={k} must be divisible by 128 when block_scaling=True"): + matmul(a, b, c=c, alpha=alpha, beta=beta, quantization_scales=scales, options=options) + + +@pytest.mark.parametrize("atype,btype,ctype,dtype", SUPPORTED_TYPE_COMBINATIONS) +@pytest.mark.parametrize( + "m,n,k", + ( + (128, 128, 128), + (128, 256, 512), + (512, 512, 128), + ), +) +@pytest.mark.parametrize(("use_cuda"), (True, False)) +@pytest.mark.parametrize( + "a_err,b_err", + ( + ( + (1, 0), + (0, 1), + (1, 1), + (32, 32), + (-1, -1), + (-1, 0), + (0, -1), + ) + ), +) +def test_validation_scales_shapes(m, n, k, atype, btype, ctype, dtype, a_err, b_err, use_cuda): + """ + Tests if an error is scale shapes don't match input shapes. + """ + a, b, c, alpha, beta = generate_simple_inputs(m, n, k, atype, btype, ctype, use_cuda=use_cuda) + + ascales = torch.zeros(size=(a.nelement() // 32 + a_err,), dtype=torch.uint8) + bscales = torch.zeros(size=(a.nelement() // 32 + b_err,), dtype=torch.uint8) + scales = {"a": ascales, "b": bscales} + options = {"result_type": NAME_TO_DATA_TYPE[dtype] if dtype else None, "block_scaling": True} + + with pytest.raises(ValueError, match=r"Scales for (A|B) should have shape .* Got .*"): + matmul(a, b, c=c, alpha=alpha, beta=beta, quantization_scales=scales, options=options) + + +@pytest.mark.parametrize("atype,btype,ctype,dtype", SUPPORTED_TYPE_COMBINATIONS[:3]) +@pytest.mark.parametrize(("use_cuda"), (True, False)) +def test_validation_scalar_scales(atype, btype, ctype, dtype, use_cuda): + """ + Tests if scalar scales are disallowed when block_scaling=True. + """ + a, b, c, alpha, beta = generate_simple_inputs(128, 128, 128, atype, btype, ctype, use_cuda=use_cuda) + scales = {"a": 1, "b": 1} + options = {"result_type": NAME_TO_DATA_TYPE[dtype] if dtype else None, "block_scaling": True} + with pytest.raises(ValueError, match="A scalar tensor-wide scale factor is not allowed when block_scaling=True."): + matmul(a, b, c=c, alpha=alpha, beta=beta, quantization_scales=scales, options=options) + + +@pytest.mark.parametrize("atype,btype,ctype,dtype", SUPPORTED_TYPE_COMBINATIONS[:3]) +@pytest.mark.parametrize( + "m,n,k", + ((128, 128, 128),), +) +@pytest.mark.parametrize(("scale_dtype"), (torch.int8, torch.float32)) +def test_validation_scales_dtype(m, n, k, atype, btype, ctype, dtype, scale_dtype): + """ + Tests if scales of invalid type are rejected. + """ + a, b, c, alpha, beta = generate_simple_inputs(m, n, k, atype, btype, ctype, use_cuda=False) + + ascales = torch.zeros(size=(a.nelement() // 32,), dtype=scale_dtype) + bscales = torch.zeros(size=(a.nelement() // 32,), dtype=scale_dtype) + scales = {"a": ascales, "b": bscales} + options = {"result_type": NAME_TO_DATA_TYPE[dtype] if dtype else None, "block_scaling": True} + + with pytest.raises(ValueError, match="Block scales for (A|B) should be uint8 tensor"): + matmul(a, b, c=c, alpha=alpha, beta=beta, quantization_scales=scales, options=options) diff --git a/tests/nvmath_tests/linalg/advanced/matmul/test_options.py b/tests/nvmath_tests/linalg/advanced/matmul/test_options.py index 1411949..ef44e32 100644 --- a/tests/nvmath_tests/linalg/advanced/matmul/test_options.py +++ b/tests/nvmath_tests/linalg/advanced/matmul/test_options.py @@ -1,13 +1,14 @@ -# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. +# Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. # # SPDX-License-Identifier: Apache-2.0 -import nvmath from nvmath.bindings import cublas from nvmath.linalg.advanced import matmul, Matmul, MatmulOptions -from .utils import * -import pytest import logging +import nvmath +import pytest + +from .utils import assert_tensors_equal, sample_matrix, is_torch_available try: import cupy_backends.cuda @@ -25,12 +26,12 @@ """ -def check_matmul_with_options(size, options, use_cuda=False, dtype="float32", atol=None): +def check_matmul_with_options(size, options, use_cuda=False, dtype="float32", atol=None, rtol=None): a = b = sample_matrix("numpy/cupy" if dtype != "bfloat16" else "torch", dtype, (size, size), use_cuda) is_complex = "_C_" in str(options.scale_type) or (options.compute_type is None and "complex" in dtype) alpha = 0.42 + 0.24j if is_complex else 0.42 result = matmul(a, b, alpha=alpha, options=options) - assert_tensors_equal(result, alpha * (a @ b), atol=atol) + assert_tensors_equal(result, alpha * (a @ b), atol=atol, rtol=rtol) return result @@ -102,6 +103,7 @@ def test_compute_scale_type(dtype, compute_type, scale_type): dtype=dtype, use_cuda=True, atol=0.1, + rtol=None, ) diff --git a/tests/nvmath_tests/linalg/advanced/matmul/test_perf.py b/tests/nvmath_tests/linalg/advanced/matmul/test_perf.py index 50518b1..b2a763c 100644 --- a/tests/nvmath_tests/linalg/advanced/matmul/test_perf.py +++ b/tests/nvmath_tests/linalg/advanced/matmul/test_perf.py @@ -1,4 +1,4 @@ -# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. +# Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. # # SPDX-License-Identifier: Apache-2.0 diff --git a/tests/nvmath_tests/linalg/advanced/matmul/test_planning.py b/tests/nvmath_tests/linalg/advanced/matmul/test_planning.py index e0c07a3..9c06d59 100644 --- a/tests/nvmath_tests/linalg/advanced/matmul/test_planning.py +++ b/tests/nvmath_tests/linalg/advanced/matmul/test_planning.py @@ -1,4 +1,4 @@ -# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. +# Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. # # SPDX-License-Identifier: Apache-2.0 @@ -6,12 +6,17 @@ This set of tests checks basic properties of separated planning. """ -import re -import nvmath -from nvmath.linalg.advanced import Matmul, MatmulPlanPreferences from nvmath.bindings import cublasLt as cublaslt +from nvmath.linalg.advanced import Matmul, MatmulPlanPreferences +import numpy as np import pytest -from .utils import * + +from .utils import sample_matrix, allow_cublas_unsupported, assert_tensors_equal + +try: + import cupy +except ModuleNotFoundError: + pytest.skip("cupy required for matmul tests", allow_module_level=True) @pytest.mark.parametrize("framework", ("numpy/cupy", "torch")) @@ -176,3 +181,74 @@ def test_algorithm_not_planned(framework, use_cuda): match=r"Algorithm passed to execute\(\) has to be included in the plan\(\) algorithms", ): mm2.execute(algorithm=algos[0]) + +def test_algorithm_ids(): + a = cupy.zeros((10, 10)) + b = cupy.zeros((10, 10)) + with Matmul(a, b) as mm: + assert len(mm.applicable_algorithm_ids(limit=4)) <= 4 + +def test_algo_attributes(): + ''' + Test Algorithm class setter/property + ''' + m, n, k = 24, 24, 24 + a = cupy.random.rand(m, k) + b = cupy.random.rand(k, n) + + with Matmul(a, b) as mm: + algos = mm.plan() + best = algos[0] + + # An attribute may not be supported in all cuBLASLt versions (INVALID_VALUE). + + message = "The attribute '{attr}' is not supported in this version." + with allow_cublas_unsupported( + allow_invalid_value=True, + message=message.format(attr='stages') + ): + if best.capabilities.stages_ids: + best.stages = best.capabilities.stages_ids[-1] + assert best.stages == best.capabilities.stages_ids[-1] + + with allow_cublas_unsupported( + allow_invalid_value=True, + message=message.format(attr='split_k') + ): + best.split_k = 4 + assert best.split_k == 4 + + with allow_cublas_unsupported( + allow_invalid_value=True, + message=message.format(attr='reduction_scheme') + ): + best.reduction_scheme = best.capabilities.reduction_scheme_mask + assert best.reduction_scheme == best.capabilities.reduction_scheme_mask + + with allow_cublas_unsupported( + allow_invalid_value=True, + message=message.format(attr='cta_swizzling') + ): + best.cta_swizzling = True + assert best.cta_swizzling == True + + with allow_cublas_unsupported( + allow_invalid_value=True, + message=message.format(attr='custom_option') + ): + best.custom_option = 1 + assert best.custom_option == 1 + + with allow_cublas_unsupported( + allow_invalid_value=True, + message=message.format(attr='inner_shape') + ): + best.inner_shape = cublaslt.MatmulInnerShape.MMA884 + assert best.inner_shape == cublaslt.MatmulInnerShape.MMA884 + + with allow_cublas_unsupported( + allow_invalid_value=True, + message=message.format(attr='cluster_shape') + ): + best.cluster_shape = (1,1,1) + assert best.cluster_shape == (1,1,1) diff --git a/tests/nvmath_tests/linalg/advanced/matmul/test_reset.py b/tests/nvmath_tests/linalg/advanced/matmul/test_reset.py index edd479a..c1645a7 100644 --- a/tests/nvmath_tests/linalg/advanced/matmul/test_reset.py +++ b/tests/nvmath_tests/linalg/advanced/matmul/test_reset.py @@ -1,4 +1,4 @@ -# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. +# Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. # # SPDX-License-Identifier: Apache-2.0 @@ -8,7 +8,8 @@ import nvmath import pytest -from .utils import * + +from .utils import assert_tensors_equal, random_torch_complex, sample_matrix, skip_if_cublas_before @pytest.mark.parametrize("framework", ("numpy/cupy", "torch")) diff --git a/tests/nvmath_tests/linalg/advanced/matmul/utils.py b/tests/nvmath_tests/linalg/advanced/matmul/utils.py index ca7db5e..1ee0374 100644 --- a/tests/nvmath_tests/linalg/advanced/matmul/utils.py +++ b/tests/nvmath_tests/linalg/advanced/matmul/utils.py @@ -1,4 +1,4 @@ -# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. +# Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. # # SPDX-License-Identifier: Apache-2.0 @@ -17,10 +17,11 @@ import numpy as np import nvmath +import datetime import re -def sample_matrix(framework, dtype, shape, use_cuda): +def sample_matrix(framework, dtype, shape, use_cuda, min=-5, max=5): """ Generates a sample matrix with random contents. """ @@ -32,7 +33,7 @@ def sample_matrix(framework, dtype, shape, use_cuda): if torch is None: pytest.skip("pytorch not present") dtype = getattr(torch, dtype) - r = (10 * torch.rand(shape) - 5).type(dtype) + r = ((max - min) * torch.rand(shape) + min).type(dtype) return r.cuda() if use_cuda else r elif framework == "cupy": if not use_cuda: @@ -83,26 +84,38 @@ def get_framework(tensor): raise AssertionError() -def get_tolerance(value): +def get_machine_eps(value): eps = np.finfo(to_numpy(value).dtype).eps if torch is not None and value.dtype == torch.bfloat16: eps = 2**-6 - return eps**0.5 + return eps -def compare_tensors(result, reference, atol=None): +def get_absolute_tolerance(value): + return get_machine_eps(value) ** 0.5 + + +def get_relative_tolerance(value): + return max(1e-5, get_machine_eps(value) ** 0.5) + + +def compare_tensors(result, reference, atol=None, rtol=None): if atol is None: - atol = get_tolerance(result) - return np.allclose(to_numpy(result), to_numpy(reference), atol=atol) + atol = get_absolute_tolerance(result) + if rtol is None: + rtol = get_relative_tolerance(result) + return np.allclose(to_numpy(result), to_numpy(reference), atol=atol, rtol=rtol) -def assert_tensors_equal(result, reference, atol=None): +def assert_tensors_equal(result, reference, atol=None, rtol=None): """ Checks if result is close to the provided numpy reference. """ assert result is not reference, "same object passed as `result` and `reference`!" - ok = compare_tensors(result, reference, atol=atol) + ok = compare_tensors(result, reference, atol=atol, rtol=rtol) if not ok: + print(f"Absdiff: {np.max(np.abs(to_numpy(result) - to_numpy(reference)))}") + print(f"Reldiff: {np.max(np.abs(to_numpy(result) - to_numpy(reference)) / (np.abs(to_numpy(reference)) + 0.000001) )}") print("Result:\n", result) print("Reference:\n", reference) assert ok @@ -128,12 +141,30 @@ def skip_if_cublas_before(version, message="Unsupported cublas version."): return False +# Setting the seed once per day allows randomness, but helps with reproducibility. +matmul_with_random_autotune_rng = np.random.default_rng(seed=abs(hash(datetime.date.today()))) + + +def matmul_with_random_autotune(*args, p=0.25, **kwargs): + """ + Executes matmul, using autotuning with probability p. + """ + constructor_kwargs = ("c", "alpha", "beta", "qualifiers", "options", "stream", "quantization_scales") + plan_kwargs = ("preferences", "epilog", "epilog_inputs", "algorithms", "stream") + execute_kwargs = ("stream",) + mm = nvmath.linalg.advanced.Matmul(*args, **{k: kwargs[k] for k in constructor_kwargs if k in kwargs}) + mm.plan(**{k: kwargs[k] for k in plan_kwargs if k in kwargs}) + if matmul_with_random_autotune_rng.random() < p: + mm.autotune() + return mm.execute(**{k: kwargs[k] for k in execute_kwargs if k in kwargs}) + + class allow_cublas_unsupported: def __init__(self, *, allow_invalid_value=True, unsupported_before=None, message="Unsupported cublas version."): if allow_invalid_value: - self.regex = r"\(CUBLAS_STATUS_(NOT_SUPPORTED|INVALID_VALUE)\)" + self.regex = r"\(?(CUBLAS_STATUS_)?(NOT_SUPPORTED|INVALID_VALUE)\)?" else: - self.regex = r"\(CUBLAS_STATUS_NOT_SUPPORTED\)" + self.regex = r"\(?(CUBLAS_STATUS_)?NOT_SUPPORTED\)?" self.unsupported_before = unsupported_before self.message = message