NVIDIA · miscco · Mar 12, 2024 · Nov 21, 2023 · Nov 22, 2023 · Nov 22, 2023
@@ -19,9 +19,22 @@ User-defined floating-point literals must be specified in terms of
 
 ## Customizations
 
+### Handling of infinities
+
 Our implementation by default recovers infinite values during multiplication and division. This adds a significant runtime overhead, so we allow disabling that canonicalization if it is not desired.
 
 Definition of `LIBCUDACXX_ENABLE_SIMPLIFIED_COMPLEX_OPERATIONS` disables canonicalization for both multiplication *and* division.
 
 Definition of `LIBCUDACXX_ENABLE_SIMPLIFIED_COMPLEX_DIVISION` or `LIBCUDACXX_ENABLE_SIMPLIFIED_COMPLEX_DIVISION` disables canonicalization for multiplication or division individually.
 
+### Support for half and bfloat16 (since libcu++ 2.x.x)
+
+Our implementation includes support for the `__half` type from `<cuda_fp16.h>`, when the CUDA toolkit version is at
+least 12.2. This is detected automatically when compiling through NVCC. If you are compiling a host-only translation
+unit directly with the host compiler, you must define the macro `LIBCUDACXX_ENABLE_HOST_NVFP16` prior to including any
+libcu++ headers, and you must ensure that the `<cuda_fp16.h>` header that's found by the compiler comes from a CUDA
+toolkit version 12.2 or higher.
+
+Our implementation includes support for the `__nv_bfloat16` type from `<cuda_bf16.h>`, when the conditions for the
+support of `__half` are fulfilled, and when `CUB_DISABLE_BF16_SUPPORT` is **not** defined.
+
@@ -84,6 +84,10 @@ set(files
   __cuda/barrier.h
   __cuda/chrono.h
   __cuda/climits_prelude.h
+  __cuda/cmath_nvbf16.h
+  __cuda/cmath_nvfp16.h
+  __cuda/complex_nvbf16.h
+  __cuda/complex_nvfp16.h
   __cuda/cstddef_prelude.h
   __cuda/cstdint_prelude.h
   __cuda/latch.h

@@ -86,6 +86,9 @@
 #if defined(_LIBCUDACXX_CUDACC) && _LIBCUDACXX_CUDACC_VER < 1108000
 #define _LIBCUDACXX_CUDACC_BELOW_11_8
 #endif // defined(_LIBCUDACXX_CUDACC) && _LIBCUDACXX_CUDACC_VER < 1108000
+#if defined(_LIBCUDACXX_CUDACC) && _LIBCUDACXX_CUDACC_VER < 1202000
+#define _LIBCUDACXX_CUDACC_BELOW_12_2
+#endif // defined(_LIBCUDACXX_CUDACC) && _LIBCUDACXX_CUDACC_VER < 1203000
 #if defined(_LIBCUDACXX_CUDACC) && _LIBCUDACXX_CUDACC_VER < 1203000
 #define _LIBCUDACXX_CUDACC_BELOW_12_3
 #endif // defined(_LIBCUDACXX_CUDACC) && _LIBCUDACXX_CUDACC_VER < 1203000
@@ -1151,8 +1154,27 @@ typedef __char32_t char32_t;
 #endif
 #endif // _LIBCUDACXX_HAS_NO_LONG_DOUBLE
 
-#ifndef _LIBCUDACXX_HAS_NO_ATTRIBUTE_NO_UNIQUE_ADDRESS
-#if __has_cpp_attribute(msvc::no_unique_address)
+#  ifndef _LIBCUDACXX_HAS_NVFP16
+#    if __has_include(<cuda_fp16.h>)                                                           \
+      && defined(__cuda_std__)                                                                 \
+      && (defined(_LIBCUDACXX_COMPILER_CLANG_CUDA) || !defined(_LIBCUDACXX_CUDACC_BELOW_12_2)) \
+      && (!defined(_LIBCUDACXX_COMPILER_CLANG_CUDA) || CUDA_VERSION >= 12020)                  \
+      && (defined(_LIBCUDACXX_CUDACC) || defined(LIBCUDACXX_ENABLE_HOST_NVFP16))
+#      define _LIBCUDACXX_HAS_NVFP16
+#    endif
+#  endif // !_LIBCUDACXX_HAS_NVFP16
+
+#  ifndef _LIBCUDACXX_HAS_NVBF16
+#    if __has_include(<cuda_bf16.h>)        \
+      && defined(__cuda_std__)              \
+      && defined(_LIBCUDACXX_HAS_NVFP16)    \
+      && !defined(CUB_DISABLE_BF16_SUPPORT)
+#      define _LIBCUDACXX_HAS_NVBF16
+#    endif
+#  endif // !_LIBCUDACXX_HAS_NVBF16
+
+#  ifndef _LIBCUDACXX_HAS_NO_ATTRIBUTE_NO_UNIQUE_ADDRESS
+#    if __has_cpp_attribute(msvc::no_unique_address)
 // MSVC implements [[no_unique_address]] as a silent no-op currently.
 // (If/when MSVC breaks its C++ ABI, it will be changed to work as intended.)
 // However, MSVC implements [[msvc::no_unique_address]] which does what

@@ -0,0 +1,162 @@
+// -*- C++ -*-
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _LIBCUDACXX___CUDA_CMATH_NVBF16_H
+#define _LIBCUDACXX___CUDA_CMATH_NVBF16_H
+
+#ifndef __cuda_std__
+#  include <config>
+#endif // __cuda_std__
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#if defined(_LIBCUDACXX_HAS_NVBF16)
+
+_CCCL_DIAG_PUSH
+_CCCL_DIAG_SUPPRESS_CLANG("-Wunused-function")
+#  include <cuda_bf16.h>
+_CCCL_DIAG_POP
+
+#  include <nv/target>
+
+#  include "../__type_traits/integral_constant.h"
+#  include "../cmath"
+
+_LIBCUDACXX_BEGIN_NAMESPACE_STD
+
+// trigonometric functions
+inline _LIBCUDACXX_INLINE_VISIBILITY __nv_bfloat16 sin(__nv_bfloat16 __v)
+{
+  NV_IF_ELSE_TARGET(NV_IS_DEVICE, (return ::hsin(__v);), (return __nv_bfloat16(::sin(float(__v)));))
+}
+
+inline _LIBCUDACXX_INLINE_VISIBILITY __nv_bfloat16 sinh(__nv_bfloat16 __v)
+{
+  return __nv_bfloat16(::sinh(float(__v)));
+}
+
+inline _LIBCUDACXX_INLINE_VISIBILITY __nv_bfloat16 cos(__nv_bfloat16 __v)
+{
+  NV_IF_ELSE_TARGET(NV_IS_DEVICE, (return hcos(__v);), (return __nv_bfloat16(::cos(float(__v)));))
+}
+
+inline _LIBCUDACXX_INLINE_VISIBILITY __nv_bfloat16 cosh(__nv_bfloat16 __v)
+{
+  return __nv_bfloat16(::cosh(float(__v)));
+}
+
+inline _LIBCUDACXX_INLINE_VISIBILITY __nv_bfloat16 exp(__nv_bfloat16 __v)
+{
+  NV_IF_ELSE_TARGET(NV_IS_DEVICE, (return ::hexp(__v);), (return __nv_bfloat16(::exp(float(__v)));))
+}
+
+inline _LIBCUDACXX_INLINE_VISIBILITY __nv_bfloat16 hypot(__nv_bfloat16 __x, __nv_bfloat16 __y)
+{
+  return __nv_bfloat16(::hypot(float(__x), float(__y)));
+}
+
+inline _LIBCUDACXX_INLINE_VISIBILITY __nv_bfloat16 atan2(__nv_bfloat16 __x, __nv_bfloat16 __y)
+{
+  return __nv_bfloat16(::atan2(float(__x), float(__y)));
+}
+
+inline _LIBCUDACXX_INLINE_VISIBILITY __nv_bfloat16 log(__nv_bfloat16 __x)
+{
+  NV_IF_ELSE_TARGET(NV_IS_DEVICE, (return ::hlog(__x);), (return __nv_bfloat16(::log(float(__x)));))
+}
+
+inline _LIBCUDACXX_INLINE_VISIBILITY __nv_bfloat16 sqrt(__nv_bfloat16 __x)
+{
+  NV_IF_ELSE_TARGET(NV_IS_DEVICE, (return ::hsqrt(__x);), (return __nv_bfloat16(::sqrt(float(__x)));))
+}
+
+// floating point helper
+inline _LIBCUDACXX_INLINE_VISIBILITY bool signbit(__nv_bfloat16 __v)
+{
+  return ::signbit(::__bfloat162float(__v));
+}
+
+inline _LIBCUDACXX_INLINE_VISIBILITY bool __constexpr_isnan(__nv_bfloat16 __x) noexcept
+{
+  return ::__hisnan(__x);
+}
+
+inline _LIBCUDACXX_INLINE_VISIBILITY bool isnan(__nv_bfloat16 __v)
+{
+  return __constexpr_isnan(__v);
+}
+
+inline _LIBCUDACXX_INLINE_VISIBILITY bool __constexpr_isinf(__nv_bfloat16 __x) noexcept
+{
+#  if _CCCL_STD_VER >= 2020
+  // There's some sort of a bug with C++20 here.
+  // XXX nvbug number pending
+  return !::__hisnan(__x) && ::__hisnan(__x - __x);
+#  else // ^^^ C++20 ^^^ / vvv C++17 vvv
+  return ::__hisinf(__x) != 0;
+#  endif // _CCCL_STD_VER <= 2017
+}
+
+inline _LIBCUDACXX_INLINE_VISIBILITY bool isinf(__nv_bfloat16 __v)
+{
+  return __constexpr_isinf(__v);
+}
+
+inline _LIBCUDACXX_INLINE_VISIBILITY bool __constexpr_isfinite(__nv_bfloat16 __x) noexcept
+{
+  return !__constexpr_isnan(__x) && !__constexpr_isinf(__x);
+}
+
+inline _LIBCUDACXX_INLINE_VISIBILITY bool isfinite(__nv_bfloat16 __v)
+{
+  return __constexpr_isfinite(__v);
+}
+
+inline _LIBCUDACXX_INLINE_VISIBILITY __nv_bfloat16 __constexpr_copysign(__nv_bfloat16 __x, __nv_bfloat16 __y) noexcept
+{
+  return __nv_bfloat16(::copysignf(float(__x), float(__y)));
+}
+
+inline _LIBCUDACXX_INLINE_VISIBILITY __nv_bfloat16 copysign(__nv_bfloat16 __x, __nv_bfloat16 __y)
+{
+  return __constexpr_copysign(__x, __y);
+}
+
+inline _LIBCUDACXX_INLINE_VISIBILITY __nv_bfloat16 __constexpr_fabs(__nv_bfloat16 __x) noexcept
+{
+  return ::__habs(__x);
+}
+
+inline _LIBCUDACXX_INLINE_VISIBILITY __nv_bfloat16 fabs(__nv_bfloat16 __x)
+{
+  return __constexpr_fabs(__x);
+}
+
+inline _LIBCUDACXX_INLINE_VISIBILITY __nv_bfloat16 abs(__nv_bfloat16 __x)
+{
+  return __constexpr_fabs(__x);
+}
+
+inline _LIBCUDACXX_INLINE_VISIBILITY __nv_bfloat16 __constexpr_fmax(__nv_bfloat16 __x, __nv_bfloat16 __y) noexcept
+{
+  return ::__hmax(__x, __y);
+}
+
+_LIBCUDACXX_END_NAMESPACE_STD
+
+#endif /// _LIBCUDACXX_HAS_NVBF16
+
+#endif // _LIBCUDACXX___CUDA_CMATH_NVBF16_H