NVIDIA · jrhemstad · Oct 11, 2023 · Sep 26, 2023 · Sep 26, 2023 · Sep 26, 2023
@@ -12,6 +12,10 @@ explode_std_versions() {
   jq -cr 'map(. as $o | {std: $o.std[]} + del($o.std))'
 }
 
+explode_libs() {
+  jq -cr 'map(. as $o | {lib: $o.lib[]} + del($o.lib))'
+}
+
 extract_matrix() {
   local file="$1"
   local type="$2"
@@ -23,6 +27,8 @@ extract_matrix() {
   write_output "HOST_COMPILERS" "$(echo "$nvcc_full_matrix" | jq -cr '[.[] | .compiler.name] | unique')"
   write_output "PER_CUDA_COMPILER_MATRIX" "$(echo "$nvcc_full_matrix" | jq -cr ' group_by(.cuda + .compiler.name) | map({(.[0].cuda + "-" + .[0].compiler.name): .}) | add')"
   write_output "NVRTC_MATRIX" "$(echo "$matrix" | jq '.nvrtc' | explode_std_versions)"
+  local clang_cuda_matrix="$(echo "$matrix" | jq -cr '.["clang-cuda"]' | explode_std_versions | explode_libs)"
+  write_output "CLANG_CUDA_MATRIX" "$clang_cuda_matrix"
 }
 
 main() {

@@ -42,6 +42,7 @@ jobs:
       HOST_COMPILERS: ${{steps.set-outputs.outputs.HOST_COMPILERS}}
       PER_CUDA_COMPILER_MATRIX: ${{steps.set-outputs.outputs.PER_CUDA_COMPILER_MATRIX}}
       NVRTC_MATRIX: ${{steps.set-outputs.outputs.NVRTC_MATRIX}}
+      CLANG_CUDA_MATRIX: ${{steps.set-outputs.outputs.CLANG_CUDA_MATRIX}}
     steps:
       - name: Checkout repo
         uses: actions/checkout@v3
@@ -50,91 +51,108 @@ jobs:
         run: |
           .github/actions/compute-matrix/compute-matrix.sh ci/matrix.yaml pull_request
 
-  nvrtc:
-    name: NVRTC CUDA${{matrix.cuda}} C++${{matrix.std}}
-    needs: compute-matrix
-    if: ${{ !contains(github.event.head_commit.message, 'skip-tests') }}
-    uses: ./.github/workflows/run-as-coder.yml
-    strategy:
-      fail-fast: false
-      matrix:
-        include: ${{ fromJSON(needs.compute-matrix.outputs.NVRTC_MATRIX) }}
-    with:
-      name: NVRTC CUDA${{matrix.cuda}} C++${{matrix.std}}
-      runner: linux-${{matrix.cpu}}-gpu-v100-latest-1
-      image: rapidsai/devcontainers:${{needs.compute-matrix.outputs.DEVCONTAINER_VERSION}}-cpp-gcc12-cuda${{matrix.cuda}}-${{matrix.os}}
-      command: |
-        ./ci/nvrtc_libcudacxx.sh g++ ${{matrix.std}} ${{matrix.gpu_build_archs}}
+  #nvrtc:
+  #  name: NVRTC CUDA${{matrix.cuda}} C++${{matrix.std}}
+  #  needs: compute-matrix
+  #  if: ${{ !contains(github.event.head_commit.message, 'skip-tests') }}
+  #  uses: ./.github/workflows/run-as-coder.yml
+  #  strategy:
+  #    fail-fast: false
+  #    matrix:
+  #      include: ${{ fromJSON(needs.compute-matrix.outputs.NVRTC_MATRIX) }}
+  #  with:
+  #    name: NVRTC CUDA${{matrix.cuda}} C++${{matrix.std}}
+  #    runner: linux-${{matrix.cpu}}-gpu-v100-latest-1
+  #    image: rapidsai/devcontainers:${{needs.compute-matrix.outputs.DEVCONTAINER_VERSION}}-cpp-gcc12-cuda${{matrix.cuda}}-${{matrix.os}}
+  #    command: |
+  #      ./ci/nvrtc_libcudacxx.sh g++ ${{matrix.std}} ${{matrix.gpu_build_archs}}
 
-  thrust:
-    name: Thrust CUDA${{ matrix.cuda_version }} ${{ matrix.compiler }}
-    needs: compute-matrix
-    uses: ./.github/workflows/dispatch-build-and-test.yml
-    strategy:
-      fail-fast: false
-      matrix:
-        cuda_version: ${{ fromJSON(needs.compute-matrix.outputs.CUDA_VERSIONS) }}
-        compiler: ${{ fromJSON(needs.compute-matrix.outputs.HOST_COMPILERS) }}
-    with:
-      project_name: "thrust"
-      per_cuda_compiler_matrix: ${{ toJSON(fromJSON(needs.compute-matrix.outputs.PER_CUDA_COMPILER_MATRIX)[ format('{0}-{1}', matrix.cuda_version, matrix.compiler) ]) }}
-      devcontainer_version: ${{ needs.compute-matrix.outputs.DEVCONTAINER_VERSION }}
-      is_windows: ${{ contains(matrix.compiler, 'cl') }}
+  #thrust:
+  #  name: Thrust CUDA${{ matrix.cuda_version }} ${{ matrix.compiler }}
+  #  needs: compute-matrix
+  #  uses: ./.github/workflows/dispatch-build-and-test.yml
+  #  strategy:
+  #    fail-fast: false
+  #    matrix:
+  #      cuda_version: ${{ fromJSON(needs.compute-matrix.outputs.CUDA_VERSIONS) }}
+  #      compiler: ${{ fromJSON(needs.compute-matrix.outputs.HOST_COMPILERS) }}
+  #  with:
+  #    project_name: "thrust"
+  #    per_cuda_compiler_matrix: ${{ toJSON(fromJSON(needs.compute-matrix.outputs.PER_CUDA_COMPILER_MATRIX)[ format('{0}-{1}', matrix.cuda_version, matrix.compiler) ]) }}
+  #    devcontainer_version: ${{ needs.compute-matrix.outputs.DEVCONTAINER_VERSION }}
+  #    is_windows: ${{ contains(matrix.compiler, 'cl') }}
 
-  cub:
-    name: CUB CUDA${{ matrix.cuda_version }} ${{ matrix.compiler }}
-    needs: compute-matrix
-    uses: ./.github/workflows/dispatch-build-and-test.yml
-    strategy:
-      fail-fast: false
-      matrix:
-        cuda_version: ${{ fromJSON(needs.compute-matrix.outputs.CUDA_VERSIONS) }}
-        compiler: ${{ fromJSON(needs.compute-matrix.outputs.HOST_COMPILERS) }}
-    with:
-      project_name: "cub"
-      per_cuda_compiler_matrix: ${{ toJSON(fromJSON(needs.compute-matrix.outputs.PER_CUDA_COMPILER_MATRIX)[ format('{0}-{1}', matrix.cuda_version, matrix.compiler) ]) }}
-      devcontainer_version: ${{ needs.compute-matrix.outputs.DEVCONTAINER_VERSION }}
-      is_windows: ${{ contains(matrix.compiler, 'cl') }}
+  #cub:
+  #  name: CUB CUDA${{ matrix.cuda_version }} ${{ matrix.compiler }}
+  #  needs: compute-matrix
+  #  uses: ./.github/workflows/dispatch-build-and-test.yml
+  #  strategy:
+  #    fail-fast: false
+  #    matrix:
+  #      cuda_version: ${{ fromJSON(needs.compute-matrix.outputs.CUDA_VERSIONS) }}
+  #      compiler: ${{ fromJSON(needs.compute-matrix.outputs.HOST_COMPILERS) }}
+  #  with:
+  #    project_name: "cub"
+  #    per_cuda_compiler_matrix: ${{ toJSON(fromJSON(needs.compute-matrix.outputs.PER_CUDA_COMPILER_MATRIX)[ format('{0}-{1}', matrix.cuda_version, matrix.compiler) ]) }}
+  #    devcontainer_version: ${{ needs.compute-matrix.outputs.DEVCONTAINER_VERSION }}
+  #    is_windows: ${{ contains(matrix.compiler, 'cl') }}
 
-  libcudacxx:
-    name: libcudacxx CUDA${{ matrix.cuda_version }} ${{ matrix.compiler }}
-    needs: compute-matrix
-    uses: ./.github/workflows/dispatch-build-and-test.yml
-    strategy:
-      fail-fast: false
-      matrix:
-        cuda_version: ${{ fromJSON(needs.compute-matrix.outputs.CUDA_VERSIONS) }}
-        compiler: ${{ fromJSON(needs.compute-matrix.outputs.HOST_COMPILERS) }}
-    with:
-      project_name: "libcudacxx"
-      per_cuda_compiler_matrix: ${{ toJSON(fromJSON(needs.compute-matrix.outputs.PER_CUDA_COMPILER_MATRIX)[ format('{0}-{1}', matrix.cuda_version, matrix.compiler) ]) }}
-      devcontainer_version: ${{ needs.compute-matrix.outputs.DEVCONTAINER_VERSION }}
-      is_windows: ${{ contains(matrix.compiler, 'cl') }}
+  #libcudacxx:
+  #  name: libcudacxx CUDA${{ matrix.cuda_version }} ${{ matrix.compiler }}
+  #  needs: compute-matrix
+  #  uses: ./.github/workflows/dispatch-build-and-test.yml
+  #  strategy:
+  #    fail-fast: false
+  #    matrix:
+  #      cuda_version: ${{ fromJSON(needs.compute-matrix.outputs.CUDA_VERSIONS) }}
+  #      compiler: ${{ fromJSON(needs.compute-matrix.outputs.HOST_COMPILERS) }}
+  #  with:
+  #    project_name: "libcudacxx"
+  #    per_cuda_compiler_matrix: ${{ toJSON(fromJSON(needs.compute-matrix.outputs.PER_CUDA_COMPILER_MATRIX)[ format('{0}-{1}', matrix.cuda_version, matrix.compiler) ]) }}
+  #    devcontainer_version: ${{ needs.compute-matrix.outputs.DEVCONTAINER_VERSION }}
+  #    is_windows: ${{ contains(matrix.compiler, 'cl') }}
 
-  examples:
-    name: CCCL Examples
+  clang-cuda:
+    name: ${{matrix.lib}} CTK${{matrix.cuda}} clang-cuda ${{matrix.compiler.version}}
     needs: compute-matrix
-    uses: ./.github/workflows/build-examples.yml
     strategy:
       fail-fast: false
       matrix:
-        cuda_version: ${{ fromJSON(needs.compute-matrix.outputs.CUDA_VERSIONS) }}
-        compiler: ${{ fromJSON(needs.compute-matrix.outputs.HOST_COMPILERS) }}
+        include: ${{ fromJSON(needs.compute-matrix.outputs.CLANG_CUDA_MATRIX) }}
+    uses: ./.github/workflows/run-as-coder.yml
     with:
-      per_cuda_compiler_matrix: ${{ toJSON(fromJSON(needs.compute-matrix.outputs.PER_CUDA_COMPILER_MATRIX)[ format('{0}-{1}', matrix.cuda_version, matrix.compiler) ]) }}
-      devcontainer_version: ${{ needs.compute-matrix.outputs.DEVCONTAINER_VERSION }}
-      is_windows: ${{ contains(matrix.compiler, 'cl') }}
+      name: ${{matrix.lib}} CTK${{matrix.cuda}} clang-cuda ${{matrix.compiler.version}}
+      runner: linux-${{matrix.cpu}}-cpu16
+      image: rapidsai/devcontainers:${{needs.compute-matrix.outputs.DEVCONTAINER_VERSION}}-cpp-${{matrix.compiler.name}}${{matrix.compiler.version}}-cuda${{matrix.cuda}}-${{matrix.os}}
+      command: |
+        CMAKE_CUDA_COMPILER="${{matrix.compiler.exe}}" ./ci/build_${{matrix.lib}}.sh ${{matrix.compiler.exe}} ${{matrix.std}} ${{matrix.gpu_build_archs}}
+
+#  examples:
+#    name: CCCL Examples
+#    needs: compute-matrix
+#    if: ${{ !contains(github.event.head_commit.message, 'skip-tests') }}
+#    uses: ./.github/workflows/build-examples.yml
+#    strategy:
+#      fail-fast: false
+#      matrix:
+#        cuda_version: ${{ fromJSON(needs.compute-matrix.outputs.CUDA_VERSIONS) }}
+#        compiler: ${{ fromJSON(needs.compute-matrix.outputs.HOST_COMPILERS) }}
+#    with:
+#      per_cuda_compiler_matrix: ${{ toJSON(fromJSON(needs.compute-matrix.outputs.PER_CUDA_COMPILER_MATRIX)[ format('{0}-{1}', matrix.cuda_version, matrix.compiler) ]) }}
+#      devcontainer_version: ${{ needs.compute-matrix.outputs.DEVCONTAINER_VERSION }}
+#      is_windows: ${{ contains(matrix.compiler, 'cl') }}
 
   # This job is the final job that runs after all other jobs and is used for branch protection status checks.
   # See: https://docs.github.com/en/pull-requests/collaborating-with-pull-requests/collaborating-on-repositories-with-code-quality-features/about-status-checks
   ci:
     runs-on: ubuntu-latest
     name: CI
     needs:
-      - cub
-      - libcudacxx
-      - nvrtc
-      - thrust
-      - examples
+      - clang-cuda
+      #- cub
+      #- libcudacxx
+      #- nvrtc
+      #- thrust
+      #- examples
     steps:
       - run: echo "CI success"
@@ -6,12 +6,13 @@ set -eo pipefail
 cd "$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )";
 
 # Script defaults
-CUDA_COMPILER=nvcc
+CUDA_COMPILER=${CMAKE_CUDA_COMPILER:-nvcc}
 
 # Check if the correct number of arguments has been provided
 function usage {
     echo "Usage: $0 [OPTIONS] <HOST_COMPILER> <CXX_STANDARD> <GPU_ARCHS>"
     echo "The PARALLEL_LEVEL environment variable controls the amount of build parallelism. Default is the number of cores."
+    echo "The CMAKE_CUDA_COMPILER environment variable can be used to control the CUDA compiler. The -nvcc flag takes precedence."
     echo "Example: PARALLEL_LEVEL=8 $0 g++-8 14 \"70\" "
     echo "Example: $0 clang++-8 17 \"70;75;80-virtual\" "
     echo "Possible options: "
@@ -54,9 +55,7 @@ readonly CXX_STANDARD=$2
 
 # Replace spaces, commas and semicolons with semicolons for CMake list
 readonly GPU_ARCHS=$(echo $3 | tr ' ,' ';')
-
 readonly PARALLEL_LEVEL=${PARALLEL_LEVEL:=$(nproc)}
-readonly NVCC_VERSION=$($CUDA_COMPILER --version | grep release | awk '{print $6}' | cut -c2-)
 
 if [ -z ${DEVCONTAINER_NAME+x} ]; then
     BUILD_DIR=../build/local
@@ -83,7 +82,7 @@ COMMON_CMAKE_OPTIONS="
 echo "========================================"
 echo "Begin build"
 echo "pwd=$(pwd)"
-echo "NVCC_VERSION=$NVCC_VERSION"
+echo "CUDA_COMPILER=$CUDA_COMPILER"
 echo "HOST_COMPILER=$HOST_COMPILER"
 echo "CXX_STANDARD=$CXX_STANDARD"
 echo "GPU_ARCHS=$GPU_ARCHS"

@@ -2,7 +2,6 @@
 
 source "$(dirname "$0")/build_common.sh"
 
-
 # CUB benchmarks require at least CUDA nvcc 11.5 for int128
 # Returns "true" if the first version is greater than or equal to the second
 version_compare() {
@@ -12,12 +11,20 @@ version_compare() {
         echo "false"
     fi
 }
-readonly ENABLE_CUB_BENCHMARKS=${ENABLE_CUB_BENCHMARKS:=$(version_compare $NVCC_VERSION 11.5)}
 
-if [[ $ENABLE_CUB_BENCHMARKS == "true" ]]; then
-    echo "CUDA version is $NVCC_VERSION. Building CUB benchmarks."
+ENABLE_CUB_BENCHMARKS="false"
+ENABLE_CUB_RDC="false"
+if [[ "$CUDA_COMPILER" == *nvcc* ]]; then
+    ENABLE_CUB_RDC="true"
+    NVCC_VERSION=$($CUDA_COMPILER --version | grep release | awk '{print $6}' | cut -c2-)
+    if [[ $(version_compare $NVCC_VERSION 11.5) == "true" ]]; then
+        ENABLE_CUB_BENCHMARKS="true"
+        echo "nvcc version is $NVCC_VERSION. Building CUB benchmarks."
+    else
+        echo "nvcc version is $NVCC_VERSION. Not building CUB benchmarks because nvcc version is less than 11.5."
+    fi
 else
-    echo "CUDA version is $NVCC_VERSION. Not building CUB benchmarks because CUDA version is less than 11.5."
+    echo "nvcc version is not determined (likely using a non-NVCC compiler). Not building CUB benchmarks."
 fi
 
 CMAKE_OPTIONS="
@@ -32,6 +39,7 @@ CMAKE_OPTIONS="
     -DTHRUST_IGNORE_DEPRECATED_CPP_DIALECT=ON \
     -DCUB_IGNORE_DEPRECATED_CPP_DIALECT=ON \
     -DCUB_ENABLE_BENCHMARKS="$ENABLE_CUB_BENCHMARKS"\
+    -DCUB_ENABLE_RDC_TESTS="$ENABLE_CUB_RDC" \
 "
 
 configure_and_build "CUB" "$CMAKE_OPTIONS"
@@ -51,3 +51,5 @@ pull_request:
     - {cuda: *cuda_newest, os: 'windows2022', cpu: 'amd64', compiler: {name: 'cl',   version: '14.36', exe: 'cl++'}, gpu_build_archs: '70', std: [14, 17, 20],     jobs: ['build']}
   nvrtc:
     - {cuda: *cuda_newest, os: 'ubuntu22.04', cpu: 'amd64', gpu_build_archs: '70', std: [11, 14, 17, 20]}
+  clang-cuda:
+    - {lib: ['thrust', 'cub', 'libcudacxx'], cuda: *cuda_newest, os: 'ubuntu22.04', cpu: 'amd64', compiler: {name: 'llvm', version: '16', exe: 'clang++'}, gpu_build_archs: '70', std: [17, 20]}
@@ -50,6 +50,18 @@ if (${CMAKE_CXX_COMPILER_ID} STREQUAL "IntelLLVM")
     --compiler-options=-fno-fast-math")
 endif()
 
+if (${CMAKE_CUDA_COMPILER_ID} STREQUAL "Clang")
+  string(APPEND LIBCUDACXX_TEST_COMPILER_FLAGS
+     " -Xclang -fcuda-allow-variadic-functions"
+     " -Xclang -Wno-unused-parameter"
+     " -Wno-unknown-cuda-version")
+
+  find_package(CUDAToolkit)
+
+  string(APPEND LIBCUDACXX_TEST_LINKER_FLAGS
+    " -L${CUDAToolkit_LIBRARY_DIR} -lcuda -lcudart")
+endif()
+
 if (${CMAKE_CUDA_COMPILER_ID} STREQUAL "NVIDIA")
   set(LIBCUDACXX_TEST_COMPILER_FLAGS
     "${LIBCUDACXX_TEST_COMPILER_FLAGS} \

@@ -16,6 +16,10 @@
 
 #include "concurrent_agents.h"
 
+#if defined(__clang__) && defined(__CUDA__)
+# include <new>
+#endif
+
 #ifdef _LIBCUDACXX_COMPILER_NVRTC
 #define LAMBDA [=]
 #else

@@ -147,6 +147,9 @@ def _initTypeAndVersion(self):
         if self.type == 'nvcc':
             # Treat C++ as CUDA when the compiler is NVCC.
             self.source_lang = 'cu'
+        elif self.type == 'clang':
+            # Treat C++ as clang-cuda when the compiler is Clang.
+            self.source_lang = 'cu'
 
     def _basicCmd(self, source_files, out, mode=CM_Default, flags=[],
                   input_is_cxx=False):

@@ -628,12 +628,17 @@ def configure_compile_flags(self):
             self.config.available_features.add("nvrtc")
         if self.cxx.type == 'nvcc':
             self.cxx.compile_flags += ['--extended-lambda']
+        real_arch_format = '-gencode=arch=compute_{0},code=sm_{0}'
+        virt_arch_format = '-gencode=arch=compute_{0},code=compute_{0}'
+        if self.cxx.type == 'clang':
+            real_arch_format = '--cuda-gpu-arch=sm_{0}'
+            virt_arch_format = '--cuda-gpu-arch=compute_{0}'
         pre_sm_32 = True
         pre_sm_60 = True
         pre_sm_70 = True
         pre_sm_80 = True
         pre_sm_90 = True
-        if compute_archs and self.cxx.type == 'nvcc':
+        if compute_archs and (self.cxx.type == 'nvcc' or self.cxx.type == 'clang'):
             pre_sm_32 = False
             pre_sm_60 = False
             pre_sm_70 = False
@@ -654,10 +659,9 @@ def configure_compile_flags(self):
                 if arch < 70: pre_sm_70 = True
                 if arch < 80: pre_sm_80 = True
                 if arch < 90: pre_sm_90 = True
+                arch_flag = real_arch_format.format(arch)
                 if mode.count("virtual"):
-                    arch_flag = '-gencode=arch=compute_{0},code=compute_{0}'.format(arch)
-                else:
-                    arch_flag = '-gencode=arch=compute_{0},code=sm_{0}'.format(arch)
+                    arch_flag = virt_arch_format.format(arch)
                 self.cxx.compile_flags += [arch_flag]
         if pre_sm_32:
             self.config.available_features.add("pre-sm-32")
@@ -820,8 +824,9 @@ def configure_compile_flags_header_includes(self):
                                  and self.cxx_stdlib_under_test != 'libc++'):
             self.lit_config.note('using the system cxx headers')
             return
-        if self.cxx.type != 'nvcc' and self.cxx.type != 'nvhpc':
-            self.cxx.compile_flags += ['-nostdinc++']
+        # I don't think this is required, since removing it helps clang-cuda compile and libcudacxx only supports building in CUDA modes?
+        # if self.cxx.type != 'nvcc' and self.cxx.type != 'pgi':
-        # if self.cxx.type != 'nvcc' and self.cxx.type != 'pgi':
+        # if self.cxx.type != 'nvcc' and self.cxx.type != 'nvhpc':
-        # if self.cxx.type != 'nvcc' and self.cxx.type != 'pgi':
+        # if self.cxx.type != 'nvcc' and self.cxx.type != 'nvhpc':
+        #    self.cxx.compile_flags += ['-nostdinc++']
         if cxx_headers is None:
             cxx_headers = os.path.join(self.libcudacxx_src_root, 'include')
         if not os.path.isdir(cxx_headers):
@@ -1063,16 +1068,17 @@ def configure_link_flags_cxx_library(self):
             self.cxx.link_flags += ['-lc++experimental']
         if self.link_shared:
             self.cxx.link_flags += ['-lc++']
-        elif self.cxx.type != 'nvcc' and self.cxx.type != 'nvhpc':
-            cxx_library_root = self.get_lit_conf('cxx_library_root')
-            if cxx_library_root:
-                libname = self.make_static_lib_name('c++')
-                abs_path = os.path.join(cxx_library_root, libname)
-                assert os.path.exists(abs_path) and \
-                       "static libc++ library does not exist"
-                self.cxx.link_flags += [abs_path]
-            else:
-                self.cxx.link_flags += ['-lc++']
+        # Device code does not have binary components, don't link libc++
+        # elif self.cxx.type != 'nvcc' and self.cxx.type != 'pgi':
-        # elif self.cxx.type != 'nvcc' and self.cxx.type != 'pgi':
+        # elif self.cxx.type != 'nvcc' and self.cxx.type != 'nvhpc':
-        # elif self.cxx.type != 'nvcc' and self.cxx.type != 'pgi':
+        # elif self.cxx.type != 'nvcc' and self.cxx.type != 'nvhpc':
+        #     cxx_library_root = self.get_lit_conf('cxx_library_root')
+        #     if cxx_library_root:
+        #         libname = self.make_static_lib_name('c++')
+        #         abs_path = os.path.join(cxx_library_root, libname)
+        #         assert os.path.exists(abs_path) and \
+        #                "static libc++ library does not exist"
+        #         self.cxx.link_flags += [abs_path]
+        #     else:
+        #         self.cxx.link_flags += ['-lc++']
 
     def configure_link_flags_abi_library(self):
         cxx_abi = self.get_lit_conf('cxx_abi', 'libcxxabi')