Add cp_async_bulk_tensor tests

NVIDIA · Sep 1, 2023 · 2543821 · 2543821
1 parent 81fa05f
commit 2543821
Show file tree

Hide file tree

Showing 7 changed files with 857 additions and 0 deletions.
diff --git a/libcudacxx/.upstream-tests/test/cuda/barrier/cp_async_bulk_tensor.pass.cpp b/libcudacxx/.upstream-tests/test/cuda/barrier/cp_async_bulk_tensor.pass.cpp
@@ -0,0 +1,208 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of libcu++, the C++ Standard Library for your entire system,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+//
+// UNSUPPORTED: libcpp-has-no-threads
+// UNSUPPORTED: pre-sm-90
+
+// <cuda/barrier>
+
+#include <cuda/barrier>
+#include <cuda/std/utility>     // cuda::std::move
+#include "test_macros.h"        // TEST_NV_DIAG_SUPPRESS
+
+// NVRTC does not support cuda.h (due to import of stdlib.h)
+#ifndef __CUDACC_RTC__
+#include <cstdio>
+#include <cudaTypedefs.h>       // PFN_cuTensorMapEncodeTiled, CUtensorMap
+#endif
+
+// Suppress warning about barrier in shared memory
+TEST_NV_DIAG_SUPPRESS(static_var_with_dynamic_init)
+
+using barrier = cuda::barrier<cuda::thread_scope_block>;
+namespace cde = cuda::device::experimental;
+
+constexpr size_t GMEM_WIDTH = 1024;  // Width of tensor (in # elements)
+constexpr size_t GMEM_HEIGHT = 1024; // Height of tensor (in # elements)
+constexpr size_t gmem_len = GMEM_WIDTH * GMEM_HEIGHT;
+
+constexpr int SMEM_WIDTH = 32;     // Width of shared memory buffer (in # elements)
+constexpr int SMEM_HEIGHT = 8;      // Height of shared memory buffer (in # elements)
+
+// Use 24KB of shared memory space.
+static constexpr int buf_len = SMEM_HEIGHT * SMEM_WIDTH;
+__device__ int gmem_tensor[gmem_len];
+
+
+// We need a type with a size. On NVRTC, cuda.h cannot be imported, so we don't
+// have access to the definition of CUTensorMap (only to the declaration of CUtensorMap inside
+// cuda/barrier). So we use this type instead and reinterpret_cast in the
+// kernel.
+struct fake_cutensormap {
+    alignas(64) uint64_t opaque[16];
+};
+__constant__ fake_cutensormap global_fake_tensor_map;
+
+__device__ void test(int base_i, int base_j)
+{
+    CUtensorMap *global_tensor_map = reinterpret_cast<CUtensorMap*>(&global_fake_tensor_map);
+
+    // SETUP: fill global memory buffer
+    for (int i = threadIdx.x; i < gmem_len; i += blockDim.x) {
+        gmem_tensor[i] = i;
+    }
+    // Ensure that writes to global memory are visible to others, including
+    // those in the async proxy.
+    __threadfence();
+    __syncthreads();
+
+    // TEST: Add i to buffer[i]
+    __shared__ alignas(128) int smem_buffer[buf_len];
+    __shared__ barrier bar;
+    if (threadIdx.x == 0) { init(&bar, blockDim.x); }
+    __syncthreads();
+
+    // Load data:
+    uint64_t token;
+    if (threadIdx.x == 0) {
+        // Fastest moving coordinate first.
+        cde::cp_async_bulk_tensor_2d_global_to_shared(smem_buffer, global_tensor_map, base_j, base_i, bar);
+        token = bar.arrive_tx(1, sizeof(smem_buffer));
+    } else {
+        token = bar.arrive();
+    }
+    bar.wait(cuda::std::move(token));
+
+    // Check smem
+    for (int i = 0; i < SMEM_HEIGHT; ++i) {
+        for (int j = 0; j < SMEM_HEIGHT; ++j) {
+            int gmem_lin_idx = (base_i + i) * GMEM_WIDTH + base_j + j;
+            int smem_lin_idx = i * SMEM_WIDTH + j;
+
+            if (smem_buffer[smem_lin_idx] != gmem_lin_idx) {
+#ifndef __CUDACC_RTC__
+                printf("Failed at smem (%d, %d). Got %d. Expected %d.\n", i, j, smem_buffer[smem_lin_idx], gmem_lin_idx);
+#endif
+                __trap();
+            }
+        }
+    }
+
+    __syncthreads();
+
+    // Update smem
+    for (int i = threadIdx.x; i < buf_len; i += blockDim.x) {
+        smem_buffer[i] = 2 * smem_buffer[i] + 1;
+    }
+    cde::fence_proxy_async_shared_cta();
+    __syncthreads();
+
+    // Write back to global memory:
+    if (threadIdx.x == 0) {
+        cde::cp_async_bulk_tensor_2d_shared_to_global(global_tensor_map, base_j, base_i, smem_buffer);
+        cde::cp_async_bulk_commit_group();
+        cde::cp_async_bulk_wait_group_read<0>();
+    }
+    __threadfence();
+    __syncthreads();
+
+    // // TEAR-DOWN: check that global memory is correct
+
+    for (int i = 0; i < SMEM_HEIGHT; ++i) {
+        for (int j = 0; j < SMEM_HEIGHT; ++j) {
+            int gmem_lin_idx = (base_i + i) * GMEM_WIDTH + base_j + j;
+
+            if (gmem_tensor[gmem_lin_idx] != 2 * gmem_lin_idx + 1) {
+                __trap();
+            }
+        }
+    }
+    __syncthreads();
+}
+
+#ifndef __CUDACC_RTC__
+PFN_cuTensorMapEncodeTiled get_cuTensorMapEncodeTiled() {
+    void* driver_ptr = nullptr;
+    cudaDriverEntryPointQueryResult driver_status;
+    auto code = cudaGetDriverEntryPoint("cuTensorMapEncodeTiled", &driver_ptr, cudaEnableDefault, &driver_status);
+    if (code != cudaSuccess) {
+        exit(1);
+    }
+    return reinterpret_cast<PFN_cuTensorMapEncodeTiled>(driver_ptr);
+}
+#endif
+
+int main(int, char**)
+{
+    NV_IF_TARGET(NV_IS_HOST,(
+        //Required by concurrent_agents_launch to know how many we're launching
+        cuda_thread_count = 512;
+
+        int * tensor_ptr = nullptr;
+        auto code = cudaGetSymbolAddress((void**)&tensor_ptr, gmem_tensor);
+        if (code != cudaSuccess) {
+            std::printf("getsymboladdress failed.");
+            exit(1);
+        }
+
+        // https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__TENSOR__MEMORY.html
+        CUtensorMap local_tensor_map{};
+        // rank is the number of dimensions of the array.
+        constexpr uint32_t rank = 2;
+        uint64_t size[rank] = {GMEM_WIDTH, GMEM_HEIGHT};
+        // The stride is the number of bytes to traverse from the first element of one row to the next.
+        // It must be a multiple of 16.
+        uint64_t stride[rank - 1] = {GMEM_WIDTH * sizeof(int)};
+        // The box_size is the size of the shared memory buffer that is used as the
+        // destination of a TMA transfer.
+        uint32_t box_size[rank] = {SMEM_WIDTH, SMEM_HEIGHT};
+        // The distance between elements in units of sizeof(element). A stride of 2
+        // can be used to load only the real component of a complex-valued tensor, for instance.
+        uint32_t elem_stride[rank] = {1, 1};
+
+        // Get a function pointer to the cuTensorMapEncodeTiled driver API.
+        auto cuTensorMapEncodeTiled = get_cuTensorMapEncodeTiled();
+
+        // Create the tensor descriptor.
+        CUresult res = cuTensorMapEncodeTiled(
+            &local_tensor_map,  // CUtensorMap *tensorMap,
+            CUtensorMapDataType::CU_TENSOR_MAP_DATA_TYPE_INT32,
+            rank,                       // cuuint32_t tensorRank,
+            tensor_ptr,                 // void *globalAddress,
+            size,                       // const cuuint64_t *globalDim,
+            stride,                     // const cuuint64_t *globalStrides,
+            box_size,                   // const cuuint32_t *boxDim,
+            elem_stride,                // const cuuint32_t *elementStrides,
+            CUtensorMapInterleave::CU_TENSOR_MAP_INTERLEAVE_NONE,
+            CUtensorMapSwizzle::CU_TENSOR_MAP_SWIZZLE_NONE,
+            CUtensorMapL2promotion::CU_TENSOR_MAP_L2_PROMOTION_NONE,
+            CUtensorMapFloatOOBfill::CU_TENSOR_MAP_FLOAT_OOB_FILL_NONE
+        );
+
+        if (res != CUDA_SUCCESS) {
+            std::printf("tensormap creation failed.");
+            exit(1);
+        }
+        code = cudaMemcpyToSymbol(global_fake_tensor_map, &local_tensor_map, sizeof(CUtensorMap));
+        if (code != cudaSuccess) {
+            std::printf("memcpytosymbol failed.");
+            exit(1);
+        }
+    ));
+
+    NV_DISPATCH_TARGET(
+        NV_IS_DEVICE, (
+            test(0, 0);
+            test(4, 0);
+            test(4, 4);
+        )
+    );
+    return 0;
+}
diff --git a/libcudacxx/.upstream-tests/test/cuda/barrier/cp_async_bulk_tensor_1d.pass.cpp b/libcudacxx/.upstream-tests/test/cuda/barrier/cp_async_bulk_tensor_1d.pass.cpp
@@ -0,0 +1,71 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of libcu++, the C++ Standard Library for your entire system,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+//
+// UNSUPPORTED: libcpp-has-no-threads
+// UNSUPPORTED: pre-sm-90
+
+// <cuda/barrier>
+
+#include "cp_async_bulk_tensor_generic.h"
+
+// Define the size of contiguous tensor in global and shared memory.
+//
+// Note that the first dimension is the one with stride 1. This one must be a
+// multiple of 4 to ensure that each new dimension starts at a 16-byte aligned
+// offset.
+//
+// We have a separate variable for host and device because a constexpr
+// std::initializer_list cannot be shared between host and device as some of its
+// member functions take a const reference, which is unsupported by nvcc.
+           constexpr std::initializer_list<int> GMEM_DIMS    {256};
+__device__ constexpr std::initializer_list<int> GMEM_DIMS_DEV{256};
+           constexpr std::initializer_list<int> SMEM_DIMS    {32};
+__device__ constexpr std::initializer_list<int> SMEM_DIMS_DEV{32};
+
+__device__ constexpr std::initializer_list<int> TEST_SMEM_COORDS[] = {
+    {0},
+    {4},
+    {8}
+};
+
+constexpr size_t gmem_len = tensor_len(GMEM_DIMS);
+constexpr size_t smem_len = tensor_len(SMEM_DIMS);
+
+__device__ int gmem_tensor[gmem_len];
+
+int main(int, char**)
+{
+    NV_DISPATCH_TARGET(
+        NV_IS_HOST, (
+        //Required by concurrent_agents_launch to know how many we're launching
+        cuda_thread_count = 512;
+
+        // Get pointer to gmem_tensor to create tensor map.
+        int * tensor_ptr = nullptr;
+        auto code = cudaGetSymbolAddress((void**)&tensor_ptr, gmem_tensor);
+        if (code != cudaSuccess) {
+            exit(1);
+        }
+        // Create tensor map
+        CUtensorMap local_tensor_map = map_encode(tensor_ptr, GMEM_DIMS, SMEM_DIMS);
+
+        // Copy it to device
+        code = cudaMemcpyToSymbol(global_fake_tensor_map, &local_tensor_map, sizeof(CUtensorMap));
+        if (code != cudaSuccess) {
+            exit(1);
+        }),
+        NV_IS_DEVICE, (
+            for (auto smem_coord : TEST_SMEM_COORDS) {
+                test<smem_len>(smem_coord, SMEM_DIMS_DEV, GMEM_DIMS_DEV, gmem_tensor, gmem_len);
+            }
+        )
+    );
+    return 0;
+}
diff --git a/libcudacxx/.upstream-tests/test/cuda/barrier/cp_async_bulk_tensor_2d.pass.cpp b/libcudacxx/.upstream-tests/test/cuda/barrier/cp_async_bulk_tensor_2d.pass.cpp
@@ -0,0 +1,72 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of libcu++, the C++ Standard Library for your entire system,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+//
+// UNSUPPORTED: libcpp-has-no-threads
+// UNSUPPORTED: pre-sm-90
+
+// <cuda/barrier>
+
+#include "cp_async_bulk_tensor_generic.h"
+
+// Define the size of contiguous tensor in global and shared memory.
+//
+// Note that the first dimension is the one with stride 1. This one must be a
+// multiple of 4 to ensure that each new dimension starts at a 16-byte aligned
+// offset.
+//
+// We have a separate variable for host and device because a constexpr
+// std::initializer_list cannot be shared between host and device as some of its
+// member functions take a const reference, which is unsupported by nvcc.
+           constexpr std::initializer_list<int> GMEM_DIMS    {8, 11};
+__device__ constexpr std::initializer_list<int> GMEM_DIMS_DEV{8, 11};
+           constexpr std::initializer_list<int> SMEM_DIMS    {4,  2};
+__device__ constexpr std::initializer_list<int> SMEM_DIMS_DEV{4,  2};
+
+__device__ constexpr std::initializer_list<int> TEST_SMEM_COORDS[] = {
+    {0, 0},
+    {4, 1},
+    {4, 5},
+    {0, 5},
+};
+
+constexpr size_t gmem_len = tensor_len(GMEM_DIMS);
+constexpr size_t smem_len = tensor_len(SMEM_DIMS);
+
+__device__ int gmem_tensor[gmem_len];
+
+int main(int, char**)
+{
+    NV_DISPATCH_TARGET(
+        NV_IS_HOST, (
+        //Required by concurrent_agents_launch to know how many we're launching
+        cuda_thread_count = 512;
+
+        // Get pointer to gmem_tensor to create tensor map.
+        int * tensor_ptr = nullptr;
+        auto code = cudaGetSymbolAddress((void**)&tensor_ptr, gmem_tensor);
+        if (code != cudaSuccess) {
+            exit(1);
+        }
+        // Create tensor map
+        CUtensorMap local_tensor_map = map_encode(tensor_ptr, GMEM_DIMS, SMEM_DIMS);
+
+        // Copy it to device
+        code = cudaMemcpyToSymbol(global_fake_tensor_map, &local_tensor_map, sizeof(CUtensorMap));
+        if (code != cudaSuccess) {
+            exit(1);
+        }),
+        NV_IS_DEVICE, (
+            for (auto smem_coord : TEST_SMEM_COORDS) {
+                test<smem_len>(smem_coord, SMEM_DIMS_DEV, GMEM_DIMS_DEV, gmem_tensor, gmem_len);
+            }
+        )
+    );
+    return 0;
+}