Skip to content

Commit

Permalink
Add cp_async_bulk_tensor tests
Browse files Browse the repository at this point in the history
  • Loading branch information
ahendriksen committed Sep 1, 2023
1 parent 81fa05f commit 2543821
Show file tree
Hide file tree
Showing 7 changed files with 857 additions and 0 deletions.
@@ -0,0 +1,208 @@
//===----------------------------------------------------------------------===//
//
// Part of libcu++, the C++ Standard Library for your entire system,
// under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
// SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES.
//
//===----------------------------------------------------------------------===//
//
// UNSUPPORTED: libcpp-has-no-threads
// UNSUPPORTED: pre-sm-90

// <cuda/barrier>

#include <cuda/barrier>
#include <cuda/std/utility> // cuda::std::move
#include "test_macros.h" // TEST_NV_DIAG_SUPPRESS

// NVRTC does not support cuda.h (due to import of stdlib.h)
#ifndef __CUDACC_RTC__
#include <cstdio>
#include <cudaTypedefs.h> // PFN_cuTensorMapEncodeTiled, CUtensorMap
#endif

// Suppress warning about barrier in shared memory
TEST_NV_DIAG_SUPPRESS(static_var_with_dynamic_init)

using barrier = cuda::barrier<cuda::thread_scope_block>;
namespace cde = cuda::device::experimental;

constexpr size_t GMEM_WIDTH = 1024; // Width of tensor (in # elements)
constexpr size_t GMEM_HEIGHT = 1024; // Height of tensor (in # elements)
constexpr size_t gmem_len = GMEM_WIDTH * GMEM_HEIGHT;

constexpr int SMEM_WIDTH = 32; // Width of shared memory buffer (in # elements)
constexpr int SMEM_HEIGHT = 8; // Height of shared memory buffer (in # elements)

// Use 24KB of shared memory space.
static constexpr int buf_len = SMEM_HEIGHT * SMEM_WIDTH;
__device__ int gmem_tensor[gmem_len];


// We need a type with a size. On NVRTC, cuda.h cannot be imported, so we don't
// have access to the definition of CUTensorMap (only to the declaration of CUtensorMap inside
// cuda/barrier). So we use this type instead and reinterpret_cast in the
// kernel.
struct fake_cutensormap {
alignas(64) uint64_t opaque[16];
};
__constant__ fake_cutensormap global_fake_tensor_map;

__device__ void test(int base_i, int base_j)
{
CUtensorMap *global_tensor_map = reinterpret_cast<CUtensorMap*>(&global_fake_tensor_map);

// SETUP: fill global memory buffer
for (int i = threadIdx.x; i < gmem_len; i += blockDim.x) {
gmem_tensor[i] = i;
}
// Ensure that writes to global memory are visible to others, including
// those in the async proxy.
__threadfence();
__syncthreads();

// TEST: Add i to buffer[i]
__shared__ alignas(128) int smem_buffer[buf_len];
__shared__ barrier bar;
if (threadIdx.x == 0) { init(&bar, blockDim.x); }
__syncthreads();

// Load data:
uint64_t token;
if (threadIdx.x == 0) {
// Fastest moving coordinate first.
cde::cp_async_bulk_tensor_2d_global_to_shared(smem_buffer, global_tensor_map, base_j, base_i, bar);
token = bar.arrive_tx(1, sizeof(smem_buffer));
} else {
token = bar.arrive();
}
bar.wait(cuda::std::move(token));

// Check smem
for (int i = 0; i < SMEM_HEIGHT; ++i) {
for (int j = 0; j < SMEM_HEIGHT; ++j) {
int gmem_lin_idx = (base_i + i) * GMEM_WIDTH + base_j + j;
int smem_lin_idx = i * SMEM_WIDTH + j;

if (smem_buffer[smem_lin_idx] != gmem_lin_idx) {
#ifndef __CUDACC_RTC__
printf("Failed at smem (%d, %d). Got %d. Expected %d.\n", i, j, smem_buffer[smem_lin_idx], gmem_lin_idx);
#endif
__trap();
}
}
}

__syncthreads();

// Update smem
for (int i = threadIdx.x; i < buf_len; i += blockDim.x) {
smem_buffer[i] = 2 * smem_buffer[i] + 1;
}
cde::fence_proxy_async_shared_cta();
__syncthreads();

// Write back to global memory:
if (threadIdx.x == 0) {
cde::cp_async_bulk_tensor_2d_shared_to_global(global_tensor_map, base_j, base_i, smem_buffer);
cde::cp_async_bulk_commit_group();
cde::cp_async_bulk_wait_group_read<0>();
}
__threadfence();
__syncthreads();

// // TEAR-DOWN: check that global memory is correct

for (int i = 0; i < SMEM_HEIGHT; ++i) {
for (int j = 0; j < SMEM_HEIGHT; ++j) {
int gmem_lin_idx = (base_i + i) * GMEM_WIDTH + base_j + j;

if (gmem_tensor[gmem_lin_idx] != 2 * gmem_lin_idx + 1) {
__trap();
}
}
}
__syncthreads();
}

#ifndef __CUDACC_RTC__
PFN_cuTensorMapEncodeTiled get_cuTensorMapEncodeTiled() {
void* driver_ptr = nullptr;
cudaDriverEntryPointQueryResult driver_status;
auto code = cudaGetDriverEntryPoint("cuTensorMapEncodeTiled", &driver_ptr, cudaEnableDefault, &driver_status);
if (code != cudaSuccess) {
exit(1);
}
return reinterpret_cast<PFN_cuTensorMapEncodeTiled>(driver_ptr);
}
#endif

int main(int, char**)
{
NV_IF_TARGET(NV_IS_HOST,(
//Required by concurrent_agents_launch to know how many we're launching
cuda_thread_count = 512;

int * tensor_ptr = nullptr;
auto code = cudaGetSymbolAddress((void**)&tensor_ptr, gmem_tensor);
if (code != cudaSuccess) {
std::printf("getsymboladdress failed.");
exit(1);
}

// https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__TENSOR__MEMORY.html
CUtensorMap local_tensor_map{};
// rank is the number of dimensions of the array.
constexpr uint32_t rank = 2;
uint64_t size[rank] = {GMEM_WIDTH, GMEM_HEIGHT};
// The stride is the number of bytes to traverse from the first element of one row to the next.
// It must be a multiple of 16.
uint64_t stride[rank - 1] = {GMEM_WIDTH * sizeof(int)};
// The box_size is the size of the shared memory buffer that is used as the
// destination of a TMA transfer.
uint32_t box_size[rank] = {SMEM_WIDTH, SMEM_HEIGHT};
// The distance between elements in units of sizeof(element). A stride of 2
// can be used to load only the real component of a complex-valued tensor, for instance.
uint32_t elem_stride[rank] = {1, 1};

// Get a function pointer to the cuTensorMapEncodeTiled driver API.
auto cuTensorMapEncodeTiled = get_cuTensorMapEncodeTiled();

// Create the tensor descriptor.
CUresult res = cuTensorMapEncodeTiled(
&local_tensor_map, // CUtensorMap *tensorMap,
CUtensorMapDataType::CU_TENSOR_MAP_DATA_TYPE_INT32,
rank, // cuuint32_t tensorRank,
tensor_ptr, // void *globalAddress,
size, // const cuuint64_t *globalDim,
stride, // const cuuint64_t *globalStrides,
box_size, // const cuuint32_t *boxDim,
elem_stride, // const cuuint32_t *elementStrides,
CUtensorMapInterleave::CU_TENSOR_MAP_INTERLEAVE_NONE,
CUtensorMapSwizzle::CU_TENSOR_MAP_SWIZZLE_NONE,
CUtensorMapL2promotion::CU_TENSOR_MAP_L2_PROMOTION_NONE,
CUtensorMapFloatOOBfill::CU_TENSOR_MAP_FLOAT_OOB_FILL_NONE
);

if (res != CUDA_SUCCESS) {
std::printf("tensormap creation failed.");
exit(1);
}
code = cudaMemcpyToSymbol(global_fake_tensor_map, &local_tensor_map, sizeof(CUtensorMap));
if (code != cudaSuccess) {
std::printf("memcpytosymbol failed.");
exit(1);
}
));

NV_DISPATCH_TARGET(
NV_IS_DEVICE, (
test(0, 0);
test(4, 0);
test(4, 4);
)
);
return 0;
}
@@ -0,0 +1,71 @@
//===----------------------------------------------------------------------===//
//
// Part of libcu++, the C++ Standard Library for your entire system,
// under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
// SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES.
//
//===----------------------------------------------------------------------===//
//
// UNSUPPORTED: libcpp-has-no-threads
// UNSUPPORTED: pre-sm-90

// <cuda/barrier>

#include "cp_async_bulk_tensor_generic.h"

// Define the size of contiguous tensor in global and shared memory.
//
// Note that the first dimension is the one with stride 1. This one must be a
// multiple of 4 to ensure that each new dimension starts at a 16-byte aligned
// offset.
//
// We have a separate variable for host and device because a constexpr
// std::initializer_list cannot be shared between host and device as some of its
// member functions take a const reference, which is unsupported by nvcc.
constexpr std::initializer_list<int> GMEM_DIMS {256};
__device__ constexpr std::initializer_list<int> GMEM_DIMS_DEV{256};
constexpr std::initializer_list<int> SMEM_DIMS {32};
__device__ constexpr std::initializer_list<int> SMEM_DIMS_DEV{32};

__device__ constexpr std::initializer_list<int> TEST_SMEM_COORDS[] = {
{0},
{4},
{8}
};

constexpr size_t gmem_len = tensor_len(GMEM_DIMS);
constexpr size_t smem_len = tensor_len(SMEM_DIMS);

__device__ int gmem_tensor[gmem_len];

int main(int, char**)
{
NV_DISPATCH_TARGET(
NV_IS_HOST, (
//Required by concurrent_agents_launch to know how many we're launching
cuda_thread_count = 512;

// Get pointer to gmem_tensor to create tensor map.
int * tensor_ptr = nullptr;
auto code = cudaGetSymbolAddress((void**)&tensor_ptr, gmem_tensor);
if (code != cudaSuccess) {
exit(1);
}
// Create tensor map
CUtensorMap local_tensor_map = map_encode(tensor_ptr, GMEM_DIMS, SMEM_DIMS);

// Copy it to device
code = cudaMemcpyToSymbol(global_fake_tensor_map, &local_tensor_map, sizeof(CUtensorMap));
if (code != cudaSuccess) {
exit(1);
}),
NV_IS_DEVICE, (
for (auto smem_coord : TEST_SMEM_COORDS) {
test<smem_len>(smem_coord, SMEM_DIMS_DEV, GMEM_DIMS_DEV, gmem_tensor, gmem_len);
}
)
);
return 0;
}
@@ -0,0 +1,72 @@
//===----------------------------------------------------------------------===//
//
// Part of libcu++, the C++ Standard Library for your entire system,
// under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
// SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES.
//
//===----------------------------------------------------------------------===//
//
// UNSUPPORTED: libcpp-has-no-threads
// UNSUPPORTED: pre-sm-90

// <cuda/barrier>

#include "cp_async_bulk_tensor_generic.h"

// Define the size of contiguous tensor in global and shared memory.
//
// Note that the first dimension is the one with stride 1. This one must be a
// multiple of 4 to ensure that each new dimension starts at a 16-byte aligned
// offset.
//
// We have a separate variable for host and device because a constexpr
// std::initializer_list cannot be shared between host and device as some of its
// member functions take a const reference, which is unsupported by nvcc.
constexpr std::initializer_list<int> GMEM_DIMS {8, 11};
__device__ constexpr std::initializer_list<int> GMEM_DIMS_DEV{8, 11};
constexpr std::initializer_list<int> SMEM_DIMS {4, 2};
__device__ constexpr std::initializer_list<int> SMEM_DIMS_DEV{4, 2};

__device__ constexpr std::initializer_list<int> TEST_SMEM_COORDS[] = {
{0, 0},
{4, 1},
{4, 5},
{0, 5},
};

constexpr size_t gmem_len = tensor_len(GMEM_DIMS);
constexpr size_t smem_len = tensor_len(SMEM_DIMS);

__device__ int gmem_tensor[gmem_len];

int main(int, char**)
{
NV_DISPATCH_TARGET(
NV_IS_HOST, (
//Required by concurrent_agents_launch to know how many we're launching
cuda_thread_count = 512;

// Get pointer to gmem_tensor to create tensor map.
int * tensor_ptr = nullptr;
auto code = cudaGetSymbolAddress((void**)&tensor_ptr, gmem_tensor);
if (code != cudaSuccess) {
exit(1);
}
// Create tensor map
CUtensorMap local_tensor_map = map_encode(tensor_ptr, GMEM_DIMS, SMEM_DIMS);

// Copy it to device
code = cudaMemcpyToSymbol(global_fake_tensor_map, &local_tensor_map, sizeof(CUtensorMap));
if (code != cudaSuccess) {
exit(1);
}),
NV_IS_DEVICE, (
for (auto smem_coord : TEST_SMEM_COORDS) {
test<smem_len>(smem_coord, SMEM_DIMS_DEV, GMEM_DIMS_DEV, gmem_tensor, gmem_len);
}
)
);
return 0;
}

0 comments on commit 2543821

Please sign in to comment.