Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
81fa05f
commit 2543821
Showing
7 changed files
with
857 additions
and
0 deletions.
There are no files selected for viewing
208 changes: 208 additions & 0 deletions
208
libcudacxx/.upstream-tests/test/cuda/barrier/cp_async_bulk_tensor.pass.cpp
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,208 @@ | ||
//===----------------------------------------------------------------------===// | ||
// | ||
// Part of libcu++, the C++ Standard Library for your entire system, | ||
// under the Apache License v2.0 with LLVM Exceptions. | ||
// See https://llvm.org/LICENSE.txt for license information. | ||
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception | ||
// SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. | ||
// | ||
//===----------------------------------------------------------------------===// | ||
// | ||
// UNSUPPORTED: libcpp-has-no-threads | ||
// UNSUPPORTED: pre-sm-90 | ||
|
||
// <cuda/barrier> | ||
|
||
#include <cuda/barrier> | ||
#include <cuda/std/utility> // cuda::std::move | ||
#include "test_macros.h" // TEST_NV_DIAG_SUPPRESS | ||
|
||
// NVRTC does not support cuda.h (due to import of stdlib.h) | ||
#ifndef __CUDACC_RTC__ | ||
#include <cstdio> | ||
#include <cudaTypedefs.h> // PFN_cuTensorMapEncodeTiled, CUtensorMap | ||
#endif | ||
|
||
// Suppress warning about barrier in shared memory | ||
TEST_NV_DIAG_SUPPRESS(static_var_with_dynamic_init) | ||
|
||
using barrier = cuda::barrier<cuda::thread_scope_block>; | ||
namespace cde = cuda::device::experimental; | ||
|
||
constexpr size_t GMEM_WIDTH = 1024; // Width of tensor (in # elements) | ||
constexpr size_t GMEM_HEIGHT = 1024; // Height of tensor (in # elements) | ||
constexpr size_t gmem_len = GMEM_WIDTH * GMEM_HEIGHT; | ||
|
||
constexpr int SMEM_WIDTH = 32; // Width of shared memory buffer (in # elements) | ||
constexpr int SMEM_HEIGHT = 8; // Height of shared memory buffer (in # elements) | ||
|
||
// Use 24KB of shared memory space. | ||
static constexpr int buf_len = SMEM_HEIGHT * SMEM_WIDTH; | ||
__device__ int gmem_tensor[gmem_len]; | ||
|
||
|
||
// We need a type with a size. On NVRTC, cuda.h cannot be imported, so we don't | ||
// have access to the definition of CUTensorMap (only to the declaration of CUtensorMap inside | ||
// cuda/barrier). So we use this type instead and reinterpret_cast in the | ||
// kernel. | ||
struct fake_cutensormap { | ||
alignas(64) uint64_t opaque[16]; | ||
}; | ||
__constant__ fake_cutensormap global_fake_tensor_map; | ||
|
||
__device__ void test(int base_i, int base_j) | ||
{ | ||
CUtensorMap *global_tensor_map = reinterpret_cast<CUtensorMap*>(&global_fake_tensor_map); | ||
|
||
// SETUP: fill global memory buffer | ||
for (int i = threadIdx.x; i < gmem_len; i += blockDim.x) { | ||
gmem_tensor[i] = i; | ||
} | ||
// Ensure that writes to global memory are visible to others, including | ||
// those in the async proxy. | ||
__threadfence(); | ||
__syncthreads(); | ||
|
||
// TEST: Add i to buffer[i] | ||
__shared__ alignas(128) int smem_buffer[buf_len]; | ||
__shared__ barrier bar; | ||
if (threadIdx.x == 0) { init(&bar, blockDim.x); } | ||
__syncthreads(); | ||
|
||
// Load data: | ||
uint64_t token; | ||
if (threadIdx.x == 0) { | ||
// Fastest moving coordinate first. | ||
cde::cp_async_bulk_tensor_2d_global_to_shared(smem_buffer, global_tensor_map, base_j, base_i, bar); | ||
token = bar.arrive_tx(1, sizeof(smem_buffer)); | ||
} else { | ||
token = bar.arrive(); | ||
} | ||
bar.wait(cuda::std::move(token)); | ||
|
||
// Check smem | ||
for (int i = 0; i < SMEM_HEIGHT; ++i) { | ||
for (int j = 0; j < SMEM_HEIGHT; ++j) { | ||
int gmem_lin_idx = (base_i + i) * GMEM_WIDTH + base_j + j; | ||
int smem_lin_idx = i * SMEM_WIDTH + j; | ||
|
||
if (smem_buffer[smem_lin_idx] != gmem_lin_idx) { | ||
#ifndef __CUDACC_RTC__ | ||
printf("Failed at smem (%d, %d). Got %d. Expected %d.\n", i, j, smem_buffer[smem_lin_idx], gmem_lin_idx); | ||
#endif | ||
__trap(); | ||
} | ||
} | ||
} | ||
|
||
__syncthreads(); | ||
|
||
// Update smem | ||
for (int i = threadIdx.x; i < buf_len; i += blockDim.x) { | ||
smem_buffer[i] = 2 * smem_buffer[i] + 1; | ||
} | ||
cde::fence_proxy_async_shared_cta(); | ||
__syncthreads(); | ||
|
||
// Write back to global memory: | ||
if (threadIdx.x == 0) { | ||
cde::cp_async_bulk_tensor_2d_shared_to_global(global_tensor_map, base_j, base_i, smem_buffer); | ||
cde::cp_async_bulk_commit_group(); | ||
cde::cp_async_bulk_wait_group_read<0>(); | ||
} | ||
__threadfence(); | ||
__syncthreads(); | ||
|
||
// // TEAR-DOWN: check that global memory is correct | ||
|
||
for (int i = 0; i < SMEM_HEIGHT; ++i) { | ||
for (int j = 0; j < SMEM_HEIGHT; ++j) { | ||
int gmem_lin_idx = (base_i + i) * GMEM_WIDTH + base_j + j; | ||
|
||
if (gmem_tensor[gmem_lin_idx] != 2 * gmem_lin_idx + 1) { | ||
__trap(); | ||
} | ||
} | ||
} | ||
__syncthreads(); | ||
} | ||
|
||
#ifndef __CUDACC_RTC__ | ||
PFN_cuTensorMapEncodeTiled get_cuTensorMapEncodeTiled() { | ||
void* driver_ptr = nullptr; | ||
cudaDriverEntryPointQueryResult driver_status; | ||
auto code = cudaGetDriverEntryPoint("cuTensorMapEncodeTiled", &driver_ptr, cudaEnableDefault, &driver_status); | ||
if (code != cudaSuccess) { | ||
exit(1); | ||
} | ||
return reinterpret_cast<PFN_cuTensorMapEncodeTiled>(driver_ptr); | ||
} | ||
#endif | ||
|
||
int main(int, char**) | ||
{ | ||
NV_IF_TARGET(NV_IS_HOST,( | ||
//Required by concurrent_agents_launch to know how many we're launching | ||
cuda_thread_count = 512; | ||
|
||
int * tensor_ptr = nullptr; | ||
auto code = cudaGetSymbolAddress((void**)&tensor_ptr, gmem_tensor); | ||
if (code != cudaSuccess) { | ||
std::printf("getsymboladdress failed."); | ||
exit(1); | ||
} | ||
|
||
// https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__TENSOR__MEMORY.html | ||
CUtensorMap local_tensor_map{}; | ||
// rank is the number of dimensions of the array. | ||
constexpr uint32_t rank = 2; | ||
uint64_t size[rank] = {GMEM_WIDTH, GMEM_HEIGHT}; | ||
// The stride is the number of bytes to traverse from the first element of one row to the next. | ||
// It must be a multiple of 16. | ||
uint64_t stride[rank - 1] = {GMEM_WIDTH * sizeof(int)}; | ||
// The box_size is the size of the shared memory buffer that is used as the | ||
// destination of a TMA transfer. | ||
uint32_t box_size[rank] = {SMEM_WIDTH, SMEM_HEIGHT}; | ||
// The distance between elements in units of sizeof(element). A stride of 2 | ||
// can be used to load only the real component of a complex-valued tensor, for instance. | ||
uint32_t elem_stride[rank] = {1, 1}; | ||
|
||
// Get a function pointer to the cuTensorMapEncodeTiled driver API. | ||
auto cuTensorMapEncodeTiled = get_cuTensorMapEncodeTiled(); | ||
|
||
// Create the tensor descriptor. | ||
CUresult res = cuTensorMapEncodeTiled( | ||
&local_tensor_map, // CUtensorMap *tensorMap, | ||
CUtensorMapDataType::CU_TENSOR_MAP_DATA_TYPE_INT32, | ||
rank, // cuuint32_t tensorRank, | ||
tensor_ptr, // void *globalAddress, | ||
size, // const cuuint64_t *globalDim, | ||
stride, // const cuuint64_t *globalStrides, | ||
box_size, // const cuuint32_t *boxDim, | ||
elem_stride, // const cuuint32_t *elementStrides, | ||
CUtensorMapInterleave::CU_TENSOR_MAP_INTERLEAVE_NONE, | ||
CUtensorMapSwizzle::CU_TENSOR_MAP_SWIZZLE_NONE, | ||
CUtensorMapL2promotion::CU_TENSOR_MAP_L2_PROMOTION_NONE, | ||
CUtensorMapFloatOOBfill::CU_TENSOR_MAP_FLOAT_OOB_FILL_NONE | ||
); | ||
|
||
if (res != CUDA_SUCCESS) { | ||
std::printf("tensormap creation failed."); | ||
exit(1); | ||
} | ||
code = cudaMemcpyToSymbol(global_fake_tensor_map, &local_tensor_map, sizeof(CUtensorMap)); | ||
if (code != cudaSuccess) { | ||
std::printf("memcpytosymbol failed."); | ||
exit(1); | ||
} | ||
)); | ||
|
||
NV_DISPATCH_TARGET( | ||
NV_IS_DEVICE, ( | ||
test(0, 0); | ||
test(4, 0); | ||
test(4, 4); | ||
) | ||
); | ||
return 0; | ||
} |
71 changes: 71 additions & 0 deletions
71
libcudacxx/.upstream-tests/test/cuda/barrier/cp_async_bulk_tensor_1d.pass.cpp
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,71 @@ | ||
//===----------------------------------------------------------------------===// | ||
// | ||
// Part of libcu++, the C++ Standard Library for your entire system, | ||
// under the Apache License v2.0 with LLVM Exceptions. | ||
// See https://llvm.org/LICENSE.txt for license information. | ||
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception | ||
// SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. | ||
// | ||
//===----------------------------------------------------------------------===// | ||
// | ||
// UNSUPPORTED: libcpp-has-no-threads | ||
// UNSUPPORTED: pre-sm-90 | ||
|
||
// <cuda/barrier> | ||
|
||
#include "cp_async_bulk_tensor_generic.h" | ||
|
||
// Define the size of contiguous tensor in global and shared memory. | ||
// | ||
// Note that the first dimension is the one with stride 1. This one must be a | ||
// multiple of 4 to ensure that each new dimension starts at a 16-byte aligned | ||
// offset. | ||
// | ||
// We have a separate variable for host and device because a constexpr | ||
// std::initializer_list cannot be shared between host and device as some of its | ||
// member functions take a const reference, which is unsupported by nvcc. | ||
constexpr std::initializer_list<int> GMEM_DIMS {256}; | ||
__device__ constexpr std::initializer_list<int> GMEM_DIMS_DEV{256}; | ||
constexpr std::initializer_list<int> SMEM_DIMS {32}; | ||
__device__ constexpr std::initializer_list<int> SMEM_DIMS_DEV{32}; | ||
|
||
__device__ constexpr std::initializer_list<int> TEST_SMEM_COORDS[] = { | ||
{0}, | ||
{4}, | ||
{8} | ||
}; | ||
|
||
constexpr size_t gmem_len = tensor_len(GMEM_DIMS); | ||
constexpr size_t smem_len = tensor_len(SMEM_DIMS); | ||
|
||
__device__ int gmem_tensor[gmem_len]; | ||
|
||
int main(int, char**) | ||
{ | ||
NV_DISPATCH_TARGET( | ||
NV_IS_HOST, ( | ||
//Required by concurrent_agents_launch to know how many we're launching | ||
cuda_thread_count = 512; | ||
|
||
// Get pointer to gmem_tensor to create tensor map. | ||
int * tensor_ptr = nullptr; | ||
auto code = cudaGetSymbolAddress((void**)&tensor_ptr, gmem_tensor); | ||
if (code != cudaSuccess) { | ||
exit(1); | ||
} | ||
// Create tensor map | ||
CUtensorMap local_tensor_map = map_encode(tensor_ptr, GMEM_DIMS, SMEM_DIMS); | ||
|
||
// Copy it to device | ||
code = cudaMemcpyToSymbol(global_fake_tensor_map, &local_tensor_map, sizeof(CUtensorMap)); | ||
if (code != cudaSuccess) { | ||
exit(1); | ||
}), | ||
NV_IS_DEVICE, ( | ||
for (auto smem_coord : TEST_SMEM_COORDS) { | ||
test<smem_len>(smem_coord, SMEM_DIMS_DEV, GMEM_DIMS_DEV, gmem_tensor, gmem_len); | ||
} | ||
) | ||
); | ||
return 0; | ||
} |
72 changes: 72 additions & 0 deletions
72
libcudacxx/.upstream-tests/test/cuda/barrier/cp_async_bulk_tensor_2d.pass.cpp
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,72 @@ | ||
//===----------------------------------------------------------------------===// | ||
// | ||
// Part of libcu++, the C++ Standard Library for your entire system, | ||
// under the Apache License v2.0 with LLVM Exceptions. | ||
// See https://llvm.org/LICENSE.txt for license information. | ||
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception | ||
// SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. | ||
// | ||
//===----------------------------------------------------------------------===// | ||
// | ||
// UNSUPPORTED: libcpp-has-no-threads | ||
// UNSUPPORTED: pre-sm-90 | ||
|
||
// <cuda/barrier> | ||
|
||
#include "cp_async_bulk_tensor_generic.h" | ||
|
||
// Define the size of contiguous tensor in global and shared memory. | ||
// | ||
// Note that the first dimension is the one with stride 1. This one must be a | ||
// multiple of 4 to ensure that each new dimension starts at a 16-byte aligned | ||
// offset. | ||
// | ||
// We have a separate variable for host and device because a constexpr | ||
// std::initializer_list cannot be shared between host and device as some of its | ||
// member functions take a const reference, which is unsupported by nvcc. | ||
constexpr std::initializer_list<int> GMEM_DIMS {8, 11}; | ||
__device__ constexpr std::initializer_list<int> GMEM_DIMS_DEV{8, 11}; | ||
constexpr std::initializer_list<int> SMEM_DIMS {4, 2}; | ||
__device__ constexpr std::initializer_list<int> SMEM_DIMS_DEV{4, 2}; | ||
|
||
__device__ constexpr std::initializer_list<int> TEST_SMEM_COORDS[] = { | ||
{0, 0}, | ||
{4, 1}, | ||
{4, 5}, | ||
{0, 5}, | ||
}; | ||
|
||
constexpr size_t gmem_len = tensor_len(GMEM_DIMS); | ||
constexpr size_t smem_len = tensor_len(SMEM_DIMS); | ||
|
||
__device__ int gmem_tensor[gmem_len]; | ||
|
||
int main(int, char**) | ||
{ | ||
NV_DISPATCH_TARGET( | ||
NV_IS_HOST, ( | ||
//Required by concurrent_agents_launch to know how many we're launching | ||
cuda_thread_count = 512; | ||
|
||
// Get pointer to gmem_tensor to create tensor map. | ||
int * tensor_ptr = nullptr; | ||
auto code = cudaGetSymbolAddress((void**)&tensor_ptr, gmem_tensor); | ||
if (code != cudaSuccess) { | ||
exit(1); | ||
} | ||
// Create tensor map | ||
CUtensorMap local_tensor_map = map_encode(tensor_ptr, GMEM_DIMS, SMEM_DIMS); | ||
|
||
// Copy it to device | ||
code = cudaMemcpyToSymbol(global_fake_tensor_map, &local_tensor_map, sizeof(CUtensorMap)); | ||
if (code != cudaSuccess) { | ||
exit(1); | ||
}), | ||
NV_IS_DEVICE, ( | ||
for (auto smem_coord : TEST_SMEM_COORDS) { | ||
test<smem_len>(smem_coord, SMEM_DIMS_DEV, GMEM_DIMS_DEV, gmem_tensor, gmem_len); | ||
} | ||
) | ||
); | ||
return 0; | ||
} |
Oops, something went wrong.