/
cp_async_bulk_tensor_2d.pass.cpp
72 lines (63 loc) · 2.53 KB
/
cp_async_bulk_tensor_2d.pass.cpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
//===----------------------------------------------------------------------===//
//
// Part of libcu++, the C++ Standard Library for your entire system,
// under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
// SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES.
//
//===----------------------------------------------------------------------===//
//
// UNSUPPORTED: libcpp-has-no-threads
// UNSUPPORTED: pre-sm-90
// <cuda/barrier>
#include "cp_async_bulk_tensor_generic.h"
// Define the size of contiguous tensor in global and shared memory.
//
// Note that the first dimension is the one with stride 1. This one must be a
// multiple of 4 to ensure that each new dimension starts at a 16-byte aligned
// offset.
//
// We have a separate variable for host and device because a constexpr
// std::initializer_list cannot be shared between host and device as some of its
// member functions take a const reference, which is unsupported by nvcc.
constexpr std::initializer_list<int> GMEM_DIMS {8, 11};
__device__ constexpr std::initializer_list<int> GMEM_DIMS_DEV{8, 11};
constexpr std::initializer_list<int> SMEM_DIMS {4, 2};
__device__ constexpr std::initializer_list<int> SMEM_DIMS_DEV{4, 2};
__device__ constexpr std::initializer_list<int> TEST_SMEM_COORDS[] = {
{0, 0},
{4, 1},
{4, 5},
{0, 5},
};
constexpr size_t gmem_len = tensor_len(GMEM_DIMS);
constexpr size_t smem_len = tensor_len(SMEM_DIMS);
__device__ int gmem_tensor[gmem_len];
int main(int, char**)
{
NV_DISPATCH_TARGET(
NV_IS_HOST, (
//Required by concurrent_agents_launch to know how many we're launching
cuda_thread_count = 512;
// Get pointer to gmem_tensor to create tensor map.
int * tensor_ptr = nullptr;
auto code = cudaGetSymbolAddress((void**)&tensor_ptr, gmem_tensor);
if (code != cudaSuccess) {
exit(1);
}
// Create tensor map
CUtensorMap local_tensor_map = map_encode(tensor_ptr, GMEM_DIMS, SMEM_DIMS);
// Copy it to device
code = cudaMemcpyToSymbol(global_fake_tensor_map, &local_tensor_map, sizeof(CUtensorMap));
if (code != cudaSuccess) {
exit(1);
}),
NV_IS_DEVICE, (
for (auto smem_coord : TEST_SMEM_COORDS) {
test<smem_len>(smem_coord, SMEM_DIMS_DEV, GMEM_DIMS_DEV, gmem_tensor, gmem_len);
}
)
);
return 0;
}