-
Notifications
You must be signed in to change notification settings - Fork 22
/
forall.hpp
128 lines (104 loc) · 3.15 KB
/
forall.hpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
//////////////////////////////////////////////////////////////////////////////
// Copyright (c) 2016-24, Lawrence Livermore National Security, LLC and CHAI
// project contributors. See the CHAI LICENSE file for details.
//
// SPDX-License-Identifier: BSD-3-Clause
//////////////////////////////////////////////////////////////////////////////
#ifndef CHAI_forall_HPP
#define CHAI_forall_HPP
#include "chai/ArrayManager.hpp"
#include "chai/ExecutionSpaces.hpp"
#include "chai/config.hpp"
#if defined(CHAI_ENABLE_UM)
#if !defined(CHAI_THIN_GPU_ALLOCATE)
#include <cuda_runtime_api.h>
#endif
#endif
struct sequential {
};
#if defined(CHAI_ENABLE_CUDA) || defined(CHAI_ENABLE_HIP)
struct gpu {
};
struct gpu_async {};
#endif
template <typename LOOP_BODY>
void forall_kernel_cpu(int begin, int end, LOOP_BODY body)
{
for (int i = begin; i < end; ++i) {
body(i);
}
}
/*
* \brief Run forall kernel on CPU.
*/
template <typename LOOP_BODY>
void forall(sequential, int begin, int end, LOOP_BODY body)
{
chai::ArrayManager* rm = chai::ArrayManager::getInstance();
#if defined(CHAI_ENABLE_UM)
#if !defined(CHAI_THIN_GPU_ALLOCATE)
cudaDeviceSynchronize();
#endif
#endif
rm->setExecutionSpace(chai::CPU);
forall_kernel_cpu(begin, end, body);
rm->setExecutionSpace(chai::NONE);
}
#if defined(CHAI_ENABLE_CUDA) || defined(CHAI_ENABLE_HIP)
template <typename LOOP_BODY>
__global__ void forall_kernel_gpu(int start, int length, LOOP_BODY body)
{
int idx = blockDim.x * blockIdx.x + threadIdx.x;
if (idx < length) {
body(idx+start);
}
}
template <typename LOOP_BODY>
void forall(gpu_async, int begin, int end, LOOP_BODY&& body)
{
chai::ArrayManager* rm = chai::ArrayManager::getInstance();
rm->setExecutionSpace(chai::GPU);
#if defined(CHAI_ENABLE_CUDA)
size_t blockSize = 32;
#elif defined(CHAI_ENABLE_HIP)
size_t blockSize = 64;
#endif
size_t gridSize = (end - begin + blockSize - 1) / blockSize;
#if defined(CHAI_ENABLE_GPU_SIMULATION_MODE)
forall_kernel_cpu(begin, end, body);
#elif defined(CHAI_ENABLE_CUDA)
forall_kernel_gpu<<<gridSize, blockSize>>>(begin, end - begin, body);
#elif defined(CHAI_ENABLE_HIP)
hipLaunchKernelGGL(forall_kernel_gpu, dim3(gridSize), dim3(blockSize), 0, 0,
begin, end - begin, body);
#endif
rm->setExecutionSpace(chai::NONE);
}
/*
* \brief Run forall kernel on GPU.
*/
template <typename LOOP_BODY>
void forall(gpu, int begin, int end, LOOP_BODY&& body)
{
chai::ArrayManager* rm = chai::ArrayManager::getInstance();
rm->setExecutionSpace(chai::GPU);
#if defined(CHAI_ENABLE_CUDA)
size_t blockSize = 32;
#elif defined(CHAI_ENABLE_HIP)
size_t blockSize = 64;
#endif
size_t gridSize = (end - begin + blockSize - 1) / blockSize;
#if defined(CHAI_ENABLE_GPU_SIMULATION_MODE)
forall_kernel_cpu(begin, end, body);
#elif defined(CHAI_ENABLE_CUDA)
forall_kernel_gpu<<<gridSize, blockSize>>>(begin, end - begin, body);
cudaDeviceSynchronize();
#elif defined(CHAI_ENABLE_HIP)
hipLaunchKernelGGL(forall_kernel_gpu, dim3(gridSize), dim3(blockSize), 0, 0,
begin, end - begin, body);
hipDeviceSynchronize();
#endif
rm->setExecutionSpace(chai::NONE);
}
#endif
#endif // CHAI_forall_HPP