Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Generalize GPU indexing to add global indexing #1334

Merged
merged 55 commits into from
Jun 23, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
55 commits
Select commit Hold shift + click to select a range
43d2211
Add HipDimIdxT and simplify HipDimHelper
MrBurmark Sep 22, 2022
848ad4f
Add Hip Indexing polices and generic statements
MrBurmark Sep 22, 2022
1fe262b
Add hip global indexing
MrBurmark Dec 6, 2022
53f4ec1
add more hip testing
MrBurmark Dec 6, 2022
d3bab9e
transform cuda for policies
MrBurmark Dec 6, 2022
2c56cd3
Add more cuda testing
MrBurmark Dec 6, 2022
d0d6f6a
Work on implementing for_all with IndexMappers
MrBurmark Dec 14, 2022
3722f9c
Add non-static members to index mappers
MrBurmark Dec 28, 2022
a100ccb
Move occupancy related functions into MemUtils
MrBurmark Dec 28, 2022
4acd5b6
Add occupancy claculator classes
MrBurmark Dec 28, 2022
b24ac95
Use LaunchConfig in hip_exec in HipKernelExt
MrBurmark Dec 28, 2022
4025847
Change type_traits impl
MrBurmark Dec 28, 2022
5b99a84
Use HipForall policies with hip_exec
MrBurmark Dec 28, 2022
1881767
Use HipKernel policies with kernel and launch
MrBurmark Dec 28, 2022
69f04a3
Rework polices to be more general
MrBurmark May 19, 2023
1c6b458
Split forone into policies and exec impl
MrBurmark May 23, 2023
449b5b6
Add hip tests for global indexer
MrBurmark May 23, 2023
1af5b20
Make abstracted memory routines
MrBurmark May 24, 2023
6cdac0e
Use resources to allocate
MrBurmark May 24, 2023
0f76d80
Move global indexing unit test
MrBurmark May 24, 2023
4f1cdbc
Merge branch 'develop' into feature/burmark1/KernelGpuGlobalIndexing
rhornung67 May 24, 2023
57eb6a6
Collapse policy lists for unit tests
MrBurmark May 25, 2023
a5a4b2d
fixup cmake from previous change
MrBurmark May 25, 2023
c574f15
edits for consistency with cuda files
MrBurmark May 26, 2023
1296e4b
Fix hip occupancy calc with older rocm versions
MrBurmark May 26, 2023
d2c3133
rename hip explicit polices to avoid confusion with cuda
MrBurmark May 26, 2023
fa23761
remove unused hip kernel policies
MrBurmark May 26, 2023
0fc059f
Make hip kernel launch helper consistent with cuda
MrBurmark May 26, 2023
872cb37
change base of hip kernel policies to explicit version
MrBurmark May 26, 2023
9328fef
Do not set dims at all for sequential in KernelDimensionCalculator
MrBurmark May 26, 2023
91efeee
Reorder mask kernel policy implementations
MrBurmark May 26, 2023
871d4b2
Copy hip changes to cuda
MrBurmark May 26, 2023
f6bb81a
Fix get_resource specializations
MrBurmark May 26, 2023
774dcda
fixup cuda typo in indexing test
MrBurmark May 26, 2023
a8ef589
Avoid auto in IndexGlobal to appease nvcc
MrBurmark May 26, 2023
13027e9
Merge branch 'feature/burmark1/KernelGpuGlobalIndexing' of github.com…
MrBurmark May 26, 2023
ce0d023
Merge remote-tracking branch 'origin/develop' into feature/burmark1/K…
MrBurmark May 26, 2023
fd0233f
add size policies for thread and block too
MrBurmark May 26, 2023
6258e68
start CI
MrBurmark May 30, 2023
2d53784
Use have_work more consistently
MrBurmark May 30, 2023
1ac1440
Add syncable loop policy aliases
MrBurmark May 30, 2023
266a576
Merge remote-tracking branch 'origin/develop' into feature/burmark1/K…
MrBurmark Jun 6, 2023
1a4842b
Merge remote-tracking branch 'origin/develop' into feature/burmark1/K…
MrBurmark Jun 12, 2023
bac379b
remove commented prints
MrBurmark Jun 12, 2023
f3b0517
Merge branch 'develop' into feature/burmark1/KernelGpuGlobalIndexing
rhornung67 Jun 15, 2023
144d5ae
Merge branch 'develop' into feature/burmark1/KernelGpuGlobalIndexing
rhornung67 Jun 15, 2023
fdb34eb
Merge branch 'develop' into feature/burmark1/KernelGpuGlobalIndexing
rhornung67 Jun 19, 2023
cf8ca34
Merge branch 'develop' into feature/burmark1/KernelGpuGlobalIndexing
rhornung67 Jun 19, 2023
7f04cf8
Merge remote-tracking branch 'origin/develop' into feature/burmark1/K…
MrBurmark Jun 21, 2023
d574bb9
Merge branch 'develop' into feature/burmark1/KernelGpuGlobalIndexing
rhornung67 Jun 21, 2023
23e4729
Merge branch 'develop' into feature/burmark1/KernelGpuGlobalIndexing
rhornung67 Jun 22, 2023
ad1fb89
Comment ForallDimensionCalculator
MrBurmark Jun 22, 2023
cb2d03e
Improve static_asserts in ForallDimensionCalculator
MrBurmark Jun 22, 2023
fd27566
Use named ints and fix race conditions
MrBurmark Jun 22, 2023
4132a3c
Merge branch 'develop' into feature/burmark1/KernelGpuGlobalIndexing
rhornung67 Jun 23, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
189 changes: 187 additions & 2 deletions include/RAJA/policy/cuda/MemUtils_CUDA.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -26,11 +26,10 @@
#include <cassert>
#include <cstddef>
#include <cstdio>
#include <limits>
#include <type_traits>
#include <unordered_map>

#include "nvToolsExt.h"

#include "RAJA/util/basic_mempool.hpp"
#include "RAJA/util/mutex.hpp"
#include "RAJA/util/types.hpp"
Expand All @@ -40,6 +39,10 @@
#include "RAJA/policy/cuda/policy.hpp"
#include "RAJA/policy/cuda/raja_cudaerrchk.hpp"

#if defined(RAJA_ENABLE_NV_TOOLS_EXT)
#include "nvToolsExt.h"
#endif

namespace RAJA
{

Expand Down Expand Up @@ -291,6 +294,188 @@ cudaDeviceProp& device_prop()
return prop;
}


struct CudaFixedMaxBlocksData
{
int multiProcessorCount;
int maxThreadsPerMultiProcessor;
};

RAJA_INLINE
size_t cuda_max_blocks(size_t block_size)
{
static CudaFixedMaxBlocksData data = []() {
cudaDeviceProp& prop = cuda::device_prop();
return CudaFixedMaxBlocksData{prop.multiProcessorCount,
prop.maxThreadsPerMultiProcessor};
}();

size_t max_blocks = data.multiProcessorCount *
(data.maxThreadsPerMultiProcessor / block_size);

return max_blocks;
}

struct CudaOccMaxBlocksThreadsData
{
int prev_shmem_size;
int max_blocks;
int max_threads;
};

template < typename RAJA_UNUSED_ARG(UniqueMarker), typename Func >
RAJA_INLINE
void cuda_occupancy_max_blocks_threads(Func&& func, int shmem_size,
size_t &max_blocks, size_t &max_threads)
{
static constexpr int uninitialized = -1;
static thread_local CudaOccMaxBlocksThreadsData data = {
uninitialized, uninitialized, uninitialized};

if (data.prev_shmem_size != shmem_size) {

cudaErrchk(cudaOccupancyMaxPotentialBlockSize(
&data.max_blocks, &data.max_threads, func, shmem_size));

data.prev_shmem_size = shmem_size;

}

max_blocks = data.max_blocks;
max_threads = data.max_threads;

}

struct CudaOccMaxBlocksFixedThreadsData
{
int prev_shmem_size;
int max_blocks;
int multiProcessorCount;
};

template < typename RAJA_UNUSED_ARG(UniqueMarker), size_t num_threads, typename Func >
RAJA_INLINE
void cuda_occupancy_max_blocks(Func&& func, int shmem_size,
size_t &max_blocks)
{
static constexpr int uninitialized = -1;
static thread_local CudaOccMaxBlocksFixedThreadsData data = {
uninitialized, uninitialized, uninitialized};

if (data.prev_shmem_size != shmem_size) {

cudaErrchk(cudaOccupancyMaxActiveBlocksPerMultiprocessor(
&data.max_blocks, func, num_threads, shmem_size));

if (data.multiProcessorCount == uninitialized) {

data.multiProcessorCount = cuda::device_prop().multiProcessorCount;

}

data.max_blocks *= data.multiProcessorCount;

data.prev_shmem_size = shmem_size;

}

max_blocks = data.max_blocks;

}

struct CudaOccMaxBlocksVariableThreadsData
{
int prev_shmem_size;
int prev_num_threads;
int max_blocks;
int multiProcessorCount;
};

template < typename RAJA_UNUSED_ARG(UniqueMarker), typename Func >
RAJA_INLINE
void cuda_occupancy_max_blocks(Func&& func, int shmem_size,
size_t &max_blocks, size_t num_threads)
{
static constexpr int uninitialized = 0;
static thread_local CudaOccMaxBlocksVariableThreadsData data = {
uninitialized, uninitialized, uninitialized, uninitialized};

if ( data.prev_shmem_size != shmem_size ||
data.prev_num_threads != num_threads ) {

int tmp_max_blocks;
cudaErrchk(cudaOccupancyMaxActiveBlocksPerMultiprocessor(
&tmp_max_blocks, func, static_cast<int>(num_threads), shmem_size));
data.max_blocks = tmp_max_blocks;

if (data.multiProcessorCount == uninitialized) {

data.multiProcessorCount = cuda::device_prop().multiProcessorCount;

}

data.max_blocks *= data.multiProcessorCount;

data.prev_shmem_size = shmem_size;
data.prev_num_threads = num_threads;

}

max_blocks = data.max_blocks;

}

struct CudaOccupancyDefaults
{
CudaOccupancyDefaults(const void* RAJA_UNUSED_ARG(func))
{ }

template < typename IdxT >
inline auto get_max_grid_size(size_t RAJA_UNUSED_ARG(dynamic_shmem_size),
IdxT RAJA_UNUSED_ARG(block_size)) const
{
return std::numeric_limits<IdxT>::max();
}

template < typename IdxT = cuda_dim_member_t >
inline auto get_max_block_size_and_grid_size(size_t RAJA_UNUSED_ARG(dynamic_shmem_size)) const
{
return std::make_pair(static_cast<IdxT>(::RAJA::policy::cuda::MAX_BLOCK_SIZE),
std::numeric_limits<IdxT>::max());
}
};

template < typename UniqueMarker >
struct CudaOccupancyCalculator
{
CudaOccupancyCalculator(const void* func)
: m_func(func)
{ }

template < typename IdxT >
inline auto get_max_grid_size(size_t dynamic_shmem_size, IdxT block_size) const
{
int max_grid_size = -1;
::RAJA::cuda::cuda_occupancy_max_blocks<UniqueMarker>(
m_func, dynamic_shmem_size, max_grid_size, block_size);
return static_cast<IdxT>(max_grid_size);
}

template < typename IdxT = cuda_dim_member_t >
inline auto get_max_block_size_and_grid_size(size_t dynamic_shmem_size) const
{
int max_block_size = -1;
int max_grid_size = -1;
::RAJA::cuda::cuda_occupancy_max_blocks_threads<UniqueMarker>(
m_func, dynamic_shmem_size, max_grid_size, max_block_size);
return std::make_pair(static_cast<IdxT>(max_block_size),
static_cast<IdxT>(max_grid_size));
}

private:
const void* m_func;
};

} // namespace cuda

} // namespace RAJA
Expand Down
Loading