Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

FEM Mass matrix element assembly kernel #330

Merged
merged 14 commits into from
Jul 21, 2023
3 changes: 3 additions & 0 deletions src/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,9 @@ blt_add_executable(
apps/LTIMES_NOVIEW.cpp
apps/LTIMES_NOVIEW-Seq.cpp
apps/LTIMES_NOVIEW-OMPTarget.cpp
apps/MASS3DEA.cpp
apps/MASS3DEA-Seq.cpp
apps/MASS3DEA-OMPTarget.cpp
apps/MASS3DPA.cpp
apps/MASS3DPA-Seq.cpp
apps/MASS3DPA-OMPTarget.cpp
Expand Down
6 changes: 6 additions & 0 deletions src/apps/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,12 @@ blt_add_library(
LTIMES_NOVIEW-Cuda.cpp
LTIMES_NOVIEW-OMP.cpp
LTIMES_NOVIEW-OMPTarget.cpp
MASS3DEA.cpp
MASS3DEA-Cuda.cpp
MASS3DEA-Hip.cpp
MASS3DEA-Seq.cpp
MASS3DEA-OMP.cpp
MASS3DEA-OMPTarget.cpp
MASS3DPA.cpp
MASS3DPA-Cuda.cpp
MASS3DPA-Hip.cpp
Expand Down
183 changes: 183 additions & 0 deletions src/apps/MASS3DEA-Cuda.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,183 @@
//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
// and RAJA Performance Suite project contributors.
// See the RAJAPerf/LICENSE file for details.
//
// SPDX-License-Identifier: (BSD-3-Clause)
//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//

#include "MASS3DEA.hpp"

#include "RAJA/RAJA.hpp"

#if defined(RAJA_ENABLE_CUDA)

#include "common/CudaDataUtils.hpp"

#include <iostream>

namespace rajaperf {
namespace apps {

template < size_t block_size >
__launch_bounds__(block_size)
__global__ void Mass3DEA(const Real_ptr B, const Real_ptr D, Real_ptr M) {

const int e = blockIdx.x;

MASS3DEA_0_GPU

GPU_FOREACH_THREAD(iz, z, 1) {
artv3 marked this conversation as resolved.
Show resolved Hide resolved
GPU_FOREACH_THREAD(d, x, MEA_D1D) {
GPU_FOREACH_THREAD(q, y, MEA_Q1D) {
MASS3DEA_1
}
}
}

MASS3DEA_2_GPU

GPU_FOREACH_THREAD(k1, x, MEA_Q1D) {
GPU_FOREACH_THREAD(k2, y, MEA_Q1D) {
GPU_FOREACH_THREAD(k3, z, MEA_Q1D) {
MASS3DEA_3
}
}
}

__syncthreads();

GPU_FOREACH_THREAD(i1, x, MEA_D1D) {
GPU_FOREACH_THREAD(i2, y, MEA_D1D) {
GPU_FOREACH_THREAD(i3, z, MEA_D1D) {
MASS3DEA_4
}
}
}

}

template < size_t block_size >
void MASS3DEA::runCudaVariantImpl(VariantID vid) {
const Index_type run_reps = getRunReps();

MASS3DEA_DATA_SETUP;

switch (vid) {

case Base_CUDA: {

dim3 nthreads_per_block(MEA_D1D, MEA_D1D, MEA_D1D);

startTimer();
for (RepIndex_type irep = 0; irep < run_reps; ++irep) {

Mass3DEA<block_size><<<NE, nthreads_per_block>>>(B, D, M);

cudaErrchk( cudaGetLastError() );
}
stopTimer();

break;
}

case RAJA_CUDA: {

constexpr bool async = true;

using launch_policy = RAJA::LaunchPolicy<RAJA::cuda_launch_t<async, MEA_D1D*MEA_D1D*MEA_D1D>>;

using outer_x = RAJA::LoopPolicy<RAJA::cuda_block_x_direct>;

using inner_x = RAJA::LoopPolicy<RAJA::cuda_thread_x_loop>;

using inner_y = RAJA::LoopPolicy<RAJA::cuda_thread_y_loop>;

using inner_z = RAJA::LoopPolicy<RAJA::cuda_thread_z_loop>;

startTimer();
for (RepIndex_type irep = 0; irep < run_reps; ++irep) {

RAJA::launch<launch_policy>(
RAJA::LaunchParams(RAJA::Teams(NE),
RAJA::Threads(MEA_D1D, MEA_D1D, MEA_D1D)),
[=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) {

RAJA::loop<outer_x>(ctx, RAJA::RangeSegment(0, NE),
[&](int e) {

MASS3DEA_0

RAJA::loop<inner_z>(ctx, RAJA::RangeSegment(0, 1),
artv3 marked this conversation as resolved.
Show resolved Hide resolved
[&](int ) {
RAJA::loop<inner_x>(ctx, RAJA::RangeSegment(0, MEA_D1D),
[&](int d) {
RAJA::loop<inner_y>(ctx, RAJA::RangeSegment(0, MEA_Q1D),
[&](int q) {
MASS3DEA_1
}
); // RAJA::loop<inner_y>
}
); // RAJA::loop<inner_x>
}
); // RAJA::loop<inner_z>


MASS3DEA_2

RAJA::loop<inner_x>(ctx, RAJA::RangeSegment(0, MEA_Q1D),
[&](int k1) {
RAJA::loop<inner_y>(ctx, RAJA::RangeSegment(0, MEA_Q1D),
[&](int k2) {
RAJA::loop<inner_z>(ctx, RAJA::RangeSegment(0, MEA_Q1D),
[&](int k3) {
MASS3DEA_3
}
); // RAJA::loop<inner_x>
}
); // RAJA::loop<inner_y>
}
); // RAJA::loop<inner_z>

ctx.teamSync();

RAJA::loop<inner_x>(ctx, RAJA::RangeSegment(0, MEA_D1D),
[&](int i1) {
RAJA::loop<inner_y>(ctx, RAJA::RangeSegment(0, MEA_D1D),
[&](int i2) {
RAJA::loop<inner_z>(ctx, RAJA::RangeSegment(0, MEA_D1D),
[&](int i3) {
MASS3DEA_4
}
); // RAJA::loop<inner_x>
}
); // RAJA::loop<inner_y>
}
); // RAJA::loop<inner_z>

} // lambda (e)
); // RAJA::loop<outer_x>

} // outer lambda (ctx)
); // RAJA::launch

} // loop over kernel reps
stopTimer();

break;
}

default: {

getCout() << "\n MASS3DEA : Unknown Cuda variant id = " << vid << std::endl;
break;
}
}
}

RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BIOLERPLATE(MASS3DEA, Cuda)

} // end namespace apps
} // end namespace rajaperf

#endif // RAJA_ENABLE_CUDA
185 changes: 185 additions & 0 deletions src/apps/MASS3DEA-Hip.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,185 @@
//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC
// and RAJA Performance Suite project contributors.
// See the RAJAPerf/LICENSE file for details.
//
// SPDX-License-Identifier: (BSD-3-Clause)
//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//

#include "MASS3DEA.hpp"

#include "RAJA/RAJA.hpp"

#if defined(RAJA_ENABLE_HIP)

#include "common/HipDataUtils.hpp"

#include <iostream>

namespace rajaperf {
namespace apps {

template < size_t block_size >
__launch_bounds__(block_size)
__global__ void Mass3DEA(const Real_ptr B, const Real_ptr D, Real_ptr M) {

const int e = blockIdx.x;

MASS3DEA_0_GPU

GPU_FOREACH_THREAD(iz, z, 1) {
artv3 marked this conversation as resolved.
Show resolved Hide resolved
GPU_FOREACH_THREAD(d, x, MEA_D1D) {
GPU_FOREACH_THREAD(q, y, MEA_Q1D) {
MASS3DEA_1
}
}
}

MASS3DEA_2_GPU

GPU_FOREACH_THREAD(k1, x, MEA_Q1D) {
GPU_FOREACH_THREAD(k2, y, MEA_Q1D) {
GPU_FOREACH_THREAD(k3, z, MEA_Q1D) {
MASS3DEA_3
}
}
}

__syncthreads();

GPU_FOREACH_THREAD(i1, x, MEA_D1D) {
GPU_FOREACH_THREAD(i2, y, MEA_D1D) {
GPU_FOREACH_THREAD(i3, z, MEA_D1D) {
MASS3DEA_4
}
}
}

}

template < size_t block_size >
void MASS3DEA::runHipVariantImpl(VariantID vid) {
const Index_type run_reps = getRunReps();

MASS3DEA_DATA_SETUP;

switch (vid) {

case Base_HIP: {

dim3 nblocks(NE);
dim3 nthreads_per_block(MEA_D1D, MEA_D1D, MEA_D1D);

startTimer();
for (RepIndex_type irep = 0; irep < run_reps; ++irep) {

hipLaunchKernelGGL((Mass3DEA<block_size>), dim3(nblocks), dim3(nthreads_per_block), 0, 0,
B, D, M);

hipErrchk( hipGetLastError() );
}
stopTimer();

break;
}

case RAJA_HIP: {

constexpr bool async = true;

using launch_policy = RAJA::LaunchPolicy<RAJA::hip_launch_t<async, MEA_D1D*MEA_D1D*MEA_D1D>>;

using outer_x = RAJA::LoopPolicy<RAJA::hip_block_x_direct>;

using inner_x = RAJA::LoopPolicy<RAJA::hip_thread_x_loop>;

using inner_y = RAJA::LoopPolicy<RAJA::hip_thread_y_loop>;

using inner_z = RAJA::LoopPolicy<RAJA::hip_thread_z_loop>;

startTimer();
for (RepIndex_type irep = 0; irep < run_reps; ++irep) {

RAJA::launch<launch_policy>(
RAJA::LaunchParams(RAJA::Teams(NE),
RAJA::Threads(MEA_D1D, MEA_D1D, MEA_D1D)),
[=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) {

RAJA::loop<outer_x>(ctx, RAJA::RangeSegment(0, NE),
[&](int e) {

MASS3DEA_0

RAJA::loop<inner_z>(ctx, RAJA::RangeSegment(0, 1),
artv3 marked this conversation as resolved.
Show resolved Hide resolved
[&](int ) {
RAJA::loop<inner_x>(ctx, RAJA::RangeSegment(0, MEA_D1D),
[&](int d) {
RAJA::loop<inner_y>(ctx, RAJA::RangeSegment(0, MEA_Q1D),
[&](int q) {
MASS3DEA_1
}
); // RAJA::loop<inner_y>
}
); // RAJA::loop<inner_x>
}
); // RAJA::loop<inner_z>


MASS3DEA_2

RAJA::loop<inner_x>(ctx, RAJA::RangeSegment(0, MEA_Q1D),
[&](int k1) {
RAJA::loop<inner_y>(ctx, RAJA::RangeSegment(0, MEA_Q1D),
[&](int k2) {
RAJA::loop<inner_z>(ctx, RAJA::RangeSegment(0, MEA_Q1D),
[&](int k3) {
MASS3DEA_3
}
); // RAJA::loop<inner_x>
}
); // RAJA::loop<inner_y>
}
); // RAJA::loop<inner_z>

ctx.teamSync();

RAJA::loop<inner_x>(ctx, RAJA::RangeSegment(0, MEA_D1D),
[&](int i1) {
RAJA::loop<inner_y>(ctx, RAJA::RangeSegment(0, MEA_D1D),
[&](int i2) {
RAJA::loop<inner_z>(ctx, RAJA::RangeSegment(0, MEA_D1D),
[&](int i3) {
MASS3DEA_4
}
); // RAJA::loop<inner_x>
}
); // RAJA::loop<inner_y>
}
); // RAJA::loop<inner_z>

} // lambda (e)
); // RAJA::loop<outer_x>

} // outer lambda (ctx)
); // RAJA::launch

} // loop over kernel reps
stopTimer();

break;
}

default: {

getCout() << "\n MASS3DEA : Unknown Hip variant id = " << vid << std::endl;
break;
}
}
}

RAJAPERF_GPU_BLOCK_SIZE_TUNING_DEFINE_BIOLERPLATE(MASS3DEA, Hip)

} // end namespace apps
} // end namespace rajaperf

#endif // RAJA_ENABLE_HIP
Loading