Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -636,7 +636,7 @@ exit 0
target_compile_options(${a_target} PRIVATE -fopenmp)
target_link_options(${a_target} PRIVATE -fopenmp)
elseif(CMAKE_Fortran_COMPILER_ID STREQUAL "LLVMFlang")
target_compile_options(${a_target} PRIVATE -fopenmp --offload-arch=gfx90a)
target_compile_options(${a_target} PRIVATE -fopenmp --offload-arch=gfx90a -fopenmp-target-fast -fopenmp-assume-threads-oversubscription -fopenmp-assume-teams-oversubscription)
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟠 Major

🧩 Analysis chain

🏁 Script executed:

#!/bin/bash
set -euo pipefail

fc="${FC:-flang}"

if ! command -v "$fc" >/dev/null 2>&1; then
  echo "Fortran compiler '$fc' not found; set FC to the LLVMFlang/flang binary to verify these flags." >&2
  exit 2
fi

"$fc" --help 2>&1 | grep -E -- \
  '-fopenmp-target-fast|-fopenmp-assume-threads-oversubscription|-fopenmp-assume-teams-oversubscription|--offload-arch' \
  || {
    echo "One or more LLVMFlang OpenMP offload flags are not advertised by this compiler." >&2
    exit 1
  }

Repository: MFlowCode/MFC

Length of output: 152


🏁 Script executed:

# Check if CMakeLists.txt exists and get file size
wc -l CMakeLists.txt

Repository: MFlowCode/MFC

Length of output: 76


🏁 Script executed:

# Read lines 630-650 to see the context around line 639
sed -n '630,650p' CMakeLists.txt

Repository: MFlowCode/MFC

Length of output: 1484


🏁 Script executed:

# Search for GPU architecture configuration patterns in CMakeLists.txt
rg -n "offload-arch|AMDGPU|AMD_GPU|GPU_ARCH" CMakeLists.txt

Repository: MFlowCode/MFC

Length of output: 352


🏁 Script executed:

# Search for LLVMFlang or flang compiler handling
rg -n "LLVMFlang\|flang" CMakeLists.txt

Repository: MFlowCode/MFC

Length of output: 39


🏁 Script executed:

# Search for GPU architecture configuration patterns more broadly
rg -n "GPU|ARCH|offload" CMakeLists.txt | head -30

Repository: MFlowCode/MFC

Length of output: 1453


🏁 Script executed:

# Search for MFC_* CMake options
rg -n "^option\|^set.*CACHE" CMakeLists.txt | head -40

Repository: MFlowCode/MFC

Length of output: 39


🏁 Script executed:

# Get more context around the LLVMFlang block - lines 625-660
sed -n '625,660p' CMakeLists.txt

Repository: MFlowCode/MFC

Length of output: 2309


🏁 Script executed:

# Check if there are any other references to gfx90a or MI250X in the repo
rg -n "gfx90a|MI250X|AMD" --type-add 'cmake:*.cmake' -t cmake

Repository: MFlowCode/MFC

Length of output: 559


🏁 Script executed:

# Search for any existing CMake configuration or detection of AMD GPU architecture
rg -n "AMDGPU\|amdgpu\|ROCM\|rocm\|HIP" CMakeLists.txt

Repository: MFlowCode/MFC

Length of output: 39


🏁 Script executed:

# Check git history to see if these lines were modified in this PR
git log -p --follow -S "offload-arch=gfx90a" -- CMakeLists.txt | head -100

Repository: MFlowCode/MFC

Length of output: 39


🏁 Script executed:

# Get the git diff to see what actually changed in this PR
git diff HEAD~1 CMakeLists.txt | grep -A 5 -B 5 "LLVMFlang" || echo "No diff found or file unchanged"

Repository: MFlowCode/MFC

Length of output: 292


🏁 Script executed:

# Search for any mention of build type (Release/Debug) constraints on GPU flags
rg -n "CMAKE_BUILD_TYPE\|Release\|Debug" CMakeLists.txt | head -20

Repository: MFlowCode/MFC

Length of output: 39


Gate the LLVMFlang fast-offload assumptions and GPU arch.

Lines 639–640 hardcode --offload-arch=gfx90a for both compile and link, and line 639 unconditionally enables -fopenmp-target-fast -fopenmp-assume-threads-oversubscription -fopenmp-assume-teams-oversubscription for all LLVMFlang builds, including Debug and RelDebug. ROCm documents -fopenmp-target-fast as requiring its constraints to be satisfied and implying extra assumptions (e.g., ignoring target env vars, applying -O3 when no -O* flag is set), which can alter debugging and performance-tuning behavior. Additionally, hardcoding --offload-arch=gfx90a limits this path to MI250X-class GPUs; the codebase's own FIXME comment at line 645 (in the GNU section) acknowledges this limitation.

Make the GPU architecture configurable via CMake and gate the fast assumptions behind Release build type and/or an explicit option.

Suggested direction
+option(MFC_AMD_OPENMP_FAST "Enable LLVMFlang AMD OpenMP fast target assumptions" ON)
+set(MFC_AMDGPU_ARCH "native" CACHE STRING "AMD GPU architecture for LLVMFlang OpenMP offload")

 elseif(CMAKE_Fortran_COMPILER_ID STREQUAL "LLVMFlang")
-    target_compile_options(${a_target} PRIVATE -fopenmp --offload-arch=gfx90a -fopenmp-target-fast -fopenmp-assume-threads-oversubscription -fopenmp-assume-teams-oversubscription)
-    target_link_options(${a_target} PRIVATE -fopenmp --offload-arch=gfx90a)
+    target_compile_options(${a_target} PRIVATE -fopenmp "--offload-arch=${MFC_AMDGPU_ARCH}")
+    target_link_options(${a_target} PRIVATE -fopenmp "--offload-arch=${MFC_AMDGPU_ARCH}")
+
+    if (MFC_AMD_OPENMP_FAST AND CMAKE_BUILD_TYPE STREQUAL "Release")
+        target_compile_options(${a_target} PRIVATE
+            -fopenmp-target-fast
+            -fopenmp-assume-threads-oversubscription
+            -fopenmp-assume-teams-oversubscription)
+    endif()
 endif()

target_link_options(${a_target} PRIVATE -fopenmp --offload-arch=gfx90a)
endif()
endif()
Expand Down
147 changes: 79 additions & 68 deletions src/simulation/m_weno.fpp
Original file line number Diff line number Diff line change
Expand Up @@ -941,7 +941,7 @@ contains
if (weno_order == 3 .or. dummy) then
#:for WENO_DIR, XYZ in [(1, 'x'), (2, 'y'), (3, 'z')]
if (weno_dir == ${WENO_DIR}$) then
$:GPU_PARALLEL_LOOP(collapse=4,private='[beta, dvd, poly, omega, alpha, tau]')
$:GPU_PARALLEL_LOOP(collapse=4,private='[beta, dvd, poly, omega, alpha, tau, q]')
do l = is3_weno%beg, is3_weno%end
do k = is2_weno%beg, is2_weno%end
do j = is1_weno%beg, is1_weno%end
Expand All @@ -962,24 +962,25 @@ contains
beta(1) = beta_coef_${XYZ}$ (j, 1, 0)*dvd(-1)*dvd(-1) + weno_eps

if (wenojs) then
alpha(0:weno_num_stencils) = d_cbL_${XYZ}$ (0:weno_num_stencils, &
& j)/(beta(0:weno_num_stencils)**2._wp)
do q = 0, weno_num_stencils
alpha(q) = d_cbL_${XYZ}$ (q, j)/(beta(q)**2._wp)
end do
else if (mapped_weno) then
alpha(0:weno_num_stencils) = d_cbL_${XYZ}$ (0:weno_num_stencils, &
& j)/(beta(0:weno_num_stencils)**2._wp)
do q = 0, weno_num_stencils
alpha(q) = d_cbL_${XYZ}$ (q, j)/(beta(q)**2._wp)
end do
omega = alpha/sum(alpha)
alpha(0:weno_num_stencils) = (d_cbL_${XYZ}$ (0:weno_num_stencils, &
& j)*(1._wp + d_cbL_${XYZ}$ (0:weno_num_stencils, &
& j) - 3._wp*omega(0:weno_num_stencils)) + omega(0:weno_num_stencils)**2._wp) &
& *(omega(0:weno_num_stencils)/(d_cbL_${XYZ}$ (0:weno_num_stencils, &
& j)**2._wp + omega(0:weno_num_stencils)*(1._wp &
& - 2._wp*d_cbL_${XYZ}$ (0:weno_num_stencils,j))))
do q = 0, weno_num_stencils
alpha(q) = (d_cbL_${XYZ}$ (q, j)*(1._wp + d_cbL_${XYZ}$ (q, &
& j) - 3._wp*omega(q)) + omega(q)**2._wp)*(omega(q)/(d_cbL_${XYZ}$ (q, &
& j)**2._wp + omega(q)*(1._wp - 2._wp*d_cbL_${XYZ}$ (q, j))))
end do
else if (wenoz) then
! Borges, et al. (2008)

tau = abs(beta(1) - beta(0))
alpha(0:weno_num_stencils) = d_cbL_${XYZ}$ (0:weno_num_stencils, &
& j)*(1._wp + tau/beta(0:weno_num_stencils))
do q = 0, weno_num_stencils
alpha(q) = d_cbL_${XYZ}$ (q, j)*(1._wp + tau/beta(q))
end do
end if

omega = alpha/sum(alpha)
Expand All @@ -992,21 +993,23 @@ contains
poly(1) = v_rs_ws_${XYZ}$ (j, k, l, i) + poly_coef_cbR_${XYZ}$ (j, 1, 0)*dvd(-1)

if (wenojs) then
alpha(0:weno_num_stencils) = d_cbR_${XYZ}$ (0:weno_num_stencils, &
& j)/(beta(0:weno_num_stencils)**2._wp)
do q = 0, weno_num_stencils
alpha(q) = d_cbR_${XYZ}$ (q, j)/(beta(q)**2._wp)
end do
else if (mapped_weno) then
alpha(0:weno_num_stencils) = d_cbR_${XYZ}$ (0:weno_num_stencils, &
& j)/(beta(0:weno_num_stencils)**2._wp)
do q = 0, weno_num_stencils
alpha(q) = d_cbR_${XYZ}$ (q, j)/(beta(q)**2._wp)
end do
omega = alpha/sum(alpha)
alpha(0:weno_num_stencils) = (d_cbR_${XYZ}$ (0:weno_num_stencils, &
& j)*(1._wp + d_cbR_${XYZ}$ (0:weno_num_stencils, &
& j) - 3._wp*omega(0:weno_num_stencils)) + omega(0:weno_num_stencils)**2._wp) &
& *(omega(0:weno_num_stencils)/(d_cbR_${XYZ}$ (0:weno_num_stencils, &
& j)**2._wp + omega(0:weno_num_stencils)*(1._wp &
& - 2._wp*d_cbR_${XYZ}$ (0:weno_num_stencils,j))))
do q = 0, weno_num_stencils
alpha(q) = (d_cbR_${XYZ}$ (q, j)*(1._wp + d_cbR_${XYZ}$ (q, &
& j) - 3._wp*omega(q)) + omega(q)**2._wp)*(omega(q)/(d_cbR_${XYZ}$ (q, &
& j)**2._wp + omega(q)*(1._wp - 2._wp*d_cbR_${XYZ}$ (q, j))))
end do
else if (wenoz) then
alpha(0:weno_num_stencils) = d_cbR_${XYZ}$ (0:weno_num_stencils, &
& j)*(1._wp + tau/beta(0:weno_num_stencils))
do q = 0, weno_num_stencils
alpha(q) = d_cbR_${XYZ}$ (q, j)*(1._wp + tau/beta(q))
end do
end if

omega = alpha/sum(alpha)
Expand Down Expand Up @@ -1057,18 +1060,19 @@ contains
& 1)*dvd(-1)*dvd(-2) + beta_coef_${XYZ}$ (j, 2, 2)*dvd(-2)*dvd(-2) + weno_eps

if (wenojs) then
alpha(0:weno_num_stencils) = d_cbL_${XYZ}$ (0:weno_num_stencils, &
& j)/(beta(0:weno_num_stencils)**2._wp)
do q = 0, weno_num_stencils
alpha(q) = d_cbL_${XYZ}$ (q, j)/(beta(q)**2._wp)
end do
else if (mapped_weno) then
alpha(0:weno_num_stencils) = d_cbL_${XYZ}$ (0:weno_num_stencils, &
& j)/(beta(0:weno_num_stencils)**2._wp)
do q = 0, weno_num_stencils
alpha(q) = d_cbL_${XYZ}$ (q, j)/(beta(q)**2._wp)
end do
omega = alpha/sum(alpha)
alpha(0:weno_num_stencils) = (d_cbL_${XYZ}$ (0:weno_num_stencils, &
& j)*(1._wp + d_cbL_${XYZ}$ (0:weno_num_stencils, &
& j) - 3._wp*omega(0:weno_num_stencils)) + omega(0:weno_num_stencils)**2._wp) &
& *(omega(0:weno_num_stencils)/(d_cbL_${XYZ}$ (0:weno_num_stencils, &
& j)**2._wp + omega(0:weno_num_stencils)*(1._wp &
& - 2._wp*d_cbL_${XYZ}$ (0:weno_num_stencils,j))))
do q = 0, weno_num_stencils
alpha(q) = (d_cbL_${XYZ}$ (q, j)*(1._wp + d_cbL_${XYZ}$ (q, &
& j) - 3._wp*omega(q)) + omega(q)**2._wp)*(omega(q)/(d_cbL_${XYZ}$ (q, &
& j)**2._wp + omega(q)*(1._wp - 2._wp*d_cbL_${XYZ}$ (q, j))))
end do
else if (wenoz) then
! Borges, et al. (2008)

Expand Down Expand Up @@ -1100,7 +1104,9 @@ contains
end do
end if

omega = alpha/sum(alpha)
omega(0) = alpha(0)/(alpha(0) + alpha(1) + alpha(2))
omega(1) = alpha(1)/(alpha(0) + alpha(1) + alpha(2))
omega(2) = alpha(2)/(alpha(0) + alpha(1) + alpha(2))

vL_rs_vf_${XYZ}$ (j, k, l, i) = omega(0)*poly(0) + omega(1)*poly(1) + omega(2)*poly(2)

Expand All @@ -1114,18 +1120,19 @@ contains
& 0)*dvd(-1) + poly_coef_cbR_${XYZ}$ (j, 2, 1)*dvd(-2)

if (wenojs) then
alpha(0:weno_num_stencils) = d_cbR_${XYZ}$ (0:weno_num_stencils, &
& j)/(beta(0:weno_num_stencils)**2._wp)
do q = 0, weno_num_stencils
alpha(q) = d_cbR_${XYZ}$ (q, j)/(beta(q)**2._wp)
end do
else if (mapped_weno) then
alpha(0:weno_num_stencils) = d_cbR_${XYZ}$ (0:weno_num_stencils, &
& j)/(beta(0:weno_num_stencils)**2._wp)
do q = 0, weno_num_stencils
alpha(q) = d_cbR_${XYZ}$ (q, j)/(beta(q)**2._wp)
end do
omega = alpha/sum(alpha)
alpha(0:weno_num_stencils) = (d_cbR_${XYZ}$ (0:weno_num_stencils, &
& j)*(1._wp + d_cbR_${XYZ}$ (0:weno_num_stencils, &
& j) - 3._wp*omega(0:weno_num_stencils)) + omega(0:weno_num_stencils)**2._wp) &
& *(omega(0:weno_num_stencils)/(d_cbR_${XYZ}$ (0:weno_num_stencils, &
& j)**2._wp + omega(0:weno_num_stencils)*(1._wp &
& - 2._wp*d_cbR_${XYZ}$ (0:weno_num_stencils,j))))
do q = 0, weno_num_stencils
alpha(q) = (d_cbR_${XYZ}$ (q, j)*(1._wp + d_cbR_${XYZ}$ (q, &
& j) - 3._wp*omega(q)) + omega(q)**2._wp)*(omega(q)/(d_cbR_${XYZ}$ (q, &
& j)**2._wp + omega(q)*(1._wp - 2._wp*d_cbR_${XYZ}$ (q, j))))
end do
else if (wenoz) then
$:GPU_LOOP(parallelism='[seq]')
do q = 0, weno_num_stencils
Expand All @@ -1138,7 +1145,9 @@ contains
end do
end if

omega = alpha/sum(alpha)
omega(0) = alpha(0)/(alpha(0) + alpha(1) + alpha(2))
omega(1) = alpha(1)/(alpha(0) + alpha(1) + alpha(2))
omega(2) = alpha(2)/(alpha(0) + alpha(1) + alpha(2))

vR_rs_vf_${XYZ}$ (j, k, l, i) = omega(0)*poly(0) + omega(1)*poly(1) + omega(2)*poly(2)
end do
Expand Down Expand Up @@ -1252,18 +1261,19 @@ contains
end if

if (wenojs) then
alpha(0:weno_num_stencils) = d_cbL_${XYZ}$ (0:weno_num_stencils, &
& j)/(beta(0:weno_num_stencils)**2._wp)
do q = 0, weno_num_stencils
alpha(q) = d_cbL_${XYZ}$ (q, j)/(beta(q)**2._wp)
end do
else if (mapped_weno) then
alpha(0:weno_num_stencils) = d_cbL_${XYZ}$ (0:weno_num_stencils, &
& j)/(beta(0:weno_num_stencils)**2._wp)
do q = 0, weno_num_stencils
alpha(q) = d_cbL_${XYZ}$ (q, j)/(beta(q)**2._wp)
end do
omega = alpha/sum(alpha)
alpha(0:weno_num_stencils) = (d_cbL_${XYZ}$ (0:weno_num_stencils, &
& j)*(1._wp + d_cbL_${XYZ}$ (0:weno_num_stencils, &
& j) - 3._wp*omega(0:weno_num_stencils)) + omega(0:weno_num_stencils)**2._wp) &
& *(omega(0:weno_num_stencils)/(d_cbL_${XYZ}$ (0:weno_num_stencils, &
& j)**2._wp + omega(0:weno_num_stencils)*(1._wp &
& - 2._wp*d_cbL_${XYZ}$ (0:weno_num_stencils,j))))
do q = 0, weno_num_stencils
alpha(q) = (d_cbL_${XYZ}$ (q, j)*(1._wp + d_cbL_${XYZ}$ (q, &
& j) - 3._wp*omega(q)) + omega(q)**2._wp)*(omega(q)/(d_cbL_${XYZ}$ (q, &
& j)**2._wp + omega(q)*(1._wp - 2._wp*d_cbL_${XYZ}$ (q, j))))
end do
else if (wenoz) then
! Castro, et al. (2010) Don & Borges (2013) also helps
tau = abs(beta(3) - beta(0)) ! Equation 50
Expand Down Expand Up @@ -1327,18 +1337,19 @@ contains
end if

if (wenojs) then
alpha(0:weno_num_stencils) = d_cbR_${XYZ}$ (0:weno_num_stencils, &
& j)/(beta(0:weno_num_stencils)**2._wp)
do q = 0, weno_num_stencils
alpha(q) = d_cbR_${XYZ}$ (q, j)/(beta(q)**2._wp)
end do
else if (mapped_weno) then
alpha(0:weno_num_stencils) = d_cbR_${XYZ}$ (0:weno_num_stencils, &
& j)/(beta(0:weno_num_stencils)**2._wp)
do q = 0, weno_num_stencils
alpha(q) = d_cbR_${XYZ}$ (q, j)/(beta(q)**2._wp)
end do
omega = alpha/sum(alpha)
alpha(0:weno_num_stencils) = (d_cbR_${XYZ}$ (0:weno_num_stencils, &
& j)*(1._wp + d_cbR_${XYZ}$ (0:weno_num_stencils, &
& j) - 3._wp*omega(0:weno_num_stencils)) + omega(0:weno_num_stencils)**2._wp) &
& *(omega(0:weno_num_stencils)/(d_cbR_${XYZ}$ (0:weno_num_stencils, &
& j)**2._wp + omega(0:weno_num_stencils)*(1._wp &
& - 2._wp*d_cbR_${XYZ}$ (0:weno_num_stencils,j))))
do q = 0, weno_num_stencils
alpha(q) = (d_cbR_${XYZ}$ (q, j)*(1._wp + d_cbR_${XYZ}$ (q, &
& j) - 3._wp*omega(q)) + omega(q)**2._wp)*(omega(q)/(d_cbR_${XYZ}$ (q, &
& j)**2._wp + omega(q)*(1._wp - 2._wp*d_cbR_${XYZ}$ (q, j))))
end do
else if (wenoz) then
$:GPU_LOOP(parallelism='[seq]')
do q = 0, weno_num_stencils
Expand Down
Loading