From 153af803cde34f5cf36c967429ea01c968b86c4c Mon Sep 17 00:00:00 2001
From: Pranay-Reddy-Kommera <pranay.kommera@gmail.com>
Date: Tue, 7 Dec 2021 00:30:59 -0800
Subject: [PATCH] Optimized parallel kernels in the scalars/scalars_mono

---
 .../dynamics/mpas_atm_time_integration.F      | 135 +++++++++++-------
 1 file changed, 80 insertions(+), 55 deletions(-)

diff --git a/src/core_atmosphere/dynamics/mpas_atm_time_integration.F b/src/core_atmosphere/dynamics/mpas_atm_time_integration.F
index 3a1a137daf..7d94891cdb 100755
--- a/src/core_atmosphere/dynamics/mpas_atm_time_integration.F
+++ b/src/core_atmosphere/dynamics/mpas_atm_time_integration.F
@@ -4155,16 +4155,19 @@ subroutine atm_advance_scalars_work( num_scalars_dummy, nCells, nVertLevels_dumm
                end do
             end do
 
-!$acc loop seq
-            do j=1,nAdvCellsForEdge(iEdge)
-               iAdvCell = advCellsForEdge(j,iEdge)
-!$acc loop vector collapse(2) private(scalar_weight)
+!!!!$acc loop seq
+!!!            do j=1,nAdvCellsForEdge(iEdge)
+!!!               iAdvCell = advCellsForEdge(j,iEdge)
 !DIR$ IVDEP
+!$acc loop vector collapse(2) private(scalar_weight,iAdvCell)
                do k=1,nVertLevels
 !DIR$ IVDEP
                   do iScalar=1,num_scalars
+!$acc loop seq
+                    do j=1,nAdvCellsForEdge(iEdge)
 !MGD OPENACC TO DO: scalar_weight is redundantly computed for each scalar
-                  scalar_weight = adv_coefs(j,iEdge) + sign(1.0_RKIND,uhAvg(k,iEdge))*adv_coefs_3rd(j,iEdge)
+                     iAdvCell = advCellsForEdge(j,iEdge)
+                     scalar_weight = adv_coefs(j,iEdge) + sign(1.0_RKIND,uhAvg(k,iEdge))*adv_coefs_3rd(j,iEdge)
                      horiz_flux_arr(iScalar,k,iEdge) = horiz_flux_arr(iScalar,k,iEdge) + scalar_weight * scalar_new(iScalar,k,iAdvCell)
                   end do
                end do
@@ -4183,9 +4186,9 @@ subroutine atm_advance_scalars_work( num_scalars_dummy, nCells, nVertLevels_dumm
 !DIR$ IVDEP
                do iScalar=1,num_scalars
 !MGD OPENACC TO DO: u_direction, u_positive, and u_negative are redundantly computed for each scalar
-               u_direction = sign(0.5_RKIND,uhAvg(k,iEdge))
-               u_positive = dvEdge(iEdge)*abs(u_direction + 0.5_RKIND)
-               u_negative = dvEdge(iEdge)*abs(u_direction - 0.5_RKIND)
+                  u_direction = sign(0.5_RKIND,uhAvg(k,iEdge))
+                  u_positive = dvEdge(iEdge)*abs(u_direction + 0.5_RKIND)
+                  u_negative = dvEdge(iEdge)*abs(u_direction - 0.5_RKIND)
                   horiz_flux_arr(iScalar,k,iEdge) = u_positive*scalar_new(iScalar,k,cell1) + u_negative*scalar_new(iScalar,k,cell2)
                end do
             end do
@@ -4200,7 +4203,6 @@ subroutine atm_advance_scalars_work( num_scalars_dummy, nCells, nVertLevels_dumm
 !  scalar update, for each column sum fluxes over horizontal edges, add physics tendency, and add vertical flux divergence in update.
 
 !$acc parallel vector_length(32)
-
 !$acc loop gang private(scalar_tend_column,iEdge,wdtn)
       do iCell=cellSolveStart,cellSolveEnd
 
@@ -4221,17 +4223,20 @@ subroutine atm_advance_scalars_work( num_scalars_dummy, nCells, nVertLevels_dumm
                end do
             end do
 
-!$acc loop seq
-            do i=1,nEdgesOnCell(iCell)
-               iEdge = edgesOnCell(i,iCell)
+!!!!$acc loop seq
+!!!            do i=1,nEdgesOnCell(iCell)
+!!!               iEdge = edgesOnCell(i,iCell)
 
                ! here we add the horizontal flux divergence into the scalar tendency.
                ! note that the scalar tendency is modified.
-!$acc loop vector collapse(2)
 !DIR$ IVDEP
+!$acc loop vector collapse(2) private(iEdge)
                do k=1,nVertLevels
 !DIR$ IVDEP
                   do iScalar=1,num_scalars
+!$acc loop seq
+                    do i=1,nEdgesOnCell(iCell)
+                        iEdge = edgesOnCell(i,iCell)
                         scalar_tend_column(iScalar,k) = scalar_tend_column(iScalar,k) &
                                - edgesOnCell_sign(i,iCell) * uhAvg(k,iEdge)*horiz_flux_arr(iScalar,k,iEdge)
                   end do
@@ -4278,8 +4283,8 @@ subroutine atm_advance_scalars_work( num_scalars_dummy, nCells, nVertLevels_dumm
             wdtn(iScalar,k) = wwAvg(k,iCell)*(fnm(k)*scalar_new(iScalar,k,iCell)+fnp(k)*scalar_new(iScalar,k-1,iCell))
          end do
 
-!$acc loop vector collapse(2) private(rho_zz_new_inv)
 !DIR$ IVDEP
+!$acc loop vector collapse(2) private(rho_zz_new_inv)
          do k=1,nVertLevels
 !DIR$ IVDEP
             do iScalar=1,num_scalars
@@ -4293,7 +4298,6 @@ subroutine atm_advance_scalars_work( num_scalars_dummy, nCells, nVertLevels_dumm
 
       end do
 !$acc end parallel
-
 !$acc end data
 
    end subroutine atm_advance_scalars_work
@@ -4536,7 +4540,7 @@ subroutine atm_advance_scalars_mono_work(block, state, nCells, nEdges, nVertLeve
 !$acc scalar_new, wdtn, scale_arr, scalar_old, flux_upwind_tmp, &
 !$acc s_min, s_max, flux_tmp, bdyMaskCell, bdyMaskEdge)
 
-!$acc parallel vector_length(32)
+!$acc parallel vector_length(128)
 !$acc loop gang vector collapse(3)
 
       do iCell=cellSolveStart,cellSolveEnd
@@ -4577,24 +4581,29 @@ subroutine atm_advance_scalars_mono_work(block, state, nCells, nEdges, nVertLeve
          end if
 
          !  begin with update of density
-!$acc parallel vector_length(32)
-!$acc loop gang vector
+!$acc parallel vector_length(128)
+!$acc loop gang vector collapse(2)
          do iCell=cellStart,cellEnd
-            rho_zz_int(:,iCell) = 0.0
+         do k = 1,nVertLevels
+            rho_zz_int(k,iCell) = 0.0
+         end do
          end do
 !$acc end parallel
 
 !$OMP BARRIER
 !$acc parallel vector_length(32)
-!$acc loop gang private(iEdge)
+!$acc loop gang vector collapse(2) private(iEdge)
          do iCell=cellSolveStart,cellSolveEnd
-!$acc loop seq
-            do i=1,nEdgesOnCell(iCell)
-               iEdge = edgesOnCell(i,iCell)
+!!!!$acc loop seq
+!!!            do i=1,nEdgesOnCell(iCell)
+!!!               iEdge = edgesOnCell(i,iCell)
    
 !DIR$ IVDEP
-!$acc loop vector
+!!!!$acc loop vector
                do k=1,nVertLevels
+!$acc loop seq
+                do i=1,nEdgesOnCell(iCell)
+                  iEdge = edgesOnCell(i,iCell)
                   rho_zz_int(k,iCell) = rho_zz_int(k,iCell) - edgesOnCell_sign(i,iCell) * uhAvg(k,iEdge) * dvEdge(iEdge) * invAreaCell(iCell)
                end do
                
@@ -4602,7 +4611,7 @@ subroutine atm_advance_scalars_mono_work(block, state, nCells, nEdges, nVertLeve
          end do
 !$acc end parallel
 
-!$acc parallel vector_length(32)
+!$acc parallel vector_length(128)
 !$acc loop gang vector collapse(2)
          do iCell=cellSolveStart,cellSolveEnd
 !DIR$ IVDEP
@@ -4617,7 +4626,7 @@ subroutine atm_advance_scalars_mono_work(block, state, nCells, nEdges, nVertLeve
       ! next, do one scalar at a time
 
       do iScalar = 1, num_scalars
-!$acc parallel vector_length(32)
+!$acc parallel vector_length(128)
 !$acc loop gang vector collapse(2)
          do iCell=cellStart,cellEnd
 !DIR$ IVDEP
@@ -4626,9 +4635,11 @@ subroutine atm_advance_scalars_mono_work(block, state, nCells, nEdges, nVertLeve
                scalar_new(k,iCell) = scalars_new(iScalar,k,iCell)
             end do
          end do
+!$acc end parallel
 
 ! ***** TEMPORARY TEST ******* WCS 20161012
-!$acc loop vector
+!$acc parallel vector_length(64)
+!$acc loop gang vector
             do k=1,nVertLevels
                scalar_old(k,nCells+1) = 0.
                scalar_new(k,nCells+1) = 0.
@@ -4846,37 +4857,44 @@ subroutine atm_advance_scalars_mono_work(block, state, nCells, nEdges, nVertLeve
          !
 
          !  upwind flux computation
-!$acc parallel vector_length(32)
-!$acc loop gang private(cell1, cell2)
+!$acc parallel vector_length(64)
+!$acc loop gang
          do iEdge=edgeStart,edgeEnd
-            cell1 = cellsOnEdge(1,iEdge)
-            cell2 = cellsOnEdge(2,iEdge)
+!!!            cell1 = cellsOnEdge(1,iEdge)
+!!!            cell2 = cellsOnEdge(2,iEdge)
 !DIR$ IVDEP
-!$acc loop vector
+!$acc loop vector private(cell1, cell2)
             do k=1, nVertLevels
+               cell1 = cellsOnEdge(1,iEdge)
+               cell2 = cellsOnEdge(2,iEdge)
                flux_upwind_tmp(k,iEdge) = dvEdge(iEdge) * dt *   &
                       (max(0.0_RKIND,uhAvg(k,iEdge))*scalar_old(k,cell1) + min(0.0_RKIND,uhAvg(k,iEdge))*scalar_old(k,cell2))
                flux_tmp(k,iEdge) = dt * flux_arr(k,iEdge) - flux_upwind_tmp(k,iEdge)
             end do
 
             if( config_apply_lbcs .and. (bdyMaskEdge(iEdge) == nRelaxZone) .or. (bdyMaskEdge(iEdge) == nRelaxZone-1) ) then
-               flux_tmp(:,iEdge) = 0.
-               flux_arr(:,iEdge) = flux_upwind_tmp(:,iEdge)
+!$acc loop vector
+             do k=1, nVertLevels
+               flux_tmp(k,iEdge) = 0.
+               flux_arr(k,iEdge) = flux_upwind_tmp(k,iEdge)
+             end do
             end if
 
          end do
 !$acc end parallel
 !$OMP BARRIER
-!$acc parallel vector_length(32)
-!$acc loop gang private(iEdge)
+!$acc parallel vector_length(128)
+!$acc loop gang vector collapse(2) private(iEdge)
          do iCell=cellSolveStart,cellSolveEnd
-!$acc loop seq
-            do i=1,nEdgesOnCell(iCell)
-               iEdge = edgesOnCell(i,iCell)
-
+!!!!$acc loop seq
+!!!            do i=1,nEdgesOnCell(iCell)
+!!!               iEdge = edgesOnCell(i,iCell)
 !DIR$ IVDEP
-!$acc loop vector
+!!!!$acc loop vector
                do k=1, nVertLevels
+!$acc loop seq
+                do i=1,nEdgesOnCell(iCell)
+                  iEdge = edgesOnCell(i,iCell)
                   scalar_new(k,iCell) = scalar_new(k,iCell) - edgesOnCell_sign(i,iCell) * flux_upwind_tmp(k,iEdge) * invAreaCell(iCell)
  
                   scale_arr(k,SCALE_OUT,iCell) = scale_arr(k,SCALE_OUT,iCell) &
@@ -4896,7 +4914,7 @@ subroutine atm_advance_scalars_mono_work(block, state, nCells, nEdges, nVertLeve
          ! worked through algebra and found equivalent form
          ! added benefit that it should address ifort single prec overflow issue
       if (local_advance_density) then
-!$acc parallel vector_length(32)
+!$acc parallel vector_length(128)
 !$acc loop gang vector collapse(2) private(scale_factor)
          do iCell=cellSolveStart,cellSolveEnd
 !DIR$ IVDEP
@@ -4913,7 +4931,7 @@ subroutine atm_advance_scalars_mono_work(block, state, nCells, nEdges, nVertLeve
          end do
 !$acc end parallel
       else
-!$acc parallel vector_length(32)
+!$acc parallel vector_length(128)
 !$acc loop gang vector collapse(2) private(scale_factor)
          do iCell=cellSolveStart,cellSolveEnd
 !DIR$ IVDEP
@@ -4947,7 +4965,7 @@ subroutine atm_advance_scalars_mono_work(block, state, nCells, nEdges, nVertLeve
 
 
 !$acc parallel vector_length(32)
-!$acc loop gang private(cell1,cell2)
+!$acc loop gang worker private(cell1,cell2)
          do iEdge=edgeStart,edgeEnd
             cell1 = cellsOnEdge(1,iEdge)
             cell2 = cellsOnEdge(2,iEdge)
@@ -4961,7 +4979,10 @@ subroutine atm_advance_scalars_mono_work(block, state, nCells, nEdges, nVertLeve
                end do
 
                if( config_apply_lbcs .and. (bdyMaskEdge(iEdge) == nRelaxZone) .or. (bdyMaskEdge(iEdge) == nRelaxZone-1) ) then
-                  flux_arr(:,iEdge) = 0.
+!$acc loop vector
+                do k=1, nVertLevels
+                  flux_arr(k,iEdge) = 0.
+                end do
                end if
 
             end if
@@ -4974,7 +4995,7 @@ subroutine atm_advance_scalars_mono_work(block, state, nCells, nEdges, nVertLeve
          ! moved assignment to scalar_new from separate loop (see commented code below)
          ! into the following loops. Avoids having to save elements of flux array
 !$acc parallel vector_length(32)
-!$acc loop gang private(cell1, cell2)
+!$acc loop gang worker private(cell1, cell2)
          do iEdge=edgeStart,edgeEnd
             cell1 = cellsOnEdge(1,iEdge)
             cell2 = cellsOnEdge(2,iEdge)
@@ -4995,7 +5016,7 @@ subroutine atm_advance_scalars_mono_work(block, state, nCells, nEdges, nVertLeve
         ! rescale the vertical flux
         !
 !$OMP BARRIER
-!$acc parallel vector_length(32)
+!$acc parallel vector_length(128)
 !$acc loop gang vector collapse(2) private(flux)
          do iCell=cellSolveStart,cellSolveEnd
 !DIR$ IVDEP
@@ -5012,14 +5033,17 @@ subroutine atm_advance_scalars_mono_work(block, state, nCells, nEdges, nVertLeve
          !  do the scalar update now that we have the fluxes
          !
 !$acc parallel vector_length(32)
-!$acc loop gang private(iEdge)
+!$acc loop gang
          do iCell=cellSolveStart,cellSolveEnd
-!$acc loop seq
-            do i=1,nEdgesOnCell(iCell)
-               iEdge = edgesOnCell(i,iCell)
+!!!!$acc loop seq
+!!!            do i=1,nEdgesOnCell(iCell)
+!!!               iEdge = edgesOnCell(i,iCell)
 !DIR$ IVDEP
-!$acc loop vector
+!$acc loop vector private(iEdge)
                do k=1,nVertLevels
+!$acc loop seq
+                do i=1,nEdgesOnCell(iCell)
+                  iEdge = edgesOnCell(i,iCell)
                   scalar_new(k,iCell) = scalar_new(k,iCell) - edgesOnCell_sign(i,iCell)*flux_arr(k,iEdge) * invAreaCell(iCell)
                end do
             end do
@@ -5069,10 +5093,11 @@ subroutine atm_advance_scalars_mono_work(block, state, nCells, nEdges, nVertLeve
 !!! MGD OPENACC TO DO: here is the original code that we can probably handle
 !!! better than below...
 !!!
-!$acc parallel vector_length(32)
-!$acc loop gang vector collapse(2)
+!$acc parallel vector_length(64)
+!$acc loop gang
          do iCell=cellStart,cellEnd
             if(bdyMaskCell(iCell) <= nSpecZone) then ! regional_MPAS does spec zone update after transport.
+!$acc loop vector
                do k=1, nVertLevels
                   scalars_new(iScalar,k,iCell) = max(0.0_RKIND,scalar_new(k,iCell))
                end do
@@ -5081,7 +5106,7 @@ subroutine atm_advance_scalars_mono_work(block, state, nCells, nEdges, nVertLeve
 !$acc end parallel
       end do !  loop over scalars
 #else
-!$acc parallel vector_length(32)
+!$acc parallel vector_length(64)
 !$acc loop gang
          do iCell=cellStart,cellEnd
             if(bdyMaskCell(iCell) <= nSpecZone) then ! regional_MPAS does spec zone update after transport.