diff --git a/.github/workflows/bench.yml b/.github/workflows/bench.yml index cadfe220f3..563566788e 100644 --- a/.github/workflows/bench.yml +++ b/.github/workflows/bench.yml @@ -36,6 +36,7 @@ jobs: labels: gt flag: p device: cpu + interface: none build_script: "" - cluster: phoenix name: Georgia Tech | Phoenix (NVHPC) @@ -43,14 +44,32 @@ jobs: labels: gt flag: p device: gpu + interface: acc build_script: "" + - cluster: phoenix + name: Georgia Tech | Phoenix (NVHPC) + group: phoenix + labels: gt + flag: p + device: gpu + interface: omp + build_script: "" + - cluster: frontier + name: Oak Ridge | Frontier (CCE) + group: phoenix + labels: frontier + flag: f + device: gpu + interface: acc + build_script: "bash .github/workflows/frontier/build.sh gpu acc bench" - cluster: frontier name: Oak Ridge | Frontier (CCE) group: phoenix labels: frontier flag: f device: gpu - build_script: "bash .github/workflows/frontier/build.sh gpu bench" + interface: omp + build_script: "bash .github/workflows/frontier/build.sh gpu omp bench" runs-on: group: ${{ matrix.group }} labels: ${{ matrix.labels }} @@ -80,29 +99,29 @@ jobs: - name: Bench (Master v. PR) run: | - (cd pr && bash .github/workflows/${{ matrix.cluster }}/submit-bench.sh .github/workflows/${{ matrix.cluster }}/bench.sh ${{ matrix.device }}) & - (cd master && bash .github/workflows/${{ matrix.cluster }}/submit-bench.sh .github/workflows/${{ matrix.cluster }}/bench.sh ${{ matrix.device }}) & + (cd pr && bash .github/workflows/${{ matrix.cluster }}/submit-bench.sh .github/workflows/${{ matrix.cluster }}/bench.sh ${{ matrix.device }} ${{ matrix.interface }}) & + (cd master && bash .github/workflows/${{ matrix.cluster }}/submit-bench.sh .github/workflows/${{ matrix.cluster }}/bench.sh ${{ matrix.device }} ${{ matrix.interface }}) & wait %1 && wait %2 - name: Generate & Post Comment run: | (cd pr && . ./mfc.sh load -c ${{ matrix.flag }} -m g) - (cd pr && ./mfc.sh bench_diff ../master/bench-${{ matrix.device }}.yaml ../pr/bench-${{ matrix.device }}.yaml) + (cd pr && ./mfc.sh bench_diff ../master/bench-${{ matrix.device }}-${{ matrix.interface }}.yaml ../pr/bench-${{ matrix.device }}-${{ matrix.interface }}.yaml) - name: Print Logs if: always() run: | - cat pr/bench-${{ matrix.device }}.* 2>/dev/null || true - cat master/bench-${{ matrix.device }}.* 2>/dev/null || true + cat pr/bench-${{ matrix.device }}-${{ matrix.interface }}.* 2>/dev/null || true + cat master/bench-${{ matrix.device }}-${{ matrix.interface }}.* 2>/dev/null || true # All other runners (non-Phoenix) just run without special env - name: Archive Logs (Frontier) if: always() && matrix.cluster != 'phoenix' uses: actions/upload-artifact@v4 with: - name: ${{ matrix.cluster }}-${{ matrix.device }} + name: ${{ matrix.cluster }}-${{ matrix.device }}-${{ matrix.interface }} path: | - pr/bench-${{ matrix.device }}.* + pr/bench-${{ matrix.device }}-${{ matrix.interface }}.* pr/build/benchmarks/* - master/bench-${{ matrix.device }}.* + master/bench-${{ matrix.device }}-${{ matrix.interface }}.* master/build/benchmarks/* diff --git a/.github/workflows/frontier/bench.sh b/.github/workflows/frontier/bench.sh index 31a514d45d..35b4c5950e 100644 --- a/.github/workflows/frontier/bench.sh +++ b/.github/workflows/frontier/bench.sh @@ -1,12 +1,18 @@ #!/bin/bash n_ranks=12 - +device_opts="" if [ "$job_device" = "gpu" ]; then gpus=$(rocm-smi --showid | awk '{print $1}' | grep -Eo '[0-9]+' | uniq | tr '\n' ' ') n_ranks=$(echo "$gpus" | wc -w) # number of GPUs on node gpu_ids=$(echo "$gpus" | tr ' ' '\n' | tr '\n' ' ' | sed 's/ $//') # GPU IDs from rocm-smi - device_opts="--gpu -g $gpu_ids" + device_opts+="--gpu" + if [ "$job_interface" = "acc" ]; then + device_opts+=" acc" + elif [ "$job_interface" = "omp" ]; then + device_opts+=" mp" + fi + device_opts+=" -g $gpu_ids" fi if [ "$job_device" = "gpu" ]; then diff --git a/.github/workflows/frontier/build.sh b/.github/workflows/frontier/build.sh index c959eb7d5c..70c29204c1 100644 --- a/.github/workflows/frontier/build.sh +++ b/.github/workflows/frontier/build.sh @@ -1,13 +1,21 @@ #!/bin/bash +job_device=$1 +job_interface=$2 +run_bench=$3 build_opts="" -if [ "$1" = "gpu" ]; then - build_opts="--gpu" +if [ "$job_device" = "gpu" ]; then + build_opts+="--gpu" + if [ "$job_interface" = "acc" ]; then + build_opts+=" acc" + elif [ "$job_interface" = "omp" ]; then + build_opts+=" mp" + fi fi . ./mfc.sh load -c f -m g -if [ "$2" == "bench" ]; then +if [ "$run_bench" == "bench" ]; then for dir in benchmarks/*/; do dirname=$(basename "$dir") ./mfc.sh run "$dir/case.py" --case-optimization -j 8 --dry-run $build_opts diff --git a/.github/workflows/frontier/submit-bench.sh b/.github/workflows/frontier/submit-bench.sh index ff3fabdb37..9dbc5e5817 100644 --- a/.github/workflows/frontier/submit-bench.sh +++ b/.github/workflows/frontier/submit-bench.sh @@ -24,7 +24,7 @@ else fi -job_slug="`basename "$1" | sed 's/\.sh$//' | sed 's/[^a-zA-Z0-9]/-/g'`-$2" +job_slug="`basename "$1" | sed 's/\.sh$//' | sed 's/[^a-zA-Z0-9]/-/g'`-$2-$3" sbatch <:-Mfreeform> $<$:-cpp> - $<$:-Minfo=inline> + $<$:-Minfo=inline> $<$:-Minfo=accel> ) @@ -405,13 +406,13 @@ HANDLE_SOURCES(syscheck OFF) # * LAPACK (optional) Should be linked with LAPACK function(MFC_SETUP_TARGET) - cmake_parse_arguments(ARGS "OpenACC;MPI;SILO;HDF5;FFTW;LAPACK" "TARGET" "SOURCES" ${ARGN}) + cmake_parse_arguments(ARGS "OpenACC;MPI;SILO;HDF5;FFTW;LAPACK;OpenMP" "TARGET" "SOURCES" ${ARGN}) add_executable(${ARGS_TARGET} ${ARGS_SOURCES}) set(IPO_TARGETS ${ARGS_TARGET}) # Here we need to split into "library" and "executable" to perform IPO on the NVIDIA compiler. # A little hacky, but it *is* an edge-case for *one* compiler. - if (NVHPC_USE_TWO_PASS_IPO) + if (NVHPC_USE_TWO_PASS_IPO AND NOT(MFC_OpenMP AND ARGS_OpenMP)) add_library(${ARGS_TARGET}_lib OBJECT ${ARGS_SOURCES}) target_compile_options(${ARGS_TARGET}_lib PRIVATE $<$:-Mextract=lib:${ARGS_TARGET}_lib> @@ -459,7 +460,7 @@ function(MFC_SETUP_TARGET) endif() if (ARGS_FFTW) - if (MFC_OpenACC AND ARGS_OpenACC) + if ((MFC_OpenACC AND ARGS_OpenACC) OR (MFC_OpenMP AND ARGS_OpenMP)) if (CMAKE_Fortran_COMPILER_ID STREQUAL "NVHPC" OR CMAKE_Fortran_COMPILER_ID STREQUAL "PGI") find_package(CUDAToolkit REQUIRED) target_link_libraries(${a_target} PRIVATE CUDA::cudart CUDA::cufft) @@ -478,16 +479,43 @@ function(MFC_SETUP_TARGET) target_link_libraries(${a_target} PRIVATE LAPACK::LAPACK) endif() - if (MFC_OpenACC AND ARGS_OpenACC) - find_package(OpenACC) + if ((MFC_OpenACC AND ARGS_OpenACC) OR (MFC_OpenMP AND ARGS_OpenMP)) + if ((MFC_OpenACC AND ARGS_OpenACC)) + find_package(OpenACC) - # This should be equivalent to if (NOT OpenACC_FC_FOUND) - if (NOT TARGET OpenACC::OpenACC_Fortran) - message(FATAL_ERROR "OpenACC + Fortran is unsupported.") - endif() + # This should be equivalent to if (NOT OpenACC_FC_FOUND) + if (NOT TARGET OpenACC::OpenACC_Fortran) + message(FATAL_ERROR "OpenACC + Fortran is unsupported.") + endif() - target_link_libraries(${a_target} PRIVATE OpenACC::OpenACC_Fortran) - target_compile_definitions(${a_target} PRIVATE MFC_OpenACC) + target_link_libraries(${a_target} PRIVATE OpenACC::OpenACC_Fortran) + target_compile_definitions(${a_target} PRIVATE MFC_OpenACC MFC_GPU) + elseif((MFC_OpenMP AND ARGS_OpenMP)) + find_package(OpenMP) + + # This should be equivalent to if (NOT OpenACC_FC_FOUND) + if (NOT TARGET OpenMP::OpenMP_Fortran) + message(FATAL_ERROR "OpenMP + Fortran is unsupported.") + endif() + set(ENV{OMP_TARGET_OFFLOAD} [MANDATORY]) + # target_link_libraries(${a_target} PRIVATE OpenMP::OpenMP_Fortran) + target_compile_definitions(${a_target} PRIVATE MFC_OpenMP MFC_GPU) + + if(CMAKE_Fortran_COMPILER_ID STREQUAL "NVHPC" OR CMAKE_Fortran_COMPILER_ID STREQUAL "PGI") + target_compile_options(${a_target} PRIVATE "-mp=gpu" "-Minfo=mp") + target_link_options(${a_target} PRIVATE "-mp=gpu") + set_target_properties(${a_target} PROPERTIES Fortran_FLAGS "-mp=gpu -gpu=ccall") + elseif(CMAKE_Fortran_COMPILER_ID STREQUAL "Intel") + target_compile_options(${a_target} PRIVATE -fopenmp -fopenmp-targets=spir64) + target_link_options(${a_target} PRIVATE -fopenmp -fopenmp-targets=spir64) + elseif(CMAKE_Fortran_COMPILER_ID STREQUAL "Cray") + target_compile_options(${a_target} PRIVATE -fopenmp) + target_link_options(${a_target} PRIVATE -fopenmp) + elseif(CMAKE_Fortran_COMPILER_ID STREQUAL "LLVMFlang") + target_compile_options(${a_target} PRIVATE -fopenmp --offload-arch=gfx90a) + target_link_options(${a_target} PRIVATE -fopenmp --offload-arch=gfx90a) + endif() + endif() if (CMAKE_Fortran_COMPILER_ID STREQUAL "GNU") # FIXME: This should work with other cards than gfx90a ones. @@ -528,7 +556,7 @@ function(MFC_SETUP_TARGET) if (CMAKE_BUILD_TYPE STREQUAL "Debug") target_compile_options(${a_target} - PRIVATE -gpu=autocompare,debug + PRIVATE -gpu=debug ) endif() elseif(CMAKE_Fortran_COMPILER_ID STREQUAL "Cray") @@ -544,6 +572,9 @@ function(MFC_SETUP_TARGET) find_package(hipfort COMPONENTS hip CONFIG REQUIRED) target_link_libraries(${a_target} PRIVATE hipfort::hip hipfort::hipfort-amdgcn) + elseif(CMAKE_Fortran_COMPILER_ID STREQUAL "LLVMFlang") + find_package(hipfort COMPONENTS hip CONFIG REQUIRED) + target_link_libraries(${a_target} PRIVATE hipfort::hip hipfort::hipfort-amdgcn flang_rt.hostdevice) endif() elseif (CMAKE_Fortran_COMPILER_ID STREQUAL "Cray") target_compile_options(${a_target} PRIVATE "SHELL:-h noacc" "SHELL:-x acc") @@ -579,7 +610,7 @@ endif() if (MFC_SIMULATION) MFC_SETUP_TARGET(TARGET simulation SOURCES "${simulation_SRCs}" - MPI OpenACC FFTW) + MPI FFTW OpenACC OpenMP) endif() if (MFC_POST_PROCESS) @@ -594,7 +625,7 @@ endif() if (MFC_SYSCHECK) MFC_SETUP_TARGET(TARGET syscheck SOURCES "${syscheck_SRCs}" - MPI OpenACC) + MPI OpenACC OpenMP) endif() if (MFC_DOCUMENTATION) diff --git a/docs/documentation/gpuParallelization.md b/docs/documentation/gpuParallelization.md index 8c64996599..f3d2089057 100644 --- a/docs/documentation/gpuParallelization.md +++ b/docs/documentation/gpuParallelization.md @@ -44,14 +44,19 @@ Note: Ordering is not guaranteed or stable, so use key-value pairing when using **Macro Invocation** -Uses FYPP eval directive using `$:` +Uses FYPP eval directive using `#:call` -`$:GPU_PARALLEL_LOOP(...)` +```C +#:call GPU_PARALLEL_LOOP(...) + {code} +#:endcall GPU_PARALLEL_LOOP +``` **Parameters** | name | data type | Default Value | description | |------------------|---------------------|-------------------|-------------------------------------------------------------------------------------------| +| `code` | code | Required | Region of code where the GPU parallelizes loops | | `collapse` | integer | None | Number of loops to combine into 1 loop | | `parallelism` | string list | '\[gang,vector\]' | Parallelism granularity to use for this loop | | `default` | string | 'present' | Implicit assumptions compiler should make | @@ -69,6 +74,7 @@ Uses FYPP eval directive using `$:` | `deviceptr` | string list | None | Pointer variables that are already allocated on GPU memory | | `attach` | string list | None | Attaches device pointer to device targets on entrance, then detach on exit | | `extraAccArgs` | string | None | String of any extra arguments added to the OpenACC directive | +| `extraOmpArgs` | string | None | String of any extra arguments added to the OpenMP directive | **Parameter Restrictions** @@ -88,9 +94,15 @@ Uses FYPP eval directive using `$:` **Example** -```python - $:GPU_PARALLEL_LOOP(collapse=3, private='[tmp, r]', reduction='[[vol, avg], [max_val]]', reductionOp='[+, MAX]') - $:GPU_PARALLEL_LOOP(collapse=2, private='[sum_holder]', copyin='[starting_sum]', copyout='[eigenval,C]') +```C + #:call GPU_PARALLEL_LOOP(collapse=3, private='[tmp, r]', reduction='[[vol, avg], [max_val]]', reductionOp='[+, MAX]') + {code} + ... + #:endcall GPU_PARALLEL_LOOP + #:call GPU_PARALLEL_LOOP(collapse=2, private='[sum_holder]', copyin='[starting_sum]', copyout='[eigenval,C]') + {code} + ... + #:endcall GPU_PARALLEL_LOOP ``` @@ -115,6 +127,7 @@ Uses FYPP eval directive using `$:` | `reduction` | 2-level string list | None | Variables unique to each iteration and reduced at the end | | `reductionOp` | string list | None | Operator that each list of reduction will reduce with | | `extraAccArgs` | string | None | String of any extra arguments added to the OpenACC directive | +| `extraOmpArgs` | string | None | String of any extra arguments added to the OpenMP directive | **Parameter Restrictions** @@ -154,23 +167,25 @@ Uses FYPP call directive using `#:call` **Parameters** -| name | data type | Default Value | description | -|------------------|---------------------|-------------------|-------------------------------------------------------------------------------------------| -| `default` | string | 'present' | Implicit assumptions compiler should make | -| `private` | string list | None | Variables that are private to each iteration/thread | -| `firstprivate` | string list | None | Initialized variables that are private to each iteration/thread | -| `reduction` | 2-level string list | None | Variables unique to each iteration and reduced at the end | -| `reductionOp` | string list | None | Operator that each list of reduction will reduce with | -| `copy` | string list | None | Allocates and copies data to GPU on entrance, then deallocated and copies to CPU on exit | -| `copyin` | string list | None | Allocates and copies data to GPU on entrance and then deallocated on exit | -| `copyinReadOnly` | string list | None | Allocates and copies readonly data to GPU and then deallocated on exit | -| `copyout` | string list | None | Allocates data on GPU on entrance and then deallocates and copies to CPU on exit | -| `create` | string list | None | Allocates data on GPU on entrance and then deallocates on exit | -| `no_create` | string list | None | Use data in CPU memory unless data is already in GPU memory | -| `present` | string list | None | Data that must be present in GPU memory. Increment counter on entrance, decrement on exit | -| `deviceptr` | string list | None | Pointer variables that are already allocated on GPU memory | -| `attach` | string list | None | Attaches device pointer to device targets on entrance, then detach on exit | -| `extraAccArgs` | string | None | String of any extra arguments added to the OpenACC directive | +| name | data type | Default Value | description | +|------------------|---------------------|---------------|-------------------------------------------------------------------------------------------| +| `code` | code | Required | Region of code where a kernel is launched on the GPU | +| `default` | string | 'present' | Implicit assumptions compiler should make | +| `private` | string list | None | Variables that are private to each iteration/thread | +| `firstprivate` | string list | None | Initialized variables that are private to each iteration/thread | +| `reduction` | 2-level string list | None | Variables unique to each iteration and reduced at the end | +| `reductionOp` | string list | None | Operator that each list of reduction will reduce with | +| `copy` | string list | None | Allocates and copies data to GPU on entrance, then deallocated and copies to CPU on exit | +| `copyin` | string list | None | Allocates and copies data to GPU on entrance and then deallocated on exit | +| `copyinReadOnly` | string list | None | Allocates and copies readonly data to GPU and then deallocated on exit | +| `copyout` | string list | None | Allocates data on GPU on entrance and then deallocates and copies to CPU on exit | +| `create` | string list | None | Allocates data on GPU on entrance and then deallocates on exit | +| `no_create` | string list | None | Use data in CPU memory unless data is already in GPU memory | +| `present` | string list | None | Data that must be present in GPU memory. Increment counter on entrance, decrement on exit | +| `deviceptr` | string list | None | Pointer variables that are already allocated on GPU memory | +| `attach` | string list | None | Attaches device pointer to device targets on entrance, then detach on exit | +| `extraAccArgs` | string | None | String of any extra arguments added to the OpenACC directive | +| `extraOmpArgs` | string | None | String of any extra arguments added to the OpenMP directive | **Parameter Restrictions** @@ -234,6 +249,8 @@ Uses FYPP call directive using `#:call` | `attach` | string list | None | Attaches device pointer to device targets on entrance, then detach on exit | | `default` | string | None | Implicit assumptions compiler should make | | `extraAccArgs` | string | None | String of any extra arguments added to the OpenACC directive | +| `extraOmpArgs` | string | None | String of any extra arguments added to the OpenMP directive | + **Parameter Restrictions** @@ -274,6 +291,8 @@ Uses FYPP eval directive using `$:` | `create` | string list | None | Allocates data on GPU on entrance | | `attach` | string list | None | Attaches device pointer to device targets on entrance | | `extraAccArgs` | string | None | String of any extra arguments added to the OpenACC directive | +| `extraOmpArgs` | string | None | String of any extra arguments added to the OpenMP directive | + **Example** @@ -301,6 +320,8 @@ Uses FYPP eval directive using `$:` | `delete` | string list | None | Deallocates data on GPU on exit | | `detach` | string list | None | Detach device pointer from device targets on exit | | `extraAccArgs` | string | None | String of any extra arguments added to the OpenACC directive | +| `extraOmpArgs` | string | None | String of any extra arguments added to the OpenMP directive | + **Example** @@ -333,6 +354,8 @@ Uses FYPP eval directive using `$:` | `deviceptr` | string list | None | Pointer variables that are already allocated on GPU memory | | `link` | string list | None | Declare global link, and only allocate when variable used in data clause. | | `extraAccArgs` | string | None | String of any extra arguments added to the OpenACC directive | +| `extraOmpArgs` | string | None | String of any extra arguments added to the OpenMP directive | + **Additional information** @@ -366,6 +389,8 @@ Uses FYPP eval directive using `$:` | `host` | string list | None | Updates data from GPU to CPU | | `device` | string list | None | Updates data from CPU to GPU | | `extraAccArgs` | string | None | String of any extra arguments added to the OpenACC directive | +| `extraOmpArgs` | string | None | String of any extra arguments added to the OpenMP directive | + **Example** @@ -391,11 +416,14 @@ Uses FYPP call directive using `#:call` **Parameters** -| name | data type | Default Value | description | -|----------------|-------------|---------------|------------------------------------------------------------------| -| `code` | code | Required | Region of code where GPU memory addresses is accessible | -| `use_device` | string list | None | Use GPU memory address of variable instead of CPU memory address | -| `extraAccArgs` | string | None | String of any extra arguments added to the OpenACC directive | +| name | data type | Default Value | description | +|-------------------|-------------|---------------|------------------------------------------------------------------| +| `code` | code | Required | Region of code where GPU memory addresses is accessible | +| `use_device_addr` | string list | None | Use GPU memory address of variable instead of CPU memory address | +| `use_device_ptr` | string list | None | Use GPU pointer of pointers instead of CPU pointer | +| `extraAccArgs` | string | None | String of any extra arguments added to the OpenACC directive | +| `extraOmpArgs` | string | None | String of any extra arguments added to the OpenMP directive | + **Parameter Restrictions** @@ -436,6 +464,8 @@ Uses FYPP eval directive using `$:` | name | data type | Default Value | description | |----------------|-----------|---------------|--------------------------------------------------------------| | `extraAccArgs` | string | None | String of any extra arguments added to the OpenACC directive | +| `extraOmpArgs` | string | None | String of any extra arguments added to the OpenMP directive | + **Example** @@ -460,6 +490,8 @@ Uses FYPP eval directive using `$:` |----------------|-----------|---------------|--------------------------------------------------------------| | `atomic` | string | Required | Which atomic operation is performed | | `extraAccArgs` | string | None | String of any extra arguments added to the OpenACC directive | +| `extraOmpArgs` | string | None | String of any extra arguments added to the OpenMP directive | + **Parameter Restrictions** @@ -517,6 +549,9 @@ Uses FYPP eval directive using `$:` | `nohost` | boolean | False | Do not compile procedure code for CPU | | `cray_inline` | boolean | False | Inline procedure on cray compiler | | `extraAccArgs` | string | None | String of any extra arguments added to the OpenACC directive | +| `extraOmpArgs` | string | None | String of any extra arguments added to the OpenMP directive | + + **Parameter Restrictions** @@ -555,6 +590,9 @@ Uses FYPP eval directive using `$:` | `cache` | string list | Required | Data that should to stored in cache | | `extraAccArgs` | string | None | String of any extra arguments added to the OpenACC directive | +**NOTE** +Does not do anything for OpenMP currently + **Example** ```python @@ -570,27 +608,32 @@ Uses FYPP eval directive using `$:` ## Compiler agnostic tools ## OpenMP tools + ```bash OMP_DISPLAY_ENV=true | false | verbose ``` + - Prints out the internal control values and environment variables at the beginning of the program if `true` or `verbose` - `verbose` will also print out vendor-specific internal control values and environment variables ```bash OMP_TARGET_OFFLOAD = MANDATORY | DISABLED | DEFAULT ``` + - Quick way to turn off off-load (`DISABLED`) or make it abort if a GPU isn't found (`MANDATORY`) -- Great first test: does the problem disappear when you drop back to the CPU? +- Great first test: does the problem disappear when you drop back to the CPU? ```bash OMP_THREAD_LIMIT= ``` + - Sets the maximum number of OpenMP threads to use in a contention group - Might be useful in checking for issues with contention or race conditions ```bash OMP_DISPLAY_AFFINITY=TRUE ``` + - Will display affinity bindings for each OpenMP thread, containing hostname, process identifier, OS thread identifier, OpenMP thread identifier, and affinity binding. ## Cray Compiler Tools @@ -620,6 +663,7 @@ CRAY_ACC_FORCE_EARLY_INIT=1 ```bash CRAY_ACC_PRESENT_DUMP_SAVE_NAMES=1 ``` + - Will cause `acc_present_dump()` to output variable names and file locations in addition to variable mappings - Add `acc_present_dump()` around hotspots to help find problems with data movements - Helps more if adding `CRAY_ACC_DEBUG` environment variable @@ -631,12 +675,14 @@ CRAY_ACC_PRESENT_DUMP_SAVE_NAMES=1 ```bash STATIC_RANDOM_SEED=1 ``` -- Forces the seed returned by `RANDOM_SEED` to be constant, so it generates the same sequence of random numbers + +- Forces the seed returned by `RANDOM_SEED` to be constant, so it generates the same sequence of random numbers - Useful for testing issues with randomized data ```bash NVCOMPILER_TERM=option[,option] ``` + - `[no]debug`: Enables/disables just-in-time debugging (debugging invoked on error) - `[no]trace`: Enables/disables stack traceback on error @@ -645,17 +691,19 @@ NVCOMPILER_TERM=option[,option] ```bash NVCOMPILER_ACC_NOTIFY= ``` + - Assign the environment variable to a bitmask to print out information to stderr for the following - kernel launches: 1 - data transfers: 2 - region entry/exit: 4 - wait operation of synchronizations with the device: 8 - device memory allocations and deallocations: 16 -- 1 (kernels only) is the usual first step.3 (kernels + copies) is great for "why is it so slow?" +- 1 (kernels only) is the usual first step.3 (kernels + copies) is great for "why is it so slow?" ```bash NVCOMPILER_ACC_TIME=1 ``` + - Lightweight profiler - prints a tidy end-of-run table with per-region and per-kernel times and bytes moved - Do not use with CUDA profiler at the same time @@ -663,8 +711,9 @@ NVCOMPILER_ACC_TIME=1 ```bash NVCOMPILER_ACC_DEBUG=1 ``` + - Spews everything the runtime sees: host/device addresses, mapping events, present-table look-ups, etc. -- Great for "partially present" or "pointer went missing" errors. +- Great for "partially present" or "pointer went missing" errors. - [Doc for NVCOMPILER_ACC_DEBUG](https://docs.nvidia.com/hpc-sdk/archive/20.9/pdf/hpc209openacc_gs.pdf) - Ctrl+F for `NVCOMPILER_ACC_DEBUG` @@ -673,6 +722,7 @@ NVCOMPILER_ACC_DEBUG=1 ```bash LIBOMPTARGET_PROFILE=run.json ``` + - Emits a Chrome-trace (JSON) timeline you can open in chrome://tracing or Speedscope - Great lightweight profiler when Nsight is overkill. - Granularity in µs via `LIBOMPTARGET_PROFILE_GRANULARITY` (default 500). @@ -680,9 +730,10 @@ LIBOMPTARGET_PROFILE=run.json ```bash LIBOMPTARGET_INFO= ``` + - Prints out different types of runtime information - Human-readable log of data-mapping inserts/updates, kernel launches, copies, waits. -- Perfect first stop for "why is nothing copied?" +- Perfect first stop for "why is nothing copied?" - Flags - Print all data arguments upon entering an OpenMP device kernel: 0x01 - Indicate when a mapped address already exists in the device mapping table: 0x02 @@ -694,6 +745,7 @@ LIBOMPTARGET_INFO= ```bash LIBOMPTARGET_DEBUG=1 ``` + - Developer-level trace (host-side) - Much noisier than `INFO` - Only works if the runtime was built with `-DOMPTARGET_DEBUG`. @@ -701,12 +753,14 @@ LIBOMPTARGET_DEBUG=1 ```bash LIBOMPTARGET_JIT_OPT_LEVEL=-O{0,1,2,3} ``` -- This environment variable can be used to change the optimization pipeline used to optimize the embedded device code as part of the device JIT. + +- This environment variable can be used to change the optimization pipeline used to optimize the embedded device code as part of the device JIT. - The value corresponds to the `-O{0,1,2,3}` command line argument passed to clang. ```bash LIBOMPTARGET_JIT_SKIP_OPT=1 ``` + - This environment variable can be used to skip the optimization pipeline during JIT compilation. - If set, the image will only be passed through the backend. - The backend is invoked with the `LIBOMPTARGET_JIT_OPT_LEVEL` flag. @@ -718,6 +772,6 @@ LIBOMPTARGET_JIT_SKIP_OPT=1 - [NVHPC & OpenACC Docs](https://docs.nvidia.com/hpc-sdk/compilers/hpc-compilers-user-guide/index.html?highlight=NVCOMPILER_#environment-variables) - [NVHPC & OpenMP Docs](https://docs.nvidia.com/hpc-sdk/compilers/hpc-compilers-user-guide/index.html?highlight=NVCOMPILER_#id2) - [LLVM & OpenMP Docs](https://openmp.llvm.org/design/Runtimes.html) - - NVHPC is built on top of LLVM + - NVHPC is built on top of LLVM - [OpenMP Docs](https://www.openmp.org/spec-html/5.1/openmp.html) - [OpenACC Docs](https://www.openacc.org/sites/default/files/inline-files/OpenACC.2.7.pdf) diff --git a/src/common/include/acc_macros.fpp b/src/common/include/acc_macros.fpp new file mode 100644 index 0000000000..147473250d --- /dev/null +++ b/src/common/include/acc_macros.fpp @@ -0,0 +1,311 @@ +#:include 'shared_parallel_macros.fpp' + +#:def GEN_COPY_STR(copy) + #:set copy_val = GEN_PARENTHESES_CLAUSE('copy', copy) + $:copy_val +#:enddef + +#:def GEN_COPYIN_STR(copyin, readonly) + #:assert isinstance(readonly, bool) + #:set copyin_val = GEN_PARENTHESES_CLAUSE('copyin', copyin) + #:if copyin is not None and readonly == True + #:set index = copyin_val.find('copyin(') + len('copyin(') + #:set copyin_val = copyin_val[:index] + 'readonly:' + copyin_val[index:] + #:endif + $:copyin_val +#:enddef + +#:def GEN_COPYOUT_STR(copyout) + #:set copyout_val = GEN_PARENTHESES_CLAUSE('copyout', copyout) + $:copyout_val +#:enddef + +#:def GEN_CREATE_STR(create) + #:set create_val = GEN_PARENTHESES_CLAUSE('create', create) + $:create_val +#:enddef + +#:def GEN_NOCREATE_STR(no_create) + #:set nocreate_val = GEN_PARENTHESES_CLAUSE('no_create', no_create) + $:nocreate_val +#:enddef + +#:def GEN_DELETE_STR(delete) + #:set delete_val = GEN_PARENTHESES_CLAUSE('delete', delete) + $:delete_val +#:enddef + +#:def GEN_PRESENT_STR(present) + #:set present_val = GEN_PARENTHESES_CLAUSE('present', present) + $:present_val +#:enddef + +#:def GEN_DEVICEPTR_STR(deviceptr) + #:set deviceptr_val = GEN_PARENTHESES_CLAUSE('deviceptr', deviceptr) + $:deviceptr_val +#:enddef + +#:def GEN_ATTACH_STR(attach) + #:set attach_val = GEN_PARENTHESES_CLAUSE('attach', attach) + $:attach_val +#:enddef + +#:def GEN_DETACH_STR(detach) + #:set detach_val = GEN_PARENTHESES_CLAUSE('detach', detach) + $:detach_val +#:enddef + +#:def GEN_LINK_STR(link) + #:set link_val = GEN_PARENTHESES_CLAUSE('link', link) + $:link_val +#:enddef + +#:def GEN_DEFAULT_STR(default) + #:if default is not None + #:assert isinstance(default, str) + #:assert (default == 'present' or default == 'none') + #:set default_val = 'default(' + default + ') ' + #:else + #:set default_val = '' + #:endif + $:default_val +#:enddef + +#:def GEN_HOST_STR(host) + #:set host_val = GEN_PARENTHESES_CLAUSE('host', host) + $:host_val +#:enddef + +#:def GEN_DEVICE_STR(device) + #:set device_val = GEN_PARENTHESES_CLAUSE('device', device) + $:device_val +#:enddef + +#:def GEN_USE_DEVICE_STR(use_device) + #:set use_device_val = GEN_PARENTHESES_CLAUSE('use_device', use_device) + $:use_device_val +#:enddef + +#:def GEN_PARALLELISM_STR(parallelism) + #:if parallelism is not None + #:assert isinstance(parallelism, str) + #:assert parallelism[0] == '[' and parallelism[-1] == ']' + #:set parallelism_list = [x.strip() for x in parallelism.strip('[]').split(',')] + $:ASSERT_LIST(parallelism_list, str) + #:assert all((element == 'gang' or element == 'worker' or & + & element == 'vector' or element == 'seq') for element in parallelism_list) + #:set parallelism_val = ' '.join(parallelism_list) + ' ' + #:else + #:set parallelism_val = '' + #:endif + $:parallelism_val +#:enddef + +#:def ACC_PARALLEL(code, private=None, default='present', firstprivate=None, reduction=None, reductionOp=None, & + & copy=None, copyin=None, copyinReadOnly=None, copyout=None, create=None, & + & no_create=None, present=None, deviceptr=None, attach=None, extraAccArgs=None) + #:set default_val = GEN_DEFAULT_STR(default) + #:set private_val = GEN_PRIVATE_STR(private, False).strip('\n') + GEN_PRIVATE_STR(firstprivate, True).strip('\n') + #:set reduction_val = GEN_REDUCTION_STR(reduction, reductionOp) + #:set copy_val = GEN_COPY_STR(copy) + #:set copyin_val = GEN_COPYIN_STR(copyin, False).strip('\n') + GEN_COPYIN_STR(copyinReadOnly, True).strip('\n') + #:set copyout_val = GEN_COPYOUT_STR(copyout) + #:set create_val = GEN_CREATE_STR(create) + #:set no_create_val = GEN_NOCREATE_STR(no_create) + #:set present_val = GEN_PRESENT_STR(present) + #:set deviceptr_val = GEN_DEVICEPTR_STR(deviceptr) + #:set attach_val = GEN_ATTACH_STR(attach) + #:set extraAccArgs_val = GEN_EXTRA_ARGS_STR(extraAccArgs) + #:set acc_clause_val = default_val.strip('\n') + private_val.strip('\n') + reduction_val.strip('\n') + & + & copy_val.strip('\n') + copyin_val.strip('\n') + & + & copyout_val.strip('\n') + create_val.strip('\n') + & + & no_create_val.strip('\n') + present_val.strip('\n') + & + & deviceptr_val.strip('\n') + attach_val.strip('\n') + #:set acc_directive = '!$acc parallel ' + & + & acc_clause_val + extraAccArgs_val.strip('\n') + #:set end_acc_directive = '!$acc end parallel' + $:acc_directive + $:code + $:end_acc_directive +#:enddef + +#:def ACC_PARALLEL_LOOP(code, collapse=None, private=None, parallelism='[gang, vector]', & + & default='present', firstprivate=None, reduction=None, reductionOp=None, & + & copy=None, copyin=None, copyinReadOnly=None, copyout=None, create=None, & + & no_create=None, present=None, deviceptr=None, attach=None, extraAccArgs=None) + #:set collapse_val = GEN_COLLAPSE_STR(collapse) + #:set parallelism_val = GEN_PARALLELISM_STR(parallelism) + #:set default_val = GEN_DEFAULT_STR(default) + #:set private_val = GEN_PRIVATE_STR(private, False).strip('\n') + GEN_PRIVATE_STR(firstprivate, True).strip('\n') + #:set reduction_val = GEN_REDUCTION_STR(reduction, reductionOp) + #:set copy_val = GEN_COPY_STR(copy) + #:set copyin_val = GEN_COPYIN_STR(copyin, False).strip('\n') + GEN_COPYIN_STR(copyinReadOnly, True).strip('\n') + #:set copyout_val = GEN_COPYOUT_STR(copyout) + #:set create_val = GEN_CREATE_STR(create) + #:set no_create_val = GEN_NOCREATE_STR(no_create) + #:set present_val = GEN_PRESENT_STR(present) + #:set deviceptr_val = GEN_DEVICEPTR_STR(deviceptr) + #:set attach_val = GEN_ATTACH_STR(attach) + #:set extraAccArgs_val = GEN_EXTRA_ARGS_STR(extraAccArgs) + #:set clause_val = collapse_val.strip('\n') + parallelism_val.strip('\n') + & + & default_val.strip('\n') + private_val.strip('\n') + reduction_val.strip('\n') + & + & copy_val.strip('\n') + copyin_val.strip('\n') + & + & copyout_val.strip('\n') + create_val.strip('\n') + & + & no_create_val.strip('\n') + present_val.strip('\n') + & + & deviceptr_val.strip('\n') + attach_val.strip('\n') + #:set acc_directive = '!$acc parallel loop ' + & + & clause_val + extraAccArgs_val.strip('\n') + #:set acc_end_directive = '!$acc end parallel loop' + $:acc_directive + $:code + $:acc_end_directive +#:enddef + +#:def ACC_ROUTINE(function_name=None, parallelism=None, nohost=False, extraAccArgs=None) + #:set parallelism_val = GEN_PARALLELISM_STR(parallelism) + #:assert isinstance(nohost, bool) + #:if nohost == True + #:set nohost_val = 'nohost' + #:else + #:set nohost_val = '' + #:endif + #:set extraAccArgs_val = GEN_EXTRA_ARGS_STR(extraAccArgs) + #:set clause_val = parallelism_val.strip('\n') + nohost_val.strip('\n') + #:set acc_directive = '!$acc routine ' + & + & clause_val + extraAccArgs_val.strip('\n') + $:acc_directive +#:enddef + +#:def ACC_DECLARE(copy=None, copyin=None, copyinReadOnly=None, copyout=None, create=None, present=None, deviceptr=None, link=None, extraAccArgs=None) + #:set copy_val = GEN_COPY_STR(copy) + #:set copyin_val = GEN_COPYIN_STR(copyin, False).strip('\n') + GEN_COPYIN_STR(copyinReadOnly, True).strip('\n') + #:set copyout_val = GEN_COPYOUT_STR(copyout) + #:set create_val = GEN_CREATE_STR(create) + #:set present_val = GEN_PRESENT_STR(present) + #:set deviceptr_val = GEN_DEVICEPTR_STR(deviceptr) + #:set link_val = GEN_LINK_STR(link) + #:set extraAccArgs_val = GEN_EXTRA_ARGS_STR(extraAccArgs) + #:set clause_val = copy_val.strip('\n') + copyin_val.strip('\n') + & + & copyout_val.strip('\n') + create_val.strip('\n') + & + & present_val.strip('\n') + deviceptr_val.strip('\n') + & + & link_val.strip('\n') + #:set acc_directive = '!$acc declare ' + clause_val + extraAccArgs_val.strip('\n') + $:acc_directive +#:enddef + +#:def ACC_LOOP(collapse=None, parallelism=None, data_dependency=None, reduction=None, reductionOp=None, private=None, extraAccArgs=None) + #:set collapse_val = GEN_COLLAPSE_STR(collapse) + #:set parallelism_val = GEN_PARALLELISM_STR(parallelism) + #:if data_dependency is not None + #:assert isinstance(data_dependency, str) + #:assert (data_dependency == 'auto' or data_dependency == 'independent') + #:set data_dependency_val = data_dependency + #:else + #:set data_dependency_val = '' + #:endif + #:set private_val = GEN_PRIVATE_STR(private, False) + #:set reduction_val = GEN_REDUCTION_STR(reduction, reductionOp) + #:set extraAccArgs_val = GEN_EXTRA_ARGS_STR(extraAccArgs) + #:set clause_val = collapse_val.strip('\n') + parallelism_val.strip('\n') + & + & data_dependency_val.strip('\n') + private_val.strip('\n') + & + & reduction_val.strip('\n') + #:set acc_directive = '!$acc loop ' + & + & clause_val + extraAccArgs_val.strip('\n') + $:acc_directive +#:enddef + +#:def ACC_DATA(code, copy=None, copyin=None, copyinReadOnly=None, copyout=None, create=None, no_create=None, present=None, deviceptr=None, attach=None, default=None, extraAccArgs=None) + #:assert code is not None + #:assert isinstance(code, str) + #:if code == '' or code.isspace() + #:stop 'GPU_DATA macro has no effect on the code as it is not surrounding any code' + #:endif + #:set copy_val = GEN_COPY_STR(copy) + #:set copyin_val = GEN_COPYIN_STR(copyin, False).strip('\n') + GEN_COPYIN_STR(copyinReadOnly, True).strip('\n') + #:set copyout_val = GEN_COPYOUT_STR(copyout) + #:set create_val = GEN_CREATE_STR(create) + #:set no_create_val = GEN_NOCREATE_STR(no_create) + #:set present_val = GEN_PRESENT_STR(present) + #:set deviceptr_val = GEN_DEVICEPTR_STR(deviceptr) + #:set attach_val = GEN_ATTACH_STR(attach) + #:set default_val = GEN_DEFAULT_STR(default) + #:set extraAccArgs_val = GEN_EXTRA_ARGS_STR(extraAccArgs) + #:set clause_val = copy_val.strip('\n') + copyin_val.strip('\n') + & + & copyout_val.strip('\n') + create_val.strip('\n') + & + & no_create_val.strip('\n') + present_val.strip('\n') + & + & deviceptr_val.strip('\n') + attach_val.strip('\n') + & + & default_val.strip('\n') + #:set acc_directive = '!$acc data ' + clause_val + extraAccArgs_val.strip('\n') + #:set end_acc_directive = '!$acc end data' + $:acc_directive + $:code + $:end_acc_directive +#:enddef + +#:def ACC_ENTER_DATA(copyin=None, copyinReadOnly=None, create=None, attach=None, extraAccArgs=None) + #:set copyin_val = GEN_COPYIN_STR(copyin, False).strip('\n') + GEN_COPYIN_STR(copyinReadOnly, True).strip('\n') + #:set create_val = GEN_CREATE_STR(create) + #:set attach_val = GEN_ATTACH_STR(attach) + #! #:set to_val = GEN_TO_STR(copyin) + #! #:set alloc_val = GEN_ALLOC_STR(create) + #! #:set alloc_val2 = GEN_ALLOC_STR(attach) + #:set extraAccArgs_val = GEN_EXTRA_ARGS_STR(extraAccArgs) + #! #:set extraMpArgs_val = '' + #:set acc_clause_val = copyin_val.strip('\n') + create_val.strip('\n') + attach_val.strip('\n') + #! #:set mp_clause_val = to_val.strip('\n') + alloc_val.strip('\n') + alloc_val2.strip('\n') + #:set acc_directive = '!$acc enter data ' + acc_clause_val + extraAccArgs_val.strip('\n') + $:acc_directive +#:enddef + +#:def ACC_EXIT_DATA(copyout=None, delete=None, detach=None, extraAccArgs=None) + #:set copyout_val = GEN_COPYOUT_STR(copyout) + #:set delete_val = GEN_DELETE_STR(delete) + #:set detach_val = GEN_DETACH_STR(detach) + #:set extraAccArgs_val = GEN_EXTRA_ARGS_STR(extraAccArgs) + #:set clause_val = copyout_val.strip('\n') + delete_val.strip('\n') + detach_val.strip('\n') + #:set acc_directive = '!$acc exit data ' + clause_val + extraAccArgs_val.strip('\n') + $:acc_directive +#:enddef + +#:def ACC_UPDATE(host=None, device=None, extraAccArgs=None) + #:set host_val = GEN_HOST_STR(host) + #:set device_val = GEN_DEVICE_STR(device) + #:set extraAccArgs_val = GEN_EXTRA_ARGS_STR(extraAccArgs) + #:set clause_val = host_val.strip('\n') + device_val.strip('\n') + #:set acc_directive = '!$acc update ' + clause_val + extraAccArgs_val.strip('\n') + $:acc_directive +#:enddef + +#:def ACC_HOST_DATA(code, use_device=None, extraAccArgs=None) + #:assert code is not None + #:assert isinstance(code, str) + #:if code == '' or code.isspace() + #:stop 'GPU_HOST_DATA macro has no effect on the code as it is not surrounding any code' + #:endif + #:set use_device_val = GEN_USE_DEVICE_STR(use_device) + #:set extraAccArgs_val = GEN_EXTRA_ARGS_STR(extraAccArgs) + #:set clause_val = use_device_val.strip('\n') + #:set acc_directive = '!$acc host_data ' + clause_val + extraAccArgs_val.strip('\n') + #:set end_acc_directive = '!$acc end host_data' + $:acc_directive + $:code + $:end_acc_directive +#:enddef + +#:def ACC_ATOMIC(atomic, extraAccArgs=None) + #:assert isinstance(atomic, str) + #:assert (atomic == 'read' or atomic == 'write' or atomic == 'update' or atomic == 'capture') + #:set atomic_val = atomic + #:set extraAccArgs_val = GEN_EXTRA_ARGS_STR(extraAccArgs) + #:set clause_val = atomic_val.strip('\n') + #:set acc_directive = '!$acc atomic ' + clause_val + extraAccArgs_val.strip('\n') + $:acc_directive +#:enddef + +#:def ACC_WAIT(extraAccArgs=None) + #:set extraAccArgs_val = GEN_EXTRA_ARGS_STR(extraAccArgs) + #:set clause_val = '' + #:set acc_directive = '!$acc wait ' + clause_val + extraAccArgs_val.strip('\n') + $:acc_directive +#:enddef +! New line at end of file is required for FYPP \ No newline at end of file diff --git a/src/common/include/macros.fpp b/src/common/include/macros.fpp index 69241c99ef..5e0d77d2f1 100644 --- a/src/common/include/macros.fpp +++ b/src/common/include/macros.fpp @@ -65,7 +65,18 @@ @:LOG({'@:ALLOCATE(${re.sub(' +', ' ', ', '.join(args))}$)'}) #:set allocated_variables = ', '.join(args) allocate (${allocated_variables}$) - $:GPU_ENTER_DATA(create=('[' + allocated_variables + ']')) + #:set cleaned = [] + #:for a in args + #:set s = a.rstrip() + #:if s.endswith(')') + #:set rev = s[::-1] + #:set pos = next(i for i, ch, d in ( (j, c, sum(1 if t==')' else -1 if t=='(' else 0 for t in rev[:j+1])) for j, c in enumerate(rev) ) if ch == '(' and d == 0 ) + #:set s = s[:len(s)-1-pos] + #:endif + $:cleaned.append(s) + #:endfor + #:set joined = ', '.join(cleaned) + $:GPU_ENTER_DATA(create='[' + joined + ']') #:enddef ALLOCATE #:def DEALLOCATE(*args) diff --git a/src/common/include/omp_macros.fpp b/src/common/include/omp_macros.fpp new file mode 100644 index 0000000000..1c2fb9c985 --- /dev/null +++ b/src/common/include/omp_macros.fpp @@ -0,0 +1,343 @@ +#:include 'shared_parallel_macros.fpp' + +#:set NVIDIA_COMPILER_ID="NVHPC" +#:set PGI_COMPILER_ID="PGI" +#:set INTEL_COMPILER_ID="Intel" +#:set CCE_COMPILER_ID="Cray" +#:set AMD_COMPILER_ID="LLVMFlang" + +#:def OMP_MAP_STR(map_type, var_list) + #:assert map_type is not None + #:assert isinstance(map_type, str) + #:if var_list is not None + #:set map_clause = 'map(' + map_type + ':' + #:set map_val = GEN_CLAUSE(map_clause, var_list) + #:else + #:set map_val = '' + #:endif + $:map_val +#:enddef + +#:def OMP_DEFAULT_STR(default) + #:if default is not None + #:assert isinstance(default, str) + #:assert (default == 'present' or default == 'none') + #:if default == 'present' + #:if MFC_COMPILER == NVIDIA_COMPILER_ID or MFC_COMPILER == PGI_COMPILER_ID + #:set default_val = 'defaultmap(tofrom:aggregate) defaultmap(tofrom:allocatable) defaultmap(tofrom:pointer) ' + #:elif MFC_COMPILER == CCE_COMPILER_ID + #:set default_val = 'defaultmap(present:aggregate) defaultmap(present:allocatable) defaultmap(present:pointer) ' + #:elif MFC_COMPILER == AMD_COMPILER_ID + #:set default_val = '' + #:else + #:set default_val = 'defaultmap(tofrom:aggregate) defaultmap(tofrom:allocatable) defaultmap(tofrom:pointer) ' + #:endif + #:elif default == 'none' + #:stop 'Not Supported Yet' + #:endif + #:else + #:set default_val = '' + #:endif + $:default_val +#:enddef + +#:def OMP_COPY_STR(copy) + #:set copy_val = OMP_MAP_STR('tofrom', copy) + $:copy_val +#:enddef + +#:def OMP_COPYIN_STR(copyin) + #:set copyin_val = OMP_MAP_STR('to', copyin) + $:copyin_val +#:enddef + +#:def OMP_COPYOUT_STR(copyout) + #:set copyout_val = OMP_MAP_STR('from', copyout) + $:copyout_val +#:enddef + +#:def OMP_CREATE_STR(create) + #:set create_val = OMP_MAP_STR('always,alloc', create) + $:create_val +#:enddef + +#:def OMP_DELETE_STR(delete) + #:set create_val = OMP_MAP_STR('release', delete) + $:create_val +#:enddef + +#:def OMP_NOCREATE_STR(no_create) + #:if no_create is not None + #:stop 'no_create is not supported yet' + #:endif + #:set no_create_val = '' + $:no_create_val +#:enddef + +#:def OMP_PRESENT_STR(present) + #:set present_val = OMP_MAP_STR('present,alloc', present) + $:present_val +#:enddef + +#:def OMP_DEVICEPTR_STR(deviceptr) + #:set deviceptr_val = GEN_PARENTHESES_CLAUSE('use_device_ptr', deviceptr) + $:deviceptr_val +#:enddef + +#! #:def OMP_ATTACH_STR(attach) + #! #:set attach_val = OMP_MAP_STR('always,to', attach) + #! $:attach_val +#! #:enddef + +#! #:def OMP_DETACH_STR(detach) + #! #:set detach_val = OMP_MAP_STR('always,from', detach) + #! $:detach_val +#! #:enddef + +#:def OMP_TO_STR(to) + #:set to_val = GEN_PARENTHESES_CLAUSE('to', to) + $:to_val +#:enddef + +#:def OMP_FROM_STR(to) + #:set from_val = GEN_PARENTHESES_CLAUSE('from', to) + $:from_val +#:enddef + +#:def OMP_PARALLELISM_STR(parallelism) + #:set temp = '' + $:temp +#:enddef + +#:def OMP_USE_DEVICE_ADDR_STR(use_device_addr) + #:set use_device_addr_val = GEN_PARENTHESES_CLAUSE('use_device_addr', use_device_addr) + $:use_device_addr_val +#:enddef + +#:def OMP_USE_DEVICE_PTR_STR(use_device_ptr) + #:set use_device_ptr_val = GEN_PARENTHESES_CLAUSE('use_device_ptr', use_device_ptr) + $:use_device_ptr_val +#:enddef + +#:def OMP_PARALLEL(code, private=None, default='present', firstprivate=None, reduction=None, reductionOp=None, & + & copy=None, copyin=None, copyinReadOnly=None, copyout=None, create=None, & + & no_create=None, present=None, deviceptr=None, attach=None, extraOmpArgs=None) + #:set default_val = OMP_DEFAULT_STR(default) + #:set private_val = GEN_PRIVATE_STR(private, False).strip('\n') + GEN_PRIVATE_STR(firstprivate, True).strip('\n') + #:set reduction_val = GEN_REDUCTION_STR(reduction, reductionOp) + #:set copy_val = OMP_COPY_STR(copy) + #:set copyin_val = OMP_COPYIN_STR(copyin).strip('\n') + OMP_COPYIN_STR(copyinReadOnly).strip('\n') + #:set copyout_val = OMP_COPYOUT_STR(copyout) + #:set create_val = OMP_CREATE_STR(create) + #:set no_create_val = OMP_NOCREATE_STR(no_create) + #:set present_val = OMP_PRESENT_STR(present) + #:set deviceptr_val = OMP_DEVICEPTR_STR(deviceptr) + #:set attach_val = OMP_MAP_STR('always,tofrom', attach) + #:set extraOmpArgs_val = GEN_EXTRA_ARGS_STR(extraOmpArgs) + #:set omp_clause_val = default_val.strip('\n') + private_val.strip('\n') + reduction_val.strip('\n') + & + & copy_val.strip('\n') + copyin_val.strip('\n') + & + & copyout_val.strip('\n') + create_val.strip('\n') + & + & no_create_val.strip('\n') + present_val.strip('\n') + & + & deviceptr_val.strip('\n') + attach_val.strip('\n') + + #:set omp_clause_val = omp_clause_val.strip('\n') + #:set omp_directive = '!$omp target teams ' + omp_clause_val + extraOmpArgs_val.strip('\n') + + #:set omp_end_directive = '!$omp end target teams' + $:omp_directive + $:code + $:omp_end_directive +#:enddef + +#:def OMP_PARALLEL_LOOP(code, collapse=None, private=None, parallelism='[gang, vector]', & + & default='present', firstprivate=None, reduction=None, reductionOp=None, & + & copy=None, copyin=None, copyinReadOnly=None, copyout=None, create=None, & + & no_create=None, present=None, deviceptr=None, attach=None, extraOmpArgs=None) + + #:set collapse_val = GEN_COLLAPSE_STR(collapse) + #:set parallelism_val = OMP_PARALLELISM_STR(parallelism) + #:set default_val = OMP_DEFAULT_STR(default) + #:set private_val = GEN_PRIVATE_STR(private, False).strip('\n') + GEN_PRIVATE_STR(firstprivate, True).strip('\n') + #:set reduction_val = GEN_REDUCTION_STR(reduction, reductionOp) + #:set copy_val = OMP_COPY_STR(copy) + #:set copyin_val = OMP_COPYIN_STR(copyin).strip('\n') + OMP_COPYIN_STR(copyinReadOnly).strip('\n') + #:set copyout_val = OMP_COPYOUT_STR(copyout) + #:set create_val = OMP_CREATE_STR(create) + #:set no_create_val = OMP_NOCREATE_STR(no_create) + #:set present_val = OMP_PRESENT_STR(present) + #:set deviceptr_val = OMP_DEVICEPTR_STR(deviceptr) + #:set attach_val = OMP_MAP_STR('always,tofrom', attach) + #:set extraOmpArgs_val = GEN_EXTRA_ARGS_STR(extraOmpArgs) + #:set clause_val = collapse_val.strip('\n') + parallelism_val.strip('\n') + & + & default_val.strip('\n') + private_val.strip('\n') + reduction_val.strip('\n') + & + & copy_val.strip('\n') + copyin_val.strip('\n') + & + & copyout_val.strip('\n') + create_val.strip('\n') + & + & no_create_val.strip('\n') + present_val.strip('\n') + & + & deviceptr_val.strip('\n') + attach_val.strip('\n') + #! Hardcoding the parallelism for now + + #:if MFC_COMPILER == NVIDIA_COMPILER_ID or MFC_COMPILER == PGI_COMPILER_ID + #:set omp_start_directive = '!$omp target teams loop defaultmap(firstprivate:scalar) bind(teams,parallel) ' + #:set omp_end_directive = '!$omp end target teams loop' + #:elif MFC_COMPILER == CCE_COMPILER_ID + #:set omp_start_directive = '!$omp target teams distribute parallel do simd defaultmap(firstprivate:scalar) ' + #:set omp_end_directive = '!$omp end target teams distribute parallel do simd' + #:elif MFC_COMPILER == AMD_COMPILER_ID + #:set omp_start_directive = '!$omp target teams distribute parallel do ' + #:set omp_end_directive = '!$omp end target teams distribute parallel do' + #:else + #:set omp_start_directive = '!$omp target teams loop defaultmap(firstprivate:scalar) bind(teams,parallel) ' + #:set omp_end_directive = '!$omp end target teams loop' + #:endif + + #:set omp_directive = omp_start_directive + clause_val + extraOmpArgs_val.strip('\n') + $:omp_directive + $:code + $:omp_end_directive +#:enddef + +#:def OMP_ROUTINE(function_name, nohost, extraOmpArgs) + #:assert isinstance(nohost, bool) + #:if nohost == True + #:set nohost_val = 'device_type(nohost) ' + #:else + #:set nohost_val = 'device_type(any) ' + #:endif + #:set extraOmpArgs_val = GEN_EXTRA_ARGS_STR(extraOmpArgs) + #:if function_name is not None + #:set function_name_val = '(' + function_name + ') ' + #:else + #:set function_name_val = '' + #:endif + + #:if MFC_COMPILER == AMD_COMPILER_ID + #:set clause_val = '' + #:else + #:set clause_val = nohost_val.strip('\n') + #:endif + + #:set omp_directive = '!$omp declare target ' + & + & clause_val + extraOmpArgs_val.strip('\n') + $:omp_directive +#:enddef + +#:def OMP_DECLARE(copyin=None, copyinReadOnly=None, create=None, link=None, extraOmpArgs=None) + #:set copyin_val = OMP_TO_STR(copyin).strip('\n') + OMP_TO_STR(copyinReadOnly).strip('\n') + #:set create_val = GEN_CLAUSE('(', create) + #:set link_val = GEN_LINK_STR(link) + #:set extraOmpArgs_val = GEN_EXTRA_ARGS_STR(extraOmpArgs) + #:set clause_val = copyin_val.strip('\n') + & + & create_val.strip('\n') + link_val.strip('\n') + #:set omp_directive = '!$omp declare target ' + clause_val + extraOmpArgs_val.strip('\n') + $:omp_directive +#:enddef + +#! Not fully implemented yet (ignores most args right now) +#:def OMP_LOOP(collapse=None, parallelism=None, data_dependency=None, reduction=None, reductionOp=None, private=None, extraOmpArgs=None) + #:if MFC_COMPILER == NVIDIA_COMPILER_ID or MFC_COMPILER == PGI_COMPILER_ID + #:set omp_directive = '!$omp loop bind(thread)' + #:elif MFC_COMPILER == CCE_COMPILER_ID or MFC_COMPILER == AMD_COMPILER_ID + #:set omp_directive = '' + #:else + #:set omp_directive = '' + #:endif + $:omp_directive +#:enddef + +#:def OMP_DATA(code, copy=None, copyin=None, copyinReadOnly=None, copyout=None, create=None, no_create=None, present=None, deviceptr=None, attach=None, default=None, extraOmpArgs=None) + #:assert code is not None + #:assert isinstance(code, str) + #:if code == '' or code.isspace() + #:stop 'GPU_DATA macro has no effect on the code as it is not surrounding any code' + #:endif + #:set copy_val = OMP_COPY_STR(copy) + #:set copyin_val = OMP_COPYIN_STR(copyin).strip('\n') + OMP_COPYIN_STR(copyinReadOnly).strip('\n') + #:set copyout_val = OMP_COPYOUT_STR(copyout) + #:set create_val = OMP_CREATE_STR(create) + #:set no_create_val = OMP_NOCREATE_STR(no_create) + #:set present_val = OMP_PRESENT_STR(present) + #:set deviceptr_val = OMP_DEVICEPTR_STR(deviceptr) + #:set attach_val = OMP_MAP_STR('always,tofrom', attach) + #:set default_val = OMP_DEFAULT_STR(default) + #:set extraOmpArgs_val = GEN_EXTRA_ARGS_STR(extraOmpArgs) + #:set clause_val = copy_val.strip('\n') + copyin_val.strip('\n') + & + & copyout_val.strip('\n') + create_val.strip('\n') + & + & no_create_val.strip('\n') + present_val.strip('\n') + & + & deviceptr_val.strip('\n') + attach_val.strip('\n') + & + & default_val.strip('\n') + #:set omp_directive = '!$omp target data ' + clause_val + extraOmpArgs_val.strip('\n') + #:set end_omp_directive = '!$omp end target data' + $:omp_directive + $:code + $:end_omp_directive +#:enddef + +#:def OMP_ENTER_DATA(copyin=None, copyinReadOnly=None, create=None, attach=None, extraOmpArgs=None) + #:set copyin_val = OMP_COPYIN_STR(copyin).strip('\n') + OMP_COPYIN_STR(copyinReadOnly).strip('\n') + #:set create_val = OMP_CREATE_STR(create) + #:set attach_val = OMP_MAP_STR('always,to', attach) + #:set extraOmpArgs_val = GEN_EXTRA_ARGS_STR(extraOmpArgs) + #:set omp_clause_val = copyin_val.strip('\n') + create_val.strip('\n') + attach_val.strip('\n') + #:set omp_directive = '!$omp target enter data ' + omp_clause_val + extraOmpArgs_val.strip('\n') + $:omp_directive +#:enddef + +#:def OMP_EXIT_DATA(copyout=None, delete=None, detach=None, extraOmpArgs=None) + #:set copyout_val = OMP_COPYOUT_STR(copyout) + #:set delete_val = OMP_DELETE_STR(delete) + #:set detach_val = OMP_MAP_STR('always,from', detach) + #:set extraOmpArgs_val = GEN_EXTRA_ARGS_STR(extraOmpArgs) + #:set clause_val = copyout_val.strip('\n') + delete_val.strip('\n') + detach_val.strip('\n') + #:set omp_directive = '!$omp target exit data ' + clause_val + extraOmpArgs_val.strip('\n') + $:omp_directive +#:enddef + +#:def OMP_UPDATE(host=None, device=None, extraOmpArgs=None) + #:set host_val = OMP_FROM_STR(host) + #:set device_val = OMP_TO_STR(device) + #:set extraOmpArgs_val = GEN_EXTRA_ARGS_STR(extraOmpArgs) + #:set clause_val = host_val.strip('\n') + device_val.strip('\n') + #:set omp_directive = '!$omp target update ' + clause_val + extraOmpArgs_val.strip('\n') + $:omp_directive +#:enddef + +#:def OMP_HOST_DATA(code, use_device_addr, use_device_ptr, extraOmpArgs) + #:assert code is not None + #:assert isinstance(code, str) + #:if code == '' or code.isspace() + #:stop 'GPU_HOST_DATA macro has no effect on the code as it is not surrounding any code' + #:endif + #:set use_device_addr_val = OMP_USE_DEVICE_ADDR_STR(use_device_addr) + #:set use_device_ptr_val = OMP_USE_DEVICE_PTR_STR(use_device_ptr) + #:set extraOmpArgs_val = GEN_EXTRA_ARGS_STR(extraOmpArgs) + #:set clause_val = use_device_addr_val.strip('\n') + use_device_ptr_val.strip('\n') + #:set omp_directive = '!$omp target data ' + clause_val + extraOmpArgs_val.strip('\n') + #:set omp_end_directive = '!$omp end target data' + $:omp_directive + $:code + $:omp_end_directive +#:enddef + +#:def OMP_ATOMIC(atomic, extraOmpArgs=None) + #:assert isinstance(atomic, str) + #:assert (atomic == 'read' or atomic == 'write' or atomic == 'update' or atomic == 'capture') + #:set atomic_val = atomic + #:set extraOmpArgs_val = GEN_EXTRA_ARGS_STR(extraOmpArgs) + #:set clause_val = atomic_val.strip('\n') + #:set omp_directive = '!$omp atomic ' + clause_val + extraOmpArgs_val.strip('\n') + $:omp_directive +#:enddef + +#:def OMP_WAIT(extraOmpArgs=None) + #:set extraOmpArgs_val = GEN_EXTRA_ARGS_STR(extraOmpArgs) + #:set clause_val = '' + #:set omp_directive = '!$omp barrier ' + clause_val + extraOmpArgs_val.strip('\n') + $:omp_directive +#:enddef + +#:def UNDEF_AMD(code) + #:if MFC_COMPILER != AMD_COMPILER_ID + $:code + #:endif +#:enddef +! New line at end of file is required for FYPP diff --git a/src/common/include/parallel_macros.fpp b/src/common/include/parallel_macros.fpp index 8d0a5a673b..61bc30b431 100644 --- a/src/common/include/parallel_macros.fpp +++ b/src/common/include/parallel_macros.fpp @@ -1,425 +1,223 @@ -#:mute +#:include 'shared_parallel_macros.fpp' +#:include 'omp_macros.fpp' +#:include 'acc_macros.fpp' -#:def ASSERT_LIST(data, datatype) - #:assert data is not None - #:assert isinstance(data, list) - #:assert len(data) != 0 - #:assert all(isinstance(element, datatype) for element in data) -#:enddef +#:def GPU_PARALLEL(code, private=None, default='present', firstprivate=None, reduction=None, reductionOp=None, & + & copy=None, copyin=None, copyinReadOnly=None, copyout=None, create=None, & + & no_create=None, present=None, deviceptr=None, attach=None, extraAccArgs=None, extraOmpArgs=None) -#:def GEN_PARENTHESES_CLAUSE(clause_name, clause_str) - #:set clause_regex = re.compile(',(?![^(]*\\))') - #:assert isinstance(clause_name, str) - #:if clause_str is not None - #:set count = 0 - #:assert isinstance(clause_str, str) - #:assert clause_str[0] == '[' and clause_str[-1] == ']' - #:for c in clause_str - #:if c == '(' - #:set count = count + 1 - #:elif c == ')' - #:set count = count - 1 - #:endif - #:if c == ',' and count > 1 - #:stop 'Nested parentheses with comma inside is not supported. Incorrect clause: {}'.format(clause_str) - #:elif count < 0 - #:stop 'Missing parentheses. Incorrect clause: {}'.format(clause_str) - #:endif - #:endfor - #:set clause_str = re.sub(clause_regex, ';', clause_str) - #:set clause_list = [x.strip() for x in clause_str.strip('[]').split(';')] - $:ASSERT_LIST(clause_list, str) - #:set clause_str = clause_name + '(' + ', '.join(clause_list) + ') ' - #:else - #:set clause_str = '' - #:endif - $:clause_str -#:enddef + #:set acc_code = ACC_PARALLEL(code, private, default, firstprivate, reduction, reductionOp, copy, copyin, copyinReadOnly, copyout, create, no_create, present, deviceptr, attach, extraAccArgs) + #:set omp_code = OMP_PARALLEL(code, private, default, firstprivate, reduction, reductionOp, copy, copyin, copyinReadOnly, copyout, create, no_create, present, deviceptr, attach, extraOmpArgs) -#:def GEN_PRIVATE_STR(private, initialized_values) - #:assert isinstance(initialized_values, bool) - #:if initialized_values == True - #:set private_val = GEN_PARENTHESES_CLAUSE('firstprivate', private) - #:else - #:set private_val = GEN_PARENTHESES_CLAUSE('private', private) - #:endif - $:private_val -#:enddef - -#:def GEN_COPY_STR(copy) - #:set copy_val = GEN_PARENTHESES_CLAUSE('copy', copy) - $:copy_val -#:enddef +#if defined(MFC_OpenACC) + $:acc_code +#elif defined(MFC_OpenMP) + $:omp_code +#else + $:code +#endif -#:def GEN_COPYIN_STR(copyin, readonly) - #:assert isinstance(readonly, bool) - #:set copyin_val = GEN_PARENTHESES_CLAUSE('copyin', copyin) - #:if copyin is not None and readonly == True - #:set index = copyin_val.find('copyin(') + len('copyin(') - #:set copyin_val = copyin_val[:index] + 'readonly:' + copyin_val[index:] - #:endif - $:copyin_val #:enddef -#:def GEN_COPYOUT_STR(copyout) - #:set copyout_val = GEN_PARENTHESES_CLAUSE('copyout', copyout) - $:copyout_val -#:enddef +#:def GPU_PARALLEL_LOOP(code, collapse=None, private=None, parallelism='[gang, vector]', & + & default='present', firstprivate=None, reduction=None, reductionOp=None, & + & copy=None, copyin=None, copyinReadOnly=None, copyout=None, create=None, & + & no_create=None, present=None, deviceptr=None, attach=None, extraAccArgs=None, extraOmpArgs=None) -#:def GEN_CREATE_STR(create) - #:set create_val = GEN_PARENTHESES_CLAUSE('create', create) - $:create_val -#:enddef + #:set acc_code = ACC_PARALLEL_LOOP(code, collapse, private, parallelism, default, firstprivate, reduction, reductionOp, copy, copyin, copyinReadOnly, copyout, create, no_create, present, deviceptr, attach, extraAccArgs) + #:set omp_code = OMP_PARALLEL_LOOP(code, collapse, private, parallelism, default, firstprivate, reduction, reductionOp, copy, copyin, copyinReadOnly, copyout, create, no_create, present, deviceptr, attach, extraOmpArgs) -#:def GEN_NOCREATE_STR(no_create) - #:set nocreate_val = GEN_PARENTHESES_CLAUSE('no_create', no_create) - $:nocreate_val +#if defined(MFC_OpenACC) + $:acc_code +#elif defined(MFC_OpenMP) + $:omp_code +#else + $:code +#endif #:enddef -#:def GEN_DELETE_STR(delete) - #:set delete_val = GEN_PARENTHESES_CLAUSE('delete', delete) - $:delete_val -#:enddef +#:def GPU_ROUTINE(function_name=None, parallelism=None, nohost=False, cray_inline=False, extraAccArgs=None, extraOmpArgs=None) + #:assert isinstance(cray_inline, bool) + #:set acc_directive = ACC_ROUTINE(function_name=function_name, parallelism=parallelism, nohost=nohost, extraAccArgs=extraAccArgs) + #:set omp_directive = OMP_ROUTINE(function_name=function_name, nohost=nohost, extraOmpArgs=extraOmpArgs) -#:def GEN_PRESENT_STR(present) - #:set present_val = GEN_PARENTHESES_CLAUSE('present', present) - $:present_val + #:if cray_inline == True + #:if not isinstance(function_name, str) + #:stop "When inlining for Cray Compiler, function name must be given and given as a string" + #:endif + #:set cray_directive = ('!DIR$ INLINEALWAYS ' + function_name).strip('\n') +#ifdef _CRAYFTN + $:cray_directive +#elif MFC_OpenACC + $:acc_directive +#elif MFC_OpenMP + $:omp_directive +#endif + #:else +#if MFC_OpenACC + $:acc_directive +#elif MFC_OpenMP + $:omp_directive +#endif + #:endif #:enddef -#:def GEN_DEVICEPTR_STR(deviceptr) - #:set deviceptr_val = GEN_PARENTHESES_CLAUSE('deviceptr', deviceptr) - $:deviceptr_val -#:enddef +#:def GPU_DECLARE(copy=None, copyin=None, copyinReadOnly=None, copyout=None, create=None, present=None, deviceptr=None, link=None, extraAccArgs=None, extraOmpArgs=None) + #:set acc_code = ACC_DECLARE(copy=copy, copyin=copyin, copyinReadOnly=copyinReadOnly, copyout=copyout, create=create, present=present, deviceptr=deviceptr, link=link, extraAccArgs=None) + #:assert copyout is None + #:assert present is None + #:assert deviceptr is None + #:assert copy is None + #:set omp_code = OMP_DECLARE(copyin=copyin, copyinReadOnly=copyinReadOnly, create=create, link=link, extraOmpArgs=extraOmpArgs) -#:def GEN_ATTACH_STR(attach) - #:set attach_val = GEN_PARENTHESES_CLAUSE('attach', attach) - $:attach_val +#if defined(MFC_OpenACC) + $:acc_code +#elif defined(MFC_OpenMP) + $:omp_code +#endif #:enddef -#:def GEN_DETACH_STR(detach) - #:set detach_val = GEN_PARENTHESES_CLAUSE('detach', detach) - $:detach_val -#:enddef +#:def GPU_LOOP(collapse=None, parallelism=None, data_dependency=None, reduction=None, reductionOp=None, private=None, extraAccArgs=None, extraOmpArgs=None) + #:set acc_code = ACC_LOOP(collapse=collapse, parallelism=parallelism, data_dependency=data_dependency, reduction=reduction, reductionOp=reductionOp, private=private, extraAccArgs=extraAccArgs) + #:set omp_code = OMP_LOOP(collapse=collapse, parallelism=parallelism, data_dependency=data_dependency, reduction=reduction, reductionOp=reductionOp, private=private, extraOmpArgs=extraOmpArgs) -#:def GEN_LINK_STR(link) - #:set link_val = GEN_PARENTHESES_CLAUSE('link', link) - $:link_val +#if defined(MFC_OpenACC) + $:acc_code +#elif defined(MFC_OpenMP) + $:omp_code +#endif #:enddef -#:def GEN_EXTRA_ARGS_STR(extraArgs) - #:if extraArgs is not None - #:assert isinstance(extraArgs, str) - #:set extraArgs_val = extraArgs - #:else - #:set extraArgs_val = '' - #:endif - $:extraArgs_val -#:enddef +#:def GPU_DATA(code, copy=None, copyin=None, copyinReadOnly=None, copyout=None, create=None, no_create=None, present=None, deviceptr=None, attach=None, default=None, extraAccArgs=None, extraOmpArgs=None) + #:set acc_code = ACC_DATA(code=code, copy=copy, copyin=copyin, copyinReadOnly=copyinReadOnly, copyout=copyout, create=create, no_create=no_create, present=present, deviceptr=deviceptr, attach=attach, default=default, extraAccArgs=extraAccArgs) + #:set omp_code = OMP_DATA(code=code, copy=copy, copyin=copyin, copyinReadOnly=copyinReadOnly, copyout=copyout, create=create, no_create=no_create, present=present, deviceptr=deviceptr, attach=attach, default=default, extraOmpArgs=extraOmpArgs) -#:def GEN_PARALLELISM_STR(parallelism) - #:if parallelism is not None - #:assert isinstance(parallelism, str) - #:assert parallelism[0] == '[' and parallelism[-1] == ']' - #:set parallelism_list = [x.strip() for x in parallelism.strip('[]').split(',')] - $:ASSERT_LIST(parallelism_list, str) - #:assert all((element == 'gang' or element == 'worker' or & - & element == 'vector' or element == 'seq') for element in parallelism_list) - #:set parallelism_val = ' '.join(parallelism_list) + ' ' - #:else - #:set parallelism_val = '' - #:endif - $:parallelism_val +#if defined(MFC_OpenACC) + $:acc_code +#elif defined(MFC_OpenMP) + $:omp_code +#else + $:code +#endif #:enddef -#:def GEN_COLLAPSE_STR(collapse) - #:if collapse is not None - #:set collapse = int(collapse) - #:assert isinstance(collapse, int) - #:assert collapse > 1 - #:set collapse_val = 'collapse(' + str(collapse) + ') ' - #:else - #:set collapse_val = '' - #:endif - $:collapse_val -#:enddef +#:def GPU_HOST_DATA(code, use_device_addr=None, use_device_ptr=None, extraAccArgs=None, extraOmpArgs=None) -#:def GEN_DEFAULT_STR(default) - #:if default is not None - #:assert isinstance(default, str) - #:assert (default == 'present' or default == 'none') - #:set default_val = 'default(' + default + ') ' + #:if use_device_addr is not None and use_device_ptr is not None + #:set use_device_addr_end_index = len(use_device_addr) - 1 + #:set use_device = use_device_addr + use_device_ptr + $:use_device[use_device_addr_end_index] = ',' + $:use_device[use_device_addr_end_index + 1] = ' ' + #:elif use_device_addr is not None or use_device_ptr is not None + #:if use_device_addr is not None + #:set use_device = use_device_addr + #:else + #:set use_device = use_device_ptr + #:endif #:else - #:set default_val = '' + #:set use_device = None #:endif - $:default_val -#:enddef + #:set acc_code = ACC_HOST_DATA(code=code, use_device=use_device, extraAccArgs=extraAccArgs) + #:set omp_code = OMP_HOST_DATA(code=code, use_device_addr=use_device_addr, use_device_ptr=use_device_ptr, extraOmpArgs=extraOmpArgs) -#:def GEN_REDUCTION_STR(reduction, reductionOp) - #:if reduction is not None and reductionOp is not None - #:assert isinstance(reduction, str) - #:assert isinstance(reductionOp, str) - #:assert reduction[0] == '[' and reduction[-1] == ']' - #:assert reductionOp[0] == '[' and reductionOp[-1] == ']' - #:set reduction = reduction.replace(' ', '') - #:set reduction = reduction[1:-1] - #:set reduction_list = reduction.split('],') - #:set reduction_list = [str + ']' for str in reduction_list] - #:assert all(str[0] == '[' and str[-1] == ']' for str in reduction_list) - - #:set reductionOp_list = [x.strip() for x in reductionOp.strip('[]').split(',')] - $:ASSERT_LIST(reduction_list, str) - $:ASSERT_LIST(reductionOp_list, str) - #:assert len(reduction_list) == len(reductionOp_list) - #:set reduction_val = '' - #:for i in range(len(reduction_list)) - #:set temp_clause = GEN_PARENTHESES_CLAUSE('reduction', reduction_list[i]).strip('\n') - #:set ind = temp_clause.find('reduction(') + len('reduction(') - #:set reduction_val = reduction_val.strip('\n') + temp_clause[:ind] + reductionOp_list[i] + ':' + temp_clause[ind:] - #:endfor - #:elif reduction is not None or reductionOp is not None - #:stop 'Cannot set the reduction list or reduction operation without setting the other' - #:else - #:set reduction_val = '' - #:endif - $:reduction_val +#if defined(MFC_OpenACC) + $:acc_code +#elif defined(MFC_OpenMP) + $:omp_code +#else + $:code +#endif #:enddef -#:def GEN_HOST_STR(host) - #:set host_val = GEN_PARENTHESES_CLAUSE('host', host) - $:host_val -#:enddef +#:def GPU_ENTER_DATA(copyin=None, copyinReadOnly=None, create=None, attach=None, extraAccArgs=None, extraOmpArgs=None) + #:set acc_code = ACC_ENTER_DATA(copyin=copyin, copyinReadOnly=copyinReadOnly, create=create, attach=attach, extraAccArgs=extraAccArgs) + #:set omp_code = OMP_ENTER_DATA(copyin=copyin, copyinReadOnly=copyinReadOnly, create=create, attach=attach, extraOmpArgs=extraOmpArgs) -#:def GEN_DEVICE_STR(device) - #:set device_val = GEN_PARENTHESES_CLAUSE('device', device) - $:device_val +#if defined(MFC_OpenACC) + $:acc_code +#elif defined(MFC_OpenMP) + $:omp_code +#endif #:enddef -#:def GEN_USE_DEVICE_STR(use_device) - #:set use_device_val = GEN_PARENTHESES_CLAUSE('use_device', use_device) - $:use_device_val -#:enddef +#:def GPU_EXIT_DATA(copyout=None, delete=None, detach=None, extraAccArgs=None, extraOmpArgs=None) + #:set acc_code = ACC_EXIT_DATA(copyout=copyout, delete=delete, detach=detach, extraAccArgs=extraAccArgs) + #:set omp_code = OMP_EXIT_DATA(copyout=copyout, delete=delete, detach=detach, extraOmpArgs=extraOmpArgs) -#:def GPU_PARALLEL(code, private=None, default='present', firstprivate=None, reduction=None, reductionOp=None, & - & copy=None, copyin=None, copyinReadOnly=None, copyout=None, create=None, & - & no_create=None, present=None, deviceptr=None, attach=None, extraAccArgs=None) - #:set default_val = GEN_DEFAULT_STR(default) - #:set private_val = GEN_PRIVATE_STR(private, False).strip('\n') + GEN_PRIVATE_STR(firstprivate, True).strip('\n') - #:set reduction_val = GEN_REDUCTION_STR(reduction, reductionOp) - #:set copy_val = GEN_COPY_STR(copy) - #:set copyin_val = GEN_COPYIN_STR(copyin, False).strip('\n') + GEN_COPYIN_STR(copyinReadOnly, True).strip('\n') - #:set copyout_val = GEN_COPYOUT_STR(copyout) - #:set create_val = GEN_CREATE_STR(create) - #:set no_create_val = GEN_NOCREATE_STR(no_create) - #:set present_val = GEN_PRESENT_STR(present) - #:set deviceptr_val = GEN_DEVICEPTR_STR(deviceptr) - #:set attach_val = GEN_ATTACH_STR(attach) - #:set extraAccArgs_val = GEN_EXTRA_ARGS_STR(extraAccArgs) - #:set clause_val = default_val.strip('\n') + private_val.strip('\n') + reduction_val.strip('\n') + & - & copy_val.strip('\n') + copyin_val.strip('\n') + & - & copyout_val.strip('\n') + create_val.strip('\n') + & - & no_create_val.strip('\n') + present_val.strip('\n') + & - & deviceptr_val.strip('\n') + attach_val.strip('\n') - #:set acc_directive = '!$acc parallel ' + & - & clause_val + extraAccArgs_val.strip('\n') - #:set end_acc_directive = '!$acc end parallel' - $:acc_directive - $:code - $:end_acc_directive +#if defined(MFC_OpenACC) + $:acc_code +#elif defined(MFC_OpenMP) + $:omp_code +#endif #:enddef +#:def GPU_ATOMIC(atomic, extraAccArgs=None, extraOmpArgs=None) + #:set acc_code = ACC_ATOMIC(atomic=atomic, extraAccArgs=extraAccArgs) + #:set omp_code = OMP_ATOMIC(atomic=atomic, extraOmpArgs=extraOmpArgs) -#:def GPU_PARALLEL_LOOP(collapse=None, private=None, parallelism='[gang, vector]', & - & default='present', firstprivate=None, reduction=None, reductionOp=None, & - & copy=None, copyin=None, copyinReadOnly=None, copyout=None, create=None, & - & no_create=None, present=None, deviceptr=None, attach=None, extraAccArgs=None) - #:set collapse_val = GEN_COLLAPSE_STR(collapse) - #:set parallelism_val = GEN_PARALLELISM_STR(parallelism) - #:set default_val = GEN_DEFAULT_STR(default) - #:set private_val = GEN_PRIVATE_STR(private, False).strip('\n') + GEN_PRIVATE_STR(firstprivate, True).strip('\n') - #:set reduction_val = GEN_REDUCTION_STR(reduction, reductionOp) - #:set copy_val = GEN_COPY_STR(copy) - #:set copyin_val = GEN_COPYIN_STR(copyin, False).strip('\n') + GEN_COPYIN_STR(copyinReadOnly, True).strip('\n') - #:set copyout_val = GEN_COPYOUT_STR(copyout) - #:set create_val = GEN_CREATE_STR(create) - #:set no_create_val = GEN_NOCREATE_STR(no_create) - #:set present_val = GEN_PRESENT_STR(present) - #:set deviceptr_val = GEN_DEVICEPTR_STR(deviceptr) - #:set attach_val = GEN_ATTACH_STR(attach) - #:set extraAccArgs_val = GEN_EXTRA_ARGS_STR(extraAccArgs) - #:set clause_val = collapse_val.strip('\n') + parallelism_val.strip('\n') + & - & default_val.strip('\n') + private_val.strip('\n') + reduction_val.strip('\n') + & - & copy_val.strip('\n') + copyin_val.strip('\n') + & - & copyout_val.strip('\n') + create_val.strip('\n') + & - & no_create_val.strip('\n') + present_val.strip('\n') + & - & deviceptr_val.strip('\n') + attach_val.strip('\n') - #:set acc_directive = '!$acc parallel loop ' + & - & clause_val + extraAccArgs_val.strip('\n') - $:acc_directive -#:enddef - -#:def GPU_ROUTINE(function_name=None, parallelism=None, nohost=False, cray_inline=False, extraAccArgs=None) - #:assert isinstance(cray_inline, bool) - #:set parallelism_val = GEN_PARALLELISM_STR(parallelism) - #:assert isinstance(nohost, bool) - #:if nohost == True - #:set nohost_val = 'nohost' - #:else - #:set nohost_val = '' - #:endif - #:set extraAccArgs_val = GEN_EXTRA_ARGS_STR(extraAccArgs) - #:set clause_val = parallelism_val.strip('\n') + nohost_val.strip('\n') - #:set acc_directive = '!$acc routine ' + & - & clause_val + extraAccArgs_val.strip('\n') - #:if cray_inline == True - #:if not isinstance(function_name, str) - #:stop "When inlining for Cray Compiler, function name must be given and given as a string" - #:endif - #:set cray_directive = ('!DIR$ INLINEALWAYS ' + function_name).strip('\n') -#ifdef _CRAYFTN - $:cray_directive -#else - $:acc_directive +#if defined(MFC_OpenACC) + $:acc_code +#elif defined(MFC_OpenMP) + $:omp_code #endif - #:else - $:acc_directive - #:endif #:enddef -#:def GPU_DECLARE(copy=None, copyin=None, copyinReadOnly=None, copyout=None, create=None, present=None, deviceptr=None, link=None, extraAccArgs=None) - #:set copy_val = GEN_COPY_STR(copy) - #:set copyin_val = GEN_COPYIN_STR(copyin, False).strip('\n') + GEN_COPYIN_STR(copyinReadOnly, True).strip('\n') - #:set copyout_val = GEN_COPYOUT_STR(copyout) - #:set create_val = GEN_CREATE_STR(create) - #:set present_val = GEN_PRESENT_STR(present) - #:set deviceptr_val = GEN_DEVICEPTR_STR(deviceptr) - #:set link_val = GEN_LINK_STR(link) - #:set extraAccArgs_val = GEN_EXTRA_ARGS_STR(extraAccArgs) - #:set clause_val = copy_val.strip('\n') + copyin_val.strip('\n') + & - & copyout_val.strip('\n') + create_val.strip('\n') + & - & present_val.strip('\n') + deviceptr_val.strip('\n') + & - & link_val.strip('\n') - #:set acc_directive = '!$acc declare ' + clause_val + extraAccArgs_val.strip('\n') - $:acc_directive -#:enddef +#:def GPU_UPDATE(host=None, device=None, extraAccArgs=None, extraOmpArgs=None) + #:set acc_code = ACC_UPDATE(host=host, device=device, extraAccArgs=extraAccArgs) + #:set omp_code = OMP_UPDATE(host=host, device=device, extraOmpArgs=extraOmpArgs) -#:def GPU_LOOP(collapse=None, parallelism=None, data_dependency=None, reduction=None, reductionOp=None, private=None, extraAccArgs=None) - #:set collapse_val = GEN_COLLAPSE_STR(collapse) - #:set parallelism_val = GEN_PARALLELISM_STR(parallelism) - #:if data_dependency is not None - #:assert isinstance(data_dependency, str) - #:assert (data_dependency == 'auto' or data_dependency == 'independent') - #:set data_dependency_val = data_dependency - #:else - #:set data_dependency_val = '' - #:endif - #:set private_val = GEN_PRIVATE_STR(private, False) - #:set reduction_val = GEN_REDUCTION_STR(reduction, reductionOp) - #:set extraAccArgs_val = GEN_EXTRA_ARGS_STR(extraAccArgs) - #:set clause_val = collapse_val.strip('\n') + parallelism_val.strip('\n') + & - & data_dependency_val.strip('\n') + private_val.strip('\n') + & - & reduction_val.strip('\n') - #:set acc_directive = '!$acc loop ' + & - & clause_val + extraAccArgs_val.strip('\n') - $:acc_directive +#if defined(MFC_OpenACC) + $:acc_code +#elif defined(MFC_OpenMP) + $:omp_code +#endif #:enddef -#:def GPU_DATA(code, copy=None, copyin=None, copyinReadOnly=None, copyout=None, create=None, no_create=None, present=None, deviceptr=None, attach=None, default=None, extraAccArgs=None) - #:assert code is not None - #:assert isinstance(code, str) - #:if code == '' or code.isspace() - #:stop 'GPU_DATA macro has no effect on the code as it is not surrounding any code' - #:endif - #:set copy_val = GEN_COPY_STR(copy) - #:set copyin_val = GEN_COPYIN_STR(copyin, False).strip('\n') + GEN_COPYIN_STR(copyinReadOnly, True).strip('\n') - #:set copyout_val = GEN_COPYOUT_STR(copyout) - #:set create_val = GEN_CREATE_STR(create) - #:set no_create_val = GEN_NOCREATE_STR(no_create) - #:set present_val = GEN_PRESENT_STR(present) - #:set deviceptr_val = GEN_DEVICEPTR_STR(deviceptr) - #:set attach_val = GEN_ATTACH_STR(attach) - #:set default_val = GEN_DEFAULT_STR(default) - #:set extraAccArgs_val = GEN_EXTRA_ARGS_STR(extraAccArgs) - #:set clause_val = copy_val.strip('\n') + copyin_val.strip('\n') + & - & copyout_val.strip('\n') + create_val.strip('\n') + & - & no_create_val.strip('\n') + present_val.strip('\n') + & - & deviceptr_val.strip('\n') + attach_val.strip('\n') + & - & default_val.strip('\n') - #:set acc_directive = '!$acc data ' + clause_val + extraAccArgs_val.strip('\n') - #:set end_acc_directive = '!$acc end data' - $:acc_directive - $:code - $:end_acc_directive -#:enddef +#:def GPU_WAIT(extraAccArgs=None, extraOmpArgs=None) + #:set acc_code = ACC_WAIT(extraAccArgs=extraAccArgs) + #:set omp_code = OMP_WAIT(extraOmpArgs=extraOmpArgs) -#:def GPU_HOST_DATA(code, use_device=None, extraAccArgs=None) - #:assert code is not None - #:assert isinstance(code, str) - #:if code == '' or code.isspace() - #:stop 'GPU_HOST_DATA macro has no effect on the code as it is not surrounding any code' - #:endif - #:set use_device_val = GEN_USE_DEVICE_STR(use_device) - #:set extraAccArgs_val = GEN_EXTRA_ARGS_STR(extraAccArgs) - #:set clause_val = use_device_val.strip('\n') - #:set acc_directive = '!$acc host_data ' + clause_val + extraAccArgs_val.strip('\n') - #:set end_acc_directive = '!$acc end host_data' - $:acc_directive - $:code - $:end_acc_directive +#if defined(MFC_OpenACC) + $:acc_code +#elif defined(MFC_OpenMP) + $:omp_code +#endif #:enddef -#:def GPU_ENTER_DATA(copyin=None, copyinReadOnly=None, create=None, attach=None, extraAccArgs=None) - #:set copyin_val = GEN_COPYIN_STR(copyin, False).strip('\n') + GEN_COPYIN_STR(copyinReadOnly, True).strip('\n') - #:set create_val = GEN_CREATE_STR(create) - #:set attach_val = GEN_ATTACH_STR(attach) - #:set extraAccArgs_val = GEN_EXTRA_ARGS_STR(extraAccArgs) - #:set clause_val = copyin_val.strip('\n') + create_val.strip('\n') + attach_val.strip('\n') - #:set acc_directive = '!$acc enter data ' + clause_val + extraAccArgs_val.strip('\n') - $:acc_directive -#:enddef +#:def USE_GPU_MODULE() + +#if defined(MFC_OpenACC) + use openacc +#elif defined(MFC_OpenMP) + use omp_lib +#endif -#:def GPU_EXIT_DATA(copyout=None, delete=None, detach=None, extraAccArgs=None) - #:set copyout_val = GEN_COPYOUT_STR(copyout) - #:set delete_val = GEN_DELETE_STR(delete) - #:set detach_val = GEN_DETACH_STR(detach) - #:set extraAccArgs_val = GEN_EXTRA_ARGS_STR(extraAccArgs) - #:set clause_val = copyout_val.strip('\n') + delete_val.strip('\n') + detach_val.strip('\n') - #:set acc_directive = '!$acc exit data ' + clause_val + extraAccArgs_val.strip('\n') - $:acc_directive #:enddef -#:def GPU_CACHE(cache, extraAccArgs=None) - #:set cache_val = GEN_PARENTHESES_CLAUSE('cache', cache) - #:set extraAccArgs_val = GEN_EXTRA_ARGS_STR(extraAccArgs) - #:set clause_val = cache_val.strip('\n') - #:set acc_directive = '!$acc ' + clause_val + extraAccArgs_val.strip('\n') - $:acc_directive +#:def DEF_AMD(code) + #:if MFC_COMPILER == AMD_COMPILER_ID + $:code + #:endif #:enddef -#:def GPU_ATOMIC(atomic, extraAccArgs=None) - #:assert isinstance(atomic, str) - #:assert (atomic == 'read' or atomic == 'write' or atomic == 'update' or atomic == 'capture') - #:set atomic_val = atomic - #:set extraAccArgs_val = GEN_EXTRA_ARGS_STR(extraAccArgs) - #:set clause_val = atomic_val.strip('\n') - #:set acc_directive = '!$acc atomic ' + clause_val + extraAccArgs_val.strip('\n') - $:acc_directive +#:def UNDEF_CCE(code) + #:if MFC_COMPILER != CCE_COMPILER_ID + $:code + #:endif #:enddef -#:def GPU_UPDATE(host=None, device=None, extraAccArgs=None) - #:set host_val = GEN_HOST_STR(host) - #:set device_val = GEN_DEVICE_STR(device) - #:set extraAccArgs_val = GEN_EXTRA_ARGS_STR(extraAccArgs) - #:set clause_val = host_val.strip('\n') + device_val.strip('\n') - #:set acc_directive = '!$acc update ' + clause_val + extraAccArgs_val.strip('\n') - $:acc_directive +#:def DEF_CCE(code) + #:if MFC_COMPILER == CCE_COMPILER_ID + $:code + #:endif #:enddef -#:def GPU_WAIT(extraAccArgs=None) - #:set extraAccArgs_val = GEN_EXTRA_ARGS_STR(extraAccArgs) - #:set clause_val = '' - #:set acc_directive = '!$acc wait ' + clause_val + extraAccArgs_val.strip('\n') - $:acc_directive +#:def UNDEF_NVIDIA(code) + #:if MFC_COMPILER != NVIDIA_COMPILER_ID and MFC_COMPILER != PGI_COMPILER_ID + $:code + #:endif #:enddef -#:endmute +#:set USING_NVHPC = (MFC_COMPILER == NVIDIA_COMPILER_ID or MFC_COMPILER == PGI_COMPILER_ID) +#:set USING_CCE = (MFC_COMPILER == CCE_COMPILER_ID) ! New line at end of file is required for FYPP diff --git a/src/common/include/shared_parallel_macros.fpp b/src/common/include/shared_parallel_macros.fpp new file mode 100644 index 0000000000..61134a3df3 --- /dev/null +++ b/src/common/include/shared_parallel_macros.fpp @@ -0,0 +1,110 @@ +#:def ASSERT_LIST(data, datatype) + #:assert data is not None + #:assert isinstance(data, list) + #:assert len(data) != 0 + #:assert all(isinstance(element, datatype) for element in data) +#:enddef + +#:def GEN_CLAUSE(clause_name, clause_str) + #:set clause_regex = re.compile(',(?![^(]*\\))') + #:assert isinstance(clause_name, str) + #:if clause_str is not None + #:set count = 0 + #:assert isinstance(clause_str, str) + #:assert clause_str[0] == '[' and clause_str[-1] == ']' + #:for c in clause_str + #:if c == '(' + #:set count = count + 1 + #:elif c == ')' + #:set count = count - 1 + #:endif + #:if c == ',' and count > 1 + #:stop 'Nested parentheses with comma inside is not supported. Incorrect clause: {}'.format(clause_str) + #:elif count < 0 + #:stop 'Missing parentheses. Incorrect clause: {}'.format(clause_str) + #:endif + #:endfor + #:set clause_str = re.sub(clause_regex, ';', clause_str) + #:set clause_list = [x.strip() for x in clause_str.strip('[]').split(';')] + $:ASSERT_LIST(clause_list, str) + #:set clause_str = clause_name + ', '.join(clause_list) + ') ' + #:else + #:set clause_str = '' + #:endif + $:clause_str +#:enddef + +#:def GEN_PARENTHESES_CLAUSE(clause_name, clause_str) + #:assert isinstance(clause_name, str) + #:if clause_str is not None + #:assert isinstance(clause_str, str) + #:set clause = clause_name + '(' + #:set clause_str = GEN_CLAUSE(clause, clause_str) + #:else + #:set clause_str = '' + #:endif + $:clause_str +#:enddef + +#:def GEN_PRIVATE_STR(private, initialized_values) + #:assert isinstance(initialized_values, bool) + #:if initialized_values == True + #:set private_val = GEN_PARENTHESES_CLAUSE('firstprivate', private) + #:else + #:set private_val = GEN_PARENTHESES_CLAUSE('private', private) + #:endif + $:private_val +#:enddef + +#:def GEN_REDUCTION_STR(reduction, reductionOp) + #:if reduction is not None and reductionOp is not None + #:assert isinstance(reduction, str) + #:assert isinstance(reductionOp, str) + #:assert reduction[0] == '[' and reduction[-1] == ']' + #:assert reductionOp[0] == '[' and reductionOp[-1] == ']' + #:set reduction = reduction.replace(' ', '') + #:set reduction = reduction[1:-1] + #:set reduction_list = reduction.split('],') + #:set reduction_list = [str + ']' for str in reduction_list] + #:assert all(str[0] == '[' and str[-1] == ']' for str in reduction_list) + + #:set reductionOp_list = [x.strip() for x in reductionOp.strip('[]').split(',')] + $:ASSERT_LIST(reduction_list, str) + $:ASSERT_LIST(reductionOp_list, str) + #:assert len(reduction_list) == len(reductionOp_list) + #:set reduction_val = '' + #:for i in range(len(reduction_list)) + #:set temp_clause = GEN_PARENTHESES_CLAUSE('reduction', reduction_list[i]).strip('\n') + #:set ind = temp_clause.find('reduction(') + len('reduction(') + #:set reduction_val = reduction_val.strip('\n') + temp_clause[:ind] + reductionOp_list[i] + ':' + temp_clause[ind:] + #:endfor + #:elif reduction is not None or reductionOp is not None + #:stop 'Cannot set the reduction list or reduction operation without setting the other' + #:else + #:set reduction_val = '' + #:endif + $:reduction_val +#:enddef + +#:def GEN_COLLAPSE_STR(collapse) + #:if collapse is not None + #:set collapse = int(collapse) + #:assert isinstance(collapse, int) + #:assert collapse > 1 + #:set collapse_val = 'collapse(' + str(collapse) + ') ' + #:else + #:set collapse_val = '' + #:endif + $:collapse_val +#:enddef + +#:def GEN_EXTRA_ARGS_STR(extraArgs) + #:if extraArgs is not None + #:assert isinstance(extraArgs, str) + #:set extraArgs_val = extraArgs + #:else + #:set extraArgs_val = '' + #:endif + $:extraArgs_val +#:enddef +! New line at end of file is required for FYPP \ No newline at end of file diff --git a/src/common/m_boundary_common.fpp b/src/common/m_boundary_common.fpp index f89278a86d..81d4bc8d65 100644 --- a/src/common/m_boundary_common.fpp +++ b/src/common/m_boundary_common.fpp @@ -26,9 +26,6 @@ module m_boundary_common type(scalar_field), dimension(:, :), allocatable :: bc_buffers $:GPU_DECLARE(create='[bc_buffers]') - type(scalar_field), dimension(1) :: jac_sf - $:GPU_DECLARE(create='[jac_sf]') - #ifdef MFC_MPI integer, dimension(1:3, -1:1) :: MPI_BC_TYPE_TYPE, MPI_BC_BUFFER_TYPE #endif @@ -91,59 +88,61 @@ contains if (bc_x%beg >= 0) then call s_mpi_sendrecv_variables_buffers(q_prim_vf, 1, -1, sys_size, pb_in, mv_in) else - $:GPU_PARALLEL_LOOP(collapse=2) - do l = 0, p - do k = 0, n - select case (int(bc_type(1, -1)%sf(0, k, l))) - case (BC_CHAR_SUP_OUTFLOW:BC_GHOST_EXTRAP) - call s_ghost_cell_extrapolation(q_prim_vf, 1, -1, k, l) - case (BC_REFLECTIVE) - call s_symmetry(q_prim_vf, 1, -1, k, l, pb_in, mv_in) - case (BC_PERIODIC) - call s_periodic(q_prim_vf, 1, -1, k, l, pb_in, mv_in) - case (BC_SLIP_WALL) - call s_slip_wall(q_prim_vf, 1, -1, k, l) - case (BC_NO_SLIP_WALL) - call s_no_slip_wall(q_prim_vf, 1, -1, k, l) - case (BC_DIRICHLET) - call s_dirichlet(q_prim_vf, 1, -1, k, l) - end select - - if (qbmm .and. (.not. polytropic) .and. & - (bc_type(1, -1)%sf(0, k, l) <= BC_GHOST_EXTRAP)) then - call s_qbmm_extrapolation(1, -1, k, l, pb_in, mv_in) - end if + #:call GPU_PARALLEL_LOOP(collapse=2) + do l = 0, p + do k = 0, n + select case (int(bc_type(1, -1)%sf(0, k, l))) + case (BC_CHAR_SUP_OUTFLOW:BC_GHOST_EXTRAP) + call s_ghost_cell_extrapolation(q_prim_vf, 1, -1, k, l) + case (BC_REFLECTIVE) + call s_symmetry(q_prim_vf, 1, -1, k, l, pb_in, mv_in) + case (BC_PERIODIC) + call s_periodic(q_prim_vf, 1, -1, k, l, pb_in, mv_in) + case (BC_SLIP_WALL) + call s_slip_wall(q_prim_vf, 1, -1, k, l) + case (BC_NO_SLIP_WALL) + call s_no_slip_wall(q_prim_vf, 1, -1, k, l) + case (BC_DIRICHLET) + call s_dirichlet(q_prim_vf, 1, -1, k, l) + end select + + if (qbmm .and. (.not. polytropic) .and. & + (bc_type(1, -1)%sf(0, k, l) <= BC_GHOST_EXTRAP)) then + call s_qbmm_extrapolation(1, -1, k, l, pb_in, mv_in) + end if + end do end do - end do + #:endcall GPU_PARALLEL_LOOP end if if (bc_x%end >= 0) then call s_mpi_sendrecv_variables_buffers(q_prim_vf, 1, 1, sys_size, pb_in, mv_in) else - $:GPU_PARALLEL_LOOP(collapse=2) - do l = 0, p - do k = 0, n - select case (int(bc_type(1, 1)%sf(0, k, l))) - case (BC_CHAR_SUP_OUTFLOW:BC_GHOST_EXTRAP) ! Ghost-cell extrap. BC at end - call s_ghost_cell_extrapolation(q_prim_vf, 1, 1, k, l) - case (BC_REFLECTIVE) - call s_symmetry(q_prim_vf, 1, 1, k, l, pb_in, mv_in) - case (BC_PERIODIC) - call s_periodic(q_prim_vf, 1, 1, k, l, pb_in, mv_in) - case (BC_SLIP_WALL) - call s_slip_wall(q_prim_vf, 1, 1, k, l) - case (BC_NO_SLIP_WALL) - call s_no_slip_wall(q_prim_vf, 1, 1, k, l) - case (BC_DIRICHLET) - call s_dirichlet(q_prim_vf, 1, 1, k, l) - end select - - if (qbmm .and. (.not. polytropic) .and. & - (bc_type(1, 1)%sf(0, k, l) <= BC_GHOST_EXTRAP)) then - call s_qbmm_extrapolation(1, 1, k, l, pb_in, mv_in) - end if + #:call GPU_PARALLEL_LOOP(collapse=2) + do l = 0, p + do k = 0, n + select case (int(bc_type(1, 1)%sf(0, k, l))) + case (BC_CHAR_SUP_OUTFLOW:BC_GHOST_EXTRAP) ! Ghost-cell extrap. BC at end + call s_ghost_cell_extrapolation(q_prim_vf, 1, 1, k, l) + case (BC_REFLECTIVE) + call s_symmetry(q_prim_vf, 1, 1, k, l, pb_in, mv_in) + case (BC_PERIODIC) + call s_periodic(q_prim_vf, 1, 1, k, l, pb_in, mv_in) + case (BC_SLIP_WALL) + call s_slip_wall(q_prim_vf, 1, 1, k, l) + case (BC_NO_SLIP_WALL) + call s_no_slip_wall(q_prim_vf, 1, 1, k, l) + case (BC_DIRICHLET) + call s_dirichlet(q_prim_vf, 1, 1, k, l) + end select + + if (qbmm .and. (.not. polytropic) .and. & + (bc_type(1, 1)%sf(0, k, l) <= BC_GHOST_EXTRAP)) then + call s_qbmm_extrapolation(1, 1, k, l, pb_in, mv_in) + end if + end do end do - end do + #:endcall GPU_PARALLEL_LOOP end if ! Population of Buffers in y-direction @@ -153,62 +152,64 @@ contains if (bc_y%beg >= 0) then call s_mpi_sendrecv_variables_buffers(q_prim_vf, 2, -1, sys_size, pb_in, mv_in) else - $:GPU_PARALLEL_LOOP(collapse=2) - do l = 0, p - do k = -buff_size, m + buff_size - select case (int(bc_type(2, -1)%sf(k, 0, l))) - case (BC_CHAR_SUP_OUTFLOW:BC_GHOST_EXTRAP) - call s_ghost_cell_extrapolation(q_prim_vf, 2, -1, k, l) - case (BC_AXIS) - call s_axis(q_prim_vf, pb_in, mv_in, k, l) - case (BC_REFLECTIVE) - call s_symmetry(q_prim_vf, 2, -1, k, l, pb_in, mv_in) - case (BC_PERIODIC) - call s_periodic(q_prim_vf, 2, -1, k, l, pb_in, mv_in) - case (BC_SLIP_WALL) - call s_slip_wall(q_prim_vf, 2, -1, k, l) - case (BC_NO_SLIP_WALL) - call s_no_slip_wall(q_prim_vf, 2, -1, k, l) - case (BC_DIRICHLET) - call s_dirichlet(q_prim_vf, 2, -1, k, l) - end select - - if (qbmm .and. (.not. polytropic) .and. & - (bc_type(2, -1)%sf(k, 0, l) <= BC_GHOST_EXTRAP) .and. & - (bc_type(2, -1)%sf(k, 0, l) /= BC_AXIS)) then - call s_qbmm_extrapolation(2, -1, k, l, pb_in, mv_in) - end if + #:call GPU_PARALLEL_LOOP(collapse=2) + do l = 0, p + do k = -buff_size, m + buff_size + select case (int(bc_type(2, -1)%sf(k, 0, l))) + case (BC_CHAR_SUP_OUTFLOW:BC_GHOST_EXTRAP) + call s_ghost_cell_extrapolation(q_prim_vf, 2, -1, k, l) + case (BC_AXIS) + call s_axis(q_prim_vf, pb_in, mv_in, k, l) + case (BC_REFLECTIVE) + call s_symmetry(q_prim_vf, 2, -1, k, l, pb_in, mv_in) + case (BC_PERIODIC) + call s_periodic(q_prim_vf, 2, -1, k, l, pb_in, mv_in) + case (BC_SLIP_WALL) + call s_slip_wall(q_prim_vf, 2, -1, k, l) + case (BC_NO_SLIP_WALL) + call s_no_slip_wall(q_prim_vf, 2, -1, k, l) + case (BC_DIRICHLET) + call s_dirichlet(q_prim_vf, 2, -1, k, l) + end select + + if (qbmm .and. (.not. polytropic) .and. & + (bc_type(2, -1)%sf(k, 0, l) <= BC_GHOST_EXTRAP) .and. & + (bc_type(2, -1)%sf(k, 0, l) /= BC_AXIS)) then + call s_qbmm_extrapolation(2, -1, k, l, pb_in, mv_in) + end if + end do end do - end do + #:endcall GPU_PARALLEL_LOOP end if if (bc_y%end >= 0) then call s_mpi_sendrecv_variables_buffers(q_prim_vf, 2, 1, sys_size, pb_in, mv_in) else - $:GPU_PARALLEL_LOOP(collapse=2) - do l = 0, p - do k = -buff_size, m + buff_size - select case (int(bc_type(2, 1)%sf(k, 0, l))) - case (BC_CHAR_SUP_OUTFLOW:BC_GHOST_EXTRAP) - call s_ghost_cell_extrapolation(q_prim_vf, 2, 1, k, l) - case (BC_REFLECTIVE) - call s_symmetry(q_prim_vf, 2, 1, k, l, pb_in, mv_in) - case (BC_PERIODIC) - call s_periodic(q_prim_vf, 2, 1, k, l, pb_in, mv_in) - case (BC_SLIP_WALL) - call s_slip_wall(q_prim_vf, 2, 1, k, l) - case (BC_NO_SLIP_WALL) - call s_no_slip_wall(q_prim_vf, 2, 1, k, l) - case (BC_DIRICHLET) - call s_dirichlet(q_prim_vf, 2, 1, k, l) - end select - - if (qbmm .and. (.not. polytropic) .and. & - (bc_type(2, 1)%sf(k, 0, l) <= BC_GHOST_EXTRAP)) then - call s_qbmm_extrapolation(2, 1, k, l, pb_in, mv_in) - end if + #:call GPU_PARALLEL_LOOP(collapse=2) + do l = 0, p + do k = -buff_size, m + buff_size + select case (int(bc_type(2, 1)%sf(k, 0, l))) + case (BC_CHAR_SUP_OUTFLOW:BC_GHOST_EXTRAP) + call s_ghost_cell_extrapolation(q_prim_vf, 2, 1, k, l) + case (BC_REFLECTIVE) + call s_symmetry(q_prim_vf, 2, 1, k, l, pb_in, mv_in) + case (BC_PERIODIC) + call s_periodic(q_prim_vf, 2, 1, k, l, pb_in, mv_in) + case (BC_SLIP_WALL) + call s_slip_wall(q_prim_vf, 2, 1, k, l) + case (BC_NO_SLIP_WALL) + call s_no_slip_wall(q_prim_vf, 2, 1, k, l) + case (BC_DIRICHLET) + call s_dirichlet(q_prim_vf, 2, 1, k, l) + end select + + if (qbmm .and. (.not. polytropic) .and. & + (bc_type(2, 1)%sf(k, 0, l) <= BC_GHOST_EXTRAP)) then + call s_qbmm_extrapolation(2, 1, k, l, pb_in, mv_in) + end if + end do end do - end do + #:endcall GPU_PARALLEL_LOOP end if ! Population of Buffers in z-direction @@ -218,65 +219,67 @@ contains if (bc_z%beg >= 0) then call s_mpi_sendrecv_variables_buffers(q_prim_vf, 3, -1, sys_size, pb_in, mv_in) else - $:GPU_PARALLEL_LOOP(collapse=2) - do l = -buff_size, n + buff_size - do k = -buff_size, m + buff_size - select case (int(bc_type(3, -1)%sf(k, l, 0))) - case (BC_CHAR_SUP_OUTFLOW:BC_GHOST_EXTRAP) - call s_ghost_cell_extrapolation(q_prim_vf, 3, -1, k, l) - case (BC_REFLECTIVE) - call s_symmetry(q_prim_vf, 3, -1, k, l, pb_in, mv_in) - case (BC_PERIODIC) - call s_periodic(q_prim_vf, 3, -1, k, l, pb_in, mv_in) - case (BC_SLIP_WALL) - call s_slip_wall(q_prim_vf, 3, -1, k, l) - case (BC_NO_SLIP_WALL) - call s_no_slip_wall(q_prim_vf, 3, -1, k, l) - case (BC_DIRICHLET) - call s_dirichlet(q_prim_vf, 3, -1, k, l) - end select - - if (qbmm .and. (.not. polytropic) .and. & - (bc_type(3, -1)%sf(k, l, 0) <= BC_GHOST_EXTRAP)) then - call s_qbmm_extrapolation(3, -1, k, l, pb_in, mv_in) - end if + #:call GPU_PARALLEL_LOOP(collapse=2) + do l = -buff_size, n + buff_size + do k = -buff_size, m + buff_size + select case (int(bc_type(3, -1)%sf(k, l, 0))) + case (BC_CHAR_SUP_OUTFLOW:BC_GHOST_EXTRAP) + call s_ghost_cell_extrapolation(q_prim_vf, 3, -1, k, l) + case (BC_REFLECTIVE) + call s_symmetry(q_prim_vf, 3, -1, k, l, pb_in, mv_in) + case (BC_PERIODIC) + call s_periodic(q_prim_vf, 3, -1, k, l, pb_in, mv_in) + case (BC_SLIP_WALL) + call s_slip_wall(q_prim_vf, 3, -1, k, l) + case (BC_NO_SLIP_WALL) + call s_no_slip_wall(q_prim_vf, 3, -1, k, l) + case (BC_DIRICHLET) + call s_dirichlet(q_prim_vf, 3, -1, k, l) + end select + + if (qbmm .and. (.not. polytropic) .and. & + (bc_type(3, -1)%sf(k, l, 0) <= BC_GHOST_EXTRAP)) then + call s_qbmm_extrapolation(3, -1, k, l, pb_in, mv_in) + end if + end do end do - end do + #:endcall GPU_PARALLEL_LOOP end if if (bc_z%end >= 0) then call s_mpi_sendrecv_variables_buffers(q_prim_vf, 3, 1, sys_size, pb_in, mv_in) else - $:GPU_PARALLEL_LOOP(collapse=2) - do l = -buff_size, n + buff_size - do k = -buff_size, m + buff_size - select case (int(bc_type(3, 1)%sf(k, l, 0))) - case (BC_CHAR_SUP_OUTFLOW:BC_GHOST_EXTRAP) - call s_ghost_cell_extrapolation(q_prim_vf, 3, 1, k, l) - case (BC_REFLECTIVE) - call s_symmetry(q_prim_vf, 3, 1, k, l, pb_in, mv_in) - case (BC_PERIODIC) - call s_periodic(q_prim_vf, 3, 1, k, l, pb_in, mv_in) - case (BC_SlIP_WALL) - call s_slip_wall(q_prim_vf, 3, 1, k, l) - case (BC_NO_SLIP_WALL) - call s_no_slip_wall(q_prim_vf, 3, 1, k, l) - case (BC_DIRICHLET) - call s_dirichlet(q_prim_vf, 3, 1, k, l) - end select - - if (qbmm .and. (.not. polytropic) .and. & - (bc_type(3, 1)%sf(k, l, 0) <= BC_GHOST_EXTRAP)) then - call s_qbmm_extrapolation(3, 1, k, l, pb_in, mv_in) - end if + #:call GPU_PARALLEL_LOOP(collapse=2) + do l = -buff_size, n + buff_size + do k = -buff_size, m + buff_size + select case (int(bc_type(3, 1)%sf(k, l, 0))) + case (BC_CHAR_SUP_OUTFLOW:BC_GHOST_EXTRAP) + call s_ghost_cell_extrapolation(q_prim_vf, 3, 1, k, l) + case (BC_REFLECTIVE) + call s_symmetry(q_prim_vf, 3, 1, k, l, pb_in, mv_in) + case (BC_PERIODIC) + call s_periodic(q_prim_vf, 3, 1, k, l, pb_in, mv_in) + case (BC_SlIP_WALL) + call s_slip_wall(q_prim_vf, 3, 1, k, l) + case (BC_NO_SLIP_WALL) + call s_no_slip_wall(q_prim_vf, 3, 1, k, l) + case (BC_DIRICHLET) + call s_dirichlet(q_prim_vf, 3, 1, k, l) + end select + + if (qbmm .and. (.not. polytropic) .and. & + (bc_type(3, 1)%sf(k, l, 0) <= BC_GHOST_EXTRAP)) then + call s_qbmm_extrapolation(3, 1, k, l, pb_in, mv_in) + end if + end do end do - end do + #:endcall GPU_PARALLEL_LOOP end if ! END: Population of Buffers in z-direction end subroutine s_populate_variables_buffers - pure subroutine s_ghost_cell_extrapolation(q_prim_vf, bc_dir, bc_loc, k, l) + subroutine s_ghost_cell_extrapolation(q_prim_vf, bc_dir, bc_loc, k, l) $:GPU_ROUTINE(function_name='s_ghost_cell_extrapolation', & & parallelism='[seq]', cray_inline=True) type(scalar_field), dimension(sys_size), intent(inout) :: q_prim_vf @@ -337,7 +340,7 @@ contains end subroutine s_ghost_cell_extrapolation - pure subroutine s_symmetry(q_prim_vf, bc_dir, bc_loc, k, l, pb_in, mv_in) + subroutine s_symmetry(q_prim_vf, bc_dir, bc_loc, k, l, pb_in, mv_in) $:GPU_ROUTINE(parallelism='[seq]') type(scalar_field), dimension(sys_size), intent(inout) :: q_prim_vf real(wp), optional, dimension(idwbuff(1)%beg:, idwbuff(2)%beg:, idwbuff(3)%beg:, 1:, 1:), intent(inout) :: pb_in, mv_in @@ -597,7 +600,7 @@ contains end subroutine s_symmetry - pure subroutine s_periodic(q_prim_vf, bc_dir, bc_loc, k, l, pb_in, mv_in) + subroutine s_periodic(q_prim_vf, bc_dir, bc_loc, k, l, pb_in, mv_in) $:GPU_ROUTINE(parallelism='[seq]') type(scalar_field), dimension(sys_size), intent(inout) :: q_prim_vf real(wp), optional, dimension(idwbuff(1)%beg:, idwbuff(2)%beg:, idwbuff(3)%beg:, 1:, 1:), intent(inout) :: pb_in, mv_in @@ -736,7 +739,7 @@ contains end subroutine s_periodic - pure subroutine s_axis(q_prim_vf, pb_in, mv_in, k, l) + subroutine s_axis(q_prim_vf, pb_in, mv_in, k, l) $:GPU_ROUTINE(parallelism='[seq]') type(scalar_field), dimension(sys_size), intent(inout) :: q_prim_vf real(wp), dimension(idwbuff(1)%beg:, idwbuff(2)%beg:, idwbuff(3)%beg:, 1:, 1:), intent(inout) :: pb_in, mv_in @@ -795,7 +798,7 @@ contains end subroutine s_axis - pure subroutine s_slip_wall(q_prim_vf, bc_dir, bc_loc, k, l) + subroutine s_slip_wall(q_prim_vf, bc_dir, bc_loc, k, l) $:GPU_ROUTINE(function_name='s_slip_wall',parallelism='[seq]', & & cray_inline=True) type(scalar_field), dimension(sys_size), intent(inout) :: q_prim_vf @@ -886,7 +889,7 @@ contains end subroutine s_slip_wall - pure subroutine s_no_slip_wall(q_prim_vf, bc_dir, bc_loc, k, l) + subroutine s_no_slip_wall(q_prim_vf, bc_dir, bc_loc, k, l) $:GPU_ROUTINE(function_name='s_no_slip_wall',parallelism='[seq]', & & cray_inline=True) @@ -1014,7 +1017,7 @@ contains end subroutine s_no_slip_wall - pure subroutine s_dirichlet(q_prim_vf, bc_dir, bc_loc, k, l) + subroutine s_dirichlet(q_prim_vf, bc_dir, bc_loc, k, l) $:GPU_ROUTINE(function_name='s_dirichlet',parallelism='[seq]', & & cray_inline=True) type(scalar_field), dimension(sys_size), intent(inout) :: q_prim_vf @@ -1079,7 +1082,7 @@ contains end subroutine s_dirichlet - pure subroutine s_qbmm_extrapolation(bc_dir, bc_loc, k, l, pb_in, mv_in) + subroutine s_qbmm_extrapolation(bc_dir, bc_loc, k, l, pb_in, mv_in) $:GPU_ROUTINE(parallelism='[seq]') real(wp), optional, dimension(idwbuff(1)%beg:, idwbuff(2)%beg:, idwbuff(3)%beg:, 1:, 1:), intent(inout) :: pb_in, mv_in integer, intent(in) :: bc_dir, bc_loc @@ -1162,37 +1165,39 @@ contains if (bc_x%beg >= 0) then call s_mpi_sendrecv_variables_buffers(c_divs, 1, -1, num_dims + 1) else - $:GPU_PARALLEL_LOOP(collapse=2) - do l = 0, p - do k = 0, n - select case (bc_type(1, -1)%sf(0, k, l)) - case (BC_PERIODIC) - call s_color_function_periodic(c_divs, 1, -1, k, l) - case (BC_REFLECTIVE) - call s_color_function_reflective(c_divs, 1, -1, k, l) - case default - call s_color_function_ghost_cell_extrapolation(c_divs, 1, -1, k, l) - end select - end do - end do + #:call GPU_PARALLEL_LOOP(collapse=2) + do l = 0, p + do k = 0, n + select case (bc_type(1, -1)%sf(0, k, l)) + case (BC_PERIODIC) + call s_color_function_periodic(c_divs, 1, -1, k, l) + case (BC_REFLECTIVE) + call s_color_function_reflective(c_divs, 1, -1, k, l) + case default + call s_color_function_ghost_cell_extrapolation(c_divs, 1, -1, k, l) + end select + end do + end do + #:endcall GPU_PARALLEL_LOOP end if if (bc_x%end >= 0) then call s_mpi_sendrecv_variables_buffers(c_divs, 1, 1, num_dims + 1) else - $:GPU_PARALLEL_LOOP(collapse=2) - do l = 0, p - do k = 0, n - select case (bc_type(1, 1)%sf(0, k, l)) - case (BC_PERIODIC) - call s_color_function_periodic(c_divs, 1, 1, k, l) - case (BC_REFLECTIVE) - call s_color_function_reflective(c_divs, 1, 1, k, l) - case default - call s_color_function_ghost_cell_extrapolation(c_divs, 1, 1, k, l) - end select - end do - end do + #:call GPU_PARALLEL_LOOP(collapse=2) + do l = 0, p + do k = 0, n + select case (bc_type(1, 1)%sf(0, k, l)) + case (BC_PERIODIC) + call s_color_function_periodic(c_divs, 1, 1, k, l) + case (BC_REFLECTIVE) + call s_color_function_reflective(c_divs, 1, 1, k, l) + case default + call s_color_function_ghost_cell_extrapolation(c_divs, 1, 1, k, l) + end select + end do + end do + #:endcall GPU_PARALLEL_LOOP end if if (n == 0) return @@ -1201,37 +1206,39 @@ contains if (bc_y%beg >= 0) then call s_mpi_sendrecv_variables_buffers(c_divs, 2, -1, num_dims + 1) else - $:GPU_PARALLEL_LOOP(collapse=2) - do l = 0, p - do k = -buff_size, m + buff_size - select case (bc_type(2, -1)%sf(k, 0, l)) - case (BC_PERIODIC) - call s_color_function_periodic(c_divs, 2, -1, k, l) - case (BC_REFLECTIVE) - call s_color_function_reflective(c_divs, 2, -1, k, l) - case default - call s_color_function_ghost_cell_extrapolation(c_divs, 2, -1, k, l) - end select - end do - end do + #:call GPU_PARALLEL_LOOP(collapse=2) + do l = 0, p + do k = -buff_size, m + buff_size + select case (bc_type(2, -1)%sf(k, 0, l)) + case (BC_PERIODIC) + call s_color_function_periodic(c_divs, 2, -1, k, l) + case (BC_REFLECTIVE) + call s_color_function_reflective(c_divs, 2, -1, k, l) + case default + call s_color_function_ghost_cell_extrapolation(c_divs, 2, -1, k, l) + end select + end do + end do + #:endcall GPU_PARALLEL_LOOP end if if (bc_y%end >= 0) then call s_mpi_sendrecv_variables_buffers(c_divs, 2, 1, num_dims + 1) else - $:GPU_PARALLEL_LOOP(collapse=2) - do l = 0, p - do k = -buff_size, m + buff_size - select case (bc_type(2, 1)%sf(k, 0, l)) - case (BC_PERIODIC) - call s_color_function_periodic(c_divs, 2, 1, k, l) - case (BC_REFLECTIVE) - call s_color_function_reflective(c_divs, 2, 1, k, l) - case default - call s_color_function_ghost_cell_extrapolation(c_divs, 2, 1, k, l) - end select - end do - end do + #:call GPU_PARALLEL_LOOP(collapse=2) + do l = 0, p + do k = -buff_size, m + buff_size + select case (bc_type(2, 1)%sf(k, 0, l)) + case (BC_PERIODIC) + call s_color_function_periodic(c_divs, 2, 1, k, l) + case (BC_REFLECTIVE) + call s_color_function_reflective(c_divs, 2, 1, k, l) + case default + call s_color_function_ghost_cell_extrapolation(c_divs, 2, 1, k, l) + end select + end do + end do + #:endcall GPU_PARALLEL_LOOP end if if (p == 0) return @@ -1240,41 +1247,43 @@ contains if (bc_z%beg >= 0) then call s_mpi_sendrecv_variables_buffers(c_divs, 3, -1, num_dims + 1) else - $:GPU_PARALLEL_LOOP(collapse=2) - do l = -buff_size, n + buff_size - do k = -buff_size, m + buff_size - select case (bc_type(3, -1)%sf(k, l, 0)) - case (BC_PERIODIC) - call s_color_function_periodic(c_divs, 3, -1, k, l) - case (BC_REFLECTIVE) - call s_color_function_reflective(c_divs, 3, -1, k, l) - case default - call s_color_function_ghost_cell_extrapolation(c_divs, 3, -1, k, l) - end select - end do - end do + #:call GPU_PARALLEL_LOOP(collapse=2) + do l = -buff_size, n + buff_size + do k = -buff_size, m + buff_size + select case (bc_type(3, -1)%sf(k, l, 0)) + case (BC_PERIODIC) + call s_color_function_periodic(c_divs, 3, -1, k, l) + case (BC_REFLECTIVE) + call s_color_function_reflective(c_divs, 3, -1, k, l) + case default + call s_color_function_ghost_cell_extrapolation(c_divs, 3, -1, k, l) + end select + end do + end do + #:endcall GPU_PARALLEL_LOOP end if if (bc_z%end >= 0) then call s_mpi_sendrecv_variables_buffers(c_divs, 3, 1, num_dims + 1) else - $:GPU_PARALLEL_LOOP(collapse=2) - do l = -buff_size, n + buff_size - do k = -buff_size, m + buff_size - select case (bc_type(3, 1)%sf(k, l, 0)) - case (BC_PERIODIC) - call s_color_function_periodic(c_divs, 3, 1, k, l) - case (BC_REFLECTIVE) - call s_color_function_reflective(c_divs, 3, 1, k, l) - case default - call s_color_function_ghost_cell_extrapolation(c_divs, 3, 1, k, l) - end select - end do - end do + #:call GPU_PARALLEL_LOOP(collapse=2) + do l = -buff_size, n + buff_size + do k = -buff_size, m + buff_size + select case (bc_type(3, 1)%sf(k, l, 0)) + case (BC_PERIODIC) + call s_color_function_periodic(c_divs, 3, 1, k, l) + case (BC_REFLECTIVE) + call s_color_function_reflective(c_divs, 3, 1, k, l) + case default + call s_color_function_ghost_cell_extrapolation(c_divs, 3, 1, k, l) + end select + end do + end do + #:endcall GPU_PARALLEL_LOOP end if end subroutine s_populate_capillary_buffers - pure subroutine s_color_function_periodic(c_divs, bc_dir, bc_loc, k, l) + subroutine s_color_function_periodic(c_divs, bc_dir, bc_loc, k, l) $:GPU_ROUTINE(function_name='s_color_function_periodic', & & parallelism='[seq]', cray_inline=True) type(scalar_field), dimension(num_dims + 1), intent(inout) :: c_divs @@ -1329,7 +1338,7 @@ contains end subroutine s_color_function_periodic - pure subroutine s_color_function_reflective(c_divs, bc_dir, bc_loc, k, l) + subroutine s_color_function_reflective(c_divs, bc_dir, bc_loc, k, l) $:GPU_ROUTINE(function_name='s_color_function_reflective', & & parallelism='[seq]', cray_inline=True) type(scalar_field), dimension(num_dims + 1), intent(inout) :: c_divs @@ -1408,7 +1417,7 @@ contains end subroutine s_color_function_reflective - pure subroutine s_color_function_ghost_cell_extrapolation(c_divs, bc_dir, bc_loc, k, l) + subroutine s_color_function_ghost_cell_extrapolation(c_divs, bc_dir, bc_loc, k, l) $:GPU_ROUTINE(function_name='s_color_function_ghost_cell_extrapolation', & & parallelism='[seq]', cray_inline=True) type(scalar_field), dimension(num_dims + 1), intent(inout) :: c_divs @@ -1463,63 +1472,63 @@ contains end subroutine s_color_function_ghost_cell_extrapolation - impure subroutine s_populate_F_igr_buffers(bc_type, jac) + impure subroutine s_populate_F_igr_buffers(bc_type, jac_sf) type(integer_field), dimension(1:num_dims, -1:1), intent(in) :: bc_type - real(wp), target, dimension(idwbuff(1)%beg:, idwbuff(2)%beg:, idwbuff(3)%beg:), intent(inout) :: jac + type(scalar_field), dimension(1:), intent(inout) :: jac_sf integer :: j, k, l - #:call GPU_PARALLEL() - jac_sf(1)%sf => jac - #:endcall GPU_PARALLEL - if (bc_x%beg >= 0) then call s_mpi_sendrecv_variables_buffers(jac_sf, 1, -1, 1) else - $:GPU_PARALLEL_LOOP(collapse=2) - do l = 0, p - do k = 0, n - select case (bc_type(1, -1)%sf(0, k, l)) - case (BC_PERIODIC) - do j = 1, buff_size - jac(-j, k, l) = jac(m - j + 1, k, l) - end do - case (BC_REFLECTIVE) - do j = 1, buff_size - jac(-j, k, l) = jac(j - 1, k, l) - end do - case default - do j = 1, buff_size - jac(-j, k, l) = jac(0, k, l) - end do - end select + #:call GPU_PARALLEL_LOOP(collapse=2) + do l = 0, p + do k = 0, n + select case (bc_type(1, -1)%sf(0, k, l)) + case (BC_PERIODIC) + do j = 1, buff_size + jac_sf(1)%sf(-j, k, l) = jac_sf(1)%sf(m - j + 1, k, l) + end do + case (BC_REFLECTIVE) + do j = 1, buff_size + jac_sf(1)%sf(-j, k, l) = jac_sf(1)%sf(j - 1, k, l) + end do + case default + do j = 1, buff_size + jac_sf(1)%sf(-j, k, l) = jac_sf(1)%sf(0, k, l) + end do + end select + end do end do - end do + #:endcall GPU_PARALLEL_LOOP + end if if (bc_x%end >= 0) then call s_mpi_sendrecv_variables_buffers(jac_sf, 1, 1, 1) else - $:GPU_PARALLEL_LOOP(collapse=2) - do l = 0, p - do k = 0, n - select case (bc_type(1, 1)%sf(0, k, l)) - case (BC_PERIODIC) - do j = 1, buff_size - jac(m + j, k, l) = jac(j - 1, k, l) - end do - case (BC_REFLECTIVE) - do j = 1, buff_size - jac(m + j, k, l) = jac(m - (j - 1), k, l) - end do - case default - do j = 1, buff_size - jac(m + j, k, l) = jac(m, k, l) - end do - end select + #:call GPU_PARALLEL_LOOP(collapse=2) + do l = 0, p + do k = 0, n + select case (bc_type(1, 1)%sf(0, k, l)) + case (BC_PERIODIC) + do j = 1, buff_size + jac_sf(1)%sf(m + j, k, l) = jac_sf(1)%sf(j - 1, k, l) + end do + case (BC_REFLECTIVE) + do j = 1, buff_size + jac_sf(1)%sf(m + j, k, l) = jac_sf(1)%sf(m - (j - 1), k, l) + end do + case default + do j = 1, buff_size + jac_sf(1)%sf(m + j, k, l) = jac_sf(1)%sf(m, k, l) + end do + end select + end do end do - end do + #:endcall GPU_PARALLEL_LOOP + end if if (n == 0) then @@ -1527,49 +1536,52 @@ contains else if (bc_y%beg >= 0) then call s_mpi_sendrecv_variables_buffers(jac_sf, 2, -1, 1) else - $:GPU_PARALLEL_LOOP(collapse=2) - do l = 0, p - do k = idwbuff(1)%beg, idwbuff(1)%end - select case (bc_type(2, -1)%sf(k, 0, l)) - case (BC_PERIODIC) - do j = 1, buff_size - jac(k, -j, l) = jac(k, n - j + 1, l) - end do - case (BC_REFLECTIVE) - do j = 1, buff_size - jac(k, -j, l) = jac(k, j - 1, l) - end do - case default - do j = 1, buff_size - jac(k, -j, l) = jac(k, 0, l) - end do - end select + #:call GPU_PARALLEL_LOOP(collapse=2) + do l = 0, p + do k = idwbuff(1)%beg, idwbuff(1)%end + select case (bc_type(2, -1)%sf(k, 0, l)) + case (BC_PERIODIC) + do j = 1, buff_size + jac_sf(1)%sf(k, -j, l) = jac_sf(1)%sf(k, n - j + 1, l) + end do + case (BC_REFLECTIVE) + do j = 1, buff_size + jac_sf(1)%sf(k, -j, l) = jac_sf(1)%sf(k, j - 1, l) + end do + case default + do j = 1, buff_size + jac_sf(1)%sf(k, -j, l) = jac_sf(1)%sf(k, 0, l) + end do + end select + end do end do - end do + #:endcall GPU_PARALLEL_LOOP + end if if (bc_y%end >= 0) then call s_mpi_sendrecv_variables_buffers(jac_sf, 2, 1, 1) else - $:GPU_PARALLEL_LOOP(collapse=2) - do l = 0, p - do k = idwbuff(1)%beg, idwbuff(1)%end - select case (bc_type(2, 1)%sf(k, 0, l)) - case (BC_PERIODIC) - do j = 1, buff_size - jac(k, n + j, l) = jac(k, j - 1, l) - end do - case (BC_REFLECTIVE) - do j = 1, buff_size - jac(k, n + j, l) = jac(k, n - (j - 1), l) - end do - case default - do j = 1, buff_size - jac(k, n + j, l) = jac(k, n, l) - end do - end select + #:call GPU_PARALLEL_LOOP(collapse=2) + do l = 0, p + do k = idwbuff(1)%beg, idwbuff(1)%end + select case (bc_type(2, 1)%sf(k, 0, l)) + case (BC_PERIODIC) + do j = 1, buff_size + jac_sf(1)%sf(k, n + j, l) = jac_sf(1)%sf(k, j - 1, l) + end do + case (BC_REFLECTIVE) + do j = 1, buff_size + jac_sf(1)%sf(k, n + j, l) = jac_sf(1)%sf(k, n - (j - 1), l) + end do + case default + do j = 1, buff_size + jac_sf(1)%sf(k, n + j, l) = jac_sf(1)%sf(k, n, l) + end do + end select + end do end do - end do + #:endcall GPU_PARALLEL_LOOP end if if (p == 0) then @@ -1577,49 +1589,51 @@ contains else if (bc_z%beg >= 0) then call s_mpi_sendrecv_variables_buffers(jac_sf, 3, -1, 1) else - $:GPU_PARALLEL_LOOP(collapse=2) - do l = idwbuff(2)%beg, idwbuff(2)%end - do k = idwbuff(1)%beg, idwbuff(1)%end - select case (bc_type(3, -1)%sf(k, l, 0)) - case (BC_PERIODIC) - do j = 1, buff_size - jac(k, l, -j) = jac(k, l, p - j + 1) - end do - case (BC_REFLECTIVE) - do j = 1, buff_size - jac(k, l, -j) = jac(k, l, j - 1) - end do - case default - do j = 1, buff_size - jac(k, l, -j) = jac(k, l, 0) - end do - end select + #:call GPU_PARALLEL_LOOP(collapse=2) + do l = idwbuff(2)%beg, idwbuff(2)%end + do k = idwbuff(1)%beg, idwbuff(1)%end + select case (bc_type(3, -1)%sf(k, l, 0)) + case (BC_PERIODIC) + do j = 1, buff_size + jac_sf(1)%sf(k, l, -j) = jac_sf(1)%sf(k, l, p - j + 1) + end do + case (BC_REFLECTIVE) + do j = 1, buff_size + jac_sf(1)%sf(k, l, -j) = jac_sf(1)%sf(k, l, j - 1) + end do + case default + do j = 1, buff_size + jac_sf(1)%sf(k, l, -j) = jac_sf(1)%sf(k, l, 0) + end do + end select + end do end do - end do + #:endcall GPU_PARALLEL_LOOP end if if (bc_z%end >= 0) then call s_mpi_sendrecv_variables_buffers(jac_sf, 3, 1, 1) else - $:GPU_PARALLEL_LOOP(collapse=2) - do l = idwbuff(2)%beg, idwbuff(2)%end - do k = idwbuff(1)%beg, idwbuff(1)%end - select case (bc_type(3, 1)%sf(k, l, 0)) - case (BC_PERIODIC) - do j = 1, buff_size - jac(k, l, p + j) = jac(k, l, j - 1) - end do - case (BC_REFLECTIVE) - do j = 1, buff_size - jac(k, l, p + j) = jac(k, l, p - (j - 1)) - end do - case default - do j = 1, buff_size - jac(k, l, p + j) = jac(k, l, p) - end do - end select + #:call GPU_PARALLEL_LOOP(collapse=2) + do l = idwbuff(2)%beg, idwbuff(2)%end + do k = idwbuff(1)%beg, idwbuff(1)%end + select case (bc_type(3, 1)%sf(k, l, 0)) + case (BC_PERIODIC) + do j = 1, buff_size + jac_sf(1)%sf(k, l, p + j) = jac_sf(1)%sf(k, l, j - 1) + end do + case (BC_REFLECTIVE) + do j = 1, buff_size + jac_sf(1)%sf(k, l, p + j) = jac_sf(1)%sf(k, l, p - (j - 1)) + end do + case default + do j = 1, buff_size + jac_sf(1)%sf(k, l, p + j) = jac_sf(1)%sf(k, l, p) + end do + end select + end do end do - end do + #:endcall GPU_PARALLEL_LOOP end if end subroutine s_populate_F_igr_buffers diff --git a/src/common/m_chemistry.fpp b/src/common/m_chemistry.fpp index 35067688f8..4ba51e9564 100644 --- a/src/common/m_chemistry.fpp +++ b/src/common/m_chemistry.fpp @@ -19,6 +19,13 @@ module m_chemistry implicit none + #:block DEF_AMD + real(wp) :: molecular_weights_nonparameter(10) = & + (/2.016, 1.008, 15.999, 31.998, 17.007, 18.015, 33.006, & + 34.014, 39.95, 28.014/) + $:GPU_DECLARE(create='[molecular_weights_nonparameter]') + #:endblock DEF_AMD + type(int_bounds_info) :: isc1, isc2, isc3 $:GPU_DECLARE(create='[isc1, isc2, isc3]') integer, dimension(3) :: offsets @@ -122,33 +129,37 @@ contains real(wp), dimension(num_species) :: Ys real(wp), dimension(num_species) :: omega - $:GPU_PARALLEL_LOOP(collapse=3, private='[Ys, omega]') - do z = bounds(3)%beg, bounds(3)%end - do y = bounds(2)%beg, bounds(2)%end - do x = bounds(1)%beg, bounds(1)%end + #:call GPU_PARALLEL_LOOP(collapse=3, private='[Ys, omega, T]') + do z = bounds(3)%beg, bounds(3)%end + do y = bounds(2)%beg, bounds(2)%end + do x = bounds(1)%beg, bounds(1)%end - $:GPU_LOOP(parallelism='[seq]') - do eqn = chemxb, chemxe - Ys(eqn - chemxb + 1) = q_prim_qp(eqn)%sf(x, y, z) - end do - - rho = q_cons_qp(contxe)%sf(x, y, z) - T = q_T_sf%sf(x, y, z) + $:GPU_LOOP(parallelism='[seq]') + do eqn = chemxb, chemxe + Ys(eqn - chemxb + 1) = q_prim_qp(eqn)%sf(x, y, z) + end do - call get_net_production_rates(rho, T, Ys, omega) + rho = q_cons_qp(contxe)%sf(x, y, z) + T = q_T_sf%sf(x, y, z) - $:GPU_LOOP(parallelism='[seq]') - do eqn = chemxb, chemxe + call get_net_production_rates(rho, T, Ys, omega) - omega_m = molecular_weights(eqn - chemxb + 1)*omega(eqn - chemxb + 1) + $:GPU_LOOP(parallelism='[seq]') + do eqn = chemxb, chemxe + #:block UNDEF_AMD + omega_m = molecular_weights(eqn - chemxb + 1)*omega(eqn - chemxb + 1) + #:endblock UNDEF_AMD + #:block DEF_AMD + omega_m = molecular_weights_nonparameter(eqn - chemxb + 1)*omega(eqn - chemxb + 1) + #:endblock DEF_AMD + rhs_vf(eqn)%sf(x, y, z) = rhs_vf(eqn)%sf(x, y, z) + omega_m - rhs_vf(eqn)%sf(x, y, z) = rhs_vf(eqn)%sf(x, y, z) + omega_m + end do end do - end do end do - end do + #:endcall GPU_PARALLEL_LOOP end subroutine s_compute_chemistry_reaction_flux @@ -180,116 +191,114 @@ contains offsets = 0 offsets(idir) = 1 - $:GPU_PARALLEL_LOOP(collapse=3, private='[Ys_L, Ys_R, Ys_cell, & - & Xs_L, Xs_R, mass_diffusivities_mixavg1, mass_diffusivities_mixavg2, & - & mass_diffusivities_mixavg_Cell, h_l, h_r, Xs_cell, h_k, & - & dXk_dxi,Mass_Diffu_Flux]', copyin='[offsets]') - do z = isc3%beg, isc3%end - do y = isc2%beg, isc2%end - do x = isc1%beg, isc1%end - ! Calculate grid spacing using direction-based indexing - select case (idir) - case (1) - grid_spacing = x_cc(x + 1) - x_cc(x) - case (2) - grid_spacing = y_cc(y + 1) - y_cc(y) - case (3) - grid_spacing = z_cc(z + 1) - z_cc(z) - end select - - ! Extract species mass fractions - $:GPU_LOOP(parallelism='[seq]') - do i = chemxb, chemxe - Ys_L(i - chemxb + 1) = q_prim_qp(i)%sf(x, y, z) - Ys_R(i - chemxb + 1) = q_prim_qp(i)%sf(x + offsets(1), y + offsets(2), z + offsets(3)) - Ys_cell(i - chemxb + 1) = 0.5_wp*(Ys_L(i - chemxb + 1) + Ys_R(i - chemxb + 1)) - end do - - ! Calculate molecular weights and mole fractions - call get_mixture_molecular_weight(Ys_L, MW_L) - call get_mixture_molecular_weight(Ys_R, MW_R) - MW_cell = 0.5_wp*(MW_L + MW_R) - - call get_mole_fractions(MW_L, Ys_L, Xs_L) - call get_mole_fractions(MW_R, Ys_R, Xs_R) - - ! Calculate gas constants and thermodynamic properties - Rgas_L = gas_constant/MW_L - Rgas_R = gas_constant/MW_R - - P_L = q_prim_qp(E_idx)%sf(x, y, z) - P_R = q_prim_qp(E_idx)%sf(x + offsets(1), y + offsets(2), z + offsets(3)) - - rho_L = q_prim_qp(1)%sf(x, y, z) - rho_R = q_prim_qp(1)%sf(x + offsets(1), y + offsets(2), z + offsets(3)) - - T_L = P_L/rho_L/Rgas_L - T_R = P_R/rho_R/Rgas_R - - rho_cell = 0.5_wp*(rho_L + rho_R) - dT_dxi = (T_R - T_L)/grid_spacing - - ! Get transport properties - call get_species_mass_diffusivities_mixavg(P_L, T_L, Ys_L, mass_diffusivities_mixavg1) - call get_species_mass_diffusivities_mixavg(P_R, T_R, Ys_R, mass_diffusivities_mixavg2) - - call get_mixture_thermal_conductivity_mixavg(T_L, Ys_L, lambda_L) - call get_mixture_thermal_conductivity_mixavg(T_R, Ys_R, lambda_R) - - call get_species_enthalpies_rt(T_L, h_l) - call get_species_enthalpies_rt(T_R, h_r) - - ! Calculate species properties and gradients - $:GPU_LOOP(parallelism='[seq]') - do i = chemxb, chemxe - h_l(i - chemxb + 1) = h_l(i - chemxb + 1)*gas_constant*T_L/molecular_weights(i - chemxb + 1) - h_r(i - chemxb + 1) = h_r(i - chemxb + 1)*gas_constant*T_R/molecular_weights(i - chemxb + 1) - Xs_cell(i - chemxb + 1) = 0.5_wp*(Xs_L(i - chemxb + 1) + Xs_R(i - chemxb + 1)) - h_k(i - chemxb + 1) = 0.5_wp*(h_l(i - chemxb + 1) + h_r(i - chemxb + 1)) - dXk_dxi(i - chemxb + 1) = (Xs_R(i - chemxb + 1) - Xs_L(i - chemxb + 1))/grid_spacing - end do - - ! Calculate mixture-averaged diffusivities - $:GPU_LOOP(parallelism='[seq]') - do i = chemxb, chemxe - mass_diffusivities_mixavg_Cell(i - chemxb + 1) = & - (mass_diffusivities_mixavg2(i - chemxb + 1) + mass_diffusivities_mixavg1(i - chemxb + 1))/2.0_wp - end do - - lambda_Cell = 0.5_wp*(lambda_R + lambda_L) - - ! Calculate mass diffusion fluxes - rho_Vic = 0.0_wp - Mass_Diffu_Energy = 0.0_wp - - $:GPU_LOOP(parallelism='[seq]') - do eqn = chemxb, chemxe - Mass_Diffu_Flux(eqn - chemxb + 1) = rho_cell*mass_diffusivities_mixavg_Cell(eqn - chemxb + 1)* & - molecular_weights(eqn - chemxb + 1)/MW_cell*dXk_dxi(eqn - chemxb + 1) - rho_Vic = rho_Vic + Mass_Diffu_Flux(eqn - chemxb + 1) - Mass_Diffu_Energy = Mass_Diffu_Energy + h_k(eqn - chemxb + 1)*Mass_Diffu_Flux(eqn - chemxb + 1) - end do - - ! Apply corrections for mass conservation - $:GPU_LOOP(parallelism='[seq]') - do eqn = chemxb, chemxe - Mass_Diffu_Energy = Mass_Diffu_Energy - h_k(eqn - chemxb + 1)*Ys_cell(eqn - chemxb + 1)*rho_Vic - Mass_Diffu_Flux(eqn - chemxb + 1) = Mass_Diffu_Flux(eqn - chemxb + 1) - rho_Vic*Ys_cell(eqn - chemxb + 1) - end do - - ! Add thermal conduction contribution - Mass_Diffu_Energy = lambda_Cell*dT_dxi + Mass_Diffu_Energy - - ! Update flux arrays - flux_src_vf(E_idx)%sf(x, y, z) = flux_src_vf(E_idx)%sf(x, y, z) - Mass_Diffu_Energy - - $:GPU_LOOP(parallelism='[seq]') - do eqn = chemxb, chemxe - flux_src_vf(eqn)%sf(x, y, z) = flux_src_vf(eqn)%sf(x, y, z) - Mass_diffu_Flux(eqn - chemxb + 1) + #:call GPU_PARALLEL_LOOP(collapse=3, private='[Ys_L, Ys_R, Ys_cell, Xs_L, Xs_R, mass_diffusivities_mixavg1, mass_diffusivities_mixavg2, mass_diffusivities_mixavg_Cell, h_l, h_r, Xs_cell, h_k, dXk_dxi,Mass_Diffu_Flux]', copyin='[offsets]') + do z = isc3%beg, isc3%end + do y = isc2%beg, isc2%end + do x = isc1%beg, isc1%end + ! Calculate grid spacing using direction-based indexing + select case (idir) + case (1) + grid_spacing = x_cc(x + 1) - x_cc(x) + case (2) + grid_spacing = y_cc(y + 1) - y_cc(y) + case (3) + grid_spacing = z_cc(z + 1) - z_cc(z) + end select + + ! Extract species mass fractions + $:GPU_LOOP(parallelism='[seq]') + do i = chemxb, chemxe + Ys_L(i - chemxb + 1) = q_prim_qp(i)%sf(x, y, z) + Ys_R(i - chemxb + 1) = q_prim_qp(i)%sf(x + offsets(1), y + offsets(2), z + offsets(3)) + Ys_cell(i - chemxb + 1) = 0.5_wp*(Ys_L(i - chemxb + 1) + Ys_R(i - chemxb + 1)) + end do + + ! Calculate molecular weights and mole fractions + call get_mixture_molecular_weight(Ys_L, MW_L) + call get_mixture_molecular_weight(Ys_R, MW_R) + MW_cell = 0.5_wp*(MW_L + MW_R) + + call get_mole_fractions(MW_L, Ys_L, Xs_L) + call get_mole_fractions(MW_R, Ys_R, Xs_R) + + ! Calculate gas constants and thermodynamic properties + Rgas_L = gas_constant/MW_L + Rgas_R = gas_constant/MW_R + + P_L = q_prim_qp(E_idx)%sf(x, y, z) + P_R = q_prim_qp(E_idx)%sf(x + offsets(1), y + offsets(2), z + offsets(3)) + + rho_L = q_prim_qp(1)%sf(x, y, z) + rho_R = q_prim_qp(1)%sf(x + offsets(1), y + offsets(2), z + offsets(3)) + + T_L = P_L/rho_L/Rgas_L + T_R = P_R/rho_R/Rgas_R + + rho_cell = 0.5_wp*(rho_L + rho_R) + dT_dxi = (T_R - T_L)/grid_spacing + + ! Get transport properties + call get_species_mass_diffusivities_mixavg(P_L, T_L, Ys_L, mass_diffusivities_mixavg1) + call get_species_mass_diffusivities_mixavg(P_R, T_R, Ys_R, mass_diffusivities_mixavg2) + + call get_mixture_thermal_conductivity_mixavg(T_L, Ys_L, lambda_L) + call get_mixture_thermal_conductivity_mixavg(T_R, Ys_R, lambda_R) + + call get_species_enthalpies_rt(T_L, h_l) + call get_species_enthalpies_rt(T_R, h_r) + + ! Calculate species properties and gradients + $:GPU_LOOP(parallelism='[seq]') + do i = chemxb, chemxe + h_l(i - chemxb + 1) = h_l(i - chemxb + 1)*gas_constant*T_L/molecular_weights(i - chemxb + 1) + h_r(i - chemxb + 1) = h_r(i - chemxb + 1)*gas_constant*T_R/molecular_weights(i - chemxb + 1) + Xs_cell(i - chemxb + 1) = 0.5_wp*(Xs_L(i - chemxb + 1) + Xs_R(i - chemxb + 1)) + h_k(i - chemxb + 1) = 0.5_wp*(h_l(i - chemxb + 1) + h_r(i - chemxb + 1)) + dXk_dxi(i - chemxb + 1) = (Xs_R(i - chemxb + 1) - Xs_L(i - chemxb + 1))/grid_spacing + end do + + ! Calculate mixture-averaged diffusivities + $:GPU_LOOP(parallelism='[seq]') + do i = chemxb, chemxe + mass_diffusivities_mixavg_Cell(i - chemxb + 1) = & + (mass_diffusivities_mixavg2(i - chemxb + 1) + mass_diffusivities_mixavg1(i - chemxb + 1))/2.0_wp + end do + + lambda_Cell = 0.5_wp*(lambda_R + lambda_L) + + ! Calculate mass diffusion fluxes + rho_Vic = 0.0_wp + Mass_Diffu_Energy = 0.0_wp + + $:GPU_LOOP(parallelism='[seq]') + do eqn = chemxb, chemxe + Mass_Diffu_Flux(eqn - chemxb + 1) = rho_cell*mass_diffusivities_mixavg_Cell(eqn - chemxb + 1)* & + molecular_weights(eqn - chemxb + 1)/MW_cell*dXk_dxi(eqn - chemxb + 1) + rho_Vic = rho_Vic + Mass_Diffu_Flux(eqn - chemxb + 1) + Mass_Diffu_Energy = Mass_Diffu_Energy + h_k(eqn - chemxb + 1)*Mass_Diffu_Flux(eqn - chemxb + 1) + end do + + ! Apply corrections for mass conservation + $:GPU_LOOP(parallelism='[seq]') + do eqn = chemxb, chemxe + Mass_Diffu_Energy = Mass_Diffu_Energy - h_k(eqn - chemxb + 1)*Ys_cell(eqn - chemxb + 1)*rho_Vic + Mass_Diffu_Flux(eqn - chemxb + 1) = Mass_Diffu_Flux(eqn - chemxb + 1) - rho_Vic*Ys_cell(eqn - chemxb + 1) + end do + + ! Add thermal conduction contribution + Mass_Diffu_Energy = lambda_Cell*dT_dxi + Mass_Diffu_Energy + + ! Update flux arrays + flux_src_vf(E_idx)%sf(x, y, z) = flux_src_vf(E_idx)%sf(x, y, z) - Mass_Diffu_Energy + + $:GPU_LOOP(parallelism='[seq]') + do eqn = chemxb, chemxe + flux_src_vf(eqn)%sf(x, y, z) = flux_src_vf(eqn)%sf(x, y, z) - Mass_diffu_Flux(eqn - chemxb + 1) + end do end do end do end do - end do + #:endcall GPU_PARALLEL_LOOP end if end subroutine s_compute_chemistry_diffusion_flux diff --git a/src/common/m_compute_levelset.fpp b/src/common/m_compute_levelset.fpp index 99398e761c..fa7865889b 100644 --- a/src/common/m_compute_levelset.fpp +++ b/src/common/m_compute_levelset.fpp @@ -27,7 +27,7 @@ module m_compute_levelset contains - pure subroutine s_circle_levelset(ib_patch_id, levelset, levelset_norm) + subroutine s_circle_levelset(ib_patch_id, levelset, levelset_norm) type(levelset_field), intent(INOUT), optional :: levelset type(levelset_norm_field), intent(INOUT), optional :: levelset_norm @@ -63,7 +63,7 @@ contains end subroutine s_circle_levelset - pure subroutine s_airfoil_levelset(ib_patch_id, levelset, levelset_norm) + subroutine s_airfoil_levelset(ib_patch_id, levelset, levelset_norm) type(levelset_field), intent(INOUT), optional :: levelset type(levelset_norm_field), intent(INOUT), optional :: levelset_norm @@ -146,7 +146,7 @@ contains end subroutine s_airfoil_levelset - pure subroutine s_3D_airfoil_levelset(ib_patch_id, levelset, levelset_norm) + subroutine s_3D_airfoil_levelset(ib_patch_id, levelset, levelset_norm) type(levelset_field), intent(INOUT), optional :: levelset type(levelset_norm_field), intent(INOUT), optional :: levelset_norm @@ -250,7 +250,7 @@ contains end subroutine s_3D_airfoil_levelset !> Initialize IBM module - pure subroutine s_rectangle_levelset(ib_patch_id, levelset, levelset_norm) + subroutine s_rectangle_levelset(ib_patch_id, levelset, levelset_norm) type(levelset_field), intent(INOUT), optional :: levelset type(levelset_norm_field), intent(INOUT), optional :: levelset_norm @@ -348,7 +348,7 @@ contains end subroutine s_rectangle_levelset - pure subroutine s_cuboid_levelset(ib_patch_id, levelset, levelset_norm) + subroutine s_cuboid_levelset(ib_patch_id, levelset, levelset_norm) type(levelset_field), intent(INOUT), optional :: levelset type(levelset_norm_field), intent(INOUT), optional :: levelset_norm @@ -465,7 +465,7 @@ contains end subroutine s_cuboid_levelset - pure subroutine s_sphere_levelset(ib_patch_id, levelset, levelset_norm) + subroutine s_sphere_levelset(ib_patch_id, levelset, levelset_norm) type(levelset_field), intent(INOUT), optional :: levelset type(levelset_norm_field), intent(INOUT), optional :: levelset_norm @@ -502,7 +502,7 @@ contains end subroutine s_sphere_levelset - pure subroutine s_cylinder_levelset(ib_patch_id, levelset, levelset_norm) + subroutine s_cylinder_levelset(ib_patch_id, levelset, levelset_norm) type(levelset_field), intent(INOUT), optional :: levelset type(levelset_norm_field), intent(INOUT), optional :: levelset_norm diff --git a/src/common/m_constants.fpp b/src/common/m_constants.fpp index 1d2e53d206..e75e753804 100644 --- a/src/common/m_constants.fpp +++ b/src/common/m_constants.fpp @@ -10,6 +10,7 @@ module m_constants real(wp), parameter :: dflt_real = -1.e6_wp !< Default real value real(wp), parameter :: sgm_eps = 1.e-16_wp !< Segmentation tolerance + real(wp), parameter :: Chem_Tolerance = 1.e-16_wp !< Speed of Sound Tolerance in Chemistry real(wp), parameter :: small_alf = 1.e-11_wp !< Small alf tolerance real(wp), parameter :: pi = 3.141592653589793_wp !< Pi real(wp), parameter :: verysmall = 1.e-12_wp !< Very small number diff --git a/src/common/m_derived_types.fpp b/src/common/m_derived_types.fpp index 9c55e8b895..e74171f581 100644 --- a/src/common/m_derived_types.fpp +++ b/src/common/m_derived_types.fpp @@ -455,3 +455,4 @@ module m_derived_types end type cell_num_bounds end module m_derived_types + diff --git a/src/common/m_finite_differences.fpp b/src/common/m_finite_differences.fpp index 2430374f4f..c01953e216 100644 --- a/src/common/m_finite_differences.fpp +++ b/src/common/m_finite_differences.fpp @@ -8,7 +8,7 @@ module m_finite_differences contains - pure subroutine s_compute_fd_divergence(div, fields, ix_s, iy_s, iz_s) + subroutine s_compute_fd_divergence(div, fields, ix_s, iy_s, iz_s) type(scalar_field), intent(INOUT) :: div type(scalar_field), intent(IN) :: fields(1:3) @@ -18,44 +18,45 @@ contains real(wp) :: divergence - $:GPU_PARALLEL_LOOP(collapse=3, private='[divergence]') - do x = ix_s%beg, ix_s%end - do y = iy_s%beg, iy_s%end - do z = iz_s%beg, iz_s%end - - if (x == ix_s%beg) then - divergence = (-3._wp*fields(1)%sf(x, y, z) + 4._wp*fields(1)%sf(x + 1, y, z) - fields(1)%sf(x + 2, y, z))/(x_cc(x + 2) - x_cc(x)) - else if (x == ix_s%end) then - divergence = (+3._wp*fields(1)%sf(x, y, z) - 4._wp*fields(1)%sf(x - 1, y, z) + fields(1)%sf(x - 2, y, z))/(x_cc(x) - x_cc(x - 2)) - else - divergence = (fields(1)%sf(x + 1, y, z) - fields(1)%sf(x - 1, y, z))/(x_cc(x + 1) - x_cc(x - 1)) - end if - - if (n > 0) then - if (y == iy_s%beg) then - divergence = divergence + (-3._wp*fields(2)%sf(x, y, z) + 4._wp*fields(2)%sf(x, y + 1, z) - fields(2)%sf(x, y + 2, z))/(y_cc(y + 2) - y_cc(y)) - else if (y == iy_s%end) then - divergence = divergence + (+3._wp*fields(2)%sf(x, y, z) - 4._wp*fields(2)%sf(x, y - 1, z) + fields(2)%sf(x, y - 2, z))/(y_cc(y) - y_cc(y - 2)) + #:call GPU_PARALLEL_LOOP(collapse=3, private='[divergence]') + do x = ix_s%beg, ix_s%end + do y = iy_s%beg, iy_s%end + do z = iz_s%beg, iz_s%end + + if (x == ix_s%beg) then + divergence = (-3._wp*fields(1)%sf(x, y, z) + 4._wp*fields(1)%sf(x + 1, y, z) - fields(1)%sf(x + 2, y, z))/(x_cc(x + 2) - x_cc(x)) + else if (x == ix_s%end) then + divergence = (+3._wp*fields(1)%sf(x, y, z) - 4._wp*fields(1)%sf(x - 1, y, z) + fields(1)%sf(x - 2, y, z))/(x_cc(x) - x_cc(x - 2)) else - divergence = divergence + (fields(2)%sf(x, y + 1, z) - fields(2)%sf(x, y - 1, z))/(y_cc(y + 1) - y_cc(y - 1)) + divergence = (fields(1)%sf(x + 1, y, z) - fields(1)%sf(x - 1, y, z))/(x_cc(x + 1) - x_cc(x - 1)) end if - end if - if (p > 0) then - if (z == iz_s%beg) then - divergence = divergence + (-3._wp*fields(3)%sf(x, y, z) + 4._wp*fields(3)%sf(x, y, z + 1) - fields(3)%sf(x, y, z + 2))/(z_cc(z + 2) - z_cc(z)) - else if (z == iz_s%end) then - divergence = divergence + (+3._wp*fields(3)%sf(x, y, z) - 4._wp*fields(3)%sf(x, y, z - 1) + fields(2)%sf(x, y, z - 2))/(z_cc(z) - z_cc(z - 2)) - else - divergence = divergence + (fields(3)%sf(x, y, z + 1) - fields(3)%sf(x, y, z - 1))/(z_cc(z + 1) - z_cc(z - 1)) + if (n > 0) then + if (y == iy_s%beg) then + divergence = divergence + (-3._wp*fields(2)%sf(x, y, z) + 4._wp*fields(2)%sf(x, y + 1, z) - fields(2)%sf(x, y + 2, z))/(y_cc(y + 2) - y_cc(y)) + else if (y == iy_s%end) then + divergence = divergence + (+3._wp*fields(2)%sf(x, y, z) - 4._wp*fields(2)%sf(x, y - 1, z) + fields(2)%sf(x, y - 2, z))/(y_cc(y) - y_cc(y - 2)) + else + divergence = divergence + (fields(2)%sf(x, y + 1, z) - fields(2)%sf(x, y - 1, z))/(y_cc(y + 1) - y_cc(y - 1)) + end if + end if + + if (p > 0) then + if (z == iz_s%beg) then + divergence = divergence + (-3._wp*fields(3)%sf(x, y, z) + 4._wp*fields(3)%sf(x, y, z + 1) - fields(3)%sf(x, y, z + 2))/(z_cc(z + 2) - z_cc(z)) + else if (z == iz_s%end) then + divergence = divergence + (+3._wp*fields(3)%sf(x, y, z) - 4._wp*fields(3)%sf(x, y, z - 1) + fields(2)%sf(x, y, z - 2))/(z_cc(z) - z_cc(z - 2)) + else + divergence = divergence + (fields(3)%sf(x, y, z + 1) - fields(3)%sf(x, y, z - 1))/(z_cc(z + 1) - z_cc(z - 1)) + end if end if - end if - div%sf(x, y, z) = div%sf(x, y, z) + divergence + div%sf(x, y, z) = div%sf(x, y, z) + divergence + end do end do end do - end do + #:endcall GPU_PARALLEL_LOOP end subroutine s_compute_fd_divergence @@ -69,8 +70,8 @@ contains !! @param q Number of cells in the s-coordinate direction !! @param s_cc Locations of the cell-centers in the s-coordinate direction !! @param fd_coeff_s Finite-diff. coefficients in the s-coordinate direction - pure subroutine s_compute_finite_difference_coefficients(q, s_cc, fd_coeff_s, local_buff_size, & - fd_number_in, fd_order_in, offset_s) + subroutine s_compute_finite_difference_coefficients(q, s_cc, fd_coeff_s, local_buff_size, & + fd_number_in, fd_order_in, offset_s) integer :: lB, lE !< loop bounds integer, intent(IN) :: q diff --git a/src/common/m_helper.fpp b/src/common/m_helper.fpp index 718626d00f..4c9b0dc101 100644 --- a/src/common/m_helper.fpp +++ b/src/common/m_helper.fpp @@ -47,7 +47,7 @@ contains !! @param vftmp is the void fraction !! @param Rtmp is the bubble radii !! @param ntmp is the output number bubble density - pure subroutine s_comp_n_from_prim(vftmp, Rtmp, ntmp, weights) + subroutine s_comp_n_from_prim(vftmp, Rtmp, ntmp, weights) $:GPU_ROUTINE(parallelism='[seq]') real(wp), intent(in) :: vftmp real(wp), dimension(nb), intent(in) :: Rtmp @@ -61,7 +61,7 @@ contains end subroutine s_comp_n_from_prim - pure subroutine s_comp_n_from_cons(vftmp, nRtmp, ntmp, weights) + subroutine s_comp_n_from_cons(vftmp, nRtmp, ntmp, weights) $:GPU_ROUTINE(parallelism='[seq]') real(wp), intent(in) :: vftmp real(wp), dimension(nb), intent(in) :: nRtmp @@ -224,7 +224,7 @@ contains !! @param peclet Peclet number !! @param Re_trans Real part of the transport coefficients !! @param Im_trans Imaginary part of the transport coefficients - pure elemental subroutine s_transcoeff(omega, peclet, Re_trans, Im_trans) + elemental subroutine s_transcoeff(omega, peclet, Re_trans, Im_trans) real(wp), intent(in) :: omega, peclet real(wp), intent(out) :: Re_trans, Im_trans @@ -243,7 +243,7 @@ contains end subroutine s_transcoeff - pure elemental subroutine s_int_to_str(i, res) + elemental subroutine s_int_to_str(i, res) integer, intent(in) :: i character(len=*), intent(inout) :: res @@ -307,7 +307,7 @@ contains !> This procedure swaps two real numbers. !! @param lhs Left-hand side. !! @param rhs Right-hand side. - pure elemental subroutine s_swap(lhs, rhs) + elemental subroutine s_swap(lhs, rhs) real(wp), intent(inout) :: lhs, rhs real(wp) :: ltemp @@ -320,7 +320,7 @@ contains !> This procedure creates a transformation matrix. !! @param p Parameters for the transformation. !! @return Transformation matrix. - pure function f_create_transform_matrix(param, center) result(out_matrix) + function f_create_transform_matrix(param, center) result(out_matrix) type(ic_model_parameters), intent(in) :: param real(wp), dimension(1:3), optional, intent(in) :: center @@ -381,7 +381,7 @@ contains !> This procedure transforms a vector by a matrix. !! @param vec Vector to transform. !! @param matrix Transformation matrix. - pure subroutine s_transform_vec(vec, matrix) + subroutine s_transform_vec(vec, matrix) real(wp), dimension(1:3), intent(inout) :: vec real(wp), dimension(1:4, 1:4), intent(in) :: matrix @@ -396,7 +396,7 @@ contains !> This procedure transforms a triangle by a matrix, one vertex at a time. !! @param triangle Triangle to transform. !! @param matrix Transformation matrix. - pure subroutine s_transform_triangle(triangle, matrix, matrix_n) + subroutine s_transform_triangle(triangle, matrix, matrix_n) type(t_triangle), intent(inout) :: triangle real(wp), dimension(1:4, 1:4), intent(in) :: matrix, matrix_n @@ -414,7 +414,7 @@ contains !> This procedure transforms a model by a matrix, one triangle at a time. !! @param model Model to transform. !! @param matrix Transformation matrix. - pure subroutine s_transform_model(model, matrix, matrix_n) + subroutine s_transform_model(model, matrix, matrix_n) type(t_model), intent(inout) :: model real(wp), dimension(1:4, 1:4), intent(in) :: matrix, matrix_n @@ -430,7 +430,7 @@ contains !> This procedure creates a bounding box for a model. !! @param model Model to create bounding box for. !! @return Bounding box. - pure function f_create_bbox(model) result(bbox) + function f_create_bbox(model) result(bbox) type(t_model), intent(in) :: model type(t_bbox) :: bbox @@ -459,7 +459,7 @@ contains !! @param lhs logical input. !! @param rhs other logical input. !! @return xored result. - pure elemental function f_xor(lhs, rhs) result(res) + elemental function f_xor(lhs, rhs) result(res) logical, intent(in) :: lhs, rhs logical :: res @@ -470,7 +470,7 @@ contains !> This procedure converts logical to 1 or 0. !! @param perdicate A Logical argument. !! @return 1 if .true., 0 if .false.. - pure elemental function f_logical_to_int(predicate) result(int) + elemental function f_logical_to_int(predicate) result(int) logical, intent(in) :: predicate integer :: int @@ -486,7 +486,7 @@ contains !! @param x is the input value !! @param l is the degree !! @return P is the unassociated legendre polynomial evaluated at x - pure recursive function unassociated_legendre(x, l) result(result_P) + recursive function unassociated_legendre(x, l) result(result_P) integer, intent(in) :: l real(wp), intent(in) :: x @@ -508,7 +508,7 @@ contains !! @param l is the degree !! @param m_order is the order !! @return Y is the spherical harmonic function evaluated at x and phi - pure recursive function spherical_harmonic_func(x, phi, l, m_order) result(Y) + recursive function spherical_harmonic_func(x, phi, l, m_order) result(Y) integer, intent(in) :: l, m_order real(wp), intent(in) :: x, phi @@ -530,7 +530,7 @@ contains !! @param l is the degree !! @param m_order is the order !! @return P is the associated legendre polynomial evaluated at x - pure recursive function associated_legendre(x, l, m_order) result(result_P) + recursive function associated_legendre(x, l, m_order) result(result_P) integer, intent(in) :: l, m_order real(wp), intent(in) :: x @@ -555,7 +555,7 @@ contains !> This function calculates the double factorial value of an integer !! @param n_in is the input integer !! @return R is the double factorial value of n - pure elemental function double_factorial(n_in) result(R_result) + elemental function double_factorial(n_in) result(R_result) integer, intent(in) :: n_in integer, parameter :: int64_kind = selected_int_kind(18) ! 18 bytes for 64-bit integer @@ -569,7 +569,7 @@ contains !> The following function calculates the factorial value of an integer !! @param n_in is the input integer !! @return R is the factorial value of n - pure elemental function factorial(n_in) result(R_result) + elemental function factorial(n_in) result(R_result) integer, intent(in) :: n_in integer, parameter :: int64_kind = selected_int_kind(18) ! 18 bytes for 64-bit integer diff --git a/src/common/m_helper_basic.fpp b/src/common/m_helper_basic.fpp index 697e919077..eca8c3ccb7 100644 --- a/src/common/m_helper_basic.fpp +++ b/src/common/m_helper_basic.fpp @@ -26,7 +26,7 @@ contains !! @param b Second number. !! @param tol_input Relative error (default = 1.e-10_wp). !! @return Result of the comparison. - logical pure elemental function f_approx_equal(a, b, tol_input) result(res) + logical elemental function f_approx_equal(a, b, tol_input) result(res) $:GPU_ROUTINE(parallelism='[seq]') real(wp), intent(in) :: a, b real(wp), optional, intent(in) :: tol_input @@ -52,7 +52,7 @@ contains !! @param b Array that contains several point numbers. !! @param tol_input Relative error (default = 1e-10_wp). !! @return Result of the comparison. - logical pure function f_approx_in_array(a, b, tol_input) result(res) + logical function f_approx_in_array(a, b, tol_input) result(res) $:GPU_ROUTINE(parallelism='[seq]') real(wp), intent(in) :: a real(wp), intent(in) :: b(:) @@ -78,7 +78,7 @@ contains !> Checks if a real(wp) variable is of default value. !! @param var Variable to check. - logical pure elemental function f_is_default(var) result(res) + logical elemental function f_is_default(var) result(res) $:GPU_ROUTINE(parallelism='[seq]') real(wp), intent(in) :: var @@ -87,13 +87,14 @@ contains !> Checks if ALL elements of a real(wp) array are of default value. !! @param var_array Array to check. - logical pure function f_all_default(var_array) result(res) + logical function f_all_default(var_array) result(res) real(wp), intent(in) :: var_array(:) - ! logical :: res_array(size(var_array)) - ! integer :: i res = all(f_is_default(var_array)) + !logical :: res_array(size(var_array)) + !integer :: i + ! do i = 1, size(var_array) ! res_array(i) = f_is_default(var_array(i)) ! end do @@ -103,16 +104,16 @@ contains !> Checks if a real(wp) variable is an integer. !! @param var Variable to check. - logical pure elemental function f_is_integer(var) result(res) + logical elemental function f_is_integer(var) result(res) $:GPU_ROUTINE(parallelism='[seq]') real(wp), intent(in) :: var res = f_approx_equal(var, real(nint(var), wp)) end function f_is_integer - pure subroutine s_configure_coordinate_bounds(recon_type, weno_polyn, muscl_polyn, & - igr_order, buff_size, idwint, idwbuff, & - viscous, bubbles_lagrange, m, n, p, num_dims, igr, ib) + subroutine s_configure_coordinate_bounds(recon_type, weno_polyn, muscl_polyn, & + igr_order, buff_size, idwint, idwbuff, & + viscous, bubbles_lagrange, m, n, p, num_dims, igr, ib) integer, intent(in) :: recon_type, weno_polyn, muscl_polyn integer, intent(in) :: m, n, p, num_dims, igr_order @@ -166,7 +167,7 @@ contains !! @param m Number of cells in x-axis !! @param n Number of cells in y-axis !! @param p Number of cells in z-axis - pure elemental subroutine s_update_cell_bounds(bounds, m, n, p) + elemental subroutine s_update_cell_bounds(bounds, m, n, p) type(cell_num_bounds), intent(out) :: bounds integer, intent(in) :: m, n, p diff --git a/src/common/m_model.fpp b/src/common/m_model.fpp index 734e83f4af..0ae508a172 100644 --- a/src/common/m_model.fpp +++ b/src/common/m_model.fpp @@ -424,7 +424,7 @@ contains end subroutine s_model_write !> This procedure frees the memory allocated for an STL mesh. - pure subroutine s_model_free(model) + subroutine s_model_free(model) type(t_model), intent(inout) :: model @@ -532,7 +532,7 @@ contains !! @param ray Ray. !! @param triangle Triangle. !! @return True if the ray intersects the triangle, false otherwise. - pure elemental function f_intersects_triangle(ray, triangle) result(intersects) + elemental function f_intersects_triangle(ray, triangle) result(intersects) type(t_ray), intent(in) :: ray type(t_triangle), intent(in) :: triangle @@ -592,7 +592,8 @@ contains !! @param boundary_v Output boundary vertices/normals. !! @param boundary_vertex_count Output total boundary vertex count !! @param boundary_edge_count Output total boundary edge counts - pure subroutine f_check_boundary(model, boundary_v, boundary_vertex_count, boundary_edge_count) + subroutine f_check_boundary(model, boundary_v, boundary_vertex_count, boundary_edge_count) + type(t_model), intent(in) :: model real(wp), allocatable, intent(out), dimension(:, :, :) :: boundary_v !< Output boundary vertices/normals integer, intent(out) :: boundary_vertex_count, boundary_edge_count !< Output boundary vertex/edge count @@ -705,7 +706,8 @@ contains !! @param edge Edges end points to be registered !! @param edge_index Edge index iterator !! @param edge_count Total number of edges - pure subroutine f_register_edge(temp_boundary_v, edge, edge_index, edge_count) + subroutine f_register_edge(temp_boundary_v, edge, edge_index, edge_count) + integer, intent(inout) :: edge_index !< Edge index iterator integer, intent(inout) :: edge_count !< Total number of edges real(wp), intent(in), dimension(1:2, 1:2) :: edge !< Edges end points to be registered @@ -723,7 +725,8 @@ contains !! @param boundary_edge_count Output total number of boundary edges !! @param spacing Dimensions of the current levelset cell !! @param interpolate Logical output - pure subroutine f_check_interpolation_2D(boundary_v, boundary_edge_count, spacing, interpolate) + subroutine f_check_interpolation_2D(boundary_v, boundary_edge_count, spacing, interpolate) + logical, intent(inout) :: interpolate !< Logical indicator of interpolation integer, intent(in) :: boundary_edge_count !< Number of boundary edges real(wp), intent(in), dimension(1:boundary_edge_count, 1:3, 1:2) :: boundary_v @@ -753,7 +756,8 @@ contains !! @param model Model to search in. !! @param spacing Dimensions of the current levelset cell !! @param interpolate Logical output - pure subroutine f_check_interpolation_3D(model, spacing, interpolate) + subroutine f_check_interpolation_3D(model, spacing, interpolate) + logical, intent(inout) :: interpolate type(t_model), intent(in) :: model real(wp), dimension(1:3), intent(in) :: spacing @@ -799,7 +803,8 @@ contains !! @param spacing Dimensions of the current levelset cell !! @param interpolated_boundary_v Output all the boundary vertices of the interpolated 2D model !! @param total_vertices Total number of vertices after interpolation - pure subroutine f_interpolate_2D(boundary_v, boundary_edge_count, spacing, interpolated_boundary_v, total_vertices) + subroutine f_interpolate_2D(boundary_v, boundary_edge_count, spacing, interpolated_boundary_v, total_vertices) + real(wp), intent(in), dimension(:, :, :) :: boundary_v real(wp), dimension(1:3), intent(in) :: spacing real(wp), allocatable, intent(inout), dimension(:, :) :: interpolated_boundary_v @@ -1042,7 +1047,8 @@ contains !! @param point The cell centers of the current level cell !! @param normals The output levelset normals !! @param distance The output levelset distance - pure subroutine f_distance_normals_3D(model, point, normals, distance) + subroutine f_distance_normals_3D(model, point, normals, distance) + type(t_model), intent(IN) :: model real(wp), dimension(1:3), intent(in) :: point real(wp), dimension(1:3), intent(out) :: normals @@ -1104,7 +1110,8 @@ contains !! @param point The cell centers of the current levelset cell !! @param spacing Dimensions of the current levelset cell !! @return Distance which the levelset distance without interpolation - pure function f_distance(boundary_v, boundary_edge_count, point) result(distance) + function f_distance(boundary_v, boundary_edge_count, point) result(distance) + integer, intent(in) :: boundary_edge_count real(wp), intent(in), dimension(1:boundary_edge_count, 1:3, 1:2) :: boundary_v real(wp), dimension(1:3), intent(in) :: point @@ -1134,7 +1141,8 @@ contains !! @param boundary_edge_count Output the total number of boundary edges !! @param point The cell centers of the current levelset cell !! @param normals Output levelset normals without interpolation - pure subroutine f_normals(boundary_v, boundary_edge_count, point, normals) + subroutine f_normals(boundary_v, boundary_edge_count, point, normals) + integer, intent(in) :: boundary_edge_count real(wp), intent(in), dimension(1:boundary_edge_count, 1:3, 1:2) :: boundary_v real(wp), dimension(1:3), intent(in) :: point @@ -1169,7 +1177,8 @@ contains end subroutine f_normals !> This procedure calculates the barycentric facet area - pure subroutine f_tri_area(tri, tri_area) + subroutine f_tri_area(tri, tri_area) + real(wp), dimension(1:3, 1:3), intent(in) :: tri real(wp), intent(out) :: tri_area real(wp), dimension(1:3) :: AB, AC, cross @@ -1192,7 +1201,8 @@ contains !! @param total_vertices Total number of vertices after interpolation !! @param point The cell centers of the current levelset cell !! @return Distance which the levelset distance without interpolation - pure function f_interpolated_distance(interpolated_boundary_v, total_vertices, point) result(distance) + function f_interpolated_distance(interpolated_boundary_v, total_vertices, point) result(distance) + integer, intent(in) :: total_vertices real(wp), intent(in), dimension(1:total_vertices, 1:3) :: interpolated_boundary_v real(wp), dimension(1:3), intent(in) :: point diff --git a/src/common/m_mpi_common.fpp b/src/common/m_mpi_common.fpp index 07463137f2..c2b17105a7 100644 --- a/src/common/m_mpi_common.fpp +++ b/src/common/m_mpi_common.fpp @@ -757,144 +757,153 @@ contains #:for mpi_dir in [1, 2, 3] if (mpi_dir == ${mpi_dir}$) then #:if mpi_dir == 1 - $:GPU_PARALLEL_LOOP(collapse=4,private='[r]') - do l = 0, p - do k = 0, n - do j = 0, buff_size - 1 - do i = 1, nVar - r = (i - 1) + v_size*(j + buff_size*(k + (n + 1)*l)) - buff_send(r) = q_comm(i)%sf(j + pack_offset, k, l) + #:call GPU_PARALLEL_LOOP(collapse=4,private='[r]') + do l = 0, p + do k = 0, n + do j = 0, buff_size - 1 + do i = 1, nVar + r = (i - 1) + v_size*(j + buff_size*(k + (n + 1)*l)) + buff_send(r) = q_comm(i)%sf(j + pack_offset, k, l) + end do end do end do end do - end do + #:endcall GPU_PARALLEL_LOOP if (qbmm_comm) then - $:GPU_PARALLEL_LOOP(collapse=4,private='[r]') - do l = 0, p - do k = 0, n - do j = 0, buff_size - 1 - do i = nVar + 1, nVar + 4 - do q = 1, nb - r = (i - 1) + (q - 1)*4 + v_size* & - (j + buff_size*(k + (n + 1)*l)) - buff_send(r) = pb_in(j + pack_offset, k, l, i - nVar, q) + #:call GPU_PARALLEL_LOOP(collapse=4,private='[r]') + do l = 0, p + do k = 0, n + do j = 0, buff_size - 1 + do i = nVar + 1, nVar + 4 + do q = 1, nb + r = (i - 1) + (q - 1)*4 + v_size* & + (j + buff_size*(k + (n + 1)*l)) + buff_send(r) = pb_in(j + pack_offset, k, l, i - nVar, q) + end do end do end do end do end do - end do + #:endcall GPU_PARALLEL_LOOP - $:GPU_PARALLEL_LOOP(collapse=5,private='[r]') - do l = 0, p - do k = 0, n - do j = 0, buff_size - 1 - do i = nVar + 1, nVar + 4 - do q = 1, nb - r = (i - 1) + (q - 1)*4 + nb*4 + v_size* & - (j + buff_size*(k + (n + 1)*l)) - buff_send(r) = mv_in(j + pack_offset, k, l, i - nVar, q) + #:call GPU_PARALLEL_LOOP(collapse=5,private='[r]') + do l = 0, p + do k = 0, n + do j = 0, buff_size - 1 + do i = nVar + 1, nVar + 4 + do q = 1, nb + r = (i - 1) + (q - 1)*4 + nb*4 + v_size* & + (j + buff_size*(k + (n + 1)*l)) + buff_send(r) = mv_in(j + pack_offset, k, l, i - nVar, q) + end do end do end do end do end do - end do + #:endcall GPU_PARALLEL_LOOP end if #:elif mpi_dir == 2 - $:GPU_PARALLEL_LOOP(collapse=4,private='[r]') - do i = 1, nVar - do l = 0, p - do k = 0, buff_size - 1 - do j = -buff_size, m + buff_size - r = (i - 1) + v_size* & - ((j + buff_size) + (m + 2*buff_size + 1)* & - (k + buff_size*l)) - buff_send(r) = q_comm(i)%sf(j, k + pack_offset, l) + #:call GPU_PARALLEL_LOOP(collapse=4,private='[r]') + do i = 1, nVar + do l = 0, p + do k = 0, buff_size - 1 + do j = -buff_size, m + buff_size + r = (i - 1) + v_size* & + ((j + buff_size) + (m + 2*buff_size + 1)* & + (k + buff_size*l)) + buff_send(r) = q_comm(i)%sf(j, k + pack_offset, l) + end do end do end do end do - end do + #:endcall GPU_PARALLEL_LOOP if (qbmm_comm) then - $:GPU_PARALLEL_LOOP(collapse=5,private='[r]') - do i = nVar + 1, nVar + 4 - do l = 0, p - do k = 0, buff_size - 1 - do j = -buff_size, m + buff_size - do q = 1, nb - r = (i - 1) + (q - 1)*4 + v_size* & - ((j + buff_size) + (m + 2*buff_size + 1)* & - (k + buff_size*l)) - buff_send(r) = pb_in(j, k + pack_offset, l, i - nVar, q) + #:call GPU_PARALLEL_LOOP(collapse=5,private='[r]') + do i = nVar + 1, nVar + 4 + do l = 0, p + do k = 0, buff_size - 1 + do j = -buff_size, m + buff_size + do q = 1, nb + r = (i - 1) + (q - 1)*4 + v_size* & + ((j + buff_size) + (m + 2*buff_size + 1)* & + (k + buff_size*l)) + buff_send(r) = pb_in(j, k + pack_offset, l, i - nVar, q) + end do end do end do end do end do - end do - - $:GPU_PARALLEL_LOOP(collapse=5,private='[r]') - do i = nVar + 1, nVar + 4 - do l = 0, p - do k = 0, buff_size - 1 - do j = -buff_size, m + buff_size - do q = 1, nb - r = (i - 1) + (q - 1)*4 + nb*4 + v_size* & - ((j + buff_size) + (m + 2*buff_size + 1)* & - (k + buff_size*l)) - buff_send(r) = mv_in(j, k + pack_offset, l, i - nVar, q) + #:endcall GPU_PARALLEL_LOOP + + #:call GPU_PARALLEL_LOOP(collapse=5,private='[r]') + do i = nVar + 1, nVar + 4 + do l = 0, p + do k = 0, buff_size - 1 + do j = -buff_size, m + buff_size + do q = 1, nb + r = (i - 1) + (q - 1)*4 + nb*4 + v_size* & + ((j + buff_size) + (m + 2*buff_size + 1)* & + (k + buff_size*l)) + buff_send(r) = mv_in(j, k + pack_offset, l, i - nVar, q) + end do end do end do end do end do - end do + #:endcall GPU_PARALLEL_LOOP end if #:else - $:GPU_PARALLEL_LOOP(collapse=4,private='[r]') - do i = 1, nVar - do l = 0, buff_size - 1 - do k = -buff_size, n + buff_size - do j = -buff_size, m + buff_size - r = (i - 1) + v_size* & - ((j + buff_size) + (m + 2*buff_size + 1)* & - ((k + buff_size) + (n + 2*buff_size + 1)*l)) - buff_send(r) = q_comm(i)%sf(j, k, l + pack_offset) + #:call GPU_PARALLEL_LOOP(collapse=4,private='[r]') + do i = 1, nVar + do l = 0, buff_size - 1 + do k = -buff_size, n + buff_size + do j = -buff_size, m + buff_size + r = (i - 1) + v_size* & + ((j + buff_size) + (m + 2*buff_size + 1)* & + ((k + buff_size) + (n + 2*buff_size + 1)*l)) + buff_send(r) = q_comm(i)%sf(j, k, l + pack_offset) + end do end do end do end do - end do + #:endcall GPU_PARALLEL_LOOP if (qbmm_comm) then - $:GPU_PARALLEL_LOOP(collapse=5,private='[r]') - do i = nVar + 1, nVar + 4 - do l = 0, buff_size - 1 - do k = -buff_size, n + buff_size - do j = -buff_size, m + buff_size - do q = 1, nb - r = (i - 1) + (q - 1)*4 + v_size* & - ((j + buff_size) + (m + 2*buff_size + 1)* & - ((k + buff_size) + (n + 2*buff_size + 1)*l)) - buff_send(r) = pb_in(j, k, l + pack_offset, i - nVar, q) + #:call GPU_PARALLEL_LOOP(collapse=5,private='[r]') + do i = nVar + 1, nVar + 4 + do l = 0, buff_size - 1 + do k = -buff_size, n + buff_size + do j = -buff_size, m + buff_size + do q = 1, nb + r = (i - 1) + (q - 1)*4 + v_size* & + ((j + buff_size) + (m + 2*buff_size + 1)* & + ((k + buff_size) + (n + 2*buff_size + 1)*l)) + buff_send(r) = pb_in(j, k, l + pack_offset, i - nVar, q) + end do end do end do end do end do - end do - - $:GPU_PARALLEL_LOOP(collapse=5,private='[r]') - do i = nVar + 1, nVar + 4 - do l = 0, buff_size - 1 - do k = -buff_size, n + buff_size - do j = -buff_size, m + buff_size - do q = 1, nb - r = (i - 1) + (q - 1)*4 + nb*4 + v_size* & - ((j + buff_size) + (m + 2*buff_size + 1)* & - ((k + buff_size) + (n + 2*buff_size + 1)*l)) - buff_send(r) = mv_in(j, k, l + pack_offset, i - nVar, q) + #:endcall GPU_PARALLEL_LOOP + + #:call GPU_PARALLEL_LOOP(collapse=5,private='[r]') + do i = nVar + 1, nVar + 4 + do l = 0, buff_size - 1 + do k = -buff_size, n + buff_size + do j = -buff_size, m + buff_size + do q = 1, nb + r = (i - 1) + (q - 1)*4 + nb*4 + v_size* & + ((j + buff_size) + (m + 2*buff_size + 1)* & + ((k + buff_size) + (n + 2*buff_size + 1)*l)) + buff_send(r) = mv_in(j, k, l + pack_offset, i - nVar, q) + end do end do end do end do end do - end do + #:endcall GPU_PARALLEL_LOOP end if #:endif end if @@ -906,7 +915,7 @@ contains #:for rdma_mpi in [False, True] if (rdma_mpi .eqv. ${'.true.' if rdma_mpi else '.false.'}$) then #:if rdma_mpi - #:call GPU_HOST_DATA(use_device='[buff_send, buff_recv]') + #:call GPU_HOST_DATA(use_device_addr='[buff_send, buff_recv]') call nvtxStartRange("RHS-COMM-SENDRECV-RDMA") call MPI_SENDRECV( & @@ -949,167 +958,176 @@ contains #:for mpi_dir in [1, 2, 3] if (mpi_dir == ${mpi_dir}$) then #:if mpi_dir == 1 - $:GPU_PARALLEL_LOOP(collapse=4,private='[r]') - do l = 0, p - do k = 0, n - do j = -buff_size, -1 - do i = 1, nVar - r = (i - 1) + v_size* & - (j + buff_size*((k + 1) + (n + 1)*l)) - q_comm(i)%sf(j + unpack_offset, k, l) = buff_recv(r) + #:call GPU_PARALLEL_LOOP(collapse=4,private='[r]') + do l = 0, p + do k = 0, n + do j = -buff_size, -1 + do i = 1, nVar + r = (i - 1) + v_size* & + (j + buff_size*((k + 1) + (n + 1)*l)) + q_comm(i)%sf(j + unpack_offset, k, l) = buff_recv(r) #if defined(__INTEL_COMPILER) - if (ieee_is_nan(q_comm(i)%sf(j, k, l))) then - print *, "Error", j, k, l, i - error stop "NaN(s) in recv" - end if + if (ieee_is_nan(q_comm(i)%sf(j, k, l))) then + print *, "Error", j, k, l, i + error stop "NaN(s) in recv" + end if #endif + end do end do end do end do - end do + #:endcall GPU_PARALLEL_LOOP if (qbmm_comm) then - $:GPU_PARALLEL_LOOP(collapse=5,private='[r]') - do l = 0, p - do k = 0, n - do j = -buff_size, -1 - do i = nVar + 1, nVar + 4 - do q = 1, nb - r = (i - 1) + (q - 1)*4 + v_size* & - (j + buff_size*((k + 1) + (n + 1)*l)) - pb_in(j + unpack_offset, k, l, i - nVar, q) = buff_recv(r) + #:call GPU_PARALLEL_LOOP(collapse=5,private='[r]') + do l = 0, p + do k = 0, n + do j = -buff_size, -1 + do i = nVar + 1, nVar + 4 + do q = 1, nb + r = (i - 1) + (q - 1)*4 + v_size* & + (j + buff_size*((k + 1) + (n + 1)*l)) + pb_in(j + unpack_offset, k, l, i - nVar, q) = buff_recv(r) + end do end do end do end do end do - end do + #:endcall GPU_PARALLEL_LOOP - $:GPU_PARALLEL_LOOP(collapse=5,private='[r]') - do l = 0, p - do k = 0, n - do j = -buff_size, -1 - do i = nVar + 1, nVar + 4 - do q = 1, nb - r = (i - 1) + (q - 1)*4 + nb*4 + v_size* & - (j + buff_size*((k + 1) + (n + 1)*l)) - mv_in(j + unpack_offset, k, l, i - nVar, q) = buff_recv(r) + #:call GPU_PARALLEL_LOOP(collapse=5,private='[r]') + do l = 0, p + do k = 0, n + do j = -buff_size, -1 + do i = nVar + 1, nVar + 4 + do q = 1, nb + r = (i - 1) + (q - 1)*4 + nb*4 + v_size* & + (j + buff_size*((k + 1) + (n + 1)*l)) + mv_in(j + unpack_offset, k, l, i - nVar, q) = buff_recv(r) + end do end do end do end do end do - end do + #:endcall GPU_PARALLEL_LOOP end if #:elif mpi_dir == 2 - $:GPU_PARALLEL_LOOP(collapse=4,private='[r]') - do i = 1, nVar - do l = 0, p - do k = -buff_size, -1 - do j = -buff_size, m + buff_size - r = (i - 1) + v_size* & - ((j + buff_size) + (m + 2*buff_size + 1)* & - ((k + buff_size) + buff_size*l)) - q_comm(i)%sf(j, k + unpack_offset, l) = buff_recv(r) + #:call GPU_PARALLEL_LOOP(collapse=4,private='[r]') + do i = 1, nVar + do l = 0, p + do k = -buff_size, -1 + do j = -buff_size, m + buff_size + r = (i - 1) + v_size* & + ((j + buff_size) + (m + 2*buff_size + 1)* & + ((k + buff_size) + buff_size*l)) + q_comm(i)%sf(j, k + unpack_offset, l) = buff_recv(r) #if defined(__INTEL_COMPILER) - if (ieee_is_nan(q_comm(i)%sf(j, k, l))) then - print *, "Error", j, k, l, i - error stop "NaN(s) in recv" - end if + if (ieee_is_nan(q_comm(i)%sf(j, k, l))) then + print *, "Error", j, k, l, i + error stop "NaN(s) in recv" + end if #endif + end do end do end do end do - end do + #:endcall GPU_PARALLEL_LOOP if (qbmm_comm) then - $:GPU_PARALLEL_LOOP(collapse=5,private='[r]') - do i = nVar + 1, nVar + 4 - do l = 0, p - do k = -buff_size, -1 - do j = -buff_size, m + buff_size - do q = 1, nb - r = (i - 1) + (q - 1)*4 + v_size* & - ((j + buff_size) + (m + 2*buff_size + 1)* & - ((k + buff_size) + buff_size*l)) - pb_in(j, k + unpack_offset, l, i - nVar, q) = buff_recv(r) + #:call GPU_PARALLEL_LOOP(collapse=5,private='[r]') + do i = nVar + 1, nVar + 4 + do l = 0, p + do k = -buff_size, -1 + do j = -buff_size, m + buff_size + do q = 1, nb + r = (i - 1) + (q - 1)*4 + v_size* & + ((j + buff_size) + (m + 2*buff_size + 1)* & + ((k + buff_size) + buff_size*l)) + pb_in(j, k + unpack_offset, l, i - nVar, q) = buff_recv(r) + end do end do end do end do end do - end do - - $:GPU_PARALLEL_LOOP(collapse=5,private='[r]') - do i = nVar + 1, nVar + 4 - do l = 0, p - do k = -buff_size, -1 - do j = -buff_size, m + buff_size - do q = 1, nb - r = (i - 1) + (q - 1)*4 + nb*4 + v_size* & - ((j + buff_size) + (m + 2*buff_size + 1)* & - ((k + buff_size) + buff_size*l)) - mv_in(j, k + unpack_offset, l, i - nVar, q) = buff_recv(r) + #:endcall GPU_PARALLEL_LOOP + + #:call GPU_PARALLEL_LOOP(collapse=5,private='[r]') + do i = nVar + 1, nVar + 4 + do l = 0, p + do k = -buff_size, -1 + do j = -buff_size, m + buff_size + do q = 1, nb + r = (i - 1) + (q - 1)*4 + nb*4 + v_size* & + ((j + buff_size) + (m + 2*buff_size + 1)* & + ((k + buff_size) + buff_size*l)) + mv_in(j, k + unpack_offset, l, i - nVar, q) = buff_recv(r) + end do end do end do end do end do - end do + #:endcall GPU_PARALLEL_LOOP end if #:else ! Unpacking buffer from bc_z%beg - $:GPU_PARALLEL_LOOP(collapse=4,private='[r]') - do i = 1, nVar - do l = -buff_size, -1 - do k = -buff_size, n + buff_size - do j = -buff_size, m + buff_size - r = (i - 1) + v_size* & - ((j + buff_size) + (m + 2*buff_size + 1)* & - ((k + buff_size) + (n + 2*buff_size + 1)* & - (l + buff_size))) - q_comm(i)%sf(j, k, l + unpack_offset) = buff_recv(r) + #:call GPU_PARALLEL_LOOP(collapse=4,private='[r]') + do i = 1, nVar + do l = -buff_size, -1 + do k = -buff_size, n + buff_size + do j = -buff_size, m + buff_size + r = (i - 1) + v_size* & + ((j + buff_size) + (m + 2*buff_size + 1)* & + ((k + buff_size) + (n + 2*buff_size + 1)* & + (l + buff_size))) + q_comm(i)%sf(j, k, l + unpack_offset) = buff_recv(r) #if defined(__INTEL_COMPILER) - if (ieee_is_nan(q_comm(i)%sf(j, k, l))) then - print *, "Error", j, k, l, i - error stop "NaN(s) in recv" - end if + if (ieee_is_nan(q_comm(i)%sf(j, k, l))) then + print *, "Error", j, k, l, i + error stop "NaN(s) in recv" + end if #endif + end do end do end do end do - end do + #:endcall GPU_PARALLEL_LOOP if (qbmm_comm) then - $:GPU_PARALLEL_LOOP(collapse=5,private='[r]') - do i = nVar + 1, nVar + 4 - do l = -buff_size, -1 - do k = -buff_size, n + buff_size - do j = -buff_size, m + buff_size - do q = 1, nb - r = (i - 1) + (q - 1)*4 + v_size* & - ((j + buff_size) + (m + 2*buff_size + 1)* & - ((k + buff_size) + (n + 2*buff_size + 1)* & - (l + buff_size))) - pb_in(j, k, l + unpack_offset, i - nVar, q) = buff_recv(r) + #:call GPU_PARALLEL_LOOP(collapse=5,private='[r]') + do i = nVar + 1, nVar + 4 + do l = -buff_size, -1 + do k = -buff_size, n + buff_size + do j = -buff_size, m + buff_size + do q = 1, nb + r = (i - 1) + (q - 1)*4 + v_size* & + ((j + buff_size) + (m + 2*buff_size + 1)* & + ((k + buff_size) + (n + 2*buff_size + 1)* & + (l + buff_size))) + pb_in(j, k, l + unpack_offset, i - nVar, q) = buff_recv(r) + end do end do end do end do end do - end do - - $:GPU_PARALLEL_LOOP(collapse=5,private='[r]') - do i = nVar + 1, nVar + 4 - do l = -buff_size, -1 - do k = -buff_size, n + buff_size - do j = -buff_size, m + buff_size - do q = 1, nb - r = (i - 1) + (q - 1)*4 + nb*4 + v_size* & - ((j + buff_size) + (m + 2*buff_size + 1)* & - ((k + buff_size) + (n + 2*buff_size + 1)* & - (l + buff_size))) - mv_in(j, k, l + unpack_offset, i - nVar, q) = buff_recv(r) + #:endcall GPU_PARALLEL_LOOP + + #:call GPU_PARALLEL_LOOP(collapse=5,private='[r]') + do i = nVar + 1, nVar + 4 + do l = -buff_size, -1 + do k = -buff_size, n + buff_size + do j = -buff_size, m + buff_size + do q = 1, nb + r = (i - 1) + (q - 1)*4 + nb*4 + v_size* & + ((j + buff_size) + (m + 2*buff_size + 1)* & + ((k + buff_size) + (n + 2*buff_size + 1)* & + (l + buff_size))) + mv_in(j, k, l + unpack_offset, i - nVar, q) = buff_recv(r) + end do end do end do end do end do - end do + #:endcall GPU_PARALLEL_LOOP end if #:endif end if diff --git a/src/common/m_nvtx.f90 b/src/common/m_nvtx.f90 index ce3273751a..524bf77781 100644 --- a/src/common/m_nvtx.f90 +++ b/src/common/m_nvtx.f90 @@ -25,7 +25,7 @@ module m_nvtx type(c_ptr) :: message ! ascii char end type nvtxEventAttributes -#if defined(MFC_OpenACC) && defined(__PGI) +#if defined(MFC_GPU) && defined(__PGI) interface nvtxRangePush ! push range with custom label and standard color @@ -58,7 +58,7 @@ subroutine nvtxStartRange(name, id) integer, intent(IN), optional :: id type(nvtxEventAttributes) :: event -#if defined(MFC_OpenACC) && defined(__PGI) +#if defined(MFC_GPU) && defined(__PGI) tempName = trim(name)//c_null_char @@ -74,7 +74,7 @@ subroutine nvtxStartRange(name, id) end subroutine nvtxStartRange subroutine nvtxEndRange -#if defined(MFC_OpenACC) && defined(__PGI) +#if defined(MFC_GPU) && defined(__PGI) call nvtxRangePop #endif end subroutine nvtxEndRange diff --git a/src/common/m_phase_change.fpp b/src/common/m_phase_change.fpp index 27cadf555a..82d8f41389 100644 --- a/src/common/m_phase_change.fpp +++ b/src/common/m_phase_change.fpp @@ -42,7 +42,7 @@ module m_phase_change real(wp) :: A, B, C, D !> @} - $:GPU_DECLARE(create='[max_iter,pCr,TCr,mixM,lp,vp,A,B,C,D]') + $:GPU_DECLARE(create='[A,B,C,D]') contains @@ -80,7 +80,7 @@ contains !! model, also considering mass depletion, depending on the incoming !! state conditions. !! @param q_cons_vf Cell-average conservative variables - pure subroutine s_infinite_relaxation_k(q_cons_vf) + subroutine s_infinite_relaxation_k(q_cons_vf) type(scalar_field), dimension(sys_size), intent(inout) :: q_cons_vf real(wp) :: pS, pSOV, pSSL !< equilibrium pressure for mixture, overheated vapor, and subcooled liquid @@ -89,189 +89,187 @@ contains real(wp) :: rho, rM, m1, m2, MCT !< total density, total reacting mass, individual reacting masses real(wp) :: TvF !< total volume fraction - $:GPU_DECLARE(create='[pS,pSOV,pSSL,TS,TSOV,TSSL,TSatOV,TSatSL]') - $:GPU_DECLARE(create='[rhoe,dynE,rhos,rho,rM,m1,m2,MCT,TvF]') + ! $:GPU_DECLARE(create='[pS,pSOV,pSSL,TS,TSOV,TSSL,TSatOV,TSatSL]') + ! $:GPU_DECLARE(create='[rhoe,dynE,rhos,rho,rM,m1,m2,MCT,TvF]') real(wp), dimension(num_fluids) :: p_infOV, p_infpT, p_infSL, sk, hk, gk, ek, rhok - $:GPU_DECLARE(create='[p_infOV,p_infpT,p_infSL,sk,hk,gk,ek,rhok]') + ! $:GPU_DECLARE(create='[p_infOV,p_infpT,p_infSL,sk,hk,gk,ek,rhok]') !< Generic loop iterators integer :: i, j, k, l ! starting equilibrium solver - $:GPU_PARALLEL_LOOP(collapse=3, private='[p_infOV, p_infpT, p_infSL, & - & sk, hk, gk, ek, rhok,pS, pSOV, pSSL, & - & TS, TSOV, TSatOV, TSatSL, TSSL, rhoe, & - & dynE, rhos, rho, rM, m1, m2, MCT, TvF]') - do j = 0, m - do k = 0, n - do l = 0, p + #:call GPU_PARALLEL_LOOP(collapse=3, private='[p_infOV, p_infpT, p_infSL, sk, hk, gk, ek, rhok,pS, pSOV, pSSL, TS, TSOV, TSatOV, TSatSL, TSSL, rhoe, dynE, rhos, rho, rM, m1, m2, MCT, TvF]') + do j = 0, m + do k = 0, n + do l = 0, p - rho = 0.0_wp; TvF = 0.0_wp - $:GPU_LOOP(parallelism='[seq]') - do i = 1, num_fluids + rho = 0.0_wp; TvF = 0.0_wp + $:GPU_LOOP(parallelism='[seq]') + do i = 1, num_fluids - ! Mixture density - rho = rho + q_cons_vf(i + contxb - 1)%sf(j, k, l) + ! Mixture density + rho = rho + q_cons_vf(i + contxb - 1)%sf(j, k, l) - ! Total Volume Fraction - TvF = TvF + q_cons_vf(i + advxb - 1)%sf(j, k, l) + ! Total Volume Fraction + TvF = TvF + q_cons_vf(i + advxb - 1)%sf(j, k, l) - end do + end do - ! calculating the total reacting mass for the phase change process. By hypothesis, this should not change - ! throughout the phase-change process. - rM = q_cons_vf(lp + contxb - 1)%sf(j, k, l) + q_cons_vf(vp + contxb - 1)%sf(j, k, l) + ! calculating the total reacting mass for the phase change process. By hypothesis, this should not change + ! throughout the phase-change process. + rM = q_cons_vf(lp + contxb - 1)%sf(j, k, l) + q_cons_vf(vp + contxb - 1)%sf(j, k, l) - ! correcting negative (reacting) mass fraction values in case they happen - call s_correct_partial_densities(MCT, q_cons_vf, rM, j, k, l) + ! correcting negative (reacting) mass fraction values in case they happen + call s_correct_partial_densities(MCT, q_cons_vf, rM, j, k, l) - ! fixing m1 and m2 AFTER correcting the partial densities. Note that these values must be stored for the phase - ! change process that will happen a posteriori - m1 = q_cons_vf(lp + contxb - 1)%sf(j, k, l) + ! fixing m1 and m2 AFTER correcting the partial densities. Note that these values must be stored for the phase + ! change process that will happen a posteriori + m1 = q_cons_vf(lp + contxb - 1)%sf(j, k, l) - m2 = q_cons_vf(vp + contxb - 1)%sf(j, k, l) + m2 = q_cons_vf(vp + contxb - 1)%sf(j, k, l) - ! kinetic energy as an auxiliary variable to the calculation of the total internal energy - dynE = 0.0_wp - $:GPU_LOOP(parallelism='[seq]') - do i = momxb, momxe + ! kinetic energy as an auxiliary variable to the calculation of the total internal energy + dynE = 0.0_wp + $:GPU_LOOP(parallelism='[seq]') + do i = momxb, momxe - dynE = dynE + 5.0e-1_wp*q_cons_vf(i)%sf(j, k, l)**2/rho + dynE = dynE + 5.0e-1_wp*q_cons_vf(i)%sf(j, k, l)**2/rho - end do + end do - ! calculating the total energy that MUST be preserved throughout the pT- and pTg-relaxation procedures - ! at each of the cells. The internal energy is calculated as the total energy minus the kinetic - ! energy to preserved its value at sharp interfaces - rhoe = q_cons_vf(E_idx)%sf(j, k, l) - dynE + ! calculating the total energy that MUST be preserved throughout the pT- and pTg-relaxation procedures + ! at each of the cells. The internal energy is calculated as the total energy minus the kinetic + ! energy to preserved its value at sharp interfaces + rhoe = q_cons_vf(E_idx)%sf(j, k, l) - dynE - ! Calling pT-equilibrium for either finishing phase-change module, or as an IC for the pTg-equilibrium - ! for this case, MFL cannot be either 0 or 1, so I chose it to be 2 - call s_infinite_pt_relaxation_k(j, k, l, 2, pS, p_infpT, q_cons_vf, rhoe, TS) + ! Calling pT-equilibrium for either finishing phase-change module, or as an IC for the pTg-equilibrium + ! for this case, MFL cannot be either 0 or 1, so I chose it to be 2 + call s_infinite_pt_relaxation_k(j, k, l, 2, pS, p_infpT, q_cons_vf, rhoe, TS) - ! check if pTg-equilibrium is required - ! NOTE that NOTHING else needs to be updated OTHER than the individual partial densities - ! given the outputs from the pT- and pTg-equilibrium solvers are just p and one of the partial masses - ! (pTg- case) - if ((relax_model == 6) .and. ((q_cons_vf(lp + contxb - 1)%sf(j, k, l) > mixM*rM) & - .and. (q_cons_vf(vp + contxb - 1)%sf(j, k, l) > mixM*rM)) & - .and. (pS < pCr) .and. (TS < TCr)) then + ! check if pTg-equilibrium is required + ! NOTE that NOTHING else needs to be updated OTHER than the individual partial densities + ! given the outputs from the pT- and pTg-equilibrium solvers are just p and one of the partial masses + ! (pTg- case) + if ((relax_model == 6) .and. ((q_cons_vf(lp + contxb - 1)%sf(j, k, l) > mixM*rM) & + .and. (q_cons_vf(vp + contxb - 1)%sf(j, k, l) > mixM*rM)) & + .and. (pS < pCr) .and. (TS < TCr)) then - ! Checking if phase change is needed, by checking whether the final solution is either subcoooled - ! liquid or overheated vapor. + ! Checking if phase change is needed, by checking whether the final solution is either subcoooled + ! liquid or overheated vapor. - ! overheated vapor case - ! depleting the mass of liquid - q_cons_vf(lp + contxb - 1)%sf(j, k, l) = mixM*rM + ! overheated vapor case + ! depleting the mass of liquid + q_cons_vf(lp + contxb - 1)%sf(j, k, l) = mixM*rM - ! transferring the total mass to vapor - q_cons_vf(vp + contxb - 1)%sf(j, k, l) = (1.0_wp - mixM)*rM + ! transferring the total mass to vapor + q_cons_vf(vp + contxb - 1)%sf(j, k, l) = (1.0_wp - mixM)*rM - ! calling pT-equilibrium for overheated vapor, which is MFL = 0 - call s_infinite_pt_relaxation_k(j, k, l, 0, pSOV, p_infOV, q_cons_vf, rhoe, TSOV) + ! calling pT-equilibrium for overheated vapor, which is MFL = 0 + call s_infinite_pt_relaxation_k(j, k, l, 0, pSOV, p_infOV, q_cons_vf, rhoe, TSOV) - ! calculating Saturation temperature - call s_TSat(pSOV, TSatOV, TSOV) + ! calculating Saturation temperature + call s_TSat(pSOV, TSatOV, TSOV) - ! subcooled liquid case - ! transferring the total mass to liquid - q_cons_vf(lp + contxb - 1)%sf(j, k, l) = (1.0_wp - mixM)*rM + ! subcooled liquid case + ! transferring the total mass to liquid + q_cons_vf(lp + contxb - 1)%sf(j, k, l) = (1.0_wp - mixM)*rM - ! depleting the mass of vapor - q_cons_vf(vp + contxb - 1)%sf(j, k, l) = mixM*rM + ! depleting the mass of vapor + q_cons_vf(vp + contxb - 1)%sf(j, k, l) = mixM*rM - ! calling pT-equilibrium for subcooled liquid, which is MFL = 1 - call s_infinite_pt_relaxation_k(j, k, l, 1, pSSL, p_infSL, q_cons_vf, rhoe, TSSL) + ! calling pT-equilibrium for subcooled liquid, which is MFL = 1 + call s_infinite_pt_relaxation_k(j, k, l, 1, pSSL, p_infSL, q_cons_vf, rhoe, TSSL) - ! calculating Saturation temperature - call s_TSat(pSSL, TSatSL, TSSL) + ! calculating Saturation temperature + call s_TSat(pSSL, TSatSL, TSSL) - ! checking the conditions for overheated vapor and subcooled liquide - if (TSOV > TSatOV) then + ! checking the conditions for overheated vapor and subcooled liquide + if (TSOV > TSatOV) then - ! Assigning pressure - pS = pSOV + ! Assigning pressure + pS = pSOV - ! Assigning Temperature - TS = TSOV + ! Assigning Temperature + TS = TSOV - ! correcting the liquid partial density - q_cons_vf(lp + contxb - 1)%sf(j, k, l) = mixM*rM + ! correcting the liquid partial density + q_cons_vf(lp + contxb - 1)%sf(j, k, l) = mixM*rM - ! correcting the vapor partial density - q_cons_vf(vp + contxb - 1)%sf(j, k, l) = (1.0_wp - mixM)*rM + ! correcting the vapor partial density + q_cons_vf(vp + contxb - 1)%sf(j, k, l) = (1.0_wp - mixM)*rM - elseif (TSSL < TSatSL) then + elseif (TSSL < TSatSL) then - ! Assigning pressure - pS = pSSL + ! Assigning pressure + pS = pSSL - ! Assigning Temperature - TS = TSSL + ! Assigning Temperature + TS = TSSL - ! correcting the liquid partial density - q_cons_vf(lp + contxb - 1)%sf(j, k, l) = (1.0_wp - mixM)*rM + ! correcting the liquid partial density + q_cons_vf(lp + contxb - 1)%sf(j, k, l) = (1.0_wp - mixM)*rM - ! correcting the vapor partial density - q_cons_vf(vp + contxb - 1)%sf(j, k, l) = mixM*rM + ! correcting the vapor partial density + q_cons_vf(vp + contxb - 1)%sf(j, k, l) = mixM*rM - else + else - ! returning partial pressures to what they were from the homogeneous solver - ! liquid - q_cons_vf(lp + contxb - 1)%sf(j, k, l) = m1 + ! returning partial pressures to what they were from the homogeneous solver + ! liquid + q_cons_vf(lp + contxb - 1)%sf(j, k, l) = m1 - ! vapor - q_cons_vf(vp + contxb - 1)%sf(j, k, l) = m2 + ! vapor + q_cons_vf(vp + contxb - 1)%sf(j, k, l) = m2 - ! calling the pTg-equilibrium solver - call s_infinite_ptg_relaxation_k(j, k, l, pS, p_infpT, rhoe, q_cons_vf, TS) + ! calling the pTg-equilibrium solver + call s_infinite_ptg_relaxation_k(j, k, l, pS, p_infpT, rhoe, q_cons_vf, TS) - end if + end if - end if + end if - ! Calculations AFTER equilibrium + ! Calculations AFTER equilibrium - ! entropy - sk(1:num_fluids) = cvs(1:num_fluids)*log((TS**gs_min(1:num_fluids)) & - /((pS + ps_inf(1:num_fluids))**(gs_min(1:num_fluids) - 1.0_wp))) + qvps(1:num_fluids) + ! entropy + sk(1:num_fluids) = cvs(1:num_fluids)*log((TS**gs_min(1:num_fluids)) & + /((pS + ps_inf(1:num_fluids))**(gs_min(1:num_fluids) - 1.0_wp))) + qvps(1:num_fluids) - ! enthalpy - hk(1:num_fluids) = gs_min(1:num_fluids)*cvs(1:num_fluids)*TS & - + qvs(1:num_fluids) + ! enthalpy + hk(1:num_fluids) = gs_min(1:num_fluids)*cvs(1:num_fluids)*TS & + + qvs(1:num_fluids) - ! Gibbs-free energy - gk(1:num_fluids) = hk(1:num_fluids) - TS*sk(1:num_fluids) + ! Gibbs-free energy + gk(1:num_fluids) = hk(1:num_fluids) - TS*sk(1:num_fluids) - ! densities - rhok(1:num_fluids) = (pS + ps_inf(1:num_fluids)) & - /((gs_min(1:num_fluids) - 1)*cvs(1:num_fluids)*TS) + ! densities + rhok(1:num_fluids) = (pS + ps_inf(1:num_fluids)) & + /((gs_min(1:num_fluids) - 1)*cvs(1:num_fluids)*TS) - ! internal energy - ek(1:num_fluids) = (pS + gs_min(1:num_fluids) & - *ps_inf(1:num_fluids))/(pS + ps_inf(1:num_fluids)) & - *cvs(1:num_fluids)*TS + qvs(1:num_fluids) + ! internal energy + ek(1:num_fluids) = (pS + gs_min(1:num_fluids) & + *ps_inf(1:num_fluids))/(pS + ps_inf(1:num_fluids)) & + *cvs(1:num_fluids)*TS + qvs(1:num_fluids) - ! calculating volume fractions, internal energies, and total entropy - rhos = 0.0_wp - $:GPU_LOOP(parallelism='[seq]') - do i = 1, num_fluids + ! calculating volume fractions, internal energies, and total entropy + rhos = 0.0_wp + $:GPU_LOOP(parallelism='[seq]') + do i = 1, num_fluids - ! volume fractions - q_cons_vf(i + advxb - 1)%sf(j, k, l) = q_cons_vf(i + contxb - 1)%sf(j, k, l)/rhok(i) + ! volume fractions + q_cons_vf(i + advxb - 1)%sf(j, k, l) = q_cons_vf(i + contxb - 1)%sf(j, k, l)/rhok(i) - ! alpha*rho*e - q_cons_vf(i + intxb - 1)%sf(j, k, l) = q_cons_vf(i + contxb - 1)%sf(j, k, l)*ek(i) + ! alpha*rho*e + q_cons_vf(i + intxb - 1)%sf(j, k, l) = q_cons_vf(i + contxb - 1)%sf(j, k, l)*ek(i) - ! Total entropy - rhos = rhos + q_cons_vf(i + contxb - 1)%sf(j, k, l)*sk(i) + ! Total entropy + rhos = rhos + q_cons_vf(i + contxb - 1)%sf(j, k, l)*sk(i) + end do end do end do end do - end do + #:endcall GPU_PARALLEL_LOOP end subroutine s_infinite_relaxation_k @@ -279,14 +277,14 @@ contains !! @param j generic loop iterator for x direction !! @param k generic loop iterator for y direction !! @param l generic loop iterator for z direction - !! @param MFL flag that tells whether the fluid is pure gas (0), pure liquid (1), or a mixture (2) + !! @param MFL flag that tells whether the fluid is gas (0), liquid (1), or a mixture (2) !! @param pS equilibrium pressure at the interface !! @param p_infpT stiffness for the participating fluids under pT-equilibrium !! @param rM sum of the reacting masses !! @param q_cons_vf Cell-average conservative variables !! @param rhoe mixture energy !! @param TS equilibrium temperature at the interface - pure subroutine s_infinite_pt_relaxation_k(j, k, l, MFL, pS, p_infpT, q_cons_vf, rhoe, TS) + subroutine s_infinite_pt_relaxation_k(j, k, l, MFL, pS, p_infpT, q_cons_vf, rhoe, TS) $:GPU_ROUTINE(function_name='s_infinite_pt_relaxation_k', & & parallelism='[seq]', cray_inline=True) @@ -386,7 +384,7 @@ contains !! @param rhoe mixture energy !! @param q_cons_vf Cell-average conservative variables !! @param TS equilibrium temperature at the interface - pure subroutine s_infinite_ptg_relaxation_k(j, k, l, pS, p_infpT, rhoe, q_cons_vf, TS) + subroutine s_infinite_ptg_relaxation_k(j, k, l, pS, p_infpT, rhoe, q_cons_vf, TS) $:GPU_ROUTINE(function_name='s_infinite_ptg_relaxation_k', & & parallelism='[seq]', cray_inline=True) @@ -507,7 +505,7 @@ contains !! @param j generic loop iterator for x direction !! @param k generic loop iterator for y direction !! @param l generic loop iterator for z direction - pure subroutine s_correct_partial_densities(MCT, q_cons_vf, rM, j, k, l) + subroutine s_correct_partial_densities(MCT, q_cons_vf, rM, j, k, l) $:GPU_ROUTINE(function_name='s_correct_partial_densities', & & parallelism='[seq]', cray_inline=True) @@ -566,7 +564,7 @@ contains !! @param pS equilibrium pressure at the interface !! @param q_cons_vf Cell-average conservative variables !! @param TJac Transpose of the Jacobian Matrix - pure subroutine s_compute_jacobian_matrix(InvJac, j, Jac, k, l, mCPD, mCVGP, mCVGP2, pS, q_cons_vf, TJac) + subroutine s_compute_jacobian_matrix(InvJac, j, Jac, k, l, mCPD, mCVGP, mCVGP2, pS, q_cons_vf, TJac) $:GPU_ROUTINE(function_name='s_compute_jacobian_matrix', & & parallelism='[seq]', cray_inline=True) @@ -669,7 +667,7 @@ contains !! @param pS equilibrium pressure at the interface !! @param rhoe mixture energy !! @param R2D (2D) residue array - pure subroutine s_compute_pTg_residue(j, k, l, mCPD, mCVGP, mQD, q_cons_vf, pS, rhoe, R2D) + subroutine s_compute_pTg_residue(j, k, l, mCPD, mCVGP, mQD, q_cons_vf, pS, rhoe, R2D) $:GPU_ROUTINE(function_name='s_compute_pTg_residue', & & parallelism='[seq]', cray_inline=True) @@ -716,7 +714,7 @@ contains !! @param pSat Saturation Pressure !! @param TSat Saturation Temperature !! @param TSIn equilibrium Temperature - pure elemental subroutine s_TSat(pSat, TSat, TSIn) + elemental subroutine s_TSat(pSat, TSat, TSIn) $:GPU_ROUTINE(function_name='s_TSat',parallelism='[seq]', & & cray_inline=True) diff --git a/src/common/m_variables_conversion.fpp b/src/common/m_variables_conversion.fpp index 085c7e7ff1..4d4bf60fa3 100644 --- a/src/common/m_variables_conversion.fpp +++ b/src/common/m_variables_conversion.fpp @@ -53,10 +53,10 @@ module m_variables_conversion $:GPU_DECLARE(create='[gammas,gs_min,pi_infs,ps_inf,cvs,qvs,qvps]') #endif - real(wp), allocatable, dimension(:) :: Gs - integer, allocatable, dimension(:) :: bubrs - real(wp), allocatable, dimension(:, :) :: Res - $:GPU_DECLARE(create='[bubrs,Gs,Res]') + real(wp), allocatable, dimension(:) :: Gs_vc + integer, allocatable, dimension(:) :: bubrs_vc + real(wp), allocatable, dimension(:, :) :: Res_vc + $:GPU_DECLARE(create='[bubrs_vc,Gs_vc,Res_vc]') integer :: is1b, is2b, is3b, is1e, is2e, is3e $:GPU_DECLARE(create='[is1b,is2b,is3b,is1e,is2e,is3e]') @@ -464,10 +464,10 @@ contains end subroutine s_convert_species_to_mixture_variables - pure subroutine s_convert_species_to_mixture_variables_acc(rho_K, & - gamma_K, pi_inf_K, qv_K, & - alpha_K, alpha_rho_K, Re_K, & - G_K, G) + subroutine s_convert_species_to_mixture_variables_acc(rho_K, & + gamma_K, pi_inf_K, qv_K, & + alpha_K, alpha_rho_K, Re_K, & + G_K, G) $:GPU_ROUTINE(function_name='s_convert_species_to_mixture_variables_acc', & & parallelism='[seq]', cray_inline=True) @@ -516,7 +516,7 @@ contains if (present(G_K)) then G_K = 0._wp do i = 1, num_fluids - !TODO: change to use Gs directly here? + !TODO: change to use Gs_vc directly here? !TODO: Make this changes as well for GPUs G_K = G_K + alpha_K(i)*G(i) end do @@ -531,7 +531,7 @@ contains if (Re_size(i) > 0) Re_K(i) = 0._wp do j = 1, Re_size(i) - Re_K(i) = alpha_K(Re_idx(i, j))/Res(i, j) & + Re_K(i) = alpha_K(Re_idx(i, j))/Res_vc(i, j) & + Re_K(i) end do @@ -543,9 +543,9 @@ contains end subroutine s_convert_species_to_mixture_variables_acc - pure subroutine s_convert_species_to_mixture_variables_bubbles_acc(rho_K, & - gamma_K, pi_inf_K, qv_K, & - alpha_K, alpha_rho_K, Re_K) + subroutine s_convert_species_to_mixture_variables_bubbles_acc(rho_K, & + gamma_K, pi_inf_K, qv_K, & + alpha_K, alpha_rho_K, Re_K) $:GPU_ROUTINE(function_name='s_convert_species_to_mixture_variables_bubbles_acc', & & parallelism='[seq]', cray_inline=True) @@ -594,7 +594,7 @@ contains if (Re_size(i) > 0) Re_K(i) = 0._wp do j = 1, Re_size(i) - Re_K(i) = (1._wp - alpha_K(Re_idx(i, j)))/Res(i, j) & + Re_K(i) = (1._wp - alpha_K(Re_idx(i, j)))/Res_vc(i, j) & + Re_K(i) end do @@ -624,7 +624,7 @@ contains @:ALLOCATE(cvs (1:num_fluids)) @:ALLOCATE(qvs (1:num_fluids)) @:ALLOCATE(qvps (1:num_fluids)) - @:ALLOCATE(Gs (1:num_fluids)) + @:ALLOCATE(Gs_vc (1:num_fluids)) #else @:ALLOCATE(gammas (1:num_fluids)) @:ALLOCATE(gs_min (1:num_fluids)) @@ -633,46 +633,46 @@ contains @:ALLOCATE(cvs (1:num_fluids)) @:ALLOCATE(qvs (1:num_fluids)) @:ALLOCATE(qvps (1:num_fluids)) - @:ALLOCATE(Gs (1:num_fluids)) + @:ALLOCATE(Gs_vc (1:num_fluids)) #endif do i = 1, num_fluids gammas(i) = fluid_pp(i)%gamma gs_min(i) = 1.0_wp/gammas(i) + 1.0_wp pi_infs(i) = fluid_pp(i)%pi_inf - Gs(i) = fluid_pp(i)%G + Gs_vc(i) = fluid_pp(i)%G ps_inf(i) = pi_infs(i)/(1.0_wp + gammas(i)) cvs(i) = fluid_pp(i)%cv qvs(i) = fluid_pp(i)%qv qvps(i) = fluid_pp(i)%qvp end do - $:GPU_UPDATE(device='[gammas,gs_min,pi_infs,ps_inf,cvs,qvs,qvps,Gs]') + $:GPU_UPDATE(device='[gammas,gs_min,pi_infs,ps_inf,cvs,qvs,qvps,Gs_vc]') #ifdef MFC_SIMULATION if (viscous) then - @:ALLOCATE(Res(1:2, 1:Re_size_max)) + @:ALLOCATE(Res_vc(1:2, 1:Re_size_max)) do i = 1, 2 do j = 1, Re_size(i) - Res(i, j) = fluid_pp(Re_idx(i, j))%Re(i) + Res_vc(i, j) = fluid_pp(Re_idx(i, j))%Re(i) end do end do - $:GPU_UPDATE(device='[Res,Re_idx,Re_size]') + $:GPU_UPDATE(device='[Res_vc,Re_idx,Re_size]') end if #endif if (bubbles_euler) then #ifdef MFC_SIMULATION - @:ALLOCATE(bubrs(1:nb)) + @:ALLOCATE(bubrs_vc(1:nb)) #else - @:ALLOCATE(bubrs(1:nb)) + @:ALLOCATE(bubrs_vc(1:nb)) #endif do i = 1, nb - bubrs(i) = bub_idx%rs(i) + bubrs_vc(i) = bub_idx%rs(i) end do - $:GPU_UPDATE(device='[bubrs]') + $:GPU_UPDATE(device='[bubrs_vc]') end if #ifdef MFC_POST_PROCESS @@ -737,7 +737,7 @@ contains end subroutine s_initialize_variables_conversion_module !Initialize mv at the quadrature nodes based on the initialized moments and sigma - pure subroutine s_initialize_mv(qK_cons_vf, mv) + subroutine s_initialize_mv(qK_cons_vf, mv) type(scalar_field), dimension(sys_size), intent(in) :: qK_cons_vf @@ -770,7 +770,7 @@ contains end subroutine s_initialize_mv !Initialize pb at the quadrature nodes using isothermal relations (Preston model) - pure subroutine s_initialize_pb(qK_cons_vf, mv, pb) + subroutine s_initialize_pb(qK_cons_vf, mv, pb) type(scalar_field), dimension(sys_size), intent(in) :: qK_cons_vf real(wp), dimension(idwint(1)%beg:, idwint(2)%beg:, idwint(3)%beg:, 1:, 1:), intent(in) :: mv @@ -873,300 +873,299 @@ contains end if #:endif - $:GPU_PARALLEL_LOOP(collapse=3, private='[alpha_K, alpha_rho_K, Re_K, & - & nRtmp, rho_K, gamma_K, pi_inf_K,qv_K, & - & dyn_pres_K, rhoYks, B]') - do l = ibounds(3)%beg, ibounds(3)%end - do k = ibounds(2)%beg, ibounds(2)%end - do j = ibounds(1)%beg, ibounds(1)%end - dyn_pres_K = 0._wp - - if (igr) then - if (num_fluids == 1) then - alpha_rho_K(1) = qK_cons_vf(contxb)%sf(j, k, l) - alpha_K(1) = 1._wp + #:call GPU_PARALLEL_LOOP(collapse=3, private='[alpha_K, alpha_rho_K, Re_K, nRtmp, rho_K, gamma_K, pi_inf_K,qv_K, dyn_pres_K, rhoYks, B, T]') + do l = ibounds(3)%beg, ibounds(3)%end + do k = ibounds(2)%beg, ibounds(2)%end + do j = ibounds(1)%beg, ibounds(1)%end + dyn_pres_K = 0._wp + + if (igr) then + if (num_fluids == 1) then + alpha_rho_K(1) = qK_cons_vf(contxb)%sf(j, k, l) + alpha_K(1) = 1._wp + else + $:GPU_LOOP(parallelism='[seq]') + do i = 1, num_fluids - 1 + alpha_rho_K(i) = qK_cons_vf(i)%sf(j, k, l) + alpha_K(i) = qK_cons_vf(advxb + i - 1)%sf(j, k, l) + end do + + alpha_rho_K(num_fluids) = qK_cons_vf(num_fluids)%sf(j, k, l) + alpha_K(num_fluids) = 1._wp - sum(alpha_K(1:num_fluids - 1)) + end if else $:GPU_LOOP(parallelism='[seq]') - do i = 1, num_fluids - 1 + do i = 1, num_fluids alpha_rho_K(i) = qK_cons_vf(i)%sf(j, k, l) alpha_K(i) = qK_cons_vf(advxb + i - 1)%sf(j, k, l) end do - - alpha_rho_K(num_fluids) = qK_cons_vf(num_fluids)%sf(j, k, l) - alpha_K(num_fluids) = 1._wp - sum(alpha_K(1:num_fluids - 1)) end if - else - $:GPU_LOOP(parallelism='[seq]') - do i = 1, num_fluids - alpha_rho_K(i) = qK_cons_vf(i)%sf(j, k, l) - alpha_K(i) = qK_cons_vf(advxb + i - 1)%sf(j, k, l) - end do - end if - if (model_eqns /= 4) then + if (model_eqns /= 4) then #ifdef MFC_SIMULATION - ! If in simulation, use acc mixture subroutines - if (elasticity) then - call s_convert_species_to_mixture_variables_acc(rho_K, gamma_K, pi_inf_K, qv_K, alpha_K, & - alpha_rho_K, Re_K, G_K, Gs) - else if (bubbles_euler) then - call s_convert_species_to_mixture_variables_bubbles_acc(rho_K, gamma_K, pi_inf_K, qv_K, & - alpha_K, alpha_rho_K, Re_K) - else - call s_convert_species_to_mixture_variables_acc(rho_K, gamma_K, pi_inf_K, qv_K, & - alpha_K, alpha_rho_K, Re_K) - end if + ! If in simulation, use acc mixture subroutines + if (elasticity) then + call s_convert_species_to_mixture_variables_acc(rho_K, gamma_K, pi_inf_K, qv_K, alpha_K, & + alpha_rho_K, Re_K, G_K, Gs_vc) + else if (bubbles_euler) then + call s_convert_species_to_mixture_variables_bubbles_acc(rho_K, gamma_K, pi_inf_K, qv_K, & + alpha_K, alpha_rho_K, Re_K) + else + call s_convert_species_to_mixture_variables_acc(rho_K, gamma_K, pi_inf_K, qv_K, & + alpha_K, alpha_rho_K, Re_K) + end if #else - ! If pre-processing, use non acc mixture subroutines - if (elasticity) then - call s_convert_to_mixture_variables(qK_cons_vf, j, k, l, & - rho_K, gamma_K, pi_inf_K, qv_K, Re_K, G_K, fluid_pp(:)%G) - else - call s_convert_to_mixture_variables(qK_cons_vf, j, k, l, & - rho_K, gamma_K, pi_inf_K, qv_K) - end if + ! If pre-processing, use non acc mixture subroutines + if (elasticity) then + call s_convert_to_mixture_variables(qK_cons_vf, j, k, l, & + rho_K, gamma_K, pi_inf_K, qv_K, Re_K, G_K, fluid_pp(:)%G) + else + call s_convert_to_mixture_variables(qK_cons_vf, j, k, l, & + rho_K, gamma_K, pi_inf_K, qv_K) + end if #endif - end if - - if (relativity) then - if (n == 0) then - B(1) = Bx0 - B(2) = qK_cons_vf(B_idx%beg)%sf(j, k, l) - B(3) = qK_cons_vf(B_idx%beg + 1)%sf(j, k, l) - else - B(1) = qK_cons_vf(B_idx%beg)%sf(j, k, l) - B(2) = qK_cons_vf(B_idx%beg + 1)%sf(j, k, l) - B(3) = qK_cons_vf(B_idx%beg + 2)%sf(j, k, l) end if - B2 = B(1)**2 + B(2)**2 + B(3)**2 - - m2 = 0._wp - $:GPU_LOOP(parallelism='[seq]') - do i = momxb, momxe - m2 = m2 + qK_cons_vf(i)%sf(j, k, l)**2 - end do - S = 0._wp - $:GPU_LOOP(parallelism='[seq]') - do i = 1, 3 - S = S + qK_cons_vf(momxb + i - 1)%sf(j, k, l)*B(i) - end do + if (relativity) then + if (n == 0) then + B(1) = Bx0 + B(2) = qK_cons_vf(B_idx%beg)%sf(j, k, l) + B(3) = qK_cons_vf(B_idx%beg + 1)%sf(j, k, l) + else + B(1) = qK_cons_vf(B_idx%beg)%sf(j, k, l) + B(2) = qK_cons_vf(B_idx%beg + 1)%sf(j, k, l) + B(3) = qK_cons_vf(B_idx%beg + 2)%sf(j, k, l) + end if + B2 = B(1)**2 + B(2)**2 + B(3)**2 - E = qK_cons_vf(E_idx)%sf(j, k, l) + m2 = 0._wp + $:GPU_LOOP(parallelism='[seq]') + do i = momxb, momxe + m2 = m2 + qK_cons_vf(i)%sf(j, k, l)**2 + end do - D = 0._wp - $:GPU_LOOP(parallelism='[seq]') - do i = 1, contxe - D = D + qK_cons_vf(i)%sf(j, k, l) - end do + S = 0._wp + $:GPU_LOOP(parallelism='[seq]') + do i = 1, 3 + S = S + qK_cons_vf(momxb + i - 1)%sf(j, k, l)*B(i) + end do - ! Newton-Raphson - W = E + D - $:GPU_LOOP(parallelism='[seq]') - do iter = 1, relativity_cons_to_prim_max_iter - Ga = (W + B2)*W/sqrt((W + B2)**2*W**2 - (m2*W**2 + S**2*(2*W + B2))) - pres = (W - D*Ga)/((gamma_K + 1)*Ga**2) ! Thermal pressure from EOS - f = W - pres + (1 - 1/(2*Ga**2))*B2 - S**2/(2*W**2) - E - D - - ! The first equation below corrects a typo in (Mignone & Bodo, 2006) - ! m2*W**2 → 2*m2*W**2, which would cancel with the 2* in other terms - ! This corrected version is not used as the second equation empirically converges faster. - ! First equation is kept for further investigation. - ! dGa_dW = -Ga**3 * ( S**2*(3*W**2+3*W*B2+B2**2) + m2*W**2 ) / (W**3 * (W+B2)**3) ! first (corrected) - dGa_dW = -Ga**3*(2*S**2*(3*W**2 + 3*W*B2 + B2**2) + m2*W**2)/(2*W**3*(W + B2)**3) ! second (in paper) - - dp_dW = (Ga*(1 + D*dGa_dW) - 2*W*dGa_dW)/((gamma_K + 1)*Ga**3) - df_dW = 1 - dp_dW + (B2/Ga**3)*dGa_dW + S**2/W**3 - - dW = -f/df_dW - W = W + dW - if (abs(dW) < 1.e-12_wp*W) exit - end do + E = qK_cons_vf(E_idx)%sf(j, k, l) - ! Recalculate pressure using converged W - Ga = (W + B2)*W/sqrt((W + B2)**2*W**2 - (m2*W**2 + S**2*(2*W + B2))) - qK_prim_vf(E_idx)%sf(j, k, l) = (W - D*Ga)/((gamma_K + 1)*Ga**2) + D = 0._wp + $:GPU_LOOP(parallelism='[seq]') + do i = 1, contxe + D = D + qK_cons_vf(i)%sf(j, k, l) + end do - ! Recover the other primitive variables - $:GPU_LOOP(parallelism='[seq]') - do i = 1, 3 - qK_prim_vf(momxb + i - 1)%sf(j, k, l) = (qK_cons_vf(momxb + i - 1)%sf(j, k, l) + (S/W)*B(i))/(W + B2) - end do - qK_prim_vf(1)%sf(j, k, l) = D/Ga ! Hard-coded for single-component for now + ! Newton-Raphson + W = E + D + $:GPU_LOOP(parallelism='[seq]') + do iter = 1, relativity_cons_to_prim_max_iter + Ga = (W + B2)*W/sqrt((W + B2)**2*W**2 - (m2*W**2 + S**2*(2*W + B2))) + pres = (W - D*Ga)/((gamma_K + 1)*Ga**2) ! Thermal pressure from EOS + f = W - pres + (1 - 1/(2*Ga**2))*B2 - S**2/(2*W**2) - E - D + + ! The first equation below corrects a typo in (Mignone & Bodo, 2006) + ! m2*W**2 → 2*m2*W**2, which would cancel with the 2* in other terms + ! This corrected version is not used as the second equation empirically converges faster. + ! First equation is kept for further investigation. + ! dGa_dW = -Ga**3 * ( S**2*(3*W**2+3*W*B2+B2**2) + m2*W**2 ) / (W**3 * (W+B2)**3) ! first (corrected) + dGa_dW = -Ga**3*(2*S**2*(3*W**2 + 3*W*B2 + B2**2) + m2*W**2)/(2*W**3*(W + B2)**3) ! second (in paper) + + dp_dW = (Ga*(1 + D*dGa_dW) - 2*W*dGa_dW)/((gamma_K + 1)*Ga**3) + df_dW = 1 - dp_dW + (B2/Ga**3)*dGa_dW + S**2/W**3 + + dW = -f/df_dW + W = W + dW + if (abs(dW) < 1.e-12_wp*W) exit + end do - $:GPU_LOOP(parallelism='[seq]') - do i = B_idx%beg, B_idx%end - qK_prim_vf(i)%sf(j, k, l) = qK_cons_vf(i)%sf(j, k, l) - end do + ! Recalculate pressure using converged W + Ga = (W + B2)*W/sqrt((W + B2)**2*W**2 - (m2*W**2 + S**2*(2*W + B2))) + qK_prim_vf(E_idx)%sf(j, k, l) = (W - D*Ga)/((gamma_K + 1)*Ga**2) - cycle ! skip all the non-relativistic conversions below - end if + ! Recover the other primitive variables + $:GPU_LOOP(parallelism='[seq]') + do i = 1, 3 + qK_prim_vf(momxb + i - 1)%sf(j, k, l) = (qK_cons_vf(momxb + i - 1)%sf(j, k, l) + (S/W)*B(i))/(W + B2) + end do + qK_prim_vf(1)%sf(j, k, l) = D/Ga ! Hard-coded for single-component for now - if (chemistry) then - rho_K = 0._wp - $:GPU_LOOP(parallelism='[seq]') - do i = chemxb, chemxe - rho_K = rho_K + max(0._wp, qK_cons_vf(i)%sf(j, k, l)) - end do + $:GPU_LOOP(parallelism='[seq]') + do i = B_idx%beg, B_idx%end + qK_prim_vf(i)%sf(j, k, l) = qK_cons_vf(i)%sf(j, k, l) + end do - $:GPU_LOOP(parallelism='[seq]') - do i = 1, contxe - qK_prim_vf(i)%sf(j, k, l) = rho_K - end do + cycle ! skip all the non-relativistic conversions below + end if - $:GPU_LOOP(parallelism='[seq]') - do i = chemxb, chemxe - qK_prim_vf(i)%sf(j, k, l) = max(0._wp, qK_cons_vf(i)%sf(j, k, l)/rho_K) - end do - else - $:GPU_LOOP(parallelism='[seq]') - do i = 1, contxe - qK_prim_vf(i)%sf(j, k, l) = qK_cons_vf(i)%sf(j, k, l) - end do - end if + if (chemistry) then + rho_K = 0._wp + $:GPU_LOOP(parallelism='[seq]') + do i = chemxb, chemxe + rho_K = rho_K + max(0._wp, qK_cons_vf(i)%sf(j, k, l)) + end do -#ifdef MFC_SIMULATION - rho_K = max(rho_K, sgm_eps) -#endif + $:GPU_LOOP(parallelism='[seq]') + do i = 1, contxe + qK_prim_vf(i)%sf(j, k, l) = rho_K + end do - $:GPU_LOOP(parallelism='[seq]') - do i = momxb, momxe - if (model_eqns /= 4) then - qK_prim_vf(i)%sf(j, k, l) = qK_cons_vf(i)%sf(j, k, l) & - /rho_K - dyn_pres_K = dyn_pres_K + 5.e-1_wp*qK_cons_vf(i)%sf(j, k, l) & - *qK_prim_vf(i)%sf(j, k, l) + $:GPU_LOOP(parallelism='[seq]') + do i = chemxb, chemxe + qK_prim_vf(i)%sf(j, k, l) = max(0._wp, qK_cons_vf(i)%sf(j, k, l)/rho_K) + end do else - qK_prim_vf(i)%sf(j, k, l) = qK_cons_vf(i)%sf(j, k, l) & - /qK_cons_vf(1)%sf(j, k, l) + $:GPU_LOOP(parallelism='[seq]') + do i = 1, contxe + qK_prim_vf(i)%sf(j, k, l) = qK_cons_vf(i)%sf(j, k, l) + end do end if - end do - if (chemistry) then +#ifdef MFC_SIMULATION + rho_K = max(rho_K, sgm_eps) +#endif + $:GPU_LOOP(parallelism='[seq]') - do i = 1, num_species - rhoYks(i) = qK_cons_vf(chemxb + i - 1)%sf(j, k, l) + do i = momxb, momxe + if (model_eqns /= 4) then + qK_prim_vf(i)%sf(j, k, l) = qK_cons_vf(i)%sf(j, k, l) & + /rho_K + dyn_pres_K = dyn_pres_K + 5.e-1_wp*qK_cons_vf(i)%sf(j, k, l) & + *qK_prim_vf(i)%sf(j, k, l) + else + qK_prim_vf(i)%sf(j, k, l) = qK_cons_vf(i)%sf(j, k, l) & + /qK_cons_vf(1)%sf(j, k, l) + end if end do - T = q_T_sf%sf(j, k, l) - end if + if (chemistry) then + $:GPU_LOOP(parallelism='[seq]') + do i = 1, num_species + rhoYks(i) = qK_cons_vf(chemxb + i - 1)%sf(j, k, l) + end do - if (mhd) then - if (n == 0) then - pres_mag = 0.5_wp*(Bx0**2 + qK_cons_vf(B_idx%beg)%sf(j, k, l)**2 + qK_cons_vf(B_idx%beg + 1)%sf(j, k, l)**2) + T = q_T_sf%sf(j, k, l) + end if + + if (mhd) then + if (n == 0) then + pres_mag = 0.5_wp*(Bx0**2 + qK_cons_vf(B_idx%beg)%sf(j, k, l)**2 + qK_cons_vf(B_idx%beg + 1)%sf(j, k, l)**2) + else + pres_mag = 0.5_wp*(qK_cons_vf(B_idx%beg)%sf(j, k, l)**2 + qK_cons_vf(B_idx%beg + 1)%sf(j, k, l)**2 + qK_cons_vf(B_idx%beg + 2)%sf(j, k, l)**2) + end if else - pres_mag = 0.5_wp*(qK_cons_vf(B_idx%beg)%sf(j, k, l)**2 + qK_cons_vf(B_idx%beg + 1)%sf(j, k, l)**2 + qK_cons_vf(B_idx%beg + 2)%sf(j, k, l)**2) + pres_mag = 0._wp end if - else - pres_mag = 0._wp - end if - call s_compute_pressure(qK_cons_vf(E_idx)%sf(j, k, l), & - qK_cons_vf(alf_idx)%sf(j, k, l), & - dyn_pres_K, pi_inf_K, gamma_K, rho_K, & - qv_K, rhoYks, pres, T, pres_mag=pres_mag) + call s_compute_pressure(qK_cons_vf(E_idx)%sf(j, k, l), & + qK_cons_vf(alf_idx)%sf(j, k, l), & + dyn_pres_K, pi_inf_K, gamma_K, rho_K, & + qv_K, rhoYks, pres, T, pres_mag=pres_mag) - qK_prim_vf(E_idx)%sf(j, k, l) = pres + qK_prim_vf(E_idx)%sf(j, k, l) = pres - if (chemistry) then - q_T_sf%sf(j, k, l) = T - end if + if (chemistry) then + q_T_sf%sf(j, k, l) = T + end if - if (bubbles_euler) then - $:GPU_LOOP(parallelism='[seq]') - do i = 1, nb - nRtmp(i) = qK_cons_vf(bubrs(i))%sf(j, k, l) - end do + if (bubbles_euler) then + $:GPU_LOOP(parallelism='[seq]') + do i = 1, nb + nRtmp(i) = qK_cons_vf(bubrs_vc(i))%sf(j, k, l) + end do - vftmp = qK_cons_vf(alf_idx)%sf(j, k, l) + vftmp = qK_cons_vf(alf_idx)%sf(j, k, l) - if (qbmm) then - !Get nb (constant across all R0 bins) - nbub_sc = qK_cons_vf(bubxb)%sf(j, k, l) + if (qbmm) then + !Get nb (constant across all R0 bins) + nbub_sc = qK_cons_vf(bubxb)%sf(j, k, l) - !Convert cons to prim - $:GPU_LOOP(parallelism='[seq]') - do i = bubxb, bubxe - qK_prim_vf(i)%sf(j, k, l) = qK_cons_vf(i)%sf(j, k, l)/nbub_sc - end do - !Need to keep track of nb in the primitive variable list (converted back to true value before output) + !Convert cons to prim + $:GPU_LOOP(parallelism='[seq]') + do i = bubxb, bubxe + qK_prim_vf(i)%sf(j, k, l) = qK_cons_vf(i)%sf(j, k, l)/nbub_sc + end do + !Need to keep track of nb in the primitive variable list (converted back to true value before output) #ifdef MFC_SIMULATION - qK_prim_vf(bubxb)%sf(j, k, l) = qK_cons_vf(bubxb)%sf(j, k, l) + qK_prim_vf(bubxb)%sf(j, k, l) = qK_cons_vf(bubxb)%sf(j, k, l) #endif - else - if (adv_n) then - qK_prim_vf(n_idx)%sf(j, k, l) = qK_cons_vf(n_idx)%sf(j, k, l) - nbub_sc = qK_prim_vf(n_idx)%sf(j, k, l) else - call s_comp_n_from_cons(vftmp, nRtmp, nbub_sc, weight) + if (adv_n) then + qK_prim_vf(n_idx)%sf(j, k, l) = qK_cons_vf(n_idx)%sf(j, k, l) + nbub_sc = qK_prim_vf(n_idx)%sf(j, k, l) + else + call s_comp_n_from_cons(vftmp, nRtmp, nbub_sc, weight) + end if + + $:GPU_LOOP(parallelism='[seq]') + do i = bubxb, bubxe + qK_prim_vf(i)%sf(j, k, l) = qK_cons_vf(i)%sf(j, k, l)/nbub_sc + end do end if + end if + if (mhd) then $:GPU_LOOP(parallelism='[seq]') - do i = bubxb, bubxe - qK_prim_vf(i)%sf(j, k, l) = qK_cons_vf(i)%sf(j, k, l)/nbub_sc + do i = B_idx%beg, B_idx%end + qK_prim_vf(i)%sf(j, k, l) = qK_cons_vf(i)%sf(j, k, l) end do end if - end if - - if (mhd) then - $:GPU_LOOP(parallelism='[seq]') - do i = B_idx%beg, B_idx%end - qK_prim_vf(i)%sf(j, k, l) = qK_cons_vf(i)%sf(j, k, l) - end do - end if - if (elasticity) then - $:GPU_LOOP(parallelism='[seq]') - do i = strxb, strxe - qK_prim_vf(i)%sf(j, k, l) = qK_cons_vf(i)%sf(j, k, l)/rho_K - end do - end if + if (elasticity) then + $:GPU_LOOP(parallelism='[seq]') + do i = strxb, strxe + qK_prim_vf(i)%sf(j, k, l) = qK_cons_vf(i)%sf(j, k, l)/rho_K + end do + end if - if (hypoelasticity) then - $:GPU_LOOP(parallelism='[seq]') - do i = strxb, strxe - ! subtracting elastic contribution for pressure calculation - if (G_K > verysmall) then - if (cont_damage) G_K = G_K*max((1._wp - qK_cons_vf(damage_idx)%sf(j, k, l)), 0._wp) - qK_prim_vf(E_idx)%sf(j, k, l) = qK_prim_vf(E_idx)%sf(j, k, l) - & - ((qK_prim_vf(i)%sf(j, k, l)**2._wp)/(4._wp*G_K))/gamma_K - ! Double for shear stresses - if (any(i == shear_indices)) then + if (hypoelasticity) then + $:GPU_LOOP(parallelism='[seq]') + do i = strxb, strxe + ! subtracting elastic contribution for pressure calculation + if (G_K > verysmall) then + if (cont_damage) G_K = G_K*max((1._wp - qK_cons_vf(damage_idx)%sf(j, k, l)), 0._wp) qK_prim_vf(E_idx)%sf(j, k, l) = qK_prim_vf(E_idx)%sf(j, k, l) - & ((qK_prim_vf(i)%sf(j, k, l)**2._wp)/(4._wp*G_K))/gamma_K + ! Double for shear stresses + if (any(i == shear_indices)) then + qK_prim_vf(E_idx)%sf(j, k, l) = qK_prim_vf(E_idx)%sf(j, k, l) - & + ((qK_prim_vf(i)%sf(j, k, l)**2._wp)/(4._wp*G_K))/gamma_K + end if end if - end if - end do - end if + end do + end if - if (hyperelasticity) then - $:GPU_LOOP(parallelism='[seq]') - do i = xibeg, xiend - qK_prim_vf(i)%sf(j, k, l) = qK_cons_vf(i)%sf(j, k, l)/rho_K - end do - end if + if (hyperelasticity) then + $:GPU_LOOP(parallelism='[seq]') + do i = xibeg, xiend + qK_prim_vf(i)%sf(j, k, l) = qK_cons_vf(i)%sf(j, k, l)/rho_K + end do + end if - if (.not. igr .or. num_fluids > 1) then - $:GPU_LOOP(parallelism='[seq]') - do i = advxb, advxe - qK_prim_vf(i)%sf(j, k, l) = qK_cons_vf(i)%sf(j, k, l) - end do - end if + if (.not. igr .or. num_fluids > 1) then + $:GPU_LOOP(parallelism='[seq]') + do i = advxb, advxe + qK_prim_vf(i)%sf(j, k, l) = qK_cons_vf(i)%sf(j, k, l) + end do + end if - if (surface_tension) then - qK_prim_vf(c_idx)%sf(j, k, l) = qK_cons_vf(c_idx)%sf(j, k, l) - end if + if (surface_tension) then + qK_prim_vf(c_idx)%sf(j, k, l) = qK_cons_vf(c_idx)%sf(j, k, l) + end if - if (cont_damage) qK_prim_vf(damage_idx)%sf(j, k, l) = qK_cons_vf(damage_idx)%sf(j, k, l) + if (cont_damage) qK_prim_vf(damage_idx)%sf(j, k, l) = qK_cons_vf(damage_idx)%sf(j, k, l) #ifdef MFC_POST_PROCESS - if (bubbles_lagrange) qK_prim_vf(beta_idx)%sf(j, k, l) = qK_cons_vf(beta_idx)%sf(j, k, l) + if (bubbles_lagrange) qK_prim_vf(beta_idx)%sf(j, k, l) = qK_cons_vf(beta_idx)%sf(j, k, l) #endif + end do end do end do - end do + #:endcall GPU_PARALLEL_LOOP end subroutine s_convert_conservative_to_primitive_variables @@ -1493,113 +1492,113 @@ contains ! Computing the flux variables from the primitive variables, without ! accounting for the contribution of either viscosity or capillarity #ifdef MFC_SIMULATION - $:GPU_PARALLEL_LOOP(collapse=3, private='[alpha_rho_K, vel_K, & - & alpha_K, Re_K, Y_K]') - do l = is3b, is3e - do k = is2b, is2e - do j = is1b, is1e - - $:GPU_LOOP(parallelism='[seq]') - do i = 1, contxe - alpha_rho_K(i) = qK_prim_vf(j, k, l, i) - end do - - $:GPU_LOOP(parallelism='[seq]') - do i = advxb, advxe - alpha_K(i - E_idx) = qK_prim_vf(j, k, l, i) - end do - $:GPU_LOOP(parallelism='[seq]') - do i = 1, num_vels - vel_K(i) = qK_prim_vf(j, k, l, contxe + i) - end do + #:call GPU_PARALLEL_LOOP(collapse=3, private='[alpha_rho_K, vel_K, alpha_K, Re_K, Y_K]') + do l = is3b, is3e + do k = is2b, is2e + do j = is1b, is1e - vel_K_sum = 0._wp - $:GPU_LOOP(parallelism='[seq]') - do i = 1, num_vels - vel_K_sum = vel_K_sum + vel_K(i)**2._wp - end do - - pres_K = qK_prim_vf(j, k, l, E_idx) - if (elasticity) then - call s_convert_species_to_mixture_variables_acc(rho_K, gamma_K, pi_inf_K, qv_K, & - alpha_K, alpha_rho_K, Re_K, & - G_K, Gs) - else if (bubbles_euler) then - call s_convert_species_to_mixture_variables_bubbles_acc(rho_K, gamma_K, & - pi_inf_K, qv_K, alpha_K, alpha_rho_K, Re_K) - else - call s_convert_species_to_mixture_variables_acc(rho_K, gamma_K, pi_inf_K, qv_K, & - alpha_K, alpha_rho_K, Re_K) - end if + $:GPU_LOOP(parallelism='[seq]') + do i = 1, contxe + alpha_rho_K(i) = qK_prim_vf(j, k, l, i) + end do - ! Computing the energy from the pressure + $:GPU_LOOP(parallelism='[seq]') + do i = advxb, advxe + alpha_K(i - E_idx) = qK_prim_vf(j, k, l, i) + end do + $:GPU_LOOP(parallelism='[seq]') + do i = 1, num_vels + vel_K(i) = qK_prim_vf(j, k, l, contxe + i) + end do - if (chemistry) then + vel_K_sum = 0._wp $:GPU_LOOP(parallelism='[seq]') - do i = chemxb, chemxe - Y_K(i - chemxb + 1) = qK_prim_vf(j, k, l, i) + do i = 1, num_vels + vel_K_sum = vel_K_sum + vel_K(i)**2._wp end do - !Computing the energy from the internal energy of the mixture - call get_mixture_molecular_weight(Y_k, mix_mol_weight) - R_gas = gas_constant/mix_mol_weight - T_K = pres_K/rho_K/R_gas - call get_mixture_energy_mass(T_K, Y_K, E_K) - E_K = rho_K*E_K + 5.e-1_wp*rho_K*vel_K_sum - else - ! Computing the energy from the pressure - E_K = gamma_K*pres_K + pi_inf_K & - + 5.e-1_wp*rho_K*vel_K_sum + qv_K - end if - ! mass flux, this should be \alpha_i \rho_i u_i - $:GPU_LOOP(parallelism='[seq]') - do i = 1, contxe - FK_vf(j, k, l, i) = alpha_rho_K(i)*vel_K(dir_idx(1)) - end do + pres_K = qK_prim_vf(j, k, l, E_idx) + if (elasticity) then + call s_convert_species_to_mixture_variables_acc(rho_K, gamma_K, pi_inf_K, qv_K, & + alpha_K, alpha_rho_K, Re_K, & + G_K, Gs_vc) + else if (bubbles_euler) then + call s_convert_species_to_mixture_variables_bubbles_acc(rho_K, gamma_K, & + pi_inf_K, qv_K, alpha_K, alpha_rho_K, Re_K) + else + call s_convert_species_to_mixture_variables_acc(rho_K, gamma_K, pi_inf_K, qv_K, & + alpha_K, alpha_rho_K, Re_K) + end if - $:GPU_LOOP(parallelism='[seq]') - do i = 1, num_vels - FK_vf(j, k, l, contxe + dir_idx(i)) = & - rho_K*vel_K(dir_idx(1)) & - *vel_K(dir_idx(i)) & - + pres_K*dir_flg(dir_idx(i)) - end do + ! Computing the energy from the pressure - ! energy flux, u(E+p) - FK_vf(j, k, l, E_idx) = vel_K(dir_idx(1))*(E_K + pres_K) + if (chemistry) then + $:GPU_LOOP(parallelism='[seq]') + do i = chemxb, chemxe + Y_K(i - chemxb + 1) = qK_prim_vf(j, k, l, i) + end do + !Computing the energy from the internal energy of the mixture + call get_mixture_molecular_weight(Y_k, mix_mol_weight) + R_gas = gas_constant/mix_mol_weight + T_K = pres_K/rho_K/R_gas + call get_mixture_energy_mass(T_K, Y_K, E_K) + E_K = rho_K*E_K + 5.e-1_wp*rho_K*vel_K_sum + else + ! Computing the energy from the pressure + E_K = gamma_K*pres_K + pi_inf_K & + + 5.e-1_wp*rho_K*vel_K_sum + qv_K + end if - ! Species advection Flux, \rho*u*Y - if (chemistry) then + ! mass flux, this should be \alpha_i \rho_i u_i $:GPU_LOOP(parallelism='[seq]') - do i = 1, num_species - FK_vf(j, k, l, i - 1 + chemxb) = vel_K(dir_idx(1))*(rho_K*Y_K(i)) + do i = 1, contxe + FK_vf(j, k, l, i) = alpha_rho_K(i)*vel_K(dir_idx(1)) end do - end if - if (riemann_solver == 1 .or. riemann_solver == 4) then $:GPU_LOOP(parallelism='[seq]') - do i = advxb, advxe - FK_vf(j, k, l, i) = 0._wp - FK_src_vf(j, k, l, i) = alpha_K(i - E_idx) + do i = 1, num_vels + FK_vf(j, k, l, contxe + dir_idx(i)) = & + rho_K*vel_K(dir_idx(1)) & + *vel_K(dir_idx(i)) & + + pres_K*dir_flg(dir_idx(i)) end do - else - ! Could be bubbles_euler! - $:GPU_LOOP(parallelism='[seq]') - do i = advxb, advxe - FK_vf(j, k, l, i) = vel_K(dir_idx(1))*alpha_K(i - E_idx) - end do + ! energy flux, u(E+p) + FK_vf(j, k, l, E_idx) = vel_K(dir_idx(1))*(E_K + pres_K) - $:GPU_LOOP(parallelism='[seq]') - do i = advxb, advxe - FK_src_vf(j, k, l, i) = vel_K(dir_idx(1)) - end do + ! Species advection Flux, \rho*u*Y + if (chemistry) then + $:GPU_LOOP(parallelism='[seq]') + do i = 1, num_species + FK_vf(j, k, l, i - 1 + chemxb) = vel_K(dir_idx(1))*(rho_K*Y_K(i)) + end do + end if - end if + if (riemann_solver == 1 .or. riemann_solver == 4) then + $:GPU_LOOP(parallelism='[seq]') + do i = advxb, advxe + FK_vf(j, k, l, i) = 0._wp + FK_src_vf(j, k, l, i) = alpha_K(i - E_idx) + end do + + else + ! Could be bubbles_euler! + $:GPU_LOOP(parallelism='[seq]') + do i = advxb, advxe + FK_vf(j, k, l, i) = vel_K(dir_idx(1))*alpha_K(i - E_idx) + end do + $:GPU_LOOP(parallelism='[seq]') + do i = advxb, advxe + FK_src_vf(j, k, l, i) = vel_K(dir_idx(1)) + end do + + end if + + end do end do end do - end do + #:endcall GPU_PARALLEL_LOOP #endif end subroutine s_convert_primitive_to_flux_variables @@ -1612,21 +1611,21 @@ contains #endif #ifdef MFC_SIMULATION - @:DEALLOCATE(gammas, gs_min, pi_infs, ps_inf, cvs, qvs, qvps, Gs) + @:DEALLOCATE(gammas, gs_min, pi_infs, ps_inf, cvs, qvs, qvps, Gs_vc) if (bubbles_euler) then - @:DEALLOCATE(bubrs) + @:DEALLOCATE(bubrs_vc) end if #else - @:DEALLOCATE(gammas, gs_min, pi_infs, ps_inf, cvs, qvs, qvps, Gs) + @:DEALLOCATE(gammas, gs_min, pi_infs, ps_inf, cvs, qvs, qvps, Gs_vc) if (bubbles_euler) then - @:DEALLOCATE(bubrs) + @:DEALLOCATE(bubrs_vc) end if #endif end subroutine s_finalize_variables_conversion_module #ifndef MFC_PRE_PROCESS - pure subroutine s_compute_speed_of_sound(pres, rho, gamma, pi_inf, H, adv, vel_sum, c_c, c) + subroutine s_compute_speed_of_sound(pres, rho, gamma, pi_inf, H, adv, vel_sum, c_c, c) $:GPU_ROUTINE(function_name='s_compute_speed_of_sound', & & parallelism='[seq]', cray_inline=True) @@ -1692,7 +1691,7 @@ contains #endif #ifndef MFC_PRE_PROCESS - pure subroutine s_compute_fast_magnetosonic_speed(rho, c, B, norm, c_fast, h) + subroutine s_compute_fast_magnetosonic_speed(rho, c, B, norm, c_fast, h) $:GPU_ROUTINE(function_name='s_compute_fast_magnetosonic_speed', & & parallelism='[seq]', cray_inline=True) diff --git a/src/post_process/m_derived_variables.fpp b/src/post_process/m_derived_variables.fpp index 96ef4a6f00..bc1242a97d 100644 --- a/src/post_process/m_derived_variables.fpp +++ b/src/post_process/m_derived_variables.fpp @@ -118,7 +118,7 @@ contains !! ratio. The latter is stored in the derived flow quantity !! storage variable, q_sf. !! @param q_sf Specific heat ratio - pure subroutine s_derive_specific_heat_ratio(q_sf) + subroutine s_derive_specific_heat_ratio(q_sf) real(wp), & dimension(-offset_x%beg:m + offset_x%end, & @@ -145,7 +145,7 @@ contains !! values of the liquid stiffness, which are stored in the !! derived flow quantity storage variable, q_sf. !! @param q_sf Liquid stiffness - pure subroutine s_derive_liquid_stiffness(q_sf) + subroutine s_derive_liquid_stiffness(q_sf) real(wp), & dimension(-offset_x%beg:m + offset_x%end, & @@ -174,7 +174,7 @@ contains !! derived flow quantity storage variable, q_sf. !! @param q_prim_vf Primitive variables !! @param q_sf Speed of sound - pure subroutine s_derive_sound_speed(q_prim_vf, q_sf) + subroutine s_derive_sound_speed(q_prim_vf, q_sf) type(scalar_field), & dimension(sys_size), & @@ -231,7 +231,7 @@ contains !! @param i Component indicator !! @param q_prim_vf Primitive variables !! @param q_sf Flux limiter - pure subroutine s_derive_flux_limiter(i, q_prim_vf, q_sf) + subroutine s_derive_flux_limiter(i, q_prim_vf, q_sf) integer, intent(in) :: i @@ -325,7 +325,7 @@ contains !! @param b right-hane-side !! @param sol Solution !! @param ndim Problem size - pure subroutine s_solve_linear_system(A, b, sol, ndim) + subroutine s_solve_linear_system(A, b, sol, ndim) integer, intent(in) :: ndim real(wp), dimension(ndim, ndim), intent(inout) :: A @@ -375,7 +375,7 @@ contains !! @param i Vorticity component indicator !! @param q_prim_vf Primitive variables !! @param q_sf Vorticity component - pure subroutine s_derive_vorticity_component(i, q_prim_vf, q_sf) + subroutine s_derive_vorticity_component(i, q_prim_vf, q_sf) integer, intent(in) :: i @@ -477,7 +477,7 @@ contains !! quantity storage variable, q_sf. !! @param q_prim_vf Primitive variables !! @param q_sf Q_M - pure subroutine s_derive_qm(q_prim_vf, q_sf) + subroutine s_derive_qm(q_prim_vf, q_sf) type(scalar_field), & dimension(sys_size), & intent(in) :: q_prim_vf diff --git a/src/pre_process/m_assign_variables.fpp b/src/pre_process/m_assign_variables.fpp index f60573b1b9..e8f6e1574b 100644 --- a/src/pre_process/m_assign_variables.fpp +++ b/src/pre_process/m_assign_variables.fpp @@ -102,8 +102,8 @@ contains !! @param eta pseudo volume fraction !! @param q_prim_vf Primitive variables !! @param patch_id_fp Array to track patch ids - pure subroutine s_assign_patch_mixture_primitive_variables(patch_id, j, k, l, & - eta, q_prim_vf, patch_id_fp) + subroutine s_assign_patch_mixture_primitive_variables(patch_id, j, k, l, & + eta, q_prim_vf, patch_id_fp) $:GPU_ROUTINE(parallelism='[seq]') integer, intent(in) :: patch_id @@ -191,7 +191,7 @@ contains !! @param k the y-dir node index !! @param l the z-dir node index !! @param q_prim_vf Primitive variables - pure subroutine s_perturb_primitive(j, k, l, q_prim_vf) + subroutine s_perturb_primitive(j, k, l, q_prim_vf) integer, intent(in) :: j, k, l type(scalar_field), dimension(1:sys_size), intent(inout) :: q_prim_vf diff --git a/src/pre_process/m_icpp_patches.fpp b/src/pre_process/m_icpp_patches.fpp index 3ed5cfd03c..1c7b035ac3 100644 --- a/src/pre_process/m_icpp_patches.fpp +++ b/src/pre_process/m_icpp_patches.fpp @@ -1689,7 +1689,7 @@ contains end subroutine s_convert_cylindrical_to_cartesian_coord - pure function f_convert_cyl_to_cart(cyl) result(cart) + function f_convert_cyl_to_cart(cyl) result(cart) $:GPU_ROUTINE(parallelism='[seq]') @@ -1715,7 +1715,8 @@ contains !! @param myth Angle !! @param offset Thickness !! @param a Starting position - pure elemental function f_r(myth, offset, a) + elemental function f_r(myth, offset, a) + $:GPU_ROUTINE(parallelism='[seq]') real(wp), intent(in) :: myth, offset, a real(wp) :: b diff --git a/src/simulation/m_acoustic_src.fpp b/src/simulation/m_acoustic_src.fpp index b14528b9d5..3da61d2e46 100644 --- a/src/simulation/m_acoustic_src.fpp +++ b/src/simulation/m_acoustic_src.fpp @@ -166,175 +166,179 @@ contains sim_time = t_step*dt - $:GPU_PARALLEL_LOOP(collapse=3) - do l = 0, p - do k = 0, n - do j = 0, m - mass_src(j, k, l) = 0._wp - mom_src(1, j, k, l) = 0._wp - e_src(j, k, l) = 0._wp - if (n > 0) mom_src(2, j, k, l) = 0._wp - if (p > 0) mom_src(3, j, k, l) = 0._wp + #:call GPU_PARALLEL_LOOP(collapse=3) + do l = 0, p + do k = 0, n + do j = 0, m + mass_src(j, k, l) = 0._wp + mom_src(1, j, k, l) = 0._wp + e_src(j, k, l) = 0._wp + if (n > 0) mom_src(2, j, k, l) = 0._wp + if (p > 0) mom_src(3, j, k, l) = 0._wp + end do end do end do - end do + #:endcall GPU_PARALLEL_LOOP ! Keep outer loop sequel because different sources can have very different number of points do ai = 1, num_source ! Skip if the pulse has not started yet for sine and square waves - if (sim_time < delay(ai) .and. (pulse(ai) == 1 .or. pulse(ai) == 3)) cycle + if (.not. (sim_time < delay(ai) .and. (pulse(ai) == 1 .or. pulse(ai) == 3))) then - ! Decide if frequency need to be converted from wavelength - freq_conv_flag = f_is_default(frequency(ai)) - gauss_conv_flag = f_is_default(gauss_sigma_time(ai)) + ! Decide if frequency need to be converted from wavelength + freq_conv_flag = f_is_default(frequency(ai)) + gauss_conv_flag = f_is_default(gauss_sigma_time(ai)) - num_points = source_spatials_num_points(ai) ! Use scalar to force firstprivate to prevent GPU bug + num_points = source_spatials_num_points(ai) ! Use scalar to force firstprivate to prevent GPU bug - ! Calculate the broadband source - period_BB = 0._wp - sl_BB = 0._wp - ffre_BB = 0._wp - sum_BB = 0._wp + ! Calculate the broadband source + period_BB = 0._wp + sl_BB = 0._wp + ffre_BB = 0._wp + sum_BB = 0._wp - ! Allocate buffers for random phase shift - allocate (phi_rn(1:bb_num_freq(ai))) + ! Allocate buffers for random phase shift + allocate (phi_rn(1:bb_num_freq(ai))) - if (pulse(ai) == 4) then - call random_number(phi_rn(1:bb_num_freq(ai))) - ! Ensure all the ranks have the same random phase shift - call s_mpi_send_random_number(phi_rn, bb_num_freq(ai)) - end if + if (pulse(ai) == 4) then + call random_number(phi_rn(1:bb_num_freq(ai))) + ! Ensure all the ranks have the same random phase shift + call s_mpi_send_random_number(phi_rn, bb_num_freq(ai)) + end if - $:GPU_LOOP(reduction='[[sum_BB]]', reductionOp='[+]') - do k = 1, bb_num_freq(ai) - ! Acoustic period of the wave at each discrete frequency - period_BB = 1._wp/(bb_lowest_freq(ai) + k*bb_bandwidth(ai)) - ! Spectral level at each frequency - sl_BB = broadband_spectral_level_constant*mag(ai) + k*mag(ai)/broadband_spectral_level_growth_rate - ! Source term corresponding to each frequencies - ffre_BB = sqrt((2._wp*sl_BB*bb_bandwidth(ai)))*cos((sim_time)*2._wp*pi/period_BB + 2._wp*pi*phi_rn(k)) - ! Sum up the source term of each frequency to obtain the total source term for broadband wave - sum_BB = sum_BB + ffre_BB - end do + $:GPU_LOOP(reduction='[[sum_BB]]', reductionOp='[+]') + do k = 1, bb_num_freq(ai) + ! Acoustic period of the wave at each discrete frequency + period_BB = 1._wp/(bb_lowest_freq(ai) + k*bb_bandwidth(ai)) + ! Spectral level at each frequency + sl_BB = broadband_spectral_level_constant*mag(ai) + k*mag(ai)/broadband_spectral_level_growth_rate + ! Source term corresponding to each frequencies + ffre_BB = sqrt((2._wp*sl_BB*bb_bandwidth(ai)))*cos((sim_time)*2._wp*pi/period_BB + 2._wp*pi*phi_rn(k)) + ! Sum up the source term of each frequency to obtain the total source term for broadband wave + sum_BB = sum_BB + ffre_BB + end do - deallocate (phi_rn) + deallocate (phi_rn) - $:GPU_PARALLEL_LOOP(private='[myalpha,myalpha_rho]') - do i = 1, num_points - j = source_spatials(ai)%coord(1, i) - k = source_spatials(ai)%coord(2, i) - l = source_spatials(ai)%coord(3, i) + #:call GPU_PARALLEL_LOOP(private='[myalpha,myalpha_rho]') + do i = 1, num_points + j = source_spatials(ai)%coord(1, i) + k = source_spatials(ai)%coord(2, i) + l = source_spatials(ai)%coord(3, i) - ! Compute speed of sound - myRho = 0._wp - B_tait = 0._wp - small_gamma = 0._wp + ! Compute speed of sound + myRho = 0._wp + B_tait = 0._wp + small_gamma = 0._wp - $:GPU_LOOP(parallelism='[seq]') - do q = 1, num_fluids - myalpha_rho(q) = q_cons_vf(q)%sf(j, k, l) - myalpha(q) = q_cons_vf(advxb + q - 1)%sf(j, k, l) - end do - - if (bubbles_euler) then - if (num_fluids > 2) then $:GPU_LOOP(parallelism='[seq]') - do q = 1, num_fluids - 1 - myRho = myRho + myalpha_rho(q) - B_tait = B_tait + myalpha(q)*pi_infs(q) - small_gamma = small_gamma + myalpha(q)*gammas(q) + do q = 1, num_fluids + myalpha_rho(q) = q_cons_vf(q)%sf(j, k, l) + myalpha(q) = q_cons_vf(advxb + q - 1)%sf(j, k, l) end do - else - myRho = myalpha_rho(1) - B_tait = pi_infs(1) - small_gamma = gammas(1) - end if - end if - if ((.not. bubbles_euler) .or. (mpp_lim .and. (num_fluids > 2))) then - $:GPU_LOOP(parallelism='[seq]') - do q = 1, num_fluids - myRho = myRho + myalpha_rho(q) - B_tait = B_tait + myalpha(q)*pi_infs(q) - small_gamma = small_gamma + myalpha(q)*gammas(q) - end do - end if + if (bubbles_euler) then + if (num_fluids > 2) then + $:GPU_LOOP(parallelism='[seq]') + do q = 1, num_fluids - 1 + myRho = myRho + myalpha_rho(q) + B_tait = B_tait + myalpha(q)*pi_infs(q) + small_gamma = small_gamma + myalpha(q)*gammas(q) + end do + else + myRho = myalpha_rho(1) + B_tait = pi_infs(1) + small_gamma = gammas(1) + end if + end if - small_gamma = 1._wp/small_gamma + 1._wp - c = sqrt(small_gamma*(q_prim_vf(E_idx)%sf(j, k, l) + ((small_gamma - 1._wp)/small_gamma)*B_tait)/myRho) + if ((.not. bubbles_euler) .or. (mpp_lim .and. (num_fluids > 2))) then + $:GPU_LOOP(parallelism='[seq]') + do q = 1, num_fluids + myRho = myRho + myalpha_rho(q) + B_tait = B_tait + myalpha(q)*pi_infs(q) + small_gamma = small_gamma + myalpha(q)*gammas(q) + end do + end if - ! Wavelength to frequency conversion - if (pulse(ai) == 1 .or. pulse(ai) == 3) frequency_local = f_frequency_local(freq_conv_flag, ai, c) - if (pulse(ai) == 2) gauss_sigma_time_local = f_gauss_sigma_time_local(gauss_conv_flag, ai, c) + small_gamma = 1._wp/small_gamma + 1._wp + c = sqrt(small_gamma*(q_prim_vf(E_idx)%sf(j, k, l) + ((small_gamma - 1._wp)/small_gamma)*B_tait)/myRho) - ! Update momentum source term - call s_source_temporal(sim_time, c, ai, mom_label, frequency_local, gauss_sigma_time_local, source_temporal, sum_BB) - mom_src_diff = source_temporal*source_spatials(ai)%val(i) + ! Wavelength to frequency conversion + if (pulse(ai) == 1 .or. pulse(ai) == 3) frequency_local = f_frequency_local(freq_conv_flag, ai, c) + if (pulse(ai) == 2) gauss_sigma_time_local = f_gauss_sigma_time_local(gauss_conv_flag, ai, c) - if (dipole(ai)) then ! Double amplitude & No momentum source term (only works for Planar) - mass_src(j, k, l) = mass_src(j, k, l) + 2._wp*mom_src_diff/c - if (model_eqns /= 4) E_src(j, k, l) = E_src(j, k, l) + 2._wp*mom_src_diff*c/(small_gamma - 1._wp) - cycle - end if + ! Update momentum source term + call s_source_temporal(sim_time, c, ai, mom_label, frequency_local, gauss_sigma_time_local, source_temporal, sum_BB) + mom_src_diff = source_temporal*source_spatials(ai)%val(i) - if (n == 0) then ! 1D - mom_src(1, j, k, l) = mom_src(1, j, k, l) + mom_src_diff*sign(1._wp, dir(ai)) ! Left or right-going wave - - elseif (p == 0) then ! 2D - if (support(ai) < 5) then ! Planar - mom_src(1, j, k, l) = mom_src(1, j, k, l) + mom_src_diff*cos(dir(ai)) - mom_src(2, j, k, l) = mom_src(2, j, k, l) + mom_src_diff*sin(dir(ai)) - else - mom_src(1, j, k, l) = mom_src(1, j, k, l) + mom_src_diff*cos(source_spatials(ai)%angle(i)) - mom_src(2, j, k, l) = mom_src(2, j, k, l) + mom_src_diff*sin(source_spatials(ai)%angle(i)) - end if - - else ! 3D - if (support(ai) < 5) then ! Planar - mom_src(1, j, k, l) = mom_src(1, j, k, l) + mom_src_diff*cos(dir(ai)) - mom_src(2, j, k, l) = mom_src(2, j, k, l) + mom_src_diff*sin(dir(ai)) - else - mom_src(1, j, k, l) = mom_src(1, j, k, l) + mom_src_diff*source_spatials(ai)%xyz_to_r_ratios(1, i) - mom_src(2, j, k, l) = mom_src(2, j, k, l) + mom_src_diff*source_spatials(ai)%xyz_to_r_ratios(2, i) - mom_src(3, j, k, l) = mom_src(3, j, k, l) + mom_src_diff*source_spatials(ai)%xyz_to_r_ratios(3, i) - end if - end if + if (dipole(ai)) then ! Double amplitude & No momentum source term (only works for Planar) + mass_src(j, k, l) = mass_src(j, k, l) + 2._wp*mom_src_diff/c + if (model_eqns /= 4) E_src(j, k, l) = E_src(j, k, l) + 2._wp*mom_src_diff*c/(small_gamma - 1._wp) + cycle + end if - ! Update mass source term - if (support(ai) < 5) then ! Planar - mass_src_diff = mom_src_diff/c - else ! Spherical or cylindrical support - ! Mass source term must be calculated differently using a correction term for spherical and cylindrical support - call s_source_temporal(sim_time, c, ai, mass_label, frequency_local, gauss_sigma_time_local, source_temporal, sum_BB) - mass_src_diff = source_temporal*source_spatials(ai)%val(i) - end if - mass_src(j, k, l) = mass_src(j, k, l) + mass_src_diff + if (n == 0) then ! 1D + mom_src(1, j, k, l) = mom_src(1, j, k, l) + mom_src_diff*sign(1._wp, dir(ai)) ! Left or right-going wave + + elseif (p == 0) then ! 2D + if (support(ai) < 5) then ! Planar + mom_src(1, j, k, l) = mom_src(1, j, k, l) + mom_src_diff*cos(dir(ai)) + mom_src(2, j, k, l) = mom_src(2, j, k, l) + mom_src_diff*sin(dir(ai)) + else + mom_src(1, j, k, l) = mom_src(1, j, k, l) + mom_src_diff*cos(source_spatials(ai)%angle(i)) + mom_src(2, j, k, l) = mom_src(2, j, k, l) + mom_src_diff*sin(source_spatials(ai)%angle(i)) + end if + + else ! 3D + if (support(ai) < 5) then ! Planar + mom_src(1, j, k, l) = mom_src(1, j, k, l) + mom_src_diff*cos(dir(ai)) + mom_src(2, j, k, l) = mom_src(2, j, k, l) + mom_src_diff*sin(dir(ai)) + else + mom_src(1, j, k, l) = mom_src(1, j, k, l) + mom_src_diff*source_spatials(ai)%xyz_to_r_ratios(1, i) + mom_src(2, j, k, l) = mom_src(2, j, k, l) + mom_src_diff*source_spatials(ai)%xyz_to_r_ratios(2, i) + mom_src(3, j, k, l) = mom_src(3, j, k, l) + mom_src_diff*source_spatials(ai)%xyz_to_r_ratios(3, i) + end if + end if - ! Update energy source term - if (model_eqns /= 4) then - E_src(j, k, l) = E_src(j, k, l) + mass_src_diff*c**2._wp/(small_gamma - 1._wp) - end if + ! Update mass source term + if (support(ai) < 5) then ! Planar + mass_src_diff = mom_src_diff/c + else ! Spherical or cylindrical support + ! Mass source term must be calculated differently using a correction term for spherical and cylindrical support + call s_source_temporal(sim_time, c, ai, mass_label, frequency_local, gauss_sigma_time_local, source_temporal, sum_BB) + mass_src_diff = source_temporal*source_spatials(ai)%val(i) + end if + mass_src(j, k, l) = mass_src(j, k, l) + mass_src_diff - end do + ! Update energy source term + if (model_eqns /= 4) then + E_src(j, k, l) = E_src(j, k, l) + mass_src_diff*c**2._wp/(small_gamma - 1._wp) + end if + + end do + #:endcall GPU_PARALLEL_LOOP + end if end do ! Update the rhs variables - $:GPU_PARALLEL_LOOP(collapse=3) - do l = 0, p - do k = 0, n - do j = 0, m - $:GPU_LOOP(parallelism='[seq]') - do q = contxb, contxe - rhs_vf(q)%sf(j, k, l) = rhs_vf(q)%sf(j, k, l) + mass_src(j, k, l) - end do - $:GPU_LOOP(parallelism='[seq]') - do q = momxb, momxe - rhs_vf(q)%sf(j, k, l) = rhs_vf(q)%sf(j, k, l) + mom_src(q - contxe, j, k, l) + #:call GPU_PARALLEL_LOOP(collapse=3) + do l = 0, p + do k = 0, n + do j = 0, m + $:GPU_LOOP(parallelism='[seq]') + do q = contxb, contxe + rhs_vf(q)%sf(j, k, l) = rhs_vf(q)%sf(j, k, l) + mass_src(j, k, l) + end do + $:GPU_LOOP(parallelism='[seq]') + do q = momxb, momxe + rhs_vf(q)%sf(j, k, l) = rhs_vf(q)%sf(j, k, l) + mom_src(q - contxe, j, k, l) + end do + rhs_vf(E_idx)%sf(j, k, l) = rhs_vf(E_idx)%sf(j, k, l) + e_src(j, k, l) end do - rhs_vf(E_idx)%sf(j, k, l) = rhs_vf(E_idx)%sf(j, k, l) + e_src(j, k, l) end do end do - end do + #:endcall GPU_PARALLEL_LOOP end subroutine s_acoustic_src_calculations !> This subroutine gives the temporally varying amplitude of the pulse @@ -345,7 +349,7 @@ contains !! @param frequency_local Frequency at the spatial location for sine and square waves !! @param gauss_sigma_time_local sigma in time for Gaussian pulse !! @param source Source term amplitude - pure elemental subroutine s_source_temporal(sim_time, c, ai, term_index, frequency_local, gauss_sigma_time_local, source, sum_BB) + elemental subroutine s_source_temporal(sim_time, c, ai, term_index, frequency_local, gauss_sigma_time_local, source, sum_BB) $:GPU_ROUTINE(parallelism='[seq]') integer, intent(in) :: ai, term_index real(wp), intent(in) :: sim_time, c, sum_BB @@ -504,7 +508,7 @@ contains !! @param source Source term amplitude !! @param angle Angle of the source term with respect to the x-axis (for 2D or 2D axisymmetric) !! @param xyz_to_r_ratios Ratios of the [xyz]-component of the source term to the magnitude (for 3D) - pure subroutine s_source_spatial(j, k, l, loc, ai, source, angle, xyz_to_r_ratios) + subroutine s_source_spatial(j, k, l, loc, ai, source, angle, xyz_to_r_ratios) integer, intent(in) :: j, k, l, ai real(wp), dimension(3), intent(in) :: loc real(wp), intent(out) :: source, angle, xyz_to_r_ratios(3) @@ -540,7 +544,7 @@ contains !! @param sig Sigma value for the Gaussian distribution !! @param r Displacement from source to current point !! @param source Source term amplitude - pure subroutine s_source_spatial_planar(ai, sig, r, source) + subroutine s_source_spatial_planar(ai, sig, r, source) integer, intent(in) :: ai real(wp), intent(in) :: sig, r(3) real(wp), intent(out) :: source @@ -570,7 +574,7 @@ contains !! @param source Source term amplitude !! @param angle Angle of the source term with respect to the x-axis (for 2D or 2D axisymmetric) !! @param xyz_to_r_ratios Ratios of the [xyz]-component of the source term to the magnitude (for 3D) - pure subroutine s_source_spatial_transducer(ai, sig, r, source, angle, xyz_to_r_ratios) + subroutine s_source_spatial_transducer(ai, sig, r, source, angle, xyz_to_r_ratios) integer, intent(in) :: ai real(wp), intent(in) :: sig, r(3) real(wp), intent(out) :: source, angle, xyz_to_r_ratios(3) @@ -615,7 +619,7 @@ contains !! @param source Source term amplitude !! @param angle Angle of the source term with respect to the x-axis (for 2D or 2D axisymmetric) !! @param xyz_to_r_ratios Ratios of the [xyz]-component of the source term to the magnitude (for 3D) - pure subroutine s_source_spatial_transducer_array(ai, sig, r, source, angle, xyz_to_r_ratios) + subroutine s_source_spatial_transducer_array(ai, sig, r, source, angle, xyz_to_r_ratios) integer, intent(in) :: ai real(wp), intent(in) :: sig, r(3) real(wp), intent(out) :: source, angle, xyz_to_r_ratios(3) @@ -697,7 +701,7 @@ contains !! @param ai Acoustic source index !! @param c Speed of sound !! @return frequency_local Converted frequency - pure elemental function f_frequency_local(freq_conv_flag, ai, c) + elemental function f_frequency_local(freq_conv_flag, ai, c) $:GPU_ROUTINE(parallelism='[seq]') logical, intent(in) :: freq_conv_flag integer, intent(in) :: ai @@ -716,7 +720,7 @@ contains !! @param c Speed of sound !! @param ai Acoustic source index !! @return gauss_sigma_time_local Converted Gaussian sigma time - pure elemental function f_gauss_sigma_time_local(gauss_conv_flag, ai, c) + function f_gauss_sigma_time_local(gauss_conv_flag, ai, c) $:GPU_ROUTINE(parallelism='[seq]') logical, intent(in) :: gauss_conv_flag integer, intent(in) :: ai diff --git a/src/simulation/m_body_forces.fpp b/src/simulation/m_body_forces.fpp index 41a103fcee..e3fbeb822e 100644 --- a/src/simulation/m_body_forces.fpp +++ b/src/simulation/m_body_forces.fpp @@ -10,9 +10,7 @@ module m_body_forces use m_nvtx -#ifdef MFC_OpenACC - use openacc -#endif +! $:USE_GPU_MODULE() implicit none @@ -79,18 +77,19 @@ contains type(scalar_field), dimension(sys_size), intent(in) :: q_cons_vf integer :: i, j, k, l !< standard iterators - $:GPU_PARALLEL_LOOP(collapse=3) - do l = 0, p - do k = 0, n - do j = 0, m - rhoM(j, k, l) = 0._wp - do i = 1, num_fluids - rhoM(j, k, l) = rhoM(j, k, l) + & - q_cons_vf(contxb + i - 1)%sf(j, k, l) + #:call GPU_PARALLEL_LOOP(collapse=3) + do l = 0, p + do k = 0, n + do j = 0, m + rhoM(j, k, l) = 0._wp + do i = 1, num_fluids + rhoM(j, k, l) = rhoM(j, k, l) + & + q_cons_vf(contxb + i - 1)%sf(j, k, l) + end do end do end do end do - end do + #:endcall GPU_PARALLEL_LOOP end subroutine s_compute_mixture_density @@ -109,60 +108,64 @@ contains call s_compute_acceleration(mytime) call s_compute_mixture_density(q_cons_vf) - $:GPU_PARALLEL_LOOP(collapse=4) - do i = momxb, E_idx - do l = 0, p - do k = 0, n - do j = 0, m - rhs_vf(i)%sf(j, k, l) = 0._wp + #:call GPU_PARALLEL_LOOP(collapse=4) + do i = momxb, E_idx + do l = 0, p + do k = 0, n + do j = 0, m + rhs_vf(i)%sf(j, k, l) = 0._wp + end do end do end do end do - end do + #:endcall GPU_PARALLEL_LOOP if (bf_x) then ! x-direction body forces - $:GPU_PARALLEL_LOOP(collapse=3) - do l = 0, p - do k = 0, n - do j = 0, m - rhs_vf(momxb)%sf(j, k, l) = rhs_vf(momxb)%sf(j, k, l) + & - rhoM(j, k, l)*accel_bf(1) - rhs_vf(E_idx)%sf(j, k, l) = rhs_vf(E_idx)%sf(j, k, l) + & - q_cons_vf(momxb)%sf(j, k, l)*accel_bf(1) + #:call GPU_PARALLEL_LOOP(collapse=3) + do l = 0, p + do k = 0, n + do j = 0, m + rhs_vf(momxb)%sf(j, k, l) = rhs_vf(momxb)%sf(j, k, l) + & + rhoM(j, k, l)*accel_bf(1) + rhs_vf(E_idx)%sf(j, k, l) = rhs_vf(E_idx)%sf(j, k, l) + & + q_cons_vf(momxb)%sf(j, k, l)*accel_bf(1) + end do end do end do - end do + #:endcall GPU_PARALLEL_LOOP end if if (bf_y) then ! y-direction body forces - $:GPU_PARALLEL_LOOP(collapse=3) - do l = 0, p - do k = 0, n - do j = 0, m - rhs_vf(momxb + 1)%sf(j, k, l) = rhs_vf(momxb + 1)%sf(j, k, l) + & - rhoM(j, k, l)*accel_bf(2) - rhs_vf(E_idx)%sf(j, k, l) = rhs_vf(E_idx)%sf(j, k, l) + & - q_cons_vf(momxb + 1)%sf(j, k, l)*accel_bf(2) + #:call GPU_PARALLEL_LOOP(collapse=3) + do l = 0, p + do k = 0, n + do j = 0, m + rhs_vf(momxb + 1)%sf(j, k, l) = rhs_vf(momxb + 1)%sf(j, k, l) + & + rhoM(j, k, l)*accel_bf(2) + rhs_vf(E_idx)%sf(j, k, l) = rhs_vf(E_idx)%sf(j, k, l) + & + q_cons_vf(momxb + 1)%sf(j, k, l)*accel_bf(2) + end do end do end do - end do + #:endcall GPU_PARALLEL_LOOP end if if (bf_z) then ! z-direction body forces - $:GPU_PARALLEL_LOOP(collapse=3) - do l = 0, p - do k = 0, n - do j = 0, m - rhs_vf(momxe)%sf(j, k, l) = rhs_vf(momxe)%sf(j, k, l) + & - (rhoM(j, k, l))*accel_bf(3) - rhs_vf(E_idx)%sf(j, k, l) = rhs_vf(E_idx)%sf(j, k, l) + & - q_cons_vf(momxe)%sf(j, k, l)*accel_bf(3) + #:call GPU_PARALLEL_LOOP(collapse=3) + do l = 0, p + do k = 0, n + do j = 0, m + rhs_vf(momxe)%sf(j, k, l) = rhs_vf(momxe)%sf(j, k, l) + & + (rhoM(j, k, l))*accel_bf(3) + rhs_vf(E_idx)%sf(j, k, l) = rhs_vf(E_idx)%sf(j, k, l) + & + q_cons_vf(momxe)%sf(j, k, l)*accel_bf(3) + end do end do end do - end do + #:endcall GPU_PARALLEL_LOOP end if diff --git a/src/simulation/m_bubbles.fpp b/src/simulation/m_bubbles.fpp index 0ec758dc22..7c6b84a37f 100644 --- a/src/simulation/m_bubbles.fpp +++ b/src/simulation/m_bubbles.fpp @@ -40,7 +40,7 @@ contains !! @param f_bub_adv_src Source for bubble volume fraction !! @param f_divu Divergence of velocity !! @param fCson Speed of sound from fP (EL) - pure elemental function f_rddot(fRho, fP, fR, fV, fR0, fpb, fpbdot, alf, fntait, fBtait, f_bub_adv_src, f_divu, fCson) + elemental function f_rddot(fRho, fP, fR, fV, fR0, fpb, fpbdot, alf, fntait, fBtait, f_bub_adv_src, f_divu, fCson) $:GPU_ROUTINE(parallelism='[seq]') real(wp), intent(in) :: fRho, fP, fR, fV, fR0, fpb, fpbdot, alf real(wp), intent(in) :: fntait, fBtait, f_bub_adv_src, f_divu @@ -81,7 +81,7 @@ contains !! @param fR Current bubble radius !! @param fV Current bubble velocity !! @param fpb Internal bubble pressure - pure elemental function f_cpbw(fR0, fR, fV, fpb) + elemental function f_cpbw(fR0, fR, fV, fpb) $:GPU_ROUTINE(parallelism='[seq]') real(wp), intent(in) :: fR0, fR, fV, fpb @@ -100,7 +100,7 @@ contains !! @param fCpinf Driving bubble pressure !! @param fntait Tait EOS parameter !! @param fBtait Tait EOS parameter - pure elemental function f_H(fCpbw, fCpinf, fntait, fBtait) + elemental function f_H(fCpbw, fCpinf, fntait, fBtait) $:GPU_ROUTINE(parallelism='[seq]') real(wp), intent(in) :: fCpbw, fCpinf, fntait, fBtait @@ -120,7 +120,7 @@ contains !! @param fntait Tait EOS parameter !! @param fBtait Tait EOS parameter !! @param fH Bubble enthalpy - pure elemental function f_cgas(fCpinf, fntait, fBtait, fH) + elemental function f_cgas(fCpinf, fntait, fBtait, fH) $:GPU_ROUTINE(parallelism='[seq]') real(wp), intent(in) :: fCpinf, fntait, fBtait, fH @@ -143,7 +143,7 @@ contains !! @param fBtait Tait EOS parameter !! @param advsrc Advection equation source term !! @param divu Divergence of velocity - pure elemental function f_cpinfdot(fRho, fP, falf, fntait, fBtait, advsrc, divu) + elemental function f_cpinfdot(fRho, fP, falf, fntait, fBtait, advsrc, divu) $:GPU_ROUTINE(parallelism='[seq]') real(wp), intent(in) :: fRho, fP, falf, fntait, fBtait, advsrc, divu @@ -173,7 +173,7 @@ contains !! @param fV Current bubble velocity !! @param fR0 Equilibrium bubble radius !! @param fpbdot Time derivative of the internal bubble pressure - pure elemental function f_Hdot(fCpbw, fCpinf, fCpinf_dot, fntait, fBtait, fR, fV, fR0, fpbdot) + elemental function f_Hdot(fCpbw, fCpinf, fCpinf_dot, fntait, fBtait, fR, fV, fR0, fpbdot) $:GPU_ROUTINE(parallelism='[seq]') real(wp), intent(in) :: fCpbw, fCpinf, fCpinf_dot, fntait, fBtait real(wp), intent(in) :: fR, fV, fR0, fpbdot @@ -209,7 +209,7 @@ contains !! @param fV Current bubble velocity !! @param fR0 Equilibrium bubble radius !! @param fCpbw Boundary wall pressure - pure elemental function f_rddot_RP(fCp, fRho, fR, fV, fCpbw) + elemental function f_rddot_RP(fCp, fRho, fR, fV, fCpbw) $:GPU_ROUTINE(parallelism='[seq]') real(wp), intent(in) :: fCp, fRho, fR, fV, fCpbw @@ -232,7 +232,7 @@ contains !! @param fcgas Current gas sound speed !! @param fntait Tait EOS parameter !! @param fBtait Tait EOS parameter - pure elemental function f_rddot_G(fCpbw, fR, fV, fH, fHdot, fcgas, fntait, fBtait) + elemental function f_rddot_G(fCpbw, fR, fV, fH, fHdot, fcgas, fntait, fBtait) $:GPU_ROUTINE(parallelism='[seq]') real(wp), intent(in) :: fCpbw, fR, fV, fH, fHdot real(wp), intent(in) :: fcgas, fntait, fBtait @@ -255,7 +255,7 @@ contains !! @param fR Current bubble radius !! @param fV Current bubble velocity !! @param fpb Internal bubble pressure - pure elemental function f_cpbw_KM(fR0, fR, fV, fpb) + elemental function f_cpbw_KM(fR0, fR, fV, fpb) $:GPU_ROUTINE(parallelism='[seq]') real(wp), intent(in) :: fR0, fR, fV, fpb real(wp) :: f_cpbw_KM @@ -282,7 +282,7 @@ contains !! @param fV Current bubble velocity !! @param fR0 Equilibrium bubble radius !! @param fC Current sound speed - pure elemental function f_rddot_KM(fpbdot, fCp, fCpbw, fRho, fR, fV, fR0, fC) + elemental function f_rddot_KM(fpbdot, fCp, fCpbw, fRho, fR, fV, fR0, fC) $:GPU_ROUTINE(parallelism='[seq]') real(wp), intent(in) :: fpbdot, fCp, fCpbw real(wp), intent(in) :: fRho, fR, fV, fR0, fC @@ -316,7 +316,7 @@ contains !> Subroutine that computes bubble wall properties for vapor bubbles !! @param pb Internal bubble pressure !! @param iR0 Current bubble size index - pure elemental subroutine s_bwproperty(pb_in, iR0, chi_vw_out, k_mw_out, rho_mw_out) + elemental subroutine s_bwproperty(pb_in, iR0, chi_vw_out, k_mw_out, rho_mw_out) $:GPU_ROUTINE(parallelism='[seq]') real(wp), intent(in) :: pb_in integer, intent(in) :: iR0 @@ -346,7 +346,7 @@ contains !! @param fbeta_c Mass transfer coefficient (EL) !! @param fR_m Mixture gas constant (EL) !! @param fgamma_m Mixture gamma (EL) - pure elemental subroutine s_vflux(fR, fV, fpb, fmass_v, iR0, vflux, fmass_n, fbeta_c, fR_m, fgamma_m) + elemental subroutine s_vflux(fR, fV, fpb, fmass_v, iR0, vflux, fmass_n, fbeta_c, fR_m, fgamma_m) $:GPU_ROUTINE(parallelism='[seq]') real(wp), intent(in) :: fR real(wp), intent(in) :: fV @@ -404,7 +404,7 @@ contains !! @param fbeta_t Mass transfer coefficient (EL) !! @param fR_m Mixture gas constant (EL) !! @param fgamma_m Mixture gamma (EL) - pure elemental function f_bpres_dot(fvflux, fR, fV, fpb, fmass_v, iR0, fbeta_t, fR_m, fgamma_m) + elemental function f_bpres_dot(fvflux, fR, fV, fpb, fmass_v, iR0, fbeta_t, fR_m, fgamma_m) $:GPU_ROUTINE(parallelism='[seq]') real(wp), intent(in) :: fvflux real(wp), intent(in) :: fR @@ -461,10 +461,10 @@ contains !! @param fbeta_t Heat transfer coefficient (EL) !! @param fCson Speed of sound (EL) !! @param adap_dt_stop Fail-safe exit if max iteration count reached - pure subroutine s_advance_step(fRho, fP, fR, fV, fR0, fpb, fpbdot, alf, & - fntait, fBtait, f_bub_adv_src, f_divu, & - bub_id, fmass_v, fmass_n, fbeta_c, & - fbeta_t, fCson, adap_dt_stop) + subroutine s_advance_step(fRho, fP, fR, fV, fR0, fpb, fpbdot, alf, & + fntait, fBtait, f_bub_adv_src, f_divu, & + bub_id, fmass_v, fmass_n, fbeta_c, & + fbeta_t, fCson, adap_dt_stop) $:GPU_ROUTINE(function_name='s_advance_step',parallelism='[seq]', & & cray_inline=True) @@ -594,9 +594,9 @@ contains !! @param f_divu Divergence of velocity !! @param fCson Speed of sound (EL) !! @param h Time step size - pure subroutine s_initial_substep_h(fRho, fP, fR, fV, fR0, fpb, fpbdot, alf, & - fntait, fBtait, f_bub_adv_src, f_divu, & - fCson, h) + subroutine s_initial_substep_h(fRho, fP, fR, fV, fR0, fpb, fpbdot, alf, & + fntait, fBtait, f_bub_adv_src, f_divu, & + fCson, h) $:GPU_ROUTINE(function_name='s_initial_substep_h',parallelism='[seq]', & & cray_inline=True) @@ -676,11 +676,11 @@ contains !! @param myV_tmp Bubble radial velocity at each stage !! @param myPb_tmp Internal bubble pressure at each stage (EL) !! @param myMv_tmp Mass of vapor in the bubble at each stage (EL) - pure subroutine s_advance_substep(err, fRho, fP, fR, fV, fR0, fpb, fpbdot, alf, & - fntait, fBtait, f_bub_adv_src, f_divu, & - bub_id, fmass_v, fmass_n, fbeta_c, & - fbeta_t, fCson, h, & - myR_tmp, myV_tmp, myPb_tmp, myMv_tmp) + subroutine s_advance_substep(err, fRho, fP, fR, fV, fR0, fpb, fpbdot, alf, & + fntait, fBtait, f_bub_adv_src, f_divu, & + bub_id, fmass_v, fmass_n, fbeta_c, & + fbeta_t, fCson, h, & + myR_tmp, myV_tmp, myPb_tmp, myMv_tmp) $:GPU_ROUTINE(function_name='s_advance_substep',parallelism='[seq]', & & cray_inline=True) @@ -778,8 +778,8 @@ contains !! @param fMv_tmp Mass of vapor in the bubble !! @param fdPbdt_tmp Rate of change of the internal bubble pressure !! @param fdMvdt_tmp Rate of change of the mass of vapor in the bubble - pure elemental subroutine s_advance_EL(fR_tmp, fV_tmp, fPb_tmp, fMv_tmp, bub_id, & - fmass_n, fbeta_c, fbeta_t, fdPbdt_tmp, advance_EL) + elemental subroutine s_advance_EL(fR_tmp, fV_tmp, fPb_tmp, fMv_tmp, bub_id, & + fmass_n, fbeta_c, fbeta_t, fdPbdt_tmp, advance_EL) $:GPU_ROUTINE(parallelism='[seq]') real(wp), intent(IN) :: fR_tmp, fV_tmp, fPb_tmp, fMv_tmp real(wp), intent(IN) :: fmass_n, fbeta_c, fbeta_t diff --git a/src/simulation/m_bubbles_EE.fpp b/src/simulation/m_bubbles_EE.fpp index b43a89e2e5..f198d2e78c 100644 --- a/src/simulation/m_bubbles_EE.fpp +++ b/src/simulation/m_bubbles_EE.fpp @@ -71,28 +71,29 @@ contains ! Compute the bubble volume fraction alpha from the bubble number density n !! @param q_cons_vf is the conservative variable - pure subroutine s_comp_alpha_from_n(q_cons_vf) + subroutine s_comp_alpha_from_n(q_cons_vf) type(scalar_field), dimension(sys_size), intent(inout) :: q_cons_vf real(wp) :: nR3bar integer(wp) :: i, j, k, l - $:GPU_PARALLEL_LOOP(collapse=3) - do l = 0, p - do k = 0, n - do j = 0, m - nR3bar = 0._wp - $:GPU_LOOP(parallelism='[seq]') - do i = 1, nb - nR3bar = nR3bar + weight(i)*(q_cons_vf(rs(i))%sf(j, k, l))**3._wp + #:call GPU_PARALLEL_LOOP(collapse=3) + do l = 0, p + do k = 0, n + do j = 0, m + nR3bar = 0._wp + $:GPU_LOOP(parallelism='[seq]') + do i = 1, nb + nR3bar = nR3bar + weight(i)*(q_cons_vf(rs(i))%sf(j, k, l))**3._wp + end do + q_cons_vf(alf_idx)%sf(j, k, l) = (4._wp*pi*nR3bar)/(3._wp*q_cons_vf(n_idx)%sf(j, k, l)**2._wp) end do - q_cons_vf(alf_idx)%sf(j, k, l) = (4._wp*pi*nR3bar)/(3._wp*q_cons_vf(n_idx)%sf(j, k, l)**2._wp) end do end do - end do + #:endcall GPU_PARALLEL_LOOP end subroutine s_comp_alpha_from_n - pure subroutine s_compute_bubbles_EE_rhs(idir, q_prim_vf, divu_in) + subroutine s_compute_bubbles_EE_rhs(idir, q_prim_vf, divu_in) integer, intent(in) :: idir type(scalar_field), dimension(sys_size), intent(in) :: q_prim_vf @@ -103,47 +104,50 @@ contains if (idir == 1) then if (.not. qbmm) then - $:GPU_PARALLEL_LOOP(collapse=3) - do l = 0, p - do k = 0, n - do j = 0, m - divu_in%sf(j, k, l) = 0._wp - divu_in%sf(j, k, l) = & - 5.e-1_wp/dx(j)*(q_prim_vf(contxe + idir)%sf(j + 1, k, l) - & - q_prim_vf(contxe + idir)%sf(j - 1, k, l)) + #:call GPU_PARALLEL_LOOP(collapse=3) + do l = 0, p + do k = 0, n + do j = 0, m + divu_in%sf(j, k, l) = 0._wp + divu_in%sf(j, k, l) = & + 5.e-1_wp/dx(j)*(q_prim_vf(contxe + idir)%sf(j + 1, k, l) - & + q_prim_vf(contxe + idir)%sf(j - 1, k, l)) + end do end do end do - end do + #:endcall GPU_PARALLEL_LOOP end if elseif (idir == 2) then - $:GPU_PARALLEL_LOOP(collapse=3) - do l = 0, p - do k = 0, n - do j = 0, m - divu_in%sf(j, k, l) = divu_in%sf(j, k, l) + & - 5.e-1_wp/dy(k)*(q_prim_vf(contxe + idir)%sf(j, k + 1, l) - & - q_prim_vf(contxe + idir)%sf(j, k - 1, l)) + #:call GPU_PARALLEL_LOOP(collapse=3) + do l = 0, p + do k = 0, n + do j = 0, m + divu_in%sf(j, k, l) = divu_in%sf(j, k, l) + & + 5.e-1_wp/dy(k)*(q_prim_vf(contxe + idir)%sf(j, k + 1, l) - & + q_prim_vf(contxe + idir)%sf(j, k - 1, l)) + end do end do end do - end do + #:endcall GPU_PARALLEL_LOOP elseif (idir == 3) then - $:GPU_PARALLEL_LOOP(collapse=3) - do l = 0, p - do k = 0, n - do j = 0, m - divu_in%sf(j, k, l) = divu_in%sf(j, k, l) + & - 5.e-1_wp/dz(l)*(q_prim_vf(contxe + idir)%sf(j, k, l + 1) - & - q_prim_vf(contxe + idir)%sf(j, k, l - 1)) + #:call GPU_PARALLEL_LOOP(collapse=3) + do l = 0, p + do k = 0, n + do j = 0, m + divu_in%sf(j, k, l) = divu_in%sf(j, k, l) + & + 5.e-1_wp/dz(l)*(q_prim_vf(contxe + idir)%sf(j, k, l + 1) - & + q_prim_vf(contxe + idir)%sf(j, k, l - 1)) + end do end do end do - end do + #:endcall GPU_PARALLEL_LOOP end if @@ -173,177 +177,180 @@ contains integer :: dmBub_id !< Dummy variables for unified subgrid bubble subroutines real(wp) :: dmMass_v, dmMass_n, dmBeta_c, dmBeta_t, dmCson - $:GPU_PARALLEL_LOOP(collapse=3) - do l = 0, p - do k = 0, n - do j = 0, m - bub_adv_src(j, k, l) = 0._wp - - $:GPU_LOOP(parallelism='[seq]') - do q = 1, nb - bub_r_src(j, k, l, q) = 0._wp - bub_v_src(j, k, l, q) = 0._wp - bub_p_src(j, k, l, q) = 0._wp - bub_m_src(j, k, l, q) = 0._wp + #:call GPU_PARALLEL_LOOP(collapse=3) + do l = 0, p + do k = 0, n + do j = 0, m + bub_adv_src(j, k, l) = 0._wp + + $:GPU_LOOP(parallelism='[seq]') + do q = 1, nb + bub_r_src(j, k, l, q) = 0._wp + bub_v_src(j, k, l, q) = 0._wp + bub_p_src(j, k, l, q) = 0._wp + bub_m_src(j, k, l, q) = 0._wp + end do end do end do end do - end do + #:endcall GPU_PARALLEL_LOOP adap_dt_stop_max = 0 - $:GPU_PARALLEL_LOOP(collapse=3, private='[Rtmp, Vtmp, myalpha_rho, myalpha]', & + #:call GPU_PARALLEL_LOOP(collapse=3, private='[Rtmp, Vtmp, myalpha_rho, myalpha]', & & reduction='[[adap_dt_stop_max]]', reductionOp='[MAX]', & & copy='[adap_dt_stop_max]') - do l = 0, p - do k = 0, n - do j = 0, m - - if (adv_n) then - nbub = q_prim_vf(n_idx)%sf(j, k, l) - else - $:GPU_LOOP(parallelism='[seq]') - do q = 1, nb - Rtmp(q) = q_prim_vf(rs(q))%sf(j, k, l) - Vtmp(q) = q_prim_vf(vs(q))%sf(j, k, l) - end do + do l = 0, p + do k = 0, n + do j = 0, m - R3 = 0._wp + if (adv_n) then + nbub = q_prim_vf(n_idx)%sf(j, k, l) + else + $:GPU_LOOP(parallelism='[seq]') + do q = 1, nb + Rtmp(q) = q_prim_vf(rs(q))%sf(j, k, l) + Vtmp(q) = q_prim_vf(vs(q))%sf(j, k, l) + end do - $:GPU_LOOP(parallelism='[seq]') - do q = 1, nb - R3 = R3 + weight(q)*Rtmp(q)**3._wp - end do + R3 = 0._wp - nbub = (3._wp/(4._wp*pi))*q_prim_vf(alf_idx)%sf(j, k, l)/R3 - end if + $:GPU_LOOP(parallelism='[seq]') + do q = 1, nb + R3 = R3 + weight(q)*Rtmp(q)**3._wp + end do - if (.not. adap_dt) then - R2Vav = 0._wp + nbub = (3._wp/(4._wp*pi))*q_prim_vf(alf_idx)%sf(j, k, l)/R3 + end if - $:GPU_LOOP(parallelism='[seq]') - do q = 1, nb - R2Vav = R2Vav + weight(q)*Rtmp(q)**2._wp*Vtmp(q) - end do + if (.not. adap_dt) then + R2Vav = 0._wp - bub_adv_src(j, k, l) = 4._wp*pi*nbub*R2Vav - end if + $:GPU_LOOP(parallelism='[seq]') + do q = 1, nb + R2Vav = R2Vav + weight(q)*Rtmp(q)**2._wp*Vtmp(q) + end do - $:GPU_LOOP(parallelism='[seq]') - do q = 1, nb + bub_adv_src(j, k, l) = 4._wp*pi*nbub*R2Vav + end if $:GPU_LOOP(parallelism='[seq]') - do ii = 1, num_fluids - myalpha_rho(ii) = q_cons_vf(ii)%sf(j, k, l) - myalpha(ii) = q_cons_vf(advxb + ii - 1)%sf(j, k, l) - end do - - myRho = 0._wp - n_tait = 0._wp - B_tait = 0._wp + do q = 1, nb - if (mpp_lim .and. (num_fluids > 2)) then $:GPU_LOOP(parallelism='[seq]') do ii = 1, num_fluids - myRho = myRho + myalpha_rho(ii) - n_tait = n_tait + myalpha(ii)*gammas(ii) - B_tait = B_tait + myalpha(ii)*pi_infs(ii) - end do - else if (num_fluids > 2) then - $:GPU_LOOP(parallelism='[seq]') - do ii = 1, num_fluids - 1 - myRho = myRho + myalpha_rho(ii) - n_tait = n_tait + myalpha(ii)*gammas(ii) - B_tait = B_tait + myalpha(ii)*pi_infs(ii) + myalpha_rho(ii) = q_cons_vf(ii)%sf(j, k, l) + myalpha(ii) = q_cons_vf(advxb + ii - 1)%sf(j, k, l) end do - else - myRho = myalpha_rho(1) - n_tait = gammas(1) - B_tait = pi_infs(1)/pi_fac - end if - n_tait = 1._wp/n_tait + 1._wp !make this the usual little 'gamma' - B_tait = B_tait*(n_tait - 1)/n_tait ! make this the usual pi_inf + myRho = 0._wp + n_tait = 0._wp + B_tait = 0._wp + + if (mpp_lim .and. (num_fluids > 2)) then + $:GPU_LOOP(parallelism='[seq]') + do ii = 1, num_fluids + myRho = myRho + myalpha_rho(ii) + n_tait = n_tait + myalpha(ii)*gammas(ii) + B_tait = B_tait + myalpha(ii)*pi_infs(ii) + end do + else if (num_fluids > 2) then + $:GPU_LOOP(parallelism='[seq]') + do ii = 1, num_fluids - 1 + myRho = myRho + myalpha_rho(ii) + n_tait = n_tait + myalpha(ii)*gammas(ii) + B_tait = B_tait + myalpha(ii)*pi_infs(ii) + end do + else + myRho = myalpha_rho(1) + n_tait = gammas(1) + B_tait = pi_infs(1)/pi_fac + end if - myRho = q_prim_vf(1)%sf(j, k, l) - myP = q_prim_vf(E_idx)%sf(j, k, l) - alf = q_prim_vf(alf_idx)%sf(j, k, l) - myR = q_prim_vf(rs(q))%sf(j, k, l) - myV = q_prim_vf(vs(q))%sf(j, k, l) + n_tait = 1._wp/n_tait + 1._wp !make this the usual little 'gamma' + B_tait = B_tait*(n_tait - 1)/n_tait ! make this the usual pi_inf - if (.not. polytropic) then - pb_local = q_prim_vf(ps(q))%sf(j, k, l) - mv_local = q_prim_vf(ms(q))%sf(j, k, l) - call s_bwproperty(pb_local, q, chi_vw, k_mw, rho_mw) - call s_vflux(myR, myV, pb_local, mv_local, q, vflux) - pbdot = f_bpres_dot(vflux, myR, myV, pb_local, mv_local, q) + myRho = q_prim_vf(1)%sf(j, k, l) + myP = q_prim_vf(E_idx)%sf(j, k, l) + alf = q_prim_vf(alf_idx)%sf(j, k, l) + myR = q_prim_vf(rs(q))%sf(j, k, l) + myV = q_prim_vf(vs(q))%sf(j, k, l) - bub_p_src(j, k, l, q) = nbub*pbdot - bub_m_src(j, k, l, q) = nbub*vflux*4._wp*pi*(myR**2._wp) - else - pb_local = 0._wp; mv_local = 0._wp; vflux = 0._wp; pbdot = 0._wp - end if + if (.not. polytropic) then + pb_local = q_prim_vf(ps(q))%sf(j, k, l) + mv_local = q_prim_vf(ms(q))%sf(j, k, l) + call s_bwproperty(pb_local, q, chi_vw, k_mw, rho_mw) + call s_vflux(myR, myV, pb_local, mv_local, q, vflux) + pbdot = f_bpres_dot(vflux, myR, myV, pb_local, mv_local, q) + + bub_p_src(j, k, l, q) = nbub*pbdot + bub_m_src(j, k, l, q) = nbub*vflux*4._wp*pi*(myR**2._wp) + else + pb_local = 0._wp; mv_local = 0._wp; vflux = 0._wp; pbdot = 0._wp + end if - ! Adaptive time stepping - adap_dt_stop = 0 + ! Adaptive time stepping + adap_dt_stop = 0 - if (adap_dt) then + if (adap_dt) then - call s_advance_step(myRho, myP, myR, myV, R0(q), & - pb_local, pbdot, alf, n_tait, B_tait, & - bub_adv_src(j, k, l), divu_in%sf(j, k, l), & - dmBub_id, dmMass_v, dmMass_n, dmBeta_c, & - dmBeta_t, dmCson, adap_dt_stop) + call s_advance_step(myRho, myP, myR, myV, R0(q), & + pb_local, pbdot, alf, n_tait, B_tait, & + bub_adv_src(j, k, l), divu_in%sf(j, k, l), & + dmBub_id, dmMass_v, dmMass_n, dmBeta_c, & + dmBeta_t, dmCson, adap_dt_stop) - q_cons_vf(rs(q))%sf(j, k, l) = nbub*myR - q_cons_vf(vs(q))%sf(j, k, l) = nbub*myV + q_cons_vf(rs(q))%sf(j, k, l) = nbub*myR + q_cons_vf(vs(q))%sf(j, k, l) = nbub*myV - else - rddot = f_rddot(myRho, myP, myR, myV, R0(q), & - pb_local, pbdot, alf, n_tait, B_tait, & - bub_adv_src(j, k, l), divu_in%sf(j, k, l), & - dmCson) - bub_v_src(j, k, l, q) = nbub*rddot - bub_r_src(j, k, l, q) = q_cons_vf(vs(q))%sf(j, k, l) - end if + else + rddot = f_rddot(myRho, myP, myR, myV, R0(q), & + pb_local, pbdot, alf, n_tait, B_tait, & + bub_adv_src(j, k, l), divu_in%sf(j, k, l), & + dmCson) + bub_v_src(j, k, l, q) = nbub*rddot + bub_r_src(j, k, l, q) = q_cons_vf(vs(q))%sf(j, k, l) + end if - adap_dt_stop_max = max(adap_dt_stop_max, adap_dt_stop) + adap_dt_stop_max = max(adap_dt_stop_max, adap_dt_stop) - if (alf < 1.e-11_wp) then - bub_adv_src(j, k, l) = 0._wp - bub_r_src(j, k, l, q) = 0._wp - bub_v_src(j, k, l, q) = 0._wp - if (.not. polytropic) then - bub_p_src(j, k, l, q) = 0._wp - bub_m_src(j, k, l, q) = 0._wp + if (alf < 1.e-11_wp) then + bub_adv_src(j, k, l) = 0._wp + bub_r_src(j, k, l, q) = 0._wp + bub_v_src(j, k, l, q) = 0._wp + if (.not. polytropic) then + bub_p_src(j, k, l, q) = 0._wp + bub_m_src(j, k, l, q) = 0._wp + end if end if - end if + end do end do end do end do - end do + #:endcall GPU_PARALLEL_LOOP if (adap_dt .and. adap_dt_stop_max > 0) call s_mpi_abort("Adaptive time stepping failed to converge.") if (.not. adap_dt) then - $:GPU_PARALLEL_LOOP(collapse=3) - do l = 0, p - do q = 0, n - do i = 0, m - rhs_vf(alf_idx)%sf(i, q, l) = rhs_vf(alf_idx)%sf(i, q, l) + bub_adv_src(i, q, l) - if (num_fluids > 1) rhs_vf(advxb)%sf(i, q, l) = & - rhs_vf(advxb)%sf(i, q, l) - bub_adv_src(i, q, l) - $:GPU_LOOP(parallelism='[seq]') - do k = 1, nb - rhs_vf(rs(k))%sf(i, q, l) = rhs_vf(rs(k))%sf(i, q, l) + bub_r_src(i, q, l, k) - rhs_vf(vs(k))%sf(i, q, l) = rhs_vf(vs(k))%sf(i, q, l) + bub_v_src(i, q, l, k) - if (polytropic .neqv. .true.) then - rhs_vf(ps(k))%sf(i, q, l) = rhs_vf(ps(k))%sf(i, q, l) + bub_p_src(i, q, l, k) - rhs_vf(ms(k))%sf(i, q, l) = rhs_vf(ms(k))%sf(i, q, l) + bub_m_src(i, q, l, k) - end if + #:call GPU_PARALLEL_LOOP(collapse=3) + do l = 0, p + do q = 0, n + do i = 0, m + rhs_vf(alf_idx)%sf(i, q, l) = rhs_vf(alf_idx)%sf(i, q, l) + bub_adv_src(i, q, l) + if (num_fluids > 1) rhs_vf(advxb)%sf(i, q, l) = & + rhs_vf(advxb)%sf(i, q, l) - bub_adv_src(i, q, l) + $:GPU_LOOP(parallelism='[seq]') + do k = 1, nb + rhs_vf(rs(k))%sf(i, q, l) = rhs_vf(rs(k))%sf(i, q, l) + bub_r_src(i, q, l, k) + rhs_vf(vs(k))%sf(i, q, l) = rhs_vf(vs(k))%sf(i, q, l) + bub_v_src(i, q, l, k) + if (polytropic .neqv. .true.) then + rhs_vf(ps(k))%sf(i, q, l) = rhs_vf(ps(k))%sf(i, q, l) + bub_p_src(i, q, l, k) + rhs_vf(ms(k))%sf(i, q, l) = rhs_vf(ms(k))%sf(i, q, l) + bub_m_src(i, q, l, k) + end if + end do end do end do end do - end do + #:endcall GPU_PARALLEL_LOOP end if end subroutine s_compute_bubble_EE_source diff --git a/src/simulation/m_bubbles_EL.fpp b/src/simulation/m_bubbles_EL.fpp index f5dbc1c628..73c7141171 100644 --- a/src/simulation/m_bubbles_EL.fpp +++ b/src/simulation/m_bubbles_EL.fpp @@ -529,104 +529,107 @@ contains ! Subgrid p_inf model based on Maeda and Colonius (2018). if (lag_params%pressure_corrector) then ! Calculate velocity potentials (valid for one bubble per cell) - $:GPU_PARALLEL_LOOP(private='[k,cell]') - do k = 1, nBubs - call s_get_pinf(k, q_prim_vf, 2, paux, cell, preterm1, term2, Romega) - myR0 = bub_R0(k) - myR = intfc_rad(k, 2) - myV = intfc_vel(k, 2) - myPb = gas_p(k, 2) - pint = f_cpbw_KM(myR0, myR, myV, myPb) - pint = pint + 0.5_wp*myV**2._wp - if (lag_params%cluster_type == 2) then - bub_dphidt(k) = (paux - pint) + term2 - ! Accounting for the potential induced by the bubble averaged over the control volume - ! Note that this is based on the incompressible flow assumption near the bubble. - term1_fac = 3._wp/2._wp*(myR*(Romega**2._wp - myR**2._wp))/(Romega**3._wp - myR**3._wp) - bub_dphidt(k) = bub_dphidt(k)/(1._wp - term1_fac) - end if - end do + #:call GPU_PARALLEL_LOOP(private='[k,cell]') + do k = 1, nBubs + call s_get_pinf(k, q_prim_vf, 2, paux, cell, preterm1, term2, Romega) + myR0 = bub_R0(k) + myR = intfc_rad(k, 2) + myV = intfc_vel(k, 2) + myPb = gas_p(k, 2) + pint = f_cpbw_KM(myR0, myR, myV, myPb) + pint = pint + 0.5_wp*myV**2._wp + if (lag_params%cluster_type == 2) then + bub_dphidt(k) = (paux - pint) + term2 + ! Accounting for the potential induced by the bubble averaged over the control volume + ! Note that this is based on the incompressible flow assumption near the bubble. + term1_fac = 3._wp/2._wp*(myR*(Romega**2._wp - myR**2._wp))/(Romega**3._wp - myR**3._wp) + bub_dphidt(k) = bub_dphidt(k)/(1._wp - term1_fac) + end if + end do + #:endcall GPU_PARALLEL_LOOP end if ! Radial motion model adap_dt_stop_max = 0 - $:GPU_PARALLEL_LOOP(private='[k,myalpha_rho,myalpha,Re,cell]', & + #:call GPU_PARALLEL_LOOP(private='[k,myalpha_rho,myalpha,Re,cell]', & & reduction='[[adap_dt_stop_max]]',reductionOp='[MAX]', & & copy='[adap_dt_stop_max]',copyin='[stage]') - do k = 1, nBubs - ! Keller-Miksis model - - ! Current bubble state - myPb = gas_p(k, 2) - myMass_n = gas_mg(k) - myMass_v = gas_mv(k, 2) - myR = intfc_rad(k, 2) - myV = intfc_vel(k, 2) - myBeta_c = gas_betaC(k) - myBeta_t = gas_betaT(k) - myR0 = bub_R0(k) - - ! Vapor and heat fluxes - call s_vflux(myR, myV, myPb, myMass_v, k, myVapFlux, myMass_n, myBeta_c, myR_m, mygamma_m) - myPbdot = f_bpres_dot(myVapFlux, myR, myV, myPb, myMass_v, k, myBeta_t, myR_m, mygamma_m) - myMvdot = 4._wp*pi*myR**2._wp*myVapFlux - - ! Obtaining driving pressure - call s_get_pinf(k, q_prim_vf, 1, myPinf, cell, aux1, aux2) - - ! Obtain liquid density and computing speed of sound from pinf - $:GPU_LOOP(parallelism='[seq]') - do i = 1, num_fluids - myalpha_rho(i) = q_prim_vf(i)%sf(cell(1), cell(2), cell(3)) - myalpha(i) = q_prim_vf(E_idx + i)%sf(cell(1), cell(2), cell(3)) - end do - call s_convert_species_to_mixture_variables_acc(myRho, gamma, pi_inf, qv, myalpha, & - myalpha_rho, Re) - call s_compute_cson_from_pinf(q_prim_vf, myPinf, cell, myRho, gamma, pi_inf, myCson) + do k = 1, nBubs + ! Keller-Miksis model + + ! Current bubble state + myPb = gas_p(k, 2) + myMass_n = gas_mg(k) + myMass_v = gas_mv(k, 2) + myR = intfc_rad(k, 2) + myV = intfc_vel(k, 2) + myBeta_c = gas_betaC(k) + myBeta_t = gas_betaT(k) + myR0 = bub_R0(k) - ! Adaptive time stepping - adap_dt_stop = 0 + ! Vapor and heat fluxes + call s_vflux(myR, myV, myPb, myMass_v, k, myVapFlux, myMass_n, myBeta_c, myR_m, mygamma_m) + myPbdot = f_bpres_dot(myVapFlux, myR, myV, myPb, myMass_v, k, myBeta_t, myR_m, mygamma_m) + myMvdot = 4._wp*pi*myR**2._wp*myVapFlux - if (adap_dt) then + ! Obtaining driving pressure + call s_get_pinf(k, q_prim_vf, 1, myPinf, cell, aux1, aux2) - call s_advance_step(myRho, myPinf, myR, myV, myR0, myPb, myPbdot, dmalf, & - dmntait, dmBtait, dm_bub_adv_src, dm_divu, & - k, myMass_v, myMass_n, myBeta_c, & - myBeta_t, myCson, adap_dt_stop) + ! Obtain liquid density and computing speed of sound from pinf + $:GPU_LOOP(parallelism='[seq]') + do i = 1, num_fluids + myalpha_rho(i) = q_prim_vf(i)%sf(cell(1), cell(2), cell(3)) + myalpha(i) = q_prim_vf(E_idx + i)%sf(cell(1), cell(2), cell(3)) + end do + call s_convert_species_to_mixture_variables_acc(myRho, gamma, pi_inf, qv, myalpha, & + myalpha_rho, Re) + call s_compute_cson_from_pinf(q_prim_vf, myPinf, cell, myRho, gamma, pi_inf, myCson) - ! Update bubble state - intfc_rad(k, 1) = myR - intfc_vel(k, 1) = myV - gas_p(k, 1) = myPb - gas_mv(k, 1) = myMass_v + ! Adaptive time stepping + adap_dt_stop = 0 - else + if (adap_dt) then - ! Radial acceleration from bubble models - intfc_dveldt(k, stage) = f_rddot(myRho, myPinf, myR, myV, myR0, & - myPb, myPbdot, dmalf, dmntait, dmBtait, & - dm_bub_adv_src, dm_divu, & - myCson) - intfc_draddt(k, stage) = myV - gas_dmvdt(k, stage) = myMvdot - gas_dpdt(k, stage) = myPbdot + call s_advance_step(myRho, myPinf, myR, myV, myR0, myPb, myPbdot, dmalf, & + dmntait, dmBtait, dm_bub_adv_src, dm_divu, & + k, myMass_v, myMass_n, myBeta_c, & + myBeta_t, myCson, adap_dt_stop) - end if + ! Update bubble state + intfc_rad(k, 1) = myR + intfc_vel(k, 1) = myV + gas_p(k, 1) = myPb + gas_mv(k, 1) = myMass_v - adap_dt_stop_max = max(adap_dt_stop_max, adap_dt_stop) + else - end do + ! Radial acceleration from bubble models + intfc_dveldt(k, stage) = f_rddot(myRho, myPinf, myR, myV, myR0, & + myPb, myPbdot, dmalf, dmntait, dmBtait, & + dm_bub_adv_src, dm_divu, & + myCson) + intfc_draddt(k, stage) = myV + gas_dmvdt(k, stage) = myMvdot + gas_dpdt(k, stage) = myPbdot + + end if + + adap_dt_stop_max = max(adap_dt_stop_max, adap_dt_stop) + + end do + #:endcall GPU_PARALLEL_LOOP if (adap_dt .and. adap_dt_stop_max > 0) call s_mpi_abort("Adaptive time stepping failed to converge.") ! Bubbles remain in a fixed position - $:GPU_PARALLEL_LOOP(collapse=2, private='[k]', copyin='[stage]') - do k = 1, nBubs - do l = 1, 3 - mtn_dposdt(k, l, stage) = 0._wp - mtn_dveldt(k, l, stage) = 0._wp + #:call GPU_PARALLEL_LOOP(collapse=2, private='[k]', copyin='[stage]') + do k = 1, nBubs + do l = 1, 3 + mtn_dposdt(k, l, stage) = 0._wp + mtn_dveldt(k, l, stage) = 0._wp + end do end do - end do + #:endcall GPU_PARALLEL_LOOP call nvtxEndRange @@ -650,80 +653,85 @@ contains if (lag_params%solver_approach == 2) then if (p == 0) then - $:GPU_PARALLEL_LOOP(collapse=4) - do k = 0, p - do j = 0, n - do i = 0, m - do l = 1, E_idx - if (q_beta%vf(1)%sf(i, j, k) > (1._wp - lag_params%valmaxvoid)) then - rhs_vf(l)%sf(i, j, k) = rhs_vf(l)%sf(i, j, k) + & - q_cons_vf(l)%sf(i, j, k)*(q_beta%vf(2)%sf(i, j, k) + & - q_beta%vf(5)%sf(i, j, k)) - - end if + #:call GPU_PARALLEL_LOOP(collapse=4) + do k = 0, p + do j = 0, n + do i = 0, m + do l = 1, E_idx + if (q_beta%vf(1)%sf(i, j, k) > (1._wp - lag_params%valmaxvoid)) then + rhs_vf(l)%sf(i, j, k) = rhs_vf(l)%sf(i, j, k) + & + q_cons_vf(l)%sf(i, j, k)*(q_beta%vf(2)%sf(i, j, k) + & + q_beta%vf(5)%sf(i, j, k)) + + end if + end do end do end do end do - end do + #:endcall GPU_PARALLEL_LOOP else - $:GPU_PARALLEL_LOOP(collapse=4) - do k = 0, p - do j = 0, n - do i = 0, m - do l = 1, E_idx - if (q_beta%vf(1)%sf(i, j, k) > (1._wp - lag_params%valmaxvoid)) then - rhs_vf(l)%sf(i, j, k) = rhs_vf(l)%sf(i, j, k) + & - q_cons_vf(l)%sf(i, j, k)/q_beta%vf(1)%sf(i, j, k)* & - q_beta%vf(2)%sf(i, j, k) - end if + #:call GPU_PARALLEL_LOOP(collapse=4) + do k = 0, p + do j = 0, n + do i = 0, m + do l = 1, E_idx + if (q_beta%vf(1)%sf(i, j, k) > (1._wp - lag_params%valmaxvoid)) then + rhs_vf(l)%sf(i, j, k) = rhs_vf(l)%sf(i, j, k) + & + q_cons_vf(l)%sf(i, j, k)/q_beta%vf(1)%sf(i, j, k)* & + q_beta%vf(2)%sf(i, j, k) + end if + end do end do end do end do - end do + #:endcall GPU_PARALLEL_LOOP end if do l = 1, num_dims call s_gradient_dir(q_prim_vf(E_idx), q_beta%vf(3), l) - $:GPU_PARALLEL_LOOP(collapse=3) - do k = 0, p - do j = 0, n - do i = 0, m - if (q_beta%vf(1)%sf(i, j, k) > (1._wp - lag_params%valmaxvoid)) then - rhs_vf(contxe + l)%sf(i, j, k) = rhs_vf(contxe + l)%sf(i, j, k) - & - (1._wp - q_beta%vf(1)%sf(i, j, k))/ & - q_beta%vf(1)%sf(i, j, k)* & - q_beta%vf(3)%sf(i, j, k) - end if + #:call GPU_PARALLEL_LOOP(collapse=3) + do k = 0, p + do j = 0, n + do i = 0, m + if (q_beta%vf(1)%sf(i, j, k) > (1._wp - lag_params%valmaxvoid)) then + rhs_vf(contxe + l)%sf(i, j, k) = rhs_vf(contxe + l)%sf(i, j, k) - & + (1._wp - q_beta%vf(1)%sf(i, j, k))/ & + q_beta%vf(1)%sf(i, j, k)* & + q_beta%vf(3)%sf(i, j, k) + end if + end do end do end do - end do + #:endcall GPU_PARALLEL_LOOP !source in energy - $:GPU_PARALLEL_LOOP(collapse=3) - do k = idwbuff(3)%beg, idwbuff(3)%end - do j = idwbuff(2)%beg, idwbuff(2)%end - do i = idwbuff(1)%beg, idwbuff(1)%end - q_beta%vf(3)%sf(i, j, k) = q_prim_vf(E_idx)%sf(i, j, k)*q_prim_vf(contxe + l)%sf(i, j, k) + #:call GPU_PARALLEL_LOOP(collapse=3) + do k = idwbuff(3)%beg, idwbuff(3)%end + do j = idwbuff(2)%beg, idwbuff(2)%end + do i = idwbuff(1)%beg, idwbuff(1)%end + q_beta%vf(3)%sf(i, j, k) = q_prim_vf(E_idx)%sf(i, j, k)*q_prim_vf(contxe + l)%sf(i, j, k) + end do end do end do - end do + #:endcall GPU_PARALLEL_LOOP call s_gradient_dir(q_beta%vf(3), q_beta%vf(4), l) - $:GPU_PARALLEL_LOOP(collapse=3) - do k = 0, p - do j = 0, n - do i = 0, m - if (q_beta%vf(1)%sf(i, j, k) > (1._wp - lag_params%valmaxvoid)) then - rhs_vf(E_idx)%sf(i, j, k) = rhs_vf(E_idx)%sf(i, j, k) - & - q_beta%vf(4)%sf(i, j, k)*(1._wp - q_beta%vf(1)%sf(i, j, k))/ & - q_beta%vf(1)%sf(i, j, k) - end if + #:call GPU_PARALLEL_LOOP(collapse=3) + do k = 0, p + do j = 0, n + do i = 0, m + if (q_beta%vf(1)%sf(i, j, k) > (1._wp - lag_params%valmaxvoid)) then + rhs_vf(E_idx)%sf(i, j, k) = rhs_vf(E_idx)%sf(i, j, k) - & + q_beta%vf(4)%sf(i, j, k)*(1._wp - q_beta%vf(1)%sf(i, j, k))/ & + q_beta%vf(1)%sf(i, j, k) + end if + end do end do end do - end do + #:endcall GPU_PARALLEL_LOOP end do end if @@ -739,7 +747,7 @@ contains !! @param gamma Liquid specific heat ratio !! @param pi_inf Liquid stiffness !! @param cson Calculated speed of sound - pure subroutine s_compute_cson_from_pinf(q_prim_vf, pinf, cell, rhol, gamma, pi_inf, cson) + subroutine s_compute_cson_from_pinf(q_prim_vf, pinf, cell, rhol, gamma, pi_inf, cson) $:GPU_ROUTINE(function_name='s_compute_cson_from_pinf', & & parallelism='[seq]', cray_inline=True) @@ -769,32 +777,34 @@ contains call nvtxStartRange("BUBBLES-LAGRANGE-KERNELS") - $:GPU_PARALLEL_LOOP(collapse=4) - do i = 1, q_beta_idx - do l = idwbuff(3)%beg, idwbuff(3)%end - do k = idwbuff(2)%beg, idwbuff(2)%end - do j = idwbuff(1)%beg, idwbuff(1)%end - q_beta%vf(i)%sf(j, k, l) = 0._wp + #:call GPU_PARALLEL_LOOP(collapse=4) + do i = 1, q_beta_idx + do l = idwbuff(3)%beg, idwbuff(3)%end + do k = idwbuff(2)%beg, idwbuff(2)%end + do j = idwbuff(1)%beg, idwbuff(1)%end + q_beta%vf(i)%sf(j, k, l) = 0._wp + end do end do end do end do - end do + #:endcall GPU_PARALLEL_LOOP call s_smoothfunction(nBubs, intfc_rad, intfc_vel, & mtn_s, mtn_pos, q_beta) !Store 1-beta - $:GPU_PARALLEL_LOOP(collapse=3) - do l = idwbuff(3)%beg, idwbuff(3)%end - do k = idwbuff(2)%beg, idwbuff(2)%end - do j = idwbuff(1)%beg, idwbuff(1)%end - q_beta%vf(1)%sf(j, k, l) = 1._wp - q_beta%vf(1)%sf(j, k, l) - ! Limiting void fraction given max value - q_beta%vf(1)%sf(j, k, l) = max(q_beta%vf(1)%sf(j, k, l), & - 1._wp - lag_params%valmaxvoid) + #:call GPU_PARALLEL_LOOP(collapse=3) + do l = idwbuff(3)%beg, idwbuff(3)%end + do k = idwbuff(2)%beg, idwbuff(2)%end + do j = idwbuff(1)%beg, idwbuff(1)%end + q_beta%vf(1)%sf(j, k, l) = 1._wp - q_beta%vf(1)%sf(j, k, l) + ! Limiting void fraction given max value + q_beta%vf(1)%sf(j, k, l) = max(q_beta%vf(1)%sf(j, k, l), & + 1._wp - lag_params%valmaxvoid) + end do end do end do - end do + #:endcall GPU_PARALLEL_LOOP call nvtxEndRange @@ -807,7 +817,7 @@ contains !! @param f_pinfl Driving pressure !! @param cell Bubble cell !! @param Romega Control volume radius - pure subroutine s_get_pinf(bub_id, q_prim_vf, ptype, f_pinfl, cell, preterm1, term2, Romega) + subroutine s_get_pinf(bub_id, q_prim_vf, ptype, f_pinfl, cell, preterm1, term2, Romega) $:GPU_ROUTINE(function_name='s_get_pinf',parallelism='[seq]', & & cray_inline=True) @@ -1025,16 +1035,17 @@ contains integer :: k if (time_stepper == 1) then ! 1st order TVD RK - $:GPU_PARALLEL_LOOP(private='[k]') - do k = 1, nBubs - !u{1} = u{n} + dt * RHS{n} - intfc_rad(k, 1) = intfc_rad(k, 1) + dt*intfc_draddt(k, 1) - intfc_vel(k, 1) = intfc_vel(k, 1) + dt*intfc_dveldt(k, 1) - mtn_pos(k, 1:3, 1) = mtn_pos(k, 1:3, 1) + dt*mtn_dposdt(k, 1:3, 1) - mtn_vel(k, 1:3, 1) = mtn_vel(k, 1:3, 1) + dt*mtn_dveldt(k, 1:3, 1) - gas_p(k, 1) = gas_p(k, 1) + dt*gas_dpdt(k, 1) - gas_mv(k, 1) = gas_mv(k, 1) + dt*gas_dmvdt(k, 1) - end do + #:call GPU_PARALLEL_LOOP(private='[k]') + do k = 1, nBubs + !u{1} = u{n} + dt * RHS{n} + intfc_rad(k, 1) = intfc_rad(k, 1) + dt*intfc_draddt(k, 1) + intfc_vel(k, 1) = intfc_vel(k, 1) + dt*intfc_dveldt(k, 1) + mtn_pos(k, 1:3, 1) = mtn_pos(k, 1:3, 1) + dt*mtn_dposdt(k, 1:3, 1) + mtn_vel(k, 1:3, 1) = mtn_vel(k, 1:3, 1) + dt*mtn_dveldt(k, 1:3, 1) + gas_p(k, 1) = gas_p(k, 1) + dt*gas_dpdt(k, 1) + gas_mv(k, 1) = gas_mv(k, 1) + dt*gas_dmvdt(k, 1) + end do + #:endcall GPU_PARALLEL_LOOP call s_transfer_data_to_tmp() call s_write_void_evol(mytime) @@ -1047,28 +1058,30 @@ contains elseif (time_stepper == 2) then ! 2nd order TVD RK if (stage == 1) then - $:GPU_PARALLEL_LOOP(private='[k]') - do k = 1, nBubs - !u{1} = u{n} + dt * RHS{n} - intfc_rad(k, 2) = intfc_rad(k, 1) + dt*intfc_draddt(k, 1) - intfc_vel(k, 2) = intfc_vel(k, 1) + dt*intfc_dveldt(k, 1) - mtn_pos(k, 1:3, 2) = mtn_pos(k, 1:3, 1) + dt*mtn_dposdt(k, 1:3, 1) - mtn_vel(k, 1:3, 2) = mtn_vel(k, 1:3, 1) + dt*mtn_dveldt(k, 1:3, 1) - gas_p(k, 2) = gas_p(k, 1) + dt*gas_dpdt(k, 1) - gas_mv(k, 2) = gas_mv(k, 1) + dt*gas_dmvdt(k, 1) - end do + #:call GPU_PARALLEL_LOOP(private='[k]') + do k = 1, nBubs + !u{1} = u{n} + dt * RHS{n} + intfc_rad(k, 2) = intfc_rad(k, 1) + dt*intfc_draddt(k, 1) + intfc_vel(k, 2) = intfc_vel(k, 1) + dt*intfc_dveldt(k, 1) + mtn_pos(k, 1:3, 2) = mtn_pos(k, 1:3, 1) + dt*mtn_dposdt(k, 1:3, 1) + mtn_vel(k, 1:3, 2) = mtn_vel(k, 1:3, 1) + dt*mtn_dveldt(k, 1:3, 1) + gas_p(k, 2) = gas_p(k, 1) + dt*gas_dpdt(k, 1) + gas_mv(k, 2) = gas_mv(k, 1) + dt*gas_dmvdt(k, 1) + end do + #:endcall GPU_PARALLEL_LOOP elseif (stage == 2) then - $:GPU_PARALLEL_LOOP(private='[k]') - do k = 1, nBubs - !u{1} = u{n} + (1/2) * dt * (RHS{n} + RHS{1}) - intfc_rad(k, 1) = intfc_rad(k, 1) + dt*(intfc_draddt(k, 1) + intfc_draddt(k, 2))/2._wp - intfc_vel(k, 1) = intfc_vel(k, 1) + dt*(intfc_dveldt(k, 1) + intfc_dveldt(k, 2))/2._wp - mtn_pos(k, 1:3, 1) = mtn_pos(k, 1:3, 1) + dt*(mtn_dposdt(k, 1:3, 1) + mtn_dposdt(k, 1:3, 2))/2._wp - mtn_vel(k, 1:3, 1) = mtn_vel(k, 1:3, 1) + dt*(mtn_dveldt(k, 1:3, 1) + mtn_dveldt(k, 1:3, 2))/2._wp - gas_p(k, 1) = gas_p(k, 1) + dt*(gas_dpdt(k, 1) + gas_dpdt(k, 2))/2._wp - gas_mv(k, 1) = gas_mv(k, 1) + dt*(gas_dmvdt(k, 1) + gas_dmvdt(k, 2))/2._wp - end do + #:call GPU_PARALLEL_LOOP(private='[k]') + do k = 1, nBubs + !u{1} = u{n} + (1/2) * dt * (RHS{n} + RHS{1}) + intfc_rad(k, 1) = intfc_rad(k, 1) + dt*(intfc_draddt(k, 1) + intfc_draddt(k, 2))/2._wp + intfc_vel(k, 1) = intfc_vel(k, 1) + dt*(intfc_dveldt(k, 1) + intfc_dveldt(k, 2))/2._wp + mtn_pos(k, 1:3, 1) = mtn_pos(k, 1:3, 1) + dt*(mtn_dposdt(k, 1:3, 1) + mtn_dposdt(k, 1:3, 2))/2._wp + mtn_vel(k, 1:3, 1) = mtn_vel(k, 1:3, 1) + dt*(mtn_dveldt(k, 1:3, 1) + mtn_dveldt(k, 1:3, 2))/2._wp + gas_p(k, 1) = gas_p(k, 1) + dt*(gas_dpdt(k, 1) + gas_dpdt(k, 2))/2._wp + gas_mv(k, 1) = gas_mv(k, 1) + dt*(gas_dmvdt(k, 1) + gas_dmvdt(k, 2))/2._wp + end do + #:endcall GPU_PARALLEL_LOOP call s_transfer_data_to_tmp() call s_write_void_evol(mytime) @@ -1083,39 +1096,42 @@ contains elseif (time_stepper == 3) then ! 3rd order TVD RK if (stage == 1) then - $:GPU_PARALLEL_LOOP(private='[k]') - do k = 1, nBubs - !u{1} = u{n} + dt * RHS{n} - intfc_rad(k, 2) = intfc_rad(k, 1) + dt*intfc_draddt(k, 1) - intfc_vel(k, 2) = intfc_vel(k, 1) + dt*intfc_dveldt(k, 1) - mtn_pos(k, 1:3, 2) = mtn_pos(k, 1:3, 1) + dt*mtn_dposdt(k, 1:3, 1) - mtn_vel(k, 1:3, 2) = mtn_vel(k, 1:3, 1) + dt*mtn_dveldt(k, 1:3, 1) - gas_p(k, 2) = gas_p(k, 1) + dt*gas_dpdt(k, 1) - gas_mv(k, 2) = gas_mv(k, 1) + dt*gas_dmvdt(k, 1) - end do + #:call GPU_PARALLEL_LOOP(private='[k]') + do k = 1, nBubs + !u{1} = u{n} + dt * RHS{n} + intfc_rad(k, 2) = intfc_rad(k, 1) + dt*intfc_draddt(k, 1) + intfc_vel(k, 2) = intfc_vel(k, 1) + dt*intfc_dveldt(k, 1) + mtn_pos(k, 1:3, 2) = mtn_pos(k, 1:3, 1) + dt*mtn_dposdt(k, 1:3, 1) + mtn_vel(k, 1:3, 2) = mtn_vel(k, 1:3, 1) + dt*mtn_dveldt(k, 1:3, 1) + gas_p(k, 2) = gas_p(k, 1) + dt*gas_dpdt(k, 1) + gas_mv(k, 2) = gas_mv(k, 1) + dt*gas_dmvdt(k, 1) + end do + #:endcall GPU_PARALLEL_LOOP elseif (stage == 2) then - $:GPU_PARALLEL_LOOP(private='[k]') - do k = 1, nBubs - !u{2} = u{n} + (1/4) * dt * [RHS{n} + RHS{1}] - intfc_rad(k, 2) = intfc_rad(k, 1) + dt*(intfc_draddt(k, 1) + intfc_draddt(k, 2))/4._wp - intfc_vel(k, 2) = intfc_vel(k, 1) + dt*(intfc_dveldt(k, 1) + intfc_dveldt(k, 2))/4._wp - mtn_pos(k, 1:3, 2) = mtn_pos(k, 1:3, 1) + dt*(mtn_dposdt(k, 1:3, 1) + mtn_dposdt(k, 1:3, 2))/4._wp - mtn_vel(k, 1:3, 2) = mtn_vel(k, 1:3, 1) + dt*(mtn_dveldt(k, 1:3, 1) + mtn_dveldt(k, 1:3, 2))/4._wp - gas_p(k, 2) = gas_p(k, 1) + dt*(gas_dpdt(k, 1) + gas_dpdt(k, 2))/4._wp - gas_mv(k, 2) = gas_mv(k, 1) + dt*(gas_dmvdt(k, 1) + gas_dmvdt(k, 2))/4._wp - end do + #:call GPU_PARALLEL_LOOP(private='[k]') + do k = 1, nBubs + !u{2} = u{n} + (1/4) * dt * [RHS{n} + RHS{1}] + intfc_rad(k, 2) = intfc_rad(k, 1) + dt*(intfc_draddt(k, 1) + intfc_draddt(k, 2))/4._wp + intfc_vel(k, 2) = intfc_vel(k, 1) + dt*(intfc_dveldt(k, 1) + intfc_dveldt(k, 2))/4._wp + mtn_pos(k, 1:3, 2) = mtn_pos(k, 1:3, 1) + dt*(mtn_dposdt(k, 1:3, 1) + mtn_dposdt(k, 1:3, 2))/4._wp + mtn_vel(k, 1:3, 2) = mtn_vel(k, 1:3, 1) + dt*(mtn_dveldt(k, 1:3, 1) + mtn_dveldt(k, 1:3, 2))/4._wp + gas_p(k, 2) = gas_p(k, 1) + dt*(gas_dpdt(k, 1) + gas_dpdt(k, 2))/4._wp + gas_mv(k, 2) = gas_mv(k, 1) + dt*(gas_dmvdt(k, 1) + gas_dmvdt(k, 2))/4._wp + end do + #:endcall GPU_PARALLEL_LOOP elseif (stage == 3) then - $:GPU_PARALLEL_LOOP(private='[k]') - do k = 1, nBubs - !u{n+1} = u{n} + (2/3) * dt * [(1/4)* RHS{n} + (1/4)* RHS{1} + RHS{2}] - intfc_rad(k, 1) = intfc_rad(k, 1) + (2._wp/3._wp)*dt*(intfc_draddt(k, 1)/4._wp + intfc_draddt(k, 2)/4._wp + intfc_draddt(k, 3)) - intfc_vel(k, 1) = intfc_vel(k, 1) + (2._wp/3._wp)*dt*(intfc_dveldt(k, 1)/4._wp + intfc_dveldt(k, 2)/4._wp + intfc_dveldt(k, 3)) - mtn_pos(k, 1:3, 1) = mtn_pos(k, 1:3, 1) + (2._wp/3._wp)*dt*(mtn_dposdt(k, 1:3, 1)/4._wp + mtn_dposdt(k, 1:3, 2)/4._wp + mtn_dposdt(k, 1:3, 3)) - mtn_vel(k, 1:3, 1) = mtn_vel(k, 1:3, 1) + (2._wp/3._wp)*dt*(mtn_dveldt(k, 1:3, 1)/4._wp + mtn_dveldt(k, 1:3, 2)/4._wp + mtn_dveldt(k, 1:3, 3)) - gas_p(k, 1) = gas_p(k, 1) + (2._wp/3._wp)*dt*(gas_dpdt(k, 1)/4._wp + gas_dpdt(k, 2)/4._wp + gas_dpdt(k, 3)) - gas_mv(k, 1) = gas_mv(k, 1) + (2._wp/3._wp)*dt*(gas_dmvdt(k, 1)/4._wp + gas_dmvdt(k, 2)/4._wp + gas_dmvdt(k, 3)) - end do + #:call GPU_PARALLEL_LOOP(private='[k]') + do k = 1, nBubs + !u{n+1} = u{n} + (2/3) * dt * [(1/4)* RHS{n} + (1/4)* RHS{1} + RHS{2}] + intfc_rad(k, 1) = intfc_rad(k, 1) + (2._wp/3._wp)*dt*(intfc_draddt(k, 1)/4._wp + intfc_draddt(k, 2)/4._wp + intfc_draddt(k, 3)) + intfc_vel(k, 1) = intfc_vel(k, 1) + (2._wp/3._wp)*dt*(intfc_dveldt(k, 1)/4._wp + intfc_dveldt(k, 2)/4._wp + intfc_dveldt(k, 3)) + mtn_pos(k, 1:3, 1) = mtn_pos(k, 1:3, 1) + (2._wp/3._wp)*dt*(mtn_dposdt(k, 1:3, 1)/4._wp + mtn_dposdt(k, 1:3, 2)/4._wp + mtn_dposdt(k, 1:3, 3)) + mtn_vel(k, 1:3, 1) = mtn_vel(k, 1:3, 1) + (2._wp/3._wp)*dt*(mtn_dveldt(k, 1:3, 1)/4._wp + mtn_dveldt(k, 1:3, 2)/4._wp + mtn_dveldt(k, 1:3, 3)) + gas_p(k, 1) = gas_p(k, 1) + (2._wp/3._wp)*dt*(gas_dpdt(k, 1)/4._wp + gas_dpdt(k, 2)/4._wp + gas_dpdt(k, 3)) + gas_mv(k, 1) = gas_mv(k, 1) + (2._wp/3._wp)*dt*(gas_dmvdt(k, 1)/4._wp + gas_dmvdt(k, 2)/4._wp + gas_dmvdt(k, 3)) + end do + #:endcall GPU_PARALLEL_LOOP call s_transfer_data_to_tmp() call s_write_void_evol(mytime) @@ -1136,7 +1152,7 @@ contains !! @param pos Input coordinates !! @param cell Computational coordinate of the cell !! @param scoord Calculated particle coordinates - pure subroutine s_locate_cell(pos, cell, scoord) + subroutine s_locate_cell(pos, cell, scoord) real(wp), dimension(3), intent(in) :: pos real(wp), dimension(3), intent(out) :: scoord @@ -1192,24 +1208,25 @@ contains integer :: k - $:GPU_PARALLEL_LOOP(private='[k]') - do k = 1, nBubs - gas_p(k, 2) = gas_p(k, 1) - gas_mv(k, 2) = gas_mv(k, 1) - intfc_rad(k, 2) = intfc_rad(k, 1) - intfc_vel(k, 2) = intfc_vel(k, 1) - mtn_pos(k, 1:3, 2) = mtn_pos(k, 1:3, 1) - mtn_posPrev(k, 1:3, 2) = mtn_posPrev(k, 1:3, 1) - mtn_vel(k, 1:3, 2) = mtn_vel(k, 1:3, 1) - mtn_s(k, 1:3, 2) = mtn_s(k, 1:3, 1) - end do + #:call GPU_PARALLEL_LOOP(private='[k]') + do k = 1, nBubs + gas_p(k, 2) = gas_p(k, 1) + gas_mv(k, 2) = gas_mv(k, 1) + intfc_rad(k, 2) = intfc_rad(k, 1) + intfc_vel(k, 2) = intfc_vel(k, 1) + mtn_pos(k, 1:3, 2) = mtn_pos(k, 1:3, 1) + mtn_posPrev(k, 1:3, 2) = mtn_posPrev(k, 1:3, 1) + mtn_vel(k, 1:3, 2) = mtn_vel(k, 1:3, 1) + mtn_s(k, 1:3, 2) = mtn_s(k, 1:3, 1) + end do + #:endcall GPU_PARALLEL_LOOP end subroutine s_transfer_data_to_tmp !> The purpose of this procedure is to determine if the global coordinates of the bubbles !! are present in the current MPI processor (including ghost cells). !! @param pos_part Spatial coordinates of the bubble - pure function particle_in_domain(pos_part) + function particle_in_domain(pos_part) logical :: particle_in_domain real(wp), dimension(3), intent(in) :: pos_part @@ -1262,7 +1279,7 @@ contains !> The purpose of this procedure is to determine if the lagrangian bubble is located in the !! physical domain. The ghost cells are not part of the physical domain. !! @param pos_part Spatial coordinates of the bubble - pure function particle_in_domain_physical(pos_part) + function particle_in_domain_physical(pos_part) logical :: particle_in_domain_physical real(wp), dimension(3), intent(in) :: pos_part @@ -1281,7 +1298,7 @@ contains !! @param q Input scalar field !! @param dq Output gradient of q !! @param dir Gradient spatial direction - pure subroutine s_gradient_dir(q, dq, dir) + subroutine s_gradient_dir(q, dq, dir) type(scalar_field), intent(inout) :: q type(scalar_field), intent(inout) :: dq @@ -1291,47 +1308,50 @@ contains if (dir == 1) then ! Gradient in x dir. - $:GPU_PARALLEL_LOOP(collapse=3) - do k = 0, p - do j = 0, n - do i = 0, m - dq%sf(i, j, k) = q%sf(i, j, k)*(dx(i + 1) - dx(i - 1)) & - + q%sf(i + 1, j, k)*(dx(i) + dx(i - 1)) & - - q%sf(i - 1, j, k)*(dx(i) + dx(i + 1)) - dq%sf(i, j, k) = dq%sf(i, j, k)/ & - ((dx(i) + dx(i - 1))*(dx(i) + dx(i + 1))) - end do - end do - end do - else - if (dir == 2) then - ! Gradient in y dir. - $:GPU_PARALLEL_LOOP(collapse=3) + #:call GPU_PARALLEL_LOOP(collapse=3) do k = 0, p do j = 0, n do i = 0, m - dq%sf(i, j, k) = q%sf(i, j, k)*(dy(j + 1) - dy(j - 1)) & - + q%sf(i, j + 1, k)*(dy(j) + dy(j - 1)) & - - q%sf(i, j - 1, k)*(dy(j) + dy(j + 1)) + dq%sf(i, j, k) = q%sf(i, j, k)*(dx(i + 1) - dx(i - 1)) & + + q%sf(i + 1, j, k)*(dx(i) + dx(i - 1)) & + - q%sf(i - 1, j, k)*(dx(i) + dx(i + 1)) dq%sf(i, j, k) = dq%sf(i, j, k)/ & - ((dy(j) + dy(j - 1))*(dy(j) + dy(j + 1))) + ((dx(i) + dx(i - 1))*(dx(i) + dx(i + 1))) end do end do end do + #:endcall GPU_PARALLEL_LOOP + else + if (dir == 2) then + ! Gradient in y dir. + #:call GPU_PARALLEL_LOOP(collapse=3) + do k = 0, p + do j = 0, n + do i = 0, m + dq%sf(i, j, k) = q%sf(i, j, k)*(dy(j + 1) - dy(j - 1)) & + + q%sf(i, j + 1, k)*(dy(j) + dy(j - 1)) & + - q%sf(i, j - 1, k)*(dy(j) + dy(j + 1)) + dq%sf(i, j, k) = dq%sf(i, j, k)/ & + ((dy(j) + dy(j - 1))*(dy(j) + dy(j + 1))) + end do + end do + end do + #:endcall GPU_PARALLEL_LOOP else ! Gradient in z dir. - $:GPU_PARALLEL_LOOP(collapse=3) - do k = 0, p - do j = 0, n - do i = 0, m - dq%sf(i, j, k) = q%sf(i, j, k)*(dz(k + 1) - dz(k - 1)) & - + q%sf(i, j, k + 1)*(dz(k) + dz(k - 1)) & - - q%sf(i, j, k - 1)*(dz(k) + dz(k + 1)) - dq%sf(i, j, k) = dq%sf(i, j, k)/ & - ((dz(k) + dz(k - 1))*(dz(k) + dz(k + 1))) + #:call GPU_PARALLEL_LOOP(collapse=3) + do k = 0, p + do j = 0, n + do i = 0, m + dq%sf(i, j, k) = q%sf(i, j, k)*(dz(k + 1) - dz(k - 1)) & + + q%sf(i, j, k + 1)*(dz(k) + dz(k - 1)) & + - q%sf(i, j, k - 1)*(dz(k) + dz(k + 1)) + dq%sf(i, j, k) = dq%sf(i, j, k)/ & + ((dz(k) + dz(k - 1))*(dz(k) + dz(k + 1))) + end do end do end do - end do + #:endcall GPU_PARALLEL_LOOP end if end if @@ -1414,21 +1434,20 @@ contains lag_void_max = 0._wp lag_void_avg = 0._wp lag_vol = 0._wp - $:GPU_PARALLEL_LOOP(collapse=3, reduction='[[lag_vol, lag_void_avg], & - & [lag_void_max]]', reductionOp='[+, MAX]', & - & copy='[lag_vol, lag_void_avg, lag_void_max]') - do k = 0, p - do j = 0, n - do i = 0, m - lag_void_max = max(lag_void_max, 1._wp - q_beta%vf(1)%sf(i, j, k)) - call s_get_char_vol(i, j, k, volcell) - if ((1._wp - q_beta%vf(1)%sf(i, j, k)) > 5.0d-11) then - lag_void_avg = lag_void_avg + (1._wp - q_beta%vf(1)%sf(i, j, k))*volcell - lag_vol = lag_vol + volcell - end if + #:call GPU_PARALLEL_LOOP(collapse=3, reduction='[[lag_vol, lag_void_avg], [lag_void_max]]', reductionOp='[+, MAX]', copy='[lag_vol, lag_void_avg, lag_void_max]') + do k = 0, p + do j = 0, n + do i = 0, m + lag_void_max = max(lag_void_max, 1._wp - q_beta%vf(1)%sf(i, j, k)) + call s_get_char_vol(i, j, k, volcell) + if ((1._wp - q_beta%vf(1)%sf(i, j, k)) > 5.0d-11) then + lag_void_avg = lag_void_avg + (1._wp - q_beta%vf(1)%sf(i, j, k))*volcell + lag_vol = lag_vol + volcell + end if + end do end do end do - end do + #:endcall GPU_PARALLEL_LOOP #ifdef MFC_MPI if (num_procs > 1) then @@ -1600,14 +1619,15 @@ contains integer :: k - $:GPU_PARALLEL_LOOP(reduction='[[Rmax_glb], [Rmin_glb]]', & + #:call GPU_PARALLEL_LOOP(reduction='[[Rmax_glb], [Rmin_glb]]', & & reductionOp='[MAX, MIN]', copy='[Rmax_glb,Rmin_glb]') - do k = 1, nBubs - Rmax_glb = max(Rmax_glb, intfc_rad(k, 1)/bub_R0(k)) - Rmin_glb = min(Rmin_glb, intfc_rad(k, 1)/bub_R0(k)) - Rmax_stats(k) = max(Rmax_stats(k), intfc_rad(k, 1)/bub_R0(k)) - Rmin_stats(k) = min(Rmin_stats(k), intfc_rad(k, 1)/bub_R0(k)) - end do + do k = 1, nBubs + Rmax_glb = max(Rmax_glb, intfc_rad(k, 1)/bub_R0(k)) + Rmin_glb = min(Rmin_glb, intfc_rad(k, 1)/bub_R0(k)) + Rmax_stats(k) = max(Rmax_stats(k), intfc_rad(k, 1)/bub_R0(k)) + Rmin_stats(k) = min(Rmin_stats(k), intfc_rad(k, 1)/bub_R0(k)) + end do + #:endcall GPU_PARALLEL_LOOP end subroutine s_calculate_lag_bubble_stats diff --git a/src/simulation/m_bubbles_EL_kernels.fpp b/src/simulation/m_bubbles_EL_kernels.fpp index 48ea3bad9a..218eaa6ea6 100644 --- a/src/simulation/m_bubbles_EL_kernels.fpp +++ b/src/simulation/m_bubbles_EL_kernels.fpp @@ -22,7 +22,7 @@ contains !! @param lbk_s Computational coordinates of the bubbles !! @param lbk_pos Spatial coordinates of the bubbles !! @param updatedvar Eulerian variable to be updated - pure subroutine s_smoothfunction(nBubs, lbk_rad, lbk_vel, lbk_s, lbk_pos, updatedvar) + subroutine s_smoothfunction(nBubs, lbk_rad, lbk_vel, lbk_s, lbk_pos, updatedvar) integer, intent(in) :: nBubs real(wp), dimension(1:lag_params%nBubs_glb, 1:3, 1:2), intent(in) :: lbk_s, lbk_pos @@ -40,7 +40,7 @@ contains !> The purpose of this procedure contains the algorithm to use the delta kernel function to map the effect of the bubbles. !! The effect of the bubbles only affects the cell where the bubble is located. - pure subroutine s_deltafunc(nBubs, lbk_rad, lbk_vel, lbk_s, updatedvar) + subroutine s_deltafunc(nBubs, lbk_rad, lbk_vel, lbk_s, updatedvar) integer, intent(in) :: nBubs real(wp), dimension(1:lag_params%nBubs_glb, 1:3, 1:2), intent(in) :: lbk_s @@ -55,47 +55,48 @@ contains real(wp), dimension(3) :: s_coord integer :: l - $:GPU_PARALLEL_LOOP(private='[l,s_coord,cell]') - do l = 1, nBubs + #:call GPU_PARALLEL_LOOP(private='[l,s_coord,cell]') + do l = 1, nBubs - volpart = 4._wp/3._wp*pi*lbk_rad(l, 2)**3._wp - s_coord(1:3) = lbk_s(l, 1:3, 2) - call s_get_cell(s_coord, cell) + volpart = 4._wp/3._wp*pi*lbk_rad(l, 2)**3._wp + s_coord(1:3) = lbk_s(l, 1:3, 2) + call s_get_cell(s_coord, cell) - strength_vol = volpart - strength_vel = 4._wp*pi*lbk_rad(l, 2)**2._wp*lbk_vel(l, 2) + strength_vol = volpart + strength_vel = 4._wp*pi*lbk_rad(l, 2)**2._wp*lbk_vel(l, 2) - if (num_dims == 2) then - Vol = dx(cell(1))*dy(cell(2))*lag_params%charwidth - if (cyl_coord) Vol = dx(cell(1))*dy(cell(2))*y_cc(cell(2))*2._wp*pi - else - Vol = dx(cell(1))*dy(cell(2))*dz(cell(3)) - end if + if (num_dims == 2) then + Vol = dx(cell(1))*dy(cell(2))*lag_params%charwidth + if (cyl_coord) Vol = dx(cell(1))*dy(cell(2))*y_cc(cell(2))*2._wp*pi + else + Vol = dx(cell(1))*dy(cell(2))*dz(cell(3)) + end if - !Update void fraction field - addFun1 = strength_vol/Vol - $:GPU_ATOMIC(atomic='update') - updatedvar%vf(1)%sf(cell(1), cell(2), cell(3)) = updatedvar%vf(1)%sf(cell(1), cell(2), cell(3)) + addFun1 - - !Update time derivative of void fraction - addFun2 = strength_vel/Vol - $:GPU_ATOMIC(atomic='update') - updatedvar%vf(2)%sf(cell(1), cell(2), cell(3)) = updatedvar%vf(2)%sf(cell(1), cell(2), cell(3)) + addFun2 + !Update void fraction field + addFun1 = strength_vol/Vol + $:GPU_ATOMIC(atomic='update') + updatedvar%vf(1)%sf(cell(1), cell(2), cell(3)) = updatedvar%vf(1)%sf(cell(1), cell(2), cell(3)) + addFun1 - !Product of two smeared functions - !Update void fraction * time derivative of void fraction - if (lag_params%cluster_type >= 4) then - addFun3 = (strength_vol*strength_vel)/Vol + !Update time derivative of void fraction + addFun2 = strength_vel/Vol $:GPU_ATOMIC(atomic='update') - updatedvar%vf(5)%sf(cell(1), cell(2), cell(3)) = updatedvar%vf(5)%sf(cell(1), cell(2), cell(3)) + addFun3 - end if - end do + updatedvar%vf(2)%sf(cell(1), cell(2), cell(3)) = updatedvar%vf(2)%sf(cell(1), cell(2), cell(3)) + addFun2 + + !Product of two smeared functions + !Update void fraction * time derivative of void fraction + if (lag_params%cluster_type >= 4) then + addFun3 = (strength_vol*strength_vel)/Vol + $:GPU_ATOMIC(atomic='update') + updatedvar%vf(5)%sf(cell(1), cell(2), cell(3)) = updatedvar%vf(5)%sf(cell(1), cell(2), cell(3)) + addFun3 + end if + end do + #:endcall GPU_PARALLEL_LOOP end subroutine s_deltafunc !> The purpose of this procedure contains the algorithm to use the gaussian kernel function to map the effect of the bubbles. !! The effect of the bubbles affects the 3X3x3 cells that surround the bubble. - pure subroutine s_gaussian(nBubs, lbk_rad, lbk_vel, lbk_s, lbk_pos, updatedvar) + subroutine s_gaussian(nBubs, lbk_rad, lbk_vel, lbk_s, lbk_pos, updatedvar) integer, intent(in) :: nBubs real(wp), dimension(1:lag_params%nBubs_glb, 1:3, 1:2), intent(in) :: lbk_s, lbk_pos @@ -120,86 +121,87 @@ contains smearGridz = smearGrid if (p == 0) smearGridz = 1 - $:GPU_PARALLEL_LOOP(private='[nodecoord,l,s_coord,cell,center]', copyin='[smearGrid,smearGridz]') - do l = 1, nBubs - nodecoord(1:3) = 0 - center(1:3) = 0._wp - volpart = 4._wp/3._wp*pi*lbk_rad(l, 2)**3._wp - s_coord(1:3) = lbk_s(l, 1:3, 2) - center(1:2) = lbk_pos(l, 1:2, 2) - if (p > 0) center(3) = lbk_pos(l, 3, 2) - call s_get_cell(s_coord, cell) - call s_compute_stddsv(cell, volpart, stddsv) - - strength_vol = volpart - strength_vel = 4._wp*pi*lbk_rad(l, 2)**2._wp*lbk_vel(l, 2) - - $:GPU_LOOP(collapse=3,private='[cellaux,nodecoord]') - do i = 1, smearGrid - do j = 1, smearGrid - do k = 1, smearGridz - cellaux(1) = cell(1) + i - (mapCells + 1) - cellaux(2) = cell(2) + j - (mapCells + 1) - cellaux(3) = cell(3) + k - (mapCells + 1) - if (p == 0) cellaux(3) = 0 - - !Check if the cells intended to smear the bubbles in are in the computational domain - !and redefine the cells for symmetric boundary - call s_check_celloutside(cellaux, celloutside) - - if (.not. celloutside) then - - nodecoord(1) = x_cc(cellaux(1)) - nodecoord(2) = y_cc(cellaux(2)) - if (p > 0) nodecoord(3) = z_cc(cellaux(3)) - call s_applygaussian(center, cellaux, nodecoord, stddsv, 0._wp, func) - if (lag_params%cluster_type >= 4) call s_applygaussian(center, cellaux, nodecoord, stddsv, 1._wp, func2) - - ! Relocate cells for bubbles intersecting symmetric boundaries - if (any((/bc_x%beg, bc_x%end, bc_y%beg, bc_y%end, bc_z%beg, bc_z%end/) == BC_REFLECTIVE)) then - call s_shift_cell_symmetric_bc(cellaux, cell) - end if - else - func = 0._wp - func2 = 0._wp - cellaux(1) = cell(1) - cellaux(2) = cell(2) - cellaux(3) = cell(3) + #:call GPU_PARALLEL_LOOP(private='[nodecoord,l,s_coord,cell,center]', copyin='[smearGrid,smearGridz]') + do l = 1, nBubs + nodecoord(1:3) = 0 + center(1:3) = 0._wp + volpart = 4._wp/3._wp*pi*lbk_rad(l, 2)**3._wp + s_coord(1:3) = lbk_s(l, 1:3, 2) + center(1:2) = lbk_pos(l, 1:2, 2) + if (p > 0) center(3) = lbk_pos(l, 3, 2) + call s_get_cell(s_coord, cell) + call s_compute_stddsv(cell, volpart, stddsv) + + strength_vol = volpart + strength_vel = 4._wp*pi*lbk_rad(l, 2)**2._wp*lbk_vel(l, 2) + + $:GPU_LOOP(collapse=3,private='[cellaux,nodecoord]') + do i = 1, smearGrid + do j = 1, smearGrid + do k = 1, smearGridz + cellaux(1) = cell(1) + i - (mapCells + 1) + cellaux(2) = cell(2) + j - (mapCells + 1) + cellaux(3) = cell(3) + k - (mapCells + 1) if (p == 0) cellaux(3) = 0 - end if - - !Update void fraction field - addFun1 = func*strength_vol - $:GPU_ATOMIC(atomic='update') - updatedvar%vf(1)%sf(cellaux(1), cellaux(2), cellaux(3)) = & - updatedvar%vf(1)%sf(cellaux(1), cellaux(2), cellaux(3)) & - + addFun1 - - !Update time derivative of void fraction - addFun2 = func*strength_vel - $:GPU_ATOMIC(atomic='update') - updatedvar%vf(2)%sf(cellaux(1), cellaux(2), cellaux(3)) = & - updatedvar%vf(2)%sf(cellaux(1), cellaux(2), cellaux(3)) & - + addFun2 - - !Product of two smeared functions - !Update void fraction * time derivative of void fraction - if (lag_params%cluster_type >= 4) then - addFun3 = func2*strength_vol*strength_vel + + !Check if the cells intended to smear the bubbles in are in the computational domain + !and redefine the cells for symmetric boundary + call s_check_celloutside(cellaux, celloutside) + + if (.not. celloutside) then + + nodecoord(1) = x_cc(cellaux(1)) + nodecoord(2) = y_cc(cellaux(2)) + if (p > 0) nodecoord(3) = z_cc(cellaux(3)) + call s_applygaussian(center, cellaux, nodecoord, stddsv, 0._wp, func) + if (lag_params%cluster_type >= 4) call s_applygaussian(center, cellaux, nodecoord, stddsv, 1._wp, func2) + + ! Relocate cells for bubbles intersecting symmetric boundaries + if (any((/bc_x%beg, bc_x%end, bc_y%beg, bc_y%end, bc_z%beg, bc_z%end/) == BC_REFLECTIVE)) then + call s_shift_cell_symmetric_bc(cellaux, cell) + end if + else + func = 0._wp + func2 = 0._wp + cellaux(1) = cell(1) + cellaux(2) = cell(2) + cellaux(3) = cell(3) + if (p == 0) cellaux(3) = 0 + end if + + !Update void fraction field + addFun1 = func*strength_vol $:GPU_ATOMIC(atomic='update') - updatedvar%vf(5)%sf(cellaux(1), cellaux(2), cellaux(3)) = & - updatedvar%vf(5)%sf(cellaux(1), cellaux(2), cellaux(3)) & - + addFun3 - end if + updatedvar%vf(1)%sf(cellaux(1), cellaux(2), cellaux(3)) = & + updatedvar%vf(1)%sf(cellaux(1), cellaux(2), cellaux(3)) & + + addFun1 + + !Update time derivative of void fraction + addFun2 = func*strength_vel + $:GPU_ATOMIC(atomic='update') + updatedvar%vf(2)%sf(cellaux(1), cellaux(2), cellaux(3)) = & + updatedvar%vf(2)%sf(cellaux(1), cellaux(2), cellaux(3)) & + + addFun2 + + !Product of two smeared functions + !Update void fraction * time derivative of void fraction + if (lag_params%cluster_type >= 4) then + addFun3 = func2*strength_vol*strength_vel + $:GPU_ATOMIC(atomic='update') + updatedvar%vf(5)%sf(cellaux(1), cellaux(2), cellaux(3)) = & + updatedvar%vf(5)%sf(cellaux(1), cellaux(2), cellaux(3)) & + + addFun3 + end if + end do end do end do end do - end do + #:endcall GPU_PARALLEL_LOOP end subroutine s_gaussian !> The purpose of this subroutine is to apply the gaussian kernel function for each bubble (Maeda and Colonius, 2018)). - pure subroutine s_applygaussian(center, cellaux, nodecoord, stddsv, strength_idx, func) + subroutine s_applygaussian(center, cellaux, nodecoord, stddsv, strength_idx, func) $:GPU_ROUTINE(function_name='s_applygaussian',parallelism='[seq]', & & cray_inline=True) @@ -267,7 +269,7 @@ contains !> The purpose of this subroutine is to check if the current cell is outside the computational domain or not (including ghost cells). !! @param cellaux Tested cell to smear the bubble effect in. !! @param celloutside If true, then cellaux is outside the computational domain. - pure subroutine s_check_celloutside(cellaux, celloutside) + subroutine s_check_celloutside(cellaux, celloutside) $:GPU_ROUTINE(function_name='s_check_celloutside',parallelism='[seq]', & & cray_inline=True) @@ -301,7 +303,7 @@ contains !> This subroutine relocates the current cell, if it intersects a symmetric boundary. !! @param cell Cell of the current bubble !! @param cellaux Cell to map the bubble effect in. - pure subroutine s_shift_cell_symmetric_bc(cellaux, cell) + subroutine s_shift_cell_symmetric_bc(cellaux, cell) $:GPU_ROUTINE(function_name='s_shift_cell_symmetric_bc', & & parallelism='[seq]', cray_inline=True) @@ -340,7 +342,7 @@ contains !! @param cell Cell where the bubble is located !! @param volpart Volume of the bubble !! @param stddsv Standard deviaton - pure subroutine s_compute_stddsv(cell, volpart, stddsv) + subroutine s_compute_stddsv(cell, volpart, stddsv) $:GPU_ROUTINE(function_name='s_compute_stddsv',parallelism='[seq]', & & cray_inline=True) @@ -379,7 +381,7 @@ contains !> The purpose of this procedure is to calculate the characteristic cell volume !! @param cell Computational coordinates (x, y, z) !! @param Charvol Characteristic volume - pure elemental subroutine s_get_char_vol(cellx, celly, cellz, Charvol) + elemental subroutine s_get_char_vol(cellx, celly, cellz, Charvol) $:GPU_ROUTINE(function_name='s_get_char_vol',parallelism='[seq]', & & cray_inline=True) @@ -402,7 +404,7 @@ contains !! real type into integer. !! @param s Computational coordinates of the bubble, real type !! @param get_cell Computational coordinates of the bubble, integer type - pure subroutine s_get_cell(s_cell, get_cell) + subroutine s_get_cell(s_cell, get_cell) $:GPU_ROUTINE(function_name='s_get_cell',parallelism='[seq]', & & cray_inline=True) diff --git a/src/simulation/m_cbc.fpp b/src/simulation/m_cbc.fpp index fb775b6c04..4df1c4fcf0 100644 --- a/src/simulation/m_cbc.fpp +++ b/src/simulation/m_cbc.fpp @@ -37,6 +37,9 @@ module m_cbc molecular_weights, get_species_specific_heats_r, & get_mole_fractions, get_species_specific_heats_r + #:block DEF_AMD + use m_chemistry, only: molecular_weights_nonparameter + #:endblock DEF_AMD implicit none private; public :: s_initialize_cbc_module, s_cbc, s_finalize_cbc_module @@ -707,29 +710,31 @@ contains F_src_rs${XYZ}$_vf, & is1, is2, is3, idwbuff(2)%beg, idwbuff(3)%beg) - $:GPU_PARALLEL_LOOP(collapse=3) - do i = 1, flux_cbc_index - do r = is3%beg, is3%end - do k = is2%beg, is2%end - flux_rs${XYZ}$_vf_l(0, k, r, i) = F_rs${XYZ}$_vf(0, k, r, i) & - + pi_coef_${XYZ}$ (0, 0, cbc_loc)* & - (F_rs${XYZ}$_vf(1, k, r, i) - & - F_rs${XYZ}$_vf(0, k, r, i)) + #:call GPU_PARALLEL_LOOP(collapse=3) + do i = 1, flux_cbc_index + do r = is3%beg, is3%end + do k = is2%beg, is2%end + flux_rs${XYZ}$_vf_l(0, k, r, i) = F_rs${XYZ}$_vf(0, k, r, i) & + + pi_coef_${XYZ}$ (0, 0, cbc_loc)* & + (F_rs${XYZ}$_vf(1, k, r, i) - & + F_rs${XYZ}$_vf(0, k, r, i)) + end do end do end do - end do + #:endcall GPU_PARALLEL_LOOP - $:GPU_PARALLEL_LOOP(collapse=3) - do i = advxb, advxe - do r = is3%beg, is3%end - do k = is2%beg, is2%end - flux_src_rs${XYZ}$_vf_l(0, k, r, i) = F_src_rs${XYZ}$_vf(0, k, r, i) + & - (F_src_rs${XYZ}$_vf(1, k, r, i) - & - F_src_rs${XYZ}$_vf(0, k, r, i)) & - *pi_coef_${XYZ}$ (0, 0, cbc_loc) + #:call GPU_PARALLEL_LOOP(collapse=3) + do i = advxb, advxe + do r = is3%beg, is3%end + do k = is2%beg, is2%end + flux_src_rs${XYZ}$_vf_l(0, k, r, i) = F_src_rs${XYZ}$_vf(0, k, r, i) + & + (F_src_rs${XYZ}$_vf(1, k, r, i) - & + F_src_rs${XYZ}$_vf(0, k, r, i)) & + *pi_coef_${XYZ}$ (0, 0, cbc_loc) + end do end do end do - end do + #:endcall GPU_PARALLEL_LOOP ! PI4 of flux_rs_vf and flux_src_rs_vf at j = 1/2, 3/2 else @@ -738,383 +743,390 @@ contains F_src_rs${XYZ}$_vf, & is1, is2, is3, idwbuff(2)%beg, idwbuff(3)%beg) - $:GPU_PARALLEL_LOOP(collapse=4) - do i = 1, flux_cbc_index - do j = 0, 1 - do r = is3%beg, is3%end - do k = is2%beg, is2%end - flux_rs${XYZ}$_vf_l(j, k, r, i) = F_rs${XYZ}$_vf(j, k, r, i) & - + pi_coef_${XYZ}$ (j, 0, cbc_loc)* & - (F_rs${XYZ}$_vf(3, k, r, i) - & - F_rs${XYZ}$_vf(2, k, r, i)) & - + pi_coef_${XYZ}$ (j, 1, cbc_loc)* & - (F_rs${XYZ}$_vf(2, k, r, i) - & - F_rs${XYZ}$_vf(1, k, r, i)) & - + pi_coef_${XYZ}$ (j, 2, cbc_loc)* & - (F_rs${XYZ}$_vf(1, k, r, i) - & - F_rs${XYZ}$_vf(0, k, r, i)) + #:call GPU_PARALLEL_LOOP(collapse=4) + do i = 1, flux_cbc_index + do j = 0, 1 + do r = is3%beg, is3%end + do k = is2%beg, is2%end + flux_rs${XYZ}$_vf_l(j, k, r, i) = F_rs${XYZ}$_vf(j, k, r, i) & + + pi_coef_${XYZ}$ (j, 0, cbc_loc)* & + (F_rs${XYZ}$_vf(3, k, r, i) - & + F_rs${XYZ}$_vf(2, k, r, i)) & + + pi_coef_${XYZ}$ (j, 1, cbc_loc)* & + (F_rs${XYZ}$_vf(2, k, r, i) - & + F_rs${XYZ}$_vf(1, k, r, i)) & + + pi_coef_${XYZ}$ (j, 2, cbc_loc)* & + (F_rs${XYZ}$_vf(1, k, r, i) - & + F_rs${XYZ}$_vf(0, k, r, i)) + end do end do end do end do - end do - - $:GPU_PARALLEL_LOOP(collapse=4) - do i = advxb, advxe - do j = 0, 1 - do r = is3%beg, is3%end - do k = is2%beg, is2%end - flux_src_rs${XYZ}$_vf_l(j, k, r, i) = F_src_rs${XYZ}$_vf(j, k, r, i) + & - (F_src_rs${XYZ}$_vf(3, k, r, i) - & - F_src_rs${XYZ}$_vf(2, k, r, i)) & - *pi_coef_${XYZ}$ (j, 0, cbc_loc) + & - (F_src_rs${XYZ}$_vf(2, k, r, i) - & - F_src_rs${XYZ}$_vf(1, k, r, i)) & - *pi_coef_${XYZ}$ (j, 1, cbc_loc) + & - (F_src_rs${XYZ}$_vf(1, k, r, i) - & - F_src_rs${XYZ}$_vf(0, k, r, i)) & - *pi_coef_${XYZ}$ (j, 2, cbc_loc) + #:endcall GPU_PARALLEL_LOOP + + #:call GPU_PARALLEL_LOOP(collapse=4) + do i = advxb, advxe + do j = 0, 1 + do r = is3%beg, is3%end + do k = is2%beg, is2%end + flux_src_rs${XYZ}$_vf_l(j, k, r, i) = F_src_rs${XYZ}$_vf(j, k, r, i) + & + (F_src_rs${XYZ}$_vf(3, k, r, i) - & + F_src_rs${XYZ}$_vf(2, k, r, i)) & + *pi_coef_${XYZ}$ (j, 0, cbc_loc) + & + (F_src_rs${XYZ}$_vf(2, k, r, i) - & + F_src_rs${XYZ}$_vf(1, k, r, i)) & + *pi_coef_${XYZ}$ (j, 1, cbc_loc) + & + (F_src_rs${XYZ}$_vf(1, k, r, i) - & + F_src_rs${XYZ}$_vf(0, k, r, i)) & + *pi_coef_${XYZ}$ (j, 2, cbc_loc) + end do end do end do end do - end do + #:endcall GPU_PARALLEL_LOOP end if ! FD2 or FD4 of RHS at j = 0 - $:GPU_PARALLEL_LOOP(collapse=2, private='[alpha_rho, vel, adv_local, & - & mf, dvel_ds, dadv_ds, Re_cbc, dalpha_rho_ds,dvel_dt, & - & dadv_dt, dalpha_rho_dt, L, lambda, Ys, dYs_dt, & - & dYs_ds, h_k, Cp_i, Gamma_i, Xs]') - do r = is3%beg, is3%end - do k = is2%beg, is2%end - - ! Transferring the Primitive Variables - $:GPU_LOOP(parallelism='[seq]') - do i = 1, contxe - alpha_rho(i) = q_prim_rs${XYZ}$_vf(0, k, r, i) - end do - - $:GPU_LOOP(parallelism='[seq]') - do i = 1, num_dims - vel(i) = q_prim_rs${XYZ}$_vf(0, k, r, contxe + i) - end do - - vel_K_sum = 0._wp - $:GPU_LOOP(parallelism='[seq]') - do i = 1, num_dims - vel_K_sum = vel_K_sum + vel(i)**2._wp - end do + #:call GPU_PARALLEL_LOOP(collapse=2, private='[alpha_rho, vel, adv_local, mf, dvel_ds, dadv_ds, Re_cbc, dalpha_rho_ds,dvel_dt, dadv_dt, dalpha_rho_dt, L, lambda, Ys, dYs_dt, dYs_ds, h_k, Cp_i, Gamma_i, Xs]') + do r = is3%beg, is3%end + do k = is2%beg, is2%end - pres = q_prim_rs${XYZ}$_vf(0, k, r, E_idx) + ! Transferring the Primitive Variables + $:GPU_LOOP(parallelism='[seq]') + do i = 1, contxe + alpha_rho(i) = q_prim_rs${XYZ}$_vf(0, k, r, i) + end do - $:GPU_LOOP(parallelism='[seq]') - do i = 1, advxe - E_idx - adv_local(i) = q_prim_rs${XYZ}$_vf(0, k, r, E_idx + i) - end do + $:GPU_LOOP(parallelism='[seq]') + do i = 1, num_dims + vel(i) = q_prim_rs${XYZ}$_vf(0, k, r, contxe + i) + end do - if (bubbles_euler) then - call s_convert_species_to_mixture_variables_bubbles_acc(rho, gamma, pi_inf, qv, adv_local, alpha_rho, Re_cbc) - else - call s_convert_species_to_mixture_variables_acc(rho, gamma, pi_inf, qv, adv_local, alpha_rho, Re_cbc) - end if + vel_K_sum = 0._wp + $:GPU_LOOP(parallelism='[seq]') + do i = 1, num_dims + vel_K_sum = vel_K_sum + vel(i)**2._wp + end do - $:GPU_LOOP(parallelism='[seq]') - do i = 1, contxe - mf(i) = alpha_rho(i)/rho - end do + pres = q_prim_rs${XYZ}$_vf(0, k, r, E_idx) - if (chemistry) then $:GPU_LOOP(parallelism='[seq]') - do i = chemxb, chemxe - Ys(i - chemxb + 1) = q_prim_rs${XYZ}$_vf(0, k, r, i) + do i = 1, advxe - E_idx + adv_local(i) = q_prim_rs${XYZ}$_vf(0, k, r, E_idx + i) end do - call get_mixture_molecular_weight(Ys, Mw) - R_gas = gas_constant/Mw - T = pres/rho/R_gas - call get_mixture_specific_heat_cp_mass(T, Ys, Cp) - call get_mixture_energy_mass(T, Ys, e_mix) - E = rho*e_mix + 5.e-1_wp*rho*vel_K_sum - if (chem_params%gamma_method == 1) then - !> gamma_method = 1: Ref. Section 2.3.1 Formulation of doi:10.7907/ZKW8-ES97. - call get_mole_fractions(Mw, Ys, Xs) - call get_species_specific_heats_r(T, Cp_i) - Gamma_i = Cp_i/(Cp_i - 1.0_wp) - gamma = sum(Xs(:)/(Gamma_i(:) - 1.0_wp)) - else if (chem_params%gamma_method == 2) then - !> gamma_method = 2: c_p / c_v where c_p, c_v are specific heats. - call get_mixture_specific_heat_cv_mass(T, Ys, Cv) - gamma = 1.0_wp/(Cp/Cv - 1.0_wp) + if (bubbles_euler) then + call s_convert_species_to_mixture_variables_bubbles_acc(rho, gamma, pi_inf, qv, adv_local, alpha_rho, Re_cbc) + else + call s_convert_species_to_mixture_variables_acc(rho, gamma, pi_inf, qv, adv_local, alpha_rho, Re_cbc) end if - else - E = gamma*pres + pi_inf + 5.e-1_wp*rho*vel_K_sum - end if - H = (E + pres)/rho - - ! Compute mixture sound speed - call s_compute_speed_of_sound(pres, rho, gamma, pi_inf, H, adv_local, vel_K_sum, 0._wp, c) - - ! First-Order Spatial Derivatives of Primitive Variables + $:GPU_LOOP(parallelism='[seq]') + do i = 1, contxe + mf(i) = alpha_rho(i)/rho + end do - $:GPU_LOOP(parallelism='[seq]') - do i = 1, contxe - dalpha_rho_ds(i) = 0._wp - end do + if (chemistry) then + $:GPU_LOOP(parallelism='[seq]') + do i = chemxb, chemxe + Ys(i - chemxb + 1) = q_prim_rs${XYZ}$_vf(0, k, r, i) + end do - $:GPU_LOOP(parallelism='[seq]') - do i = 1, num_dims - dvel_ds(i) = 0._wp - end do + call get_mixture_molecular_weight(Ys, Mw) + R_gas = gas_constant/Mw + T = pres/rho/R_gas + call get_mixture_specific_heat_cp_mass(T, Ys, Cp) + call get_mixture_energy_mass(T, Ys, e_mix) + E = rho*e_mix + 5.e-1_wp*rho*vel_K_sum + if (chem_params%gamma_method == 1) then + !> gamma_method = 1: Ref. Section 2.3.1 Formulation of doi:10.7907/ZKW8-ES97. + call get_mole_fractions(Mw, Ys, Xs) + call get_species_specific_heats_r(T, Cp_i) + Gamma_i = Cp_i/(Cp_i - 1.0_wp) + gamma = sum(Xs(:)/(Gamma_i(:) - 1.0_wp)) + else if (chem_params%gamma_method == 2) then + !> gamma_method = 2: c_p / c_v where c_p, c_v are specific heats. + call get_mixture_specific_heat_cv_mass(T, Ys, Cv) + gamma = 1.0_wp/(Cp/Cv - 1.0_wp) + end if + else + E = gamma*pres + pi_inf + 5.e-1_wp*rho*vel_K_sum + end if - dpres_ds = 0._wp - $:GPU_LOOP(parallelism='[seq]') - do i = 1, advxe - E_idx - dadv_ds(i) = 0._wp - end do + H = (E + pres)/rho - if (chemistry) then - $:GPU_LOOP(parallelism='[seq]') - do i = 1, num_species - dYs_ds(i) = 0._wp - end do - end if + ! Compute mixture sound speed + call s_compute_speed_of_sound(pres, rho, gamma, pi_inf, H, adv_local, vel_K_sum, 0._wp, c) - $:GPU_LOOP(parallelism='[seq]') - do j = 0, buff_size + ! First-Order Spatial Derivatives of Primitive Variables $:GPU_LOOP(parallelism='[seq]') do i = 1, contxe - dalpha_rho_ds(i) = q_prim_rs${XYZ}$_vf(j, k, r, i)* & - fd_coef_${XYZ}$ (j, cbc_loc) + & - dalpha_rho_ds(i) + dalpha_rho_ds(i) = 0._wp end do + $:GPU_LOOP(parallelism='[seq]') do i = 1, num_dims - dvel_ds(i) = q_prim_rs${XYZ}$_vf(j, k, r, contxe + i)* & - fd_coef_${XYZ}$ (j, cbc_loc) + & - dvel_ds(i) + dvel_ds(i) = 0._wp end do - dpres_ds = q_prim_rs${XYZ}$_vf(j, k, r, E_idx)* & - fd_coef_${XYZ}$ (j, cbc_loc) + & - dpres_ds + dpres_ds = 0._wp $:GPU_LOOP(parallelism='[seq]') do i = 1, advxe - E_idx - dadv_ds(i) = q_prim_rs${XYZ}$_vf(j, k, r, E_idx + i)* & - fd_coef_${XYZ}$ (j, cbc_loc) + & - dadv_ds(i) + dadv_ds(i) = 0._wp end do if (chemistry) then $:GPU_LOOP(parallelism='[seq]') do i = 1, num_species - dYs_ds(i) = q_prim_rs${XYZ}$_vf(j, k, r, chemxb - 1 + i)* & - fd_coef_${XYZ}$ (j, cbc_loc) + & - dYs_ds(i) + dYs_ds(i) = 0._wp end do end if - end do - ! First-Order Temporal Derivatives of Primitive Variables - lambda(1) = vel(dir_idx(1)) - c - lambda(2) = vel(dir_idx(1)) - lambda(3) = vel(dir_idx(1)) + c - - Ma = vel(dir_idx(1))/c - - if ((cbc_loc == -1 .and. bc${XYZ}$b == BC_CHAR_SLIP_WALL) .or. & - (cbc_loc == 1 .and. bc${XYZ}$e == BC_CHAR_SLIP_WALL)) then - call s_compute_slip_wall_L(lambda, L, rho, c, dpres_ds, dvel_ds) - else if ((cbc_loc == -1 .and. bc${XYZ}$b == BC_CHAR_NR_SUB_BUFFER) .or. & - (cbc_loc == 1 .and. bc${XYZ}$e == BC_CHAR_NR_SUB_BUFFER)) then - call s_compute_nonreflecting_subsonic_buffer_L(lambda, L, rho, c, mf, dalpha_rho_ds, dpres_ds, dvel_ds, dadv_ds, dYs_ds) - else if ((cbc_loc == -1 .and. bc${XYZ}$b == BC_CHAR_NR_SUB_INFLOW) .or. & - (cbc_loc == 1 .and. bc${XYZ}$e == BC_CHAR_NR_SUB_INFLOW)) then - call s_compute_nonreflecting_subsonic_inflow_L(lambda, L, rho, c, dpres_ds, dvel_ds) - ! Add GRCBC for Subsonic Inflow - if (bc_${XYZ}$%grcbc_in) then + $:GPU_LOOP(parallelism='[seq]') + do j = 0, buff_size + $:GPU_LOOP(parallelism='[seq]') - do i = 2, momxb - L(2) = c**3._wp*Ma*(alpha_rho(i - 1) - alpha_rho_in(i - 1, ${CBC_DIR}$))/Del_in(${CBC_DIR}$) - c*Ma*(pres - pres_in(${CBC_DIR}$))/Del_in(${CBC_DIR}$) + do i = 1, contxe + dalpha_rho_ds(i) = q_prim_rs${XYZ}$_vf(j, k, r, i)* & + fd_coef_${XYZ}$ (j, cbc_loc) + & + dalpha_rho_ds(i) end do - if (n > 0) then - L(momxb + 1) = c*Ma*(vel(dir_idx(2)) - vel_in(${CBC_DIR}$, dir_idx(2)))/Del_in(${CBC_DIR}$) - if (p > 0) then - L(momxb + 2) = c*Ma*(vel(dir_idx(3)) - vel_in(${CBC_DIR}$, dir_idx(3)))/Del_in(${CBC_DIR}$) - end if - end if $:GPU_LOOP(parallelism='[seq]') - do i = E_idx, advxe - 1 - L(i) = c*Ma*(adv_local(i + 1 - E_idx) - alpha_in(i + 1 - E_idx, ${CBC_DIR}$))/Del_in(${CBC_DIR}$) + do i = 1, num_dims + dvel_ds(i) = q_prim_rs${XYZ}$_vf(j, k, r, contxe + i)* & + fd_coef_${XYZ}$ (j, cbc_loc) + & + dvel_ds(i) end do - L(advxe) = rho*c**2._wp*(1._wp + Ma)*(vel(dir_idx(1)) + vel_in(${CBC_DIR}$, dir_idx(1))*sign(1, cbc_loc))/Del_in(${CBC_DIR}$) + c*(1._wp + Ma)*(pres - pres_in(${CBC_DIR}$))/Del_in(${CBC_DIR}$) - end if - else if ((cbc_loc == -1 .and. bc${XYZ}$b == BC_CHAR_NR_SUB_OUTFLOW) .or. & - (cbc_loc == 1 .and. bc${XYZ}$e == BC_CHAR_NR_SUB_OUTFLOW)) then - call s_compute_nonreflecting_subsonic_outflow_L(lambda, L, rho, c, mf, dalpha_rho_ds, dpres_ds, dvel_ds, dadv_ds, dYs_ds) - ! Add GRCBC for Subsonic Outflow (Pressure) - if (bc_${XYZ}$%grcbc_out) then - L(advxe) = c*(1._wp - Ma)*(pres - pres_out(${CBC_DIR}$))/Del_out(${CBC_DIR}$) - - ! Add GRCBC for Subsonic Outflow (Normal Velocity) - if (bc_${XYZ}$%grcbc_vel_out) then - L(advxe) = L(advxe) + rho*c**2._wp*(1._wp - Ma)*(vel(dir_idx(1)) + vel_out(${CBC_DIR}$, dir_idx(1))*sign(1, cbc_loc))/Del_out(${CBC_DIR}$) + + dpres_ds = q_prim_rs${XYZ}$_vf(j, k, r, E_idx)* & + fd_coef_${XYZ}$ (j, cbc_loc) + & + dpres_ds + $:GPU_LOOP(parallelism='[seq]') + do i = 1, advxe - E_idx + dadv_ds(i) = q_prim_rs${XYZ}$_vf(j, k, r, E_idx + i)* & + fd_coef_${XYZ}$ (j, cbc_loc) + & + dadv_ds(i) + end do + + if (chemistry) then + $:GPU_LOOP(parallelism='[seq]') + do i = 1, num_species + dYs_ds(i) = q_prim_rs${XYZ}$_vf(j, k, r, chemxb - 1 + i)* & + fd_coef_${XYZ}$ (j, cbc_loc) + & + dYs_ds(i) + end do end if - end if - else if ((cbc_loc == -1 .and. bc${XYZ}$b == BC_CHAR_FF_SUB_OUTFLOW) .or. & - (cbc_loc == 1 .and. bc${XYZ}$e == BC_CHAR_FF_SUB_OUTFLOW)) then - call s_compute_force_free_subsonic_outflow_L(lambda, L, rho, c, mf, dalpha_rho_ds, dpres_ds, dvel_ds, dadv_ds) - else if ((cbc_loc == -1 .and. bc${XYZ}$b == BC_CHAR_CP_SUB_OUTFLOW) .or. & - (cbc_loc == 1 .and. bc${XYZ}$e == BC_CHAR_CP_SUB_OUTFLOW)) then - call s_compute_constant_pressure_subsonic_outflow_L(lambda, L, rho, c, mf, dalpha_rho_ds, dpres_ds, dvel_ds, dadv_ds) - else if ((cbc_loc == -1 .and. bc${XYZ}$b == BC_CHAR_SUP_INFLOW) .or. & - (cbc_loc == 1 .and. bc${XYZ}$e == BC_CHAR_SUP_INFLOW)) then - call s_compute_supersonic_inflow_L(L) - else if ((cbc_loc == -1 .and. bc${XYZ}$b == BC_CHAR_SUP_OUTFLOW) .or. & - (cbc_loc == 1 .and. bc${XYZ}$e == BC_CHAR_SUP_OUTFLOW)) then - call s_compute_supersonic_outflow_L(lambda, L, rho, c, mf, dalpha_rho_ds, dpres_ds, dvel_ds, dadv_ds, dYs_ds) - end if - - ! Be careful about the cylindrical coordinate! - if (cyl_coord .and. cbc_dir == 2 .and. cbc_loc == 1) then - dpres_dt = -5.e-1_wp*(L(advxe) + L(1)) + rho*c*c*vel(dir_idx(1)) & - /y_cc(n) - else - dpres_dt = -5.e-1_wp*(L(advxe) + L(1)) - end if - - $:GPU_LOOP(parallelism='[seq]') - do i = 1, contxe - dalpha_rho_dt(i) = & - -(L(i + 1) - mf(i)*dpres_dt)/(c*c) - end do + end do - $:GPU_LOOP(parallelism='[seq]') - do i = 1, num_dims - dvel_dt(dir_idx(i)) = dir_flg(dir_idx(i))* & - (L(1) - L(advxe))/(2._wp*rho*c) + & - (dir_flg(dir_idx(i)) - 1._wp)* & - L(momxb + i - 1) - end do + ! First-Order Temporal Derivatives of Primitive Variables + lambda(1) = vel(dir_idx(1)) - c + lambda(2) = vel(dir_idx(1)) + lambda(3) = vel(dir_idx(1)) + c + + Ma = vel(dir_idx(1))/c + + if ((cbc_loc == -1 .and. bc${XYZ}$b == BC_CHAR_SLIP_WALL) .or. & + (cbc_loc == 1 .and. bc${XYZ}$e == BC_CHAR_SLIP_WALL)) then + call s_compute_slip_wall_L(lambda, L, rho, c, dpres_ds, dvel_ds) + else if ((cbc_loc == -1 .and. bc${XYZ}$b == BC_CHAR_NR_SUB_BUFFER) .or. & + (cbc_loc == 1 .and. bc${XYZ}$e == BC_CHAR_NR_SUB_BUFFER)) then + call s_compute_nonreflecting_subsonic_buffer_L(lambda, L, rho, c, mf, dalpha_rho_ds, dpres_ds, dvel_ds, dadv_ds, dYs_ds) + else if ((cbc_loc == -1 .and. bc${XYZ}$b == BC_CHAR_NR_SUB_INFLOW) .or. & + (cbc_loc == 1 .and. bc${XYZ}$e == BC_CHAR_NR_SUB_INFLOW)) then + call s_compute_nonreflecting_subsonic_inflow_L(lambda, L, rho, c, dpres_ds, dvel_ds) + ! Add GRCBC for Subsonic Inflow + if (bc_${XYZ}$%grcbc_in) then + $:GPU_LOOP(parallelism='[seq]') + do i = 2, momxb + L(2) = c**3._wp*Ma*(alpha_rho(i - 1) - alpha_rho_in(i - 1, ${CBC_DIR}$))/Del_in(${CBC_DIR}$) - c*Ma*(pres - pres_in(${CBC_DIR}$))/Del_in(${CBC_DIR}$) + end do + if (n > 0) then + L(momxb + 1) = c*Ma*(vel(dir_idx(2)) - vel_in(${CBC_DIR}$, dir_idx(2)))/Del_in(${CBC_DIR}$) + if (p > 0) then + L(momxb + 2) = c*Ma*(vel(dir_idx(3)) - vel_in(${CBC_DIR}$, dir_idx(3)))/Del_in(${CBC_DIR}$) + end if + end if + $:GPU_LOOP(parallelism='[seq]') + do i = E_idx, advxe - 1 + L(i) = c*Ma*(adv_local(i + 1 - E_idx) - alpha_in(i + 1 - E_idx, ${CBC_DIR}$))/Del_in(${CBC_DIR}$) + end do + L(advxe) = rho*c**2._wp*(1._wp + Ma)*(vel(dir_idx(1)) + vel_in(${CBC_DIR}$, dir_idx(1))*sign(1, cbc_loc))/Del_in(${CBC_DIR}$) + c*(1._wp + Ma)*(pres - pres_in(${CBC_DIR}$))/Del_in(${CBC_DIR}$) + end if + else if ((cbc_loc == -1 .and. bc${XYZ}$b == BC_CHAR_NR_SUB_OUTFLOW) .or. & + (cbc_loc == 1 .and. bc${XYZ}$e == BC_CHAR_NR_SUB_OUTFLOW)) then + call s_compute_nonreflecting_subsonic_outflow_L(lambda, L, rho, c, mf, dalpha_rho_ds, dpres_ds, dvel_ds, dadv_ds, dYs_ds) + ! Add GRCBC for Subsonic Outflow (Pressure) + if (bc_${XYZ}$%grcbc_out) then + L(advxe) = c*(1._wp - Ma)*(pres - pres_out(${CBC_DIR}$))/Del_out(${CBC_DIR}$) + + ! Add GRCBC for Subsonic Outflow (Normal Velocity) + if (bc_${XYZ}$%grcbc_vel_out) then + L(advxe) = L(advxe) + rho*c**2._wp*(1._wp - Ma)*(vel(dir_idx(1)) + vel_out(${CBC_DIR}$, dir_idx(1))*sign(1, cbc_loc))/Del_out(${CBC_DIR}$) + end if + end if + else if ((cbc_loc == -1 .and. bc${XYZ}$b == BC_CHAR_FF_SUB_OUTFLOW) .or. & + (cbc_loc == 1 .and. bc${XYZ}$e == BC_CHAR_FF_SUB_OUTFLOW)) then + call s_compute_force_free_subsonic_outflow_L(lambda, L, rho, c, mf, dalpha_rho_ds, dpres_ds, dvel_ds, dadv_ds) + else if ((cbc_loc == -1 .and. bc${XYZ}$b == BC_CHAR_CP_SUB_OUTFLOW) .or. & + (cbc_loc == 1 .and. bc${XYZ}$e == BC_CHAR_CP_SUB_OUTFLOW)) then + call s_compute_constant_pressure_subsonic_outflow_L(lambda, L, rho, c, mf, dalpha_rho_ds, dpres_ds, dvel_ds, dadv_ds) + else if ((cbc_loc == -1 .and. bc${XYZ}$b == BC_CHAR_SUP_INFLOW) .or. & + (cbc_loc == 1 .and. bc${XYZ}$e == BC_CHAR_SUP_INFLOW)) then + call s_compute_supersonic_inflow_L(L) + else if ((cbc_loc == -1 .and. bc${XYZ}$b == BC_CHAR_SUP_OUTFLOW) .or. & + (cbc_loc == 1 .and. bc${XYZ}$e == BC_CHAR_SUP_OUTFLOW)) then + call s_compute_supersonic_outflow_L(lambda, L, rho, c, mf, dalpha_rho_ds, dpres_ds, dvel_ds, dadv_ds, dYs_ds) + end if - vel_dv_dt_sum = 0._wp - $:GPU_LOOP(parallelism='[seq]') - do i = 1, num_dims - vel_dv_dt_sum = vel_dv_dt_sum + vel(i)*dvel_dt(i) - end do + ! Be careful about the cylindrical coordinate! + if (cyl_coord .and. cbc_dir == 2 .and. cbc_loc == 1) then + dpres_dt = -5.e-1_wp*(L(advxe) + L(1)) + rho*c*c*vel(dir_idx(1)) & + /y_cc(n) + else + dpres_dt = -5.e-1_wp*(L(advxe) + L(1)) + end if - if (chemistry) then $:GPU_LOOP(parallelism='[seq]') - do i = 1, num_species - dYs_dt(i) = -1._wp*L(chemxb + i - 1) + do i = 1, contxe + dalpha_rho_dt(i) = & + -(L(i + 1) - mf(i)*dpres_dt)/(c*c) end do - end if - ! The treatment of void fraction source is unclear - if (cyl_coord .and. cbc_dir == 2 .and. cbc_loc == 1) then $:GPU_LOOP(parallelism='[seq]') - do i = 1, advxe - E_idx - dadv_dt(i) = -L(momxe + i) !+ adv_local(i) * vel(dir_idx(1))/y_cc(n) + do i = 1, num_dims + dvel_dt(dir_idx(i)) = dir_flg(dir_idx(i))* & + (L(1) - L(advxe))/(2._wp*rho*c) + & + (dir_flg(dir_idx(i)) - 1._wp)* & + L(momxb + i - 1) end do - else + + vel_dv_dt_sum = 0._wp $:GPU_LOOP(parallelism='[seq]') - do i = 1, advxe - E_idx - dadv_dt(i) = -L(momxe + i) + do i = 1, num_dims + vel_dv_dt_sum = vel_dv_dt_sum + vel(i)*dvel_dt(i) end do - end if - drho_dt = 0._wp; dgamma_dt = 0._wp; dpi_inf_dt = 0._wp; dqv_dt = 0._wp + if (chemistry) then + $:GPU_LOOP(parallelism='[seq]') + do i = 1, num_species + dYs_dt(i) = -1._wp*L(chemxb + i - 1) + end do + end if - if (model_eqns == 1) then - drho_dt = dalpha_rho_dt(1) - dgamma_dt = dadv_dt(1) - dpi_inf_dt = dadv_dt(2) - else - $:GPU_LOOP(parallelism='[seq]') - do i = 1, num_fluids - drho_dt = drho_dt + dalpha_rho_dt(i) - dgamma_dt = dgamma_dt + dadv_dt(i)*gammas(i) - dpi_inf_dt = dpi_inf_dt + dadv_dt(i)*pi_infs(i) - dqv_dt = dqv_dt + dalpha_rho_dt(i)*qvs(i) - end do - end if + ! The treatment of void fraction source is unclear + if (cyl_coord .and. cbc_dir == 2 .and. cbc_loc == 1) then + $:GPU_LOOP(parallelism='[seq]') + do i = 1, advxe - E_idx + dadv_dt(i) = -L(momxe + i) !+ adv_local(i) * vel(dir_idx(1))/y_cc(n) + end do + else + $:GPU_LOOP(parallelism='[seq]') + do i = 1, advxe - E_idx + dadv_dt(i) = -L(momxe + i) + end do + end if - ! flux_rs_vf_l and flux_src_rs_vf_l at j = -1/2 - $:GPU_LOOP(parallelism='[seq]') - do i = 1, contxe - flux_rs${XYZ}$_vf_l(-1, k, r, i) = flux_rs${XYZ}$_vf_l(0, k, r, i) & - + ds(0)*dalpha_rho_dt(i) - end do + drho_dt = 0._wp; dgamma_dt = 0._wp; dpi_inf_dt = 0._wp; dqv_dt = 0._wp - $:GPU_LOOP(parallelism='[seq]') - do i = momxb, momxe - flux_rs${XYZ}$_vf_l(-1, k, r, i) = flux_rs${XYZ}$_vf_l(0, k, r, i) & - + ds(0)*(vel(i - contxe)*drho_dt & - + rho*dvel_dt(i - contxe)) - end do + if (model_eqns == 1) then + drho_dt = dalpha_rho_dt(1) + dgamma_dt = dadv_dt(1) + dpi_inf_dt = dadv_dt(2) + else + $:GPU_LOOP(parallelism='[seq]') + do i = 1, num_fluids + drho_dt = drho_dt + dalpha_rho_dt(i) + dgamma_dt = dgamma_dt + dadv_dt(i)*gammas(i) + dpi_inf_dt = dpi_inf_dt + dadv_dt(i)*pi_infs(i) + dqv_dt = dqv_dt + dalpha_rho_dt(i)*qvs(i) + end do + end if - if (chemistry) then - ! Evolution of LODI equation of energy for real gases adjusted to perfect gas, doi:10.1006/jcph.2002.6990 - call get_species_enthalpies_rt(T, h_k) - sum_Enthalpies = 0._wp - $:GPU_LOOP(parallelism='[seq]') - do i = 1, num_species - h_k(i) = h_k(i)*gas_constant/molecular_weights(i)*T - sum_Enthalpies = sum_Enthalpies + (rho*h_k(i) - pres*Mw/molecular_weights(i)*Cp/R_gas)*dYs_dt(i) - end do - flux_rs${XYZ}$_vf_l(-1, k, r, E_idx) = flux_rs${XYZ}$_vf_l(0, k, r, E_idx) & - + ds(0)*((E/rho + pres/rho)*drho_dt + rho*vel_dv_dt_sum + Cp*T*L(2)/(c*c) + sum_Enthalpies) - $:GPU_LOOP(parallelism='[seq]') - do i = 1, num_species - flux_rs${XYZ}$_vf_l(-1, k, r, i - 1 + chemxb) = flux_rs${XYZ}$_vf_l(0, k, r, chemxb + i - 1) & - + ds(0)*(drho_dt*Ys(i) + rho*dYs_dt(i)) - end do - else - flux_rs${XYZ}$_vf_l(-1, k, r, E_idx) = flux_rs${XYZ}$_vf_l(0, k, r, E_idx) & - + ds(0)*(pres*dgamma_dt & - + gamma*dpres_dt & - + dpi_inf_dt & - + dqv_dt & - + rho*vel_dv_dt_sum & - + 5.e-1_wp*drho_dt*vel_K_sum) - end if - - if (riemann_solver == 1) then + ! flux_rs_vf_l and flux_src_rs_vf_l at j = -1/2 $:GPU_LOOP(parallelism='[seq]') - do i = advxb, advxe - flux_rs${XYZ}$_vf_l(-1, k, r, i) = 0._wp + do i = 1, contxe + flux_rs${XYZ}$_vf_l(-1, k, r, i) = flux_rs${XYZ}$_vf_l(0, k, r, i) & + + ds(0)*dalpha_rho_dt(i) end do $:GPU_LOOP(parallelism='[seq]') - do i = advxb, advxe - flux_src_rs${XYZ}$_vf_l(-1, k, r, i) = & - 1._wp/max(abs(vel(dir_idx(1))), sgm_eps) & - *sign(1._wp, vel(dir_idx(1))) & - *(flux_rs${XYZ}$_vf_l(0, k, r, i) & - + vel(dir_idx(1)) & - *flux_src_rs${XYZ}$_vf_l(0, k, r, i) & - + ds(0)*dadv_dt(i - E_idx)) + do i = momxb, momxe + flux_rs${XYZ}$_vf_l(-1, k, r, i) = flux_rs${XYZ}$_vf_l(0, k, r, i) & + + ds(0)*(vel(i - contxe)*drho_dt & + + rho*dvel_dt(i - contxe)) end do - else + if (chemistry) then + ! Evolution of LODI equation of energy for real gases adjusted to perfect gas, doi:10.1006/jcph.2002.6990 + call get_species_enthalpies_rt(T, h_k) + sum_Enthalpies = 0._wp + $:GPU_LOOP(parallelism='[seq]') + do i = 1, num_species + #:block UNDEF_AMD + h_k(i) = h_k(i)*gas_constant/molecular_weights(i)*T + sum_Enthalpies = sum_Enthalpies + (rho*h_k(i) - pres*Mw/molecular_weights(i)*Cp/R_gas)*dYs_dt(i) + #:endblock UNDEF_AMD + + #:block DEF_AMD + h_k(i) = h_k(i)*gas_constant/molecular_weights_nonparameter(i)*T + sum_Enthalpies = sum_Enthalpies + (rho*h_k(i) - pres*Mw/molecular_weights_nonparameter(i)*Cp/R_gas)*dYs_dt(i) + #:endblock DEF_AMD + end do + flux_rs${XYZ}$_vf_l(-1, k, r, E_idx) = flux_rs${XYZ}$_vf_l(0, k, r, E_idx) & + + ds(0)*((E/rho + pres/rho)*drho_dt + rho*vel_dv_dt_sum + Cp*T*L(2)/(c*c) + sum_Enthalpies) + $:GPU_LOOP(parallelism='[seq]') + do i = 1, num_species + flux_rs${XYZ}$_vf_l(-1, k, r, i - 1 + chemxb) = flux_rs${XYZ}$_vf_l(0, k, r, chemxb + i - 1) & + + ds(0)*(drho_dt*Ys(i) + rho*dYs_dt(i)) + end do + else + flux_rs${XYZ}$_vf_l(-1, k, r, E_idx) = flux_rs${XYZ}$_vf_l(0, k, r, E_idx) & + + ds(0)*(pres*dgamma_dt & + + gamma*dpres_dt & + + dpi_inf_dt & + + dqv_dt & + + rho*vel_dv_dt_sum & + + 5.e-1_wp*drho_dt*vel_K_sum) + end if + + if (riemann_solver == 1) then + $:GPU_LOOP(parallelism='[seq]') + do i = advxb, advxe + flux_rs${XYZ}$_vf_l(-1, k, r, i) = 0._wp + end do - $:GPU_LOOP(parallelism='[seq]') - do i = advxb, advxe - flux_rs${XYZ}$_vf_l(-1, k, r, i) = flux_rs${XYZ}$_vf_l(0, k, r, i) + & - ds(0)*dadv_dt(i - E_idx) - end do + $:GPU_LOOP(parallelism='[seq]') + do i = advxb, advxe + flux_src_rs${XYZ}$_vf_l(-1, k, r, i) = & + 1._wp/max(abs(vel(dir_idx(1))), sgm_eps) & + *sign(1._wp, vel(dir_idx(1))) & + *(flux_rs${XYZ}$_vf_l(0, k, r, i) & + + vel(dir_idx(1)) & + *flux_src_rs${XYZ}$_vf_l(0, k, r, i) & + + ds(0)*dadv_dt(i - E_idx)) + end do - $:GPU_LOOP(parallelism='[seq]') - do i = advxb, advxe - flux_src_rs${XYZ}$_vf_l(-1, k, r, i) = flux_src_rs${XYZ}$_vf_l(0, k, r, i) - end do + else + + $:GPU_LOOP(parallelism='[seq]') + do i = advxb, advxe + flux_rs${XYZ}$_vf_l(-1, k, r, i) = flux_rs${XYZ}$_vf_l(0, k, r, i) + & + ds(0)*dadv_dt(i - E_idx) + end do - end if - ! END: flux_rs_vf_l and flux_src_rs_vf_l at j = -1/2 + $:GPU_LOOP(parallelism='[seq]') + do i = advxb, advxe + flux_src_rs${XYZ}$_vf_l(-1, k, r, i) = flux_src_rs${XYZ}$_vf_l(0, k, r, i) + end do + + end if + ! END: flux_rs_vf_l and flux_src_rs_vf_l at j = -1/2 + end do end do - end do + #:endcall GPU_PARALLEL_LOOP end if #:endfor @@ -1175,75 +1187,81 @@ contains ! Reshaping Inputted Data in x-direction if (cbc_dir == 1) then - $:GPU_PARALLEL_LOOP(collapse=4) - do i = 1, sys_size - do r = is3%beg, is3%end - do k = is2%beg, is2%end - do j = 0, buff_size - q_prim_rsx_vf(j, k, r, i) = & - q_prim_vf(i)%sf(dj*(m - 2*j) + j, k, r) + #:call GPU_PARALLEL_LOOP(collapse=4) + do i = 1, sys_size + do r = is3%beg, is3%end + do k = is2%beg, is2%end + do j = 0, buff_size + q_prim_rsx_vf(j, k, r, i) = & + q_prim_vf(i)%sf(dj*(m - 2*j) + j, k, r) + end do end do end do end do - end do - - $:GPU_PARALLEL_LOOP(collapse=3) - do r = is3%beg, is3%end - do k = is2%beg, is2%end - do j = 0, buff_size - q_prim_rsx_vf(j, k, r, momxb) = & - q_prim_vf(momxb)%sf(dj*(m - 2*j) + j, k, r)* & - sign(1._wp, -1._wp*cbc_loc) - end do - end do - end do + #:endcall GPU_PARALLEL_LOOP - $:GPU_PARALLEL_LOOP(collapse=4) - do i = 1, flux_cbc_index + #:call GPU_PARALLEL_LOOP(collapse=3) do r = is3%beg, is3%end do k = is2%beg, is2%end - do j = -1, buff_size - flux_rsx_vf_l(j, k, r, i) = & - flux_vf(i)%sf(dj*((m - 1) - 2*j) + j, k, r)* & + do j = 0, buff_size + q_prim_rsx_vf(j, k, r, momxb) = & + q_prim_vf(momxb)%sf(dj*(m - 2*j) + j, k, r)* & sign(1._wp, -1._wp*cbc_loc) end do end do end do - end do - - $:GPU_PARALLEL_LOOP(collapse=3) - do r = is3%beg, is3%end - do k = is2%beg, is2%end - do j = -1, buff_size - flux_rsx_vf_l(j, k, r, momxb) = & - flux_vf(momxb)%sf(dj*((m - 1) - 2*j) + j, k, r) - end do - end do - end do + #:endcall GPU_PARALLEL_LOOP - if (riemann_solver == 1) then - $:GPU_PARALLEL_LOOP(collapse=4) - do i = advxb, advxe + #:call GPU_PARALLEL_LOOP(collapse=4) + do i = 1, flux_cbc_index do r = is3%beg, is3%end do k = is2%beg, is2%end do j = -1, buff_size - flux_src_rsx_vf_l(j, k, r, i) = & - flux_src_vf(i)%sf(dj*((m - 1) - 2*j) + j, k, r) + flux_rsx_vf_l(j, k, r, i) = & + flux_vf(i)%sf(dj*((m - 1) - 2*j) + j, k, r)* & + sign(1._wp, -1._wp*cbc_loc) end do end do end do end do - else - $:GPU_PARALLEL_LOOP(collapse=3) + #:endcall GPU_PARALLEL_LOOP + + #:call GPU_PARALLEL_LOOP(collapse=3) do r = is3%beg, is3%end do k = is2%beg, is2%end do j = -1, buff_size - flux_src_rsx_vf_l(j, k, r, advxb) = & - flux_src_vf(advxb)%sf(dj*((m - 1) - 2*j) + j, k, r)* & - sign(1._wp, -1._wp*cbc_loc) + flux_rsx_vf_l(j, k, r, momxb) = & + flux_vf(momxb)%sf(dj*((m - 1) - 2*j) + j, k, r) end do end do end do + #:endcall GPU_PARALLEL_LOOP + + if (riemann_solver == 1) then + #:call GPU_PARALLEL_LOOP(collapse=4) + do i = advxb, advxe + do r = is3%beg, is3%end + do k = is2%beg, is2%end + do j = -1, buff_size + flux_src_rsx_vf_l(j, k, r, i) = & + flux_src_vf(i)%sf(dj*((m - 1) - 2*j) + j, k, r) + end do + end do + end do + end do + #:endcall GPU_PARALLEL_LOOP + else + #:call GPU_PARALLEL_LOOP(collapse=3) + do r = is3%beg, is3%end + do k = is2%beg, is2%end + do j = -1, buff_size + flux_src_rsx_vf_l(j, k, r, advxb) = & + flux_src_vf(advxb)%sf(dj*((m - 1) - 2*j) + j, k, r)* & + sign(1._wp, -1._wp*cbc_loc) + end do + end do + end do + #:endcall GPU_PARALLEL_LOOP end if ! END: Reshaping Inputted Data in x-direction @@ -1251,75 +1269,81 @@ contains ! Reshaping Inputted Data in y-direction elseif (cbc_dir == 2) then - $:GPU_PARALLEL_LOOP(collapse=4) - do i = 1, sys_size - do r = is3%beg, is3%end - do k = is2%beg, is2%end - do j = 0, buff_size - q_prim_rsy_vf(j, k, r, i) = & - q_prim_vf(i)%sf(k, dj*(n - 2*j) + j, r) + #:call GPU_PARALLEL_LOOP(collapse=4) + do i = 1, sys_size + do r = is3%beg, is3%end + do k = is2%beg, is2%end + do j = 0, buff_size + q_prim_rsy_vf(j, k, r, i) = & + q_prim_vf(i)%sf(k, dj*(n - 2*j) + j, r) + end do end do end do end do - end do - - $:GPU_PARALLEL_LOOP(collapse=3) - do r = is3%beg, is3%end - do k = is2%beg, is2%end - do j = 0, buff_size - q_prim_rsy_vf(j, k, r, momxb + 1) = & - q_prim_vf(momxb + 1)%sf(k, dj*(n - 2*j) + j, r)* & - sign(1._wp, -1._wp*cbc_loc) - end do - end do - end do + #:endcall GPU_PARALLEL_LOOP - $:GPU_PARALLEL_LOOP(collapse=4) - do i = 1, flux_cbc_index + #:call GPU_PARALLEL_LOOP(collapse=3) do r = is3%beg, is3%end do k = is2%beg, is2%end - do j = -1, buff_size - flux_rsy_vf_l(j, k, r, i) = & - flux_vf(i)%sf(k, dj*((n - 1) - 2*j) + j, r)* & + do j = 0, buff_size + q_prim_rsy_vf(j, k, r, momxb + 1) = & + q_prim_vf(momxb + 1)%sf(k, dj*(n - 2*j) + j, r)* & sign(1._wp, -1._wp*cbc_loc) end do end do end do - end do - - $:GPU_PARALLEL_LOOP(collapse=3) - do r = is3%beg, is3%end - do k = is2%beg, is2%end - do j = -1, buff_size - flux_rsy_vf_l(j, k, r, momxb + 1) = & - flux_vf(momxb + 1)%sf(k, dj*((n - 1) - 2*j) + j, r) - end do - end do - end do + #:endcall GPU_PARALLEL_LOOP - if (riemann_solver == 1) then - $:GPU_PARALLEL_LOOP(collapse=4) - do i = advxb, advxe + #:call GPU_PARALLEL_LOOP(collapse=4) + do i = 1, flux_cbc_index do r = is3%beg, is3%end do k = is2%beg, is2%end do j = -1, buff_size - flux_src_rsy_vf_l(j, k, r, i) = & - flux_src_vf(i)%sf(k, dj*((n - 1) - 2*j) + j, r) + flux_rsy_vf_l(j, k, r, i) = & + flux_vf(i)%sf(k, dj*((n - 1) - 2*j) + j, r)* & + sign(1._wp, -1._wp*cbc_loc) end do end do end do end do - else - $:GPU_PARALLEL_LOOP(collapse=3) + #:endcall GPU_PARALLEL_LOOP + + #:call GPU_PARALLEL_LOOP(collapse=3) do r = is3%beg, is3%end do k = is2%beg, is2%end do j = -1, buff_size - flux_src_rsy_vf_l(j, k, r, advxb) = & - flux_src_vf(advxb)%sf(k, dj*((n - 1) - 2*j) + j, r)* & - sign(1._wp, -1._wp*cbc_loc) + flux_rsy_vf_l(j, k, r, momxb + 1) = & + flux_vf(momxb + 1)%sf(k, dj*((n - 1) - 2*j) + j, r) end do end do end do + #:endcall GPU_PARALLEL_LOOP + + if (riemann_solver == 1) then + #:call GPU_PARALLEL_LOOP(collapse=4) + do i = advxb, advxe + do r = is3%beg, is3%end + do k = is2%beg, is2%end + do j = -1, buff_size + flux_src_rsy_vf_l(j, k, r, i) = & + flux_src_vf(i)%sf(k, dj*((n - 1) - 2*j) + j, r) + end do + end do + end do + end do + #:endcall GPU_PARALLEL_LOOP + else + #:call GPU_PARALLEL_LOOP(collapse=3) + do r = is3%beg, is3%end + do k = is2%beg, is2%end + do j = -1, buff_size + flux_src_rsy_vf_l(j, k, r, advxb) = & + flux_src_vf(advxb)%sf(k, dj*((n - 1) - 2*j) + j, r)* & + sign(1._wp, -1._wp*cbc_loc) + end do + end do + end do + #:endcall GPU_PARALLEL_LOOP end if ! END: Reshaping Inputted Data in y-direction @@ -1327,75 +1351,81 @@ contains ! Reshaping Inputted Data in z-direction else - $:GPU_PARALLEL_LOOP(collapse=4) - do i = 1, sys_size - do r = is3%beg, is3%end - do k = is2%beg, is2%end - do j = 0, buff_size - q_prim_rsz_vf(j, k, r, i) = & - q_prim_vf(i)%sf(r, k, dj*(p - 2*j) + j) + #:call GPU_PARALLEL_LOOP(collapse=4) + do i = 1, sys_size + do r = is3%beg, is3%end + do k = is2%beg, is2%end + do j = 0, buff_size + q_prim_rsz_vf(j, k, r, i) = & + q_prim_vf(i)%sf(r, k, dj*(p - 2*j) + j) + end do end do end do end do - end do - - $:GPU_PARALLEL_LOOP(collapse=3) - do r = is3%beg, is3%end - do k = is2%beg, is2%end - do j = 0, buff_size - q_prim_rsz_vf(j, k, r, momxe) = & - q_prim_vf(momxe)%sf(r, k, dj*(p - 2*j) + j)* & - sign(1._wp, -1._wp*cbc_loc) - end do - end do - end do + #:endcall GPU_PARALLEL_LOOP - $:GPU_PARALLEL_LOOP(collapse=4) - do i = 1, flux_cbc_index + #:call GPU_PARALLEL_LOOP(collapse=3) do r = is3%beg, is3%end do k = is2%beg, is2%end - do j = -1, buff_size - flux_rsz_vf_l(j, k, r, i) = & - flux_vf(i)%sf(r, k, dj*((p - 1) - 2*j) + j)* & + do j = 0, buff_size + q_prim_rsz_vf(j, k, r, momxe) = & + q_prim_vf(momxe)%sf(r, k, dj*(p - 2*j) + j)* & sign(1._wp, -1._wp*cbc_loc) end do end do end do - end do - - $:GPU_PARALLEL_LOOP(collapse=3) - do r = is3%beg, is3%end - do k = is2%beg, is2%end - do j = -1, buff_size - flux_rsz_vf_l(j, k, r, momxe) = & - flux_vf(momxe)%sf(r, k, dj*((p - 1) - 2*j) + j) - end do - end do - end do + #:endcall GPU_PARALLEL_LOOP - if (riemann_solver == 1) then - $:GPU_PARALLEL_LOOP(collapse=4) - do i = advxb, advxe + #:call GPU_PARALLEL_LOOP(collapse=4) + do i = 1, flux_cbc_index do r = is3%beg, is3%end do k = is2%beg, is2%end do j = -1, buff_size - flux_src_rsz_vf_l(j, k, r, i) = & - flux_src_vf(i)%sf(r, k, dj*((p - 1) - 2*j) + j) + flux_rsz_vf_l(j, k, r, i) = & + flux_vf(i)%sf(r, k, dj*((p - 1) - 2*j) + j)* & + sign(1._wp, -1._wp*cbc_loc) end do end do end do end do - else - $:GPU_PARALLEL_LOOP(collapse=3) + #:endcall GPU_PARALLEL_LOOP + + #:call GPU_PARALLEL_LOOP(collapse=3) do r = is3%beg, is3%end do k = is2%beg, is2%end do j = -1, buff_size - flux_src_rsz_vf_l(j, k, r, advxb) = & - flux_src_vf(advxb)%sf(r, k, dj*((p - 1) - 2*j) + j)* & - sign(1._wp, -1._wp*cbc_loc) + flux_rsz_vf_l(j, k, r, momxe) = & + flux_vf(momxe)%sf(r, k, dj*((p - 1) - 2*j) + j) end do end do end do + #:endcall GPU_PARALLEL_LOOP + + if (riemann_solver == 1) then + #:call GPU_PARALLEL_LOOP(collapse=4) + do i = advxb, advxe + do r = is3%beg, is3%end + do k = is2%beg, is2%end + do j = -1, buff_size + flux_src_rsz_vf_l(j, k, r, i) = & + flux_src_vf(i)%sf(r, k, dj*((p - 1) - 2*j) + j) + end do + end do + end do + end do + #:endcall GPU_PARALLEL_LOOP + else + #:call GPU_PARALLEL_LOOP(collapse=3) + do r = is3%beg, is3%end + do k = is2%beg, is2%end + do j = -1, buff_size + flux_src_rsz_vf_l(j, k, r, advxb) = & + flux_src_vf(advxb)%sf(r, k, dj*((p - 1) - 2*j) + j)* & + sign(1._wp, -1._wp*cbc_loc) + end do + end do + end do + #:endcall GPU_PARALLEL_LOOP end if end if @@ -1425,103 +1455,111 @@ contains ! Reshaping Outputted Data in x-direction if (cbc_dir == 1) then - $:GPU_PARALLEL_LOOP(collapse=4) - do i = 1, flux_cbc_index + #:call GPU_PARALLEL_LOOP(collapse=4) + do i = 1, flux_cbc_index + do r = is3%beg, is3%end + do k = is2%beg, is2%end + do j = -1, buff_size + flux_vf(i)%sf(dj*((m - 1) - 2*j) + j, k, r) = & + flux_rsx_vf_l(j, k, r, i)* & + sign(1._wp, -1._wp*cbc_loc) + end do + end do + end do + end do + #:endcall GPU_PARALLEL_LOOP + #:call GPU_PARALLEL_LOOP(collapse=3) do r = is3%beg, is3%end do k = is2%beg, is2%end do j = -1, buff_size - flux_vf(i)%sf(dj*((m - 1) - 2*j) + j, k, r) = & - flux_rsx_vf_l(j, k, r, i)* & - sign(1._wp, -1._wp*cbc_loc) + flux_vf(momxb)%sf(dj*((m - 1) - 2*j) + j, k, r) = & + flux_rsx_vf_l(j, k, r, momxb) end do end do end do - end do - $:GPU_PARALLEL_LOOP(collapse=3) - do r = is3%beg, is3%end - do k = is2%beg, is2%end - do j = -1, buff_size - flux_vf(momxb)%sf(dj*((m - 1) - 2*j) + j, k, r) = & - flux_rsx_vf_l(j, k, r, momxb) - end do - end do - end do + #:endcall GPU_PARALLEL_LOOP if (riemann_solver == 1) then - $:GPU_PARALLEL_LOOP(collapse=4) - do i = advxb, advxe - do r = is3%beg, is3%end - do k = is2%beg, is2%end - do j = -1, buff_size - flux_src_vf(i)%sf(dj*((m - 1) - 2*j) + j, k, r) = & - flux_src_rsx_vf_l(j, k, r, i) + #:call GPU_PARALLEL_LOOP(collapse=4) + do i = advxb, advxe + do r = is3%beg, is3%end + do k = is2%beg, is2%end + do j = -1, buff_size + flux_src_vf(i)%sf(dj*((m - 1) - 2*j) + j, k, r) = & + flux_src_rsx_vf_l(j, k, r, i) + end do end do end do end do - end do + #:endcall GPU_PARALLEL_LOOP else - $:GPU_PARALLEL_LOOP(collapse=3) - do r = is3%beg, is3%end - do k = is2%beg, is2%end - do j = -1, buff_size - flux_src_vf(advxb)%sf(dj*((m - 1) - 2*j) + j, k, r) = & - flux_src_rsx_vf_l(j, k, r, advxb)* & - sign(1._wp, -1._wp*cbc_loc) + #:call GPU_PARALLEL_LOOP(collapse=3) + do r = is3%beg, is3%end + do k = is2%beg, is2%end + do j = -1, buff_size + flux_src_vf(advxb)%sf(dj*((m - 1) - 2*j) + j, k, r) = & + flux_src_rsx_vf_l(j, k, r, advxb)* & + sign(1._wp, -1._wp*cbc_loc) + end do end do end do - end do + #:endcall GPU_PARALLEL_LOOP end if ! END: Reshaping Outputted Data in x-direction ! Reshaping Outputted Data in y-direction elseif (cbc_dir == 2) then - $:GPU_PARALLEL_LOOP(collapse=4) - do i = 1, flux_cbc_index + #:call GPU_PARALLEL_LOOP(collapse=4) + do i = 1, flux_cbc_index + do r = is3%beg, is3%end + do k = is2%beg, is2%end + do j = -1, buff_size + flux_vf(i)%sf(k, dj*((n - 1) - 2*j) + j, r) = & + flux_rsy_vf_l(j, k, r, i)* & + sign(1._wp, -1._wp*cbc_loc) + end do + end do + end do + end do + #:endcall GPU_PARALLEL_LOOP + + #:call GPU_PARALLEL_LOOP(collapse=3) do r = is3%beg, is3%end do k = is2%beg, is2%end do j = -1, buff_size - flux_vf(i)%sf(k, dj*((n - 1) - 2*j) + j, r) = & - flux_rsy_vf_l(j, k, r, i)* & - sign(1._wp, -1._wp*cbc_loc) + flux_vf(momxb + 1)%sf(k, dj*((n - 1) - 2*j) + j, r) = & + flux_rsy_vf_l(j, k, r, momxb + 1) end do end do end do - end do - - $:GPU_PARALLEL_LOOP(collapse=3) - do r = is3%beg, is3%end - do k = is2%beg, is2%end - do j = -1, buff_size - flux_vf(momxb + 1)%sf(k, dj*((n - 1) - 2*j) + j, r) = & - flux_rsy_vf_l(j, k, r, momxb + 1) - end do - end do - end do + #:endcall GPU_PARALLEL_LOOP if (riemann_solver == 1) then - $:GPU_PARALLEL_LOOP(collapse=4) - do i = advxb, advxe - do r = is3%beg, is3%end - do k = is2%beg, is2%end - do j = -1, buff_size - flux_src_vf(i)%sf(k, dj*((n - 1) - 2*j) + j, r) = & - flux_src_rsy_vf_l(j, k, r, i) + #:call GPU_PARALLEL_LOOP(collapse=4) + do i = advxb, advxe + do r = is3%beg, is3%end + do k = is2%beg, is2%end + do j = -1, buff_size + flux_src_vf(i)%sf(k, dj*((n - 1) - 2*j) + j, r) = & + flux_src_rsy_vf_l(j, k, r, i) + end do end do end do end do - end do + #:endcall GPU_PARALLEL_LOOP else - $:GPU_PARALLEL_LOOP(collapse=3) - do r = is3%beg, is3%end - do k = is2%beg, is2%end - do j = -1, buff_size - flux_src_vf(advxb)%sf(k, dj*((n - 1) - 2*j) + j, r) = & - flux_src_rsy_vf_l(j, k, r, advxb)* & - sign(1._wp, -1._wp*cbc_loc) + #:call GPU_PARALLEL_LOOP(collapse=3) + do r = is3%beg, is3%end + do k = is2%beg, is2%end + do j = -1, buff_size + flux_src_vf(advxb)%sf(k, dj*((n - 1) - 2*j) + j, r) = & + flux_src_rsy_vf_l(j, k, r, advxb)* & + sign(1._wp, -1._wp*cbc_loc) + end do end do end do - end do + #:endcall GPU_PARALLEL_LOOP end if ! END: Reshaping Outputted Data in y-direction @@ -1529,52 +1567,56 @@ contains ! Reshaping Outputted Data in z-direction else - $:GPU_PARALLEL_LOOP(collapse=4) - do i = 1, flux_cbc_index + #:call GPU_PARALLEL_LOOP(collapse=4) + do i = 1, flux_cbc_index + do r = is3%beg, is3%end + do k = is2%beg, is2%end + do j = -1, buff_size + flux_vf(i)%sf(r, k, dj*((p - 1) - 2*j) + j) = & + flux_rsz_vf_l(j, k, r, i)* & + sign(1._wp, -1._wp*cbc_loc) + end do + end do + end do + end do + #:endcall GPU_PARALLEL_LOOP + + #:call GPU_PARALLEL_LOOP(collapse=3) do r = is3%beg, is3%end do k = is2%beg, is2%end do j = -1, buff_size - flux_vf(i)%sf(r, k, dj*((p - 1) - 2*j) + j) = & - flux_rsz_vf_l(j, k, r, i)* & - sign(1._wp, -1._wp*cbc_loc) + flux_vf(momxe)%sf(r, k, dj*((p - 1) - 2*j) + j) = & + flux_rsz_vf_l(j, k, r, momxe) end do end do end do - end do - - $:GPU_PARALLEL_LOOP(collapse=3) - do r = is3%beg, is3%end - do k = is2%beg, is2%end - do j = -1, buff_size - flux_vf(momxe)%sf(r, k, dj*((p - 1) - 2*j) + j) = & - flux_rsz_vf_l(j, k, r, momxe) - end do - end do - end do + #:endcall GPU_PARALLEL_LOOP if (riemann_solver == 1) then - $:GPU_PARALLEL_LOOP(collapse=4) - do i = advxb, advxe - do r = is3%beg, is3%end - do k = is2%beg, is2%end - do j = -1, buff_size - flux_src_vf(i)%sf(r, k, dj*((p - 1) - 2*j) + j) = & - flux_src_rsz_vf_l(j, k, r, i) + #:call GPU_PARALLEL_LOOP(collapse=4) + do i = advxb, advxe + do r = is3%beg, is3%end + do k = is2%beg, is2%end + do j = -1, buff_size + flux_src_vf(i)%sf(r, k, dj*((p - 1) - 2*j) + j) = & + flux_src_rsz_vf_l(j, k, r, i) + end do end do end do end do - end do + #:endcall GPU_PARALLEL_LOOP else - $:GPU_PARALLEL_LOOP(collapse=3) - do r = is3%beg, is3%end - do k = is2%beg, is2%end - do j = -1, buff_size - flux_src_vf(advxb)%sf(r, k, dj*((p - 1) - 2*j) + j) = & - flux_src_rsz_vf_l(j, k, r, advxb)* & - sign(1._wp, -1._wp*cbc_loc) + #:call GPU_PARALLEL_LOOP(collapse=3) + do r = is3%beg, is3%end + do k = is2%beg, is2%end + do j = -1, buff_size + flux_src_vf(advxb)%sf(r, k, dj*((p - 1) - 2*j) + j) = & + flux_src_rsz_vf_l(j, k, r, advxb)* & + sign(1._wp, -1._wp*cbc_loc) + end do end do end do - end do + #:endcall GPU_PARALLEL_LOOP end if end if @@ -1583,7 +1625,7 @@ contains end subroutine s_finalize_cbc ! Detext if the problem has any characteristic boundary conditions - pure elemental subroutine s_any_cbc_boundaries(toggle) + elemental subroutine s_any_cbc_boundaries(toggle) logical, intent(inout) :: toggle diff --git a/src/simulation/m_compute_cbc.fpp b/src/simulation/m_compute_cbc.fpp index 694f6735b2..5f361ee61b 100644 --- a/src/simulation/m_compute_cbc.fpp +++ b/src/simulation/m_compute_cbc.fpp @@ -19,7 +19,7 @@ module m_compute_cbc contains !> Base L1 calculation - pure function f_base_L1(lambda, rho, c, dpres_ds, dvel_ds) result(L1) + function f_base_L1(lambda, rho, c, dpres_ds, dvel_ds) result(L1) $:GPU_ROUTINE(parallelism='[seq]') real(wp), dimension(3), intent(in) :: lambda real(wp), intent(in) :: rho, c, dpres_ds @@ -29,7 +29,7 @@ contains end function f_base_L1 !> Fill density L variables - pure subroutine s_fill_density_L(L, lambda_factor, lambda2, c, mf, dalpha_rho_ds, dpres_ds) + subroutine s_fill_density_L(L, lambda_factor, lambda2, c, mf, dalpha_rho_ds, dpres_ds) $:GPU_ROUTINE(parallelism='[seq]') real(wp), dimension(sys_size), intent(inout) :: L real(wp), intent(in) :: lambda_factor, lambda2, c @@ -37,39 +37,42 @@ contains real(wp), intent(in) :: dpres_ds integer :: i + $:GPU_LOOP(parallelism='[seq]') do i = 2, momxb L(i) = lambda_factor*lambda2*(c*c*dalpha_rho_ds(i - 1) - mf(i - 1)*dpres_ds) end do end subroutine s_fill_density_L !> Fill velocity L variables - pure subroutine s_fill_velocity_L(L, lambda_factor, lambda2, dvel_ds) + subroutine s_fill_velocity_L(L, lambda_factor, lambda2, dvel_ds) $:GPU_ROUTINE(parallelism='[seq]') real(wp), dimension(sys_size), intent(inout) :: L real(wp), intent(in) :: lambda_factor, lambda2 real(wp), dimension(num_dims), intent(in) :: dvel_ds integer :: i + $:GPU_LOOP(parallelism='[seq]') do i = momxb + 1, momxe L(i) = lambda_factor*lambda2*dvel_ds(dir_idx(i - contxe)) end do end subroutine s_fill_velocity_L !> Fill advection L variables - pure subroutine s_fill_advection_L(L, lambda_factor, lambda2, dadv_ds) + subroutine s_fill_advection_L(L, lambda_factor, lambda2, dadv_ds) $:GPU_ROUTINE(parallelism='[seq]') real(wp), dimension(sys_size), intent(inout) :: L real(wp), intent(in) :: lambda_factor, lambda2 real(wp), dimension(num_fluids), intent(in) :: dadv_ds integer :: i + $:GPU_LOOP(parallelism='[seq]') do i = E_idx, advxe - 1 L(i) = lambda_factor*lambda2*dadv_ds(i - momxe) end do end subroutine s_fill_advection_L !> Fill chemistry L variables - pure subroutine s_fill_chemistry_L(L, lambda_factor, lambda2, dYs_ds) + subroutine s_fill_chemistry_L(L, lambda_factor, lambda2, dYs_ds) $:GPU_ROUTINE(parallelism='[seq]') real(wp), dimension(sys_size), intent(inout) :: L real(wp), intent(in) :: lambda_factor, lambda2 @@ -78,13 +81,14 @@ contains if (.not. chemistry) return + $:GPU_LOOP(parallelism='[seq]') do i = chemxb, chemxe L(i) = lambda_factor*lambda2*dYs_ds(i - chemxb + 1) end do end subroutine s_fill_chemistry_L !> Slip wall CBC (Thompson 1990, pg. 451) - pure subroutine s_compute_slip_wall_L(lambda, L, rho, c, dpres_ds, dvel_ds) + subroutine s_compute_slip_wall_L(lambda, L, rho, c, dpres_ds, dvel_ds) $:GPU_ROUTINE(function_name='s_compute_slip_wall_L',parallelism='[seq]', & & cray_inline=True) @@ -100,7 +104,7 @@ contains end subroutine s_compute_slip_wall_L !> Nonreflecting subsonic buffer CBC (Thompson 1987, pg. 13) - pure subroutine s_compute_nonreflecting_subsonic_buffer_L(lambda, L, rho, c, mf, dalpha_rho_ds, dpres_ds, dvel_ds, dadv_ds, dYs_ds) + subroutine s_compute_nonreflecting_subsonic_buffer_L(lambda, L, rho, c, mf, dalpha_rho_ds, dpres_ds, dvel_ds, dadv_ds, dYs_ds) $:GPU_ROUTINE(function_name='s_compute_nonreflecting_subsonic_buffer_L', & & parallelism='[seq]', cray_inline=True) @@ -128,7 +132,7 @@ contains end subroutine s_compute_nonreflecting_subsonic_buffer_L !> Nonreflecting subsonic inflow CBC (Thompson 1990, pg. 455) - pure subroutine s_compute_nonreflecting_subsonic_inflow_L(lambda, L, rho, c, dpres_ds, dvel_ds) + subroutine s_compute_nonreflecting_subsonic_inflow_L(lambda, L, rho, c, dpres_ds, dvel_ds) $:GPU_ROUTINE(function_name='s_compute_nonreflecting_subsonic_inflow_L', & & parallelism='[seq]', cray_inline=True) @@ -143,7 +147,7 @@ contains end subroutine s_compute_nonreflecting_subsonic_inflow_L !> Nonreflecting subsonic outflow CBC (Thompson 1990, pg. 454) - pure subroutine s_compute_nonreflecting_subsonic_outflow_L(lambda, L, rho, c, mf, dalpha_rho_ds, dpres_ds, dvel_ds, dadv_ds, dYs_ds) + subroutine s_compute_nonreflecting_subsonic_outflow_L(lambda, L, rho, c, mf, dalpha_rho_ds, dpres_ds, dvel_ds, dadv_ds, dYs_ds) $:GPU_ROUTINE(function_name='s_compute_nonreflecting_subsonic_outflow_L', & & parallelism='[seq]', cray_inline=True) @@ -165,7 +169,7 @@ contains end subroutine s_compute_nonreflecting_subsonic_outflow_L !> Force-free subsonic outflow CBC (Thompson 1990, pg. 454) - pure subroutine s_compute_force_free_subsonic_outflow_L(lambda, L, rho, c, mf, dalpha_rho_ds, dpres_ds, dvel_ds, dadv_ds) + subroutine s_compute_force_free_subsonic_outflow_L(lambda, L, rho, c, mf, dalpha_rho_ds, dpres_ds, dvel_ds, dadv_ds) $:GPU_ROUTINE(function_name='s_compute_force_free_subsonic_outflow_L', & & parallelism='[seq]', cray_inline=True) @@ -185,7 +189,7 @@ contains end subroutine s_compute_force_free_subsonic_outflow_L !> Constant pressure subsonic outflow CBC (Thompson 1990, pg. 455) - pure subroutine s_compute_constant_pressure_subsonic_outflow_L(lambda, L, rho, c, mf, dalpha_rho_ds, dpres_ds, dvel_ds, dadv_ds) + subroutine s_compute_constant_pressure_subsonic_outflow_L(lambda, L, rho, c, mf, dalpha_rho_ds, dpres_ds, dvel_ds, dadv_ds) $:GPU_ROUTINE(function_name='s_compute_constant_pressure_subsonic_outflow_L', & & parallelism='[seq]', cray_inline=True) @@ -205,7 +209,7 @@ contains end subroutine s_compute_constant_pressure_subsonic_outflow_L !> Supersonic inflow CBC (Thompson 1990, pg. 453) - pure subroutine s_compute_supersonic_inflow_L(L) + subroutine s_compute_supersonic_inflow_L(L) $:GPU_ROUTINE(function_name='s_compute_supersonic_inflow_L', & & parallelism='[seq]', cray_inline=True) @@ -215,7 +219,7 @@ contains end subroutine s_compute_supersonic_inflow_L !> Supersonic outflow CBC (Thompson 1990, pg. 453) - pure subroutine s_compute_supersonic_outflow_L(lambda, L, rho, c, mf, dalpha_rho_ds, dpres_ds, dvel_ds, dadv_ds, dYs_ds) + subroutine s_compute_supersonic_outflow_L(lambda, L, rho, c, mf, dalpha_rho_ds, dpres_ds, dvel_ds, dadv_ds, dYs_ds) $:GPU_ROUTINE(function_name='s_compute_supersonic_outflow_L', & & parallelism='[seq]', cray_inline=True) diff --git a/src/simulation/m_data_output.fpp b/src/simulation/m_data_output.fpp index 5f5b79c481..1884852f39 100644 --- a/src/simulation/m_data_output.fpp +++ b/src/simulation/m_data_output.fpp @@ -287,23 +287,24 @@ contains integer :: j, k, l ! Computing Stability Criteria at Current Time-step - $:GPU_PARALLEL_LOOP(collapse=3, private='[vel, alpha, Re]') - do l = 0, p - do k = 0, n - do j = 0, m - call s_compute_enthalpy(q_prim_vf, pres, rho, gamma, pi_inf, Re, H, alpha, vel, vel_sum, j, k, l) + #:call GPU_PARALLEL_LOOP(collapse=3, private='[vel, alpha, Re]') + do l = 0, p + do k = 0, n + do j = 0, m + call s_compute_enthalpy(q_prim_vf, pres, rho, gamma, pi_inf, Re, H, alpha, vel, vel_sum, j, k, l) - call s_compute_speed_of_sound(pres, rho, gamma, pi_inf, H, alpha, vel_sum, 0._wp, c) + call s_compute_speed_of_sound(pres, rho, gamma, pi_inf, H, alpha, vel_sum, 0._wp, c) - if (viscous) then - call s_compute_stability_from_dt(vel, c, rho, Re, j, k, l, icfl_sf, vcfl_sf, Rc_sf) - else - call s_compute_stability_from_dt(vel, c, rho, Re, j, k, l, icfl_sf) - end if + if (viscous) then + call s_compute_stability_from_dt(vel, c, rho, Re, j, k, l, icfl_sf, vcfl_sf, Rc_sf) + else + call s_compute_stability_from_dt(vel, c, rho, Re, j, k, l, icfl_sf) + end if + end do end do end do - end do + #:endcall GPU_PARALLEL_LOOP ! end: Computing Stability Criteria at Current Time-step diff --git a/src/simulation/m_derived_variables.fpp b/src/simulation/m_derived_variables.fpp index 8da88a3a91..f4653a9366 100644 --- a/src/simulation/m_derived_variables.fpp +++ b/src/simulation/m_derived_variables.fpp @@ -146,23 +146,24 @@ contains z_accel) end if - $:GPU_PARALLEL_LOOP(collapse=3) - do k = 0, p - do j = 0, n - do i = 0, m - if (p > 0) then - accel_mag(i, j, k) = sqrt(x_accel(i, j, k)**2._wp + & - y_accel(i, j, k)**2._wp + & - z_accel(i, j, k)**2._wp) - elseif (n > 0) then - accel_mag(i, j, k) = sqrt(x_accel(i, j, k)**2._wp + & - y_accel(i, j, k)**2._wp) - else - accel_mag(i, j, k) = x_accel(i, j, k) - end if + #:call GPU_PARALLEL_LOOP(collapse=3) + do k = 0, p + do j = 0, n + do i = 0, m + if (p > 0) then + accel_mag(i, j, k) = sqrt(x_accel(i, j, k)**2._wp + & + y_accel(i, j, k)**2._wp + & + z_accel(i, j, k)**2._wp) + elseif (n > 0) then + accel_mag(i, j, k) = sqrt(x_accel(i, j, k)**2._wp + & + y_accel(i, j, k)**2._wp) + else + accel_mag(i, j, k) = x_accel(i, j, k) + end if + end do end do end do - end do + #:endcall GPU_PARALLEL_LOOP $:GPU_UPDATE(host='[accel_mag]') @@ -187,8 +188,8 @@ contains !! @param q_prim_vf2 Primitive variables !! @param q_prim_vf3 Primitive variables !! @param q_sf Acceleration component - pure subroutine s_derive_acceleration_component(i, q_prim_vf0, q_prim_vf1, & - q_prim_vf2, q_prim_vf3, q_sf) + subroutine s_derive_acceleration_component(i, q_prim_vf0, q_prim_vf1, & + q_prim_vf2, q_prim_vf3, q_sf) integer, intent(in) :: i @@ -203,66 +204,35 @@ contains ! Computing the acceleration component in the x-coordinate direction if (i == 1) then - $:GPU_PARALLEL_LOOP(collapse=3) - do l = 0, p - do k = 0, n - do j = 0, m - q_sf(j, k, l) = (11._wp*q_prim_vf0(momxb)%sf(j, k, l) & - - 18._wp*q_prim_vf1(momxb)%sf(j, k, l) & - + 9._wp*q_prim_vf2(momxb)%sf(j, k, l) & - - 2._wp*q_prim_vf3(momxb)%sf(j, k, l))/(6._wp*dt) - end do - end do - end do - - if (n == 0) then - $:GPU_PARALLEL_LOOP(collapse=4) - do l = 0, p - do k = 0, n - do j = 0, m - do r = -fd_number, fd_number - q_sf(j, k, l) = q_sf(j, k, l) & - + q_prim_vf0(momxb)%sf(j, k, l)*fd_coeff_x(r, j)* & - q_prim_vf0(momxb)%sf(r + j, k, l) - end do - end do - end do - end do - elseif (p == 0) then - $:GPU_PARALLEL_LOOP(collapse=4) + #:call GPU_PARALLEL_LOOP(collapse=3) do l = 0, p do k = 0, n do j = 0, m - do r = -fd_number, fd_number - q_sf(j, k, l) = q_sf(j, k, l) & - + q_prim_vf0(momxb)%sf(j, k, l)*fd_coeff_x(r, j)* & - q_prim_vf0(momxb)%sf(r + j, k, l) & - + q_prim_vf0(momxb + 1)%sf(j, k, l)*fd_coeff_y(r, k)* & - q_prim_vf0(momxb)%sf(j, r + k, l) - end do + q_sf(j, k, l) = (11._wp*q_prim_vf0(momxb)%sf(j, k, l) & + - 18._wp*q_prim_vf1(momxb)%sf(j, k, l) & + + 9._wp*q_prim_vf2(momxb)%sf(j, k, l) & + - 2._wp*q_prim_vf3(momxb)%sf(j, k, l))/(6._wp*dt) end do end do end do - else - if (grid_geometry == 3) then - $:GPU_PARALLEL_LOOP(collapse=4) + #:endcall GPU_PARALLEL_LOOP + + if (n == 0) then + #:call GPU_PARALLEL_LOOP(collapse=4) do l = 0, p do k = 0, n do j = 0, m do r = -fd_number, fd_number q_sf(j, k, l) = q_sf(j, k, l) & + q_prim_vf0(momxb)%sf(j, k, l)*fd_coeff_x(r, j)* & - q_prim_vf0(momxb)%sf(r + j, k, l) & - + q_prim_vf0(momxb + 1)%sf(j, k, l)*fd_coeff_y(r, k)* & - q_prim_vf0(momxb)%sf(j, r + k, l) & - + q_prim_vf0(momxe)%sf(j, k, l)*fd_coeff_z(r, l)* & - q_prim_vf0(momxb)%sf(j, k, r + l)/y_cc(k) + q_prim_vf0(momxb)%sf(r + j, k, l) end do end do end do end do - else - $:GPU_PARALLEL_LOOP(collapse=4) + #:endcall GPU_PARALLEL_LOOP + elseif (p == 0) then + #:call GPU_PARALLEL_LOOP(collapse=4) do l = 0, p do k = 0, n do j = 0, m @@ -271,47 +241,68 @@ contains + q_prim_vf0(momxb)%sf(j, k, l)*fd_coeff_x(r, j)* & q_prim_vf0(momxb)%sf(r + j, k, l) & + q_prim_vf0(momxb + 1)%sf(j, k, l)*fd_coeff_y(r, k)* & - q_prim_vf0(momxb)%sf(j, r + k, l) & - + q_prim_vf0(momxe)%sf(j, k, l)*fd_coeff_z(r, l)* & - q_prim_vf0(momxb)%sf(j, k, r + l) + q_prim_vf0(momxb)%sf(j, r + k, l) end do end do end do end do + #:endcall GPU_PARALLEL_LOOP + else + if (grid_geometry == 3) then + #:call GPU_PARALLEL_LOOP(collapse=4) + do l = 0, p + do k = 0, n + do j = 0, m + do r = -fd_number, fd_number + q_sf(j, k, l) = q_sf(j, k, l) & + + q_prim_vf0(momxb)%sf(j, k, l)*fd_coeff_x(r, j)* & + q_prim_vf0(momxb)%sf(r + j, k, l) & + + q_prim_vf0(momxb + 1)%sf(j, k, l)*fd_coeff_y(r, k)* & + q_prim_vf0(momxb)%sf(j, r + k, l) & + + q_prim_vf0(momxe)%sf(j, k, l)*fd_coeff_z(r, l)* & + q_prim_vf0(momxb)%sf(j, k, r + l)/y_cc(k) + end do + end do + end do + end do + #:endcall GPU_PARALLEL_LOOP + else + #:call GPU_PARALLEL_LOOP(collapse=4) + do l = 0, p + do k = 0, n + do j = 0, m + do r = -fd_number, fd_number + q_sf(j, k, l) = q_sf(j, k, l) & + + q_prim_vf0(momxb)%sf(j, k, l)*fd_coeff_x(r, j)* & + q_prim_vf0(momxb)%sf(r + j, k, l) & + + q_prim_vf0(momxb + 1)%sf(j, k, l)*fd_coeff_y(r, k)* & + q_prim_vf0(momxb)%sf(j, r + k, l) & + + q_prim_vf0(momxe)%sf(j, k, l)*fd_coeff_z(r, l)* & + q_prim_vf0(momxb)%sf(j, k, r + l) + end do + end do + end do + end do + #:endcall GPU_PARALLEL_LOOP end if end if ! Computing the acceleration component in the y-coordinate direction elseif (i == 2) then - $:GPU_PARALLEL_LOOP(collapse=3) - do l = 0, p - do k = 0, n - do j = 0, m - q_sf(j, k, l) = (11._wp*q_prim_vf0(momxb + 1)%sf(j, k, l) & - - 18._wp*q_prim_vf1(momxb + 1)%sf(j, k, l) & - + 9._wp*q_prim_vf2(momxb + 1)%sf(j, k, l) & - - 2._wp*q_prim_vf3(momxb + 1)%sf(j, k, l))/(6._wp*dt) - end do - end do - end do - - if (p == 0) then - $:GPU_PARALLEL_LOOP(collapse=4) + #:call GPU_PARALLEL_LOOP(collapse=3) do l = 0, p do k = 0, n do j = 0, m - do r = -fd_number, fd_number - q_sf(j, k, l) = q_sf(j, k, l) & - + q_prim_vf0(momxb)%sf(j, k, l)*fd_coeff_x(r, j)* & - q_prim_vf0(momxb + 1)%sf(r + j, k, l) & - + q_prim_vf0(momxb + 1)%sf(j, k, l)*fd_coeff_y(r, k)* & - q_prim_vf0(momxb + 1)%sf(j, r + k, l) - end do + q_sf(j, k, l) = (11._wp*q_prim_vf0(momxb + 1)%sf(j, k, l) & + - 18._wp*q_prim_vf1(momxb + 1)%sf(j, k, l) & + + 9._wp*q_prim_vf2(momxb + 1)%sf(j, k, l) & + - 2._wp*q_prim_vf3(momxb + 1)%sf(j, k, l))/(6._wp*dt) end do end do end do - else - if (grid_geometry == 3) then - $:GPU_PARALLEL_LOOP(collapse=4) + #:endcall GPU_PARALLEL_LOOP + + if (p == 0) then + #:call GPU_PARALLEL_LOOP(collapse=4) do l = 0, p do k = 0, n do j = 0, m @@ -320,83 +311,105 @@ contains + q_prim_vf0(momxb)%sf(j, k, l)*fd_coeff_x(r, j)* & q_prim_vf0(momxb + 1)%sf(r + j, k, l) & + q_prim_vf0(momxb + 1)%sf(j, k, l)*fd_coeff_y(r, k)* & - q_prim_vf0(momxb + 1)%sf(j, r + k, l) & - + q_prim_vf0(momxe)%sf(j, k, l)*fd_coeff_z(r, l)* & - q_prim_vf0(momxb + 1)%sf(j, k, r + l)/y_cc(k) & - - (q_prim_vf0(momxe)%sf(j, k, l)**2._wp)/y_cc(k) + q_prim_vf0(momxb + 1)%sf(j, r + k, l) end do end do end do end do + #:endcall GPU_PARALLEL_LOOP + else + if (grid_geometry == 3) then + #:call GPU_PARALLEL_LOOP(collapse=4) + do l = 0, p + do k = 0, n + do j = 0, m + do r = -fd_number, fd_number + q_sf(j, k, l) = q_sf(j, k, l) & + + q_prim_vf0(momxb)%sf(j, k, l)*fd_coeff_x(r, j)* & + q_prim_vf0(momxb + 1)%sf(r + j, k, l) & + + q_prim_vf0(momxb + 1)%sf(j, k, l)*fd_coeff_y(r, k)* & + q_prim_vf0(momxb + 1)%sf(j, r + k, l) & + + q_prim_vf0(momxe)%sf(j, k, l)*fd_coeff_z(r, l)* & + q_prim_vf0(momxb + 1)%sf(j, k, r + l)/y_cc(k) & + - (q_prim_vf0(momxe)%sf(j, k, l)**2._wp)/y_cc(k) + end do + end do + end do + end do + #:endcall GPU_PARALLEL_LOOP else - $:GPU_PARALLEL_LOOP(collapse=4) - do l = 0, p - do k = 0, n - do j = 0, m - do r = -fd_number, fd_number - q_sf(j, k, l) = q_sf(j, k, l) & - + q_prim_vf0(momxb)%sf(j, k, l)*fd_coeff_x(r, j)* & - q_prim_vf0(momxb + 1)%sf(r + j, k, l) & - + q_prim_vf0(momxb + 1)%sf(j, k, l)*fd_coeff_y(r, k)* & - q_prim_vf0(momxb + 1)%sf(j, r + k, l) & - + q_prim_vf0(momxe)%sf(j, k, l)*fd_coeff_z(r, l)* & - q_prim_vf0(momxb + 1)%sf(j, k, r + l) + #:call GPU_PARALLEL_LOOP(collapse=4) + do l = 0, p + do k = 0, n + do j = 0, m + do r = -fd_number, fd_number + q_sf(j, k, l) = q_sf(j, k, l) & + + q_prim_vf0(momxb)%sf(j, k, l)*fd_coeff_x(r, j)* & + q_prim_vf0(momxb + 1)%sf(r + j, k, l) & + + q_prim_vf0(momxb + 1)%sf(j, k, l)*fd_coeff_y(r, k)* & + q_prim_vf0(momxb + 1)%sf(j, r + k, l) & + + q_prim_vf0(momxe)%sf(j, k, l)*fd_coeff_z(r, l)* & + q_prim_vf0(momxb + 1)%sf(j, k, r + l) + end do end do end do end do - end do + #:endcall GPU_PARALLEL_LOOP end if end if ! Computing the acceleration component in the z-coordinate direction else - $:GPU_PARALLEL_LOOP(collapse=3) - do l = 0, p - do k = 0, n - do j = 0, m - q_sf(j, k, l) = (11._wp*q_prim_vf0(momxe)%sf(j, k, l) & - - 18._wp*q_prim_vf1(momxe)%sf(j, k, l) & - + 9._wp*q_prim_vf2(momxe)%sf(j, k, l) & - - 2._wp*q_prim_vf3(momxe)%sf(j, k, l))/(6._wp*dt) + #:call GPU_PARALLEL_LOOP(collapse=3) + do l = 0, p + do k = 0, n + do j = 0, m + q_sf(j, k, l) = (11._wp*q_prim_vf0(momxe)%sf(j, k, l) & + - 18._wp*q_prim_vf1(momxe)%sf(j, k, l) & + + 9._wp*q_prim_vf2(momxe)%sf(j, k, l) & + - 2._wp*q_prim_vf3(momxe)%sf(j, k, l))/(6._wp*dt) + end do end do end do - end do + #:endcall GPU_PARALLEL_LOOP if (grid_geometry == 3) then - $:GPU_PARALLEL_LOOP(collapse=4) - do l = 0, p - do k = 0, n - do j = 0, m - do r = -fd_number, fd_number - q_sf(j, k, l) = q_sf(j, k, l) & - + q_prim_vf0(momxb)%sf(j, k, l)*fd_coeff_x(r, j)* & - q_prim_vf0(momxe)%sf(r + j, k, l) & - + q_prim_vf0(momxb + 1)%sf(j, k, l)*fd_coeff_y(r, k)* & - q_prim_vf0(momxe)%sf(j, r + k, l) & - + q_prim_vf0(momxe)%sf(j, k, l)*fd_coeff_z(r, l)* & - q_prim_vf0(momxe)%sf(j, k, r + l)/y_cc(k) & - + (q_prim_vf0(momxe)%sf(j, k, l)* & - q_prim_vf0(momxb + 1)%sf(j, k, l))/y_cc(k) + #:call GPU_PARALLEL_LOOP(collapse=4) + do l = 0, p + do k = 0, n + do j = 0, m + do r = -fd_number, fd_number + q_sf(j, k, l) = q_sf(j, k, l) & + + q_prim_vf0(momxb)%sf(j, k, l)*fd_coeff_x(r, j)* & + q_prim_vf0(momxe)%sf(r + j, k, l) & + + q_prim_vf0(momxb + 1)%sf(j, k, l)*fd_coeff_y(r, k)* & + q_prim_vf0(momxe)%sf(j, r + k, l) & + + q_prim_vf0(momxe)%sf(j, k, l)*fd_coeff_z(r, l)* & + q_prim_vf0(momxe)%sf(j, k, r + l)/y_cc(k) & + + (q_prim_vf0(momxe)%sf(j, k, l)* & + q_prim_vf0(momxb + 1)%sf(j, k, l))/y_cc(k) + end do end do end do end do - end do + #:endcall GPU_PARALLEL_LOOP else - $:GPU_PARALLEL_LOOP(collapse=4) - do l = 0, p - do k = 0, n - do j = 0, m - do r = -fd_number, fd_number - q_sf(j, k, l) = q_sf(j, k, l) & - + q_prim_vf0(momxb)%sf(j, k, l)*fd_coeff_x(r, j)* & - q_prim_vf0(momxe)%sf(r + j, k, l) & - + q_prim_vf0(momxb + 1)%sf(j, k, l)*fd_coeff_y(r, k)* & - q_prim_vf0(momxe)%sf(j, r + k, l) & - + q_prim_vf0(momxe)%sf(j, k, l)*fd_coeff_z(r, l)* & - q_prim_vf0(momxe)%sf(j, k, r + l) + #:call GPU_PARALLEL_LOOP(collapse=4) + do l = 0, p + do k = 0, n + do j = 0, m + do r = -fd_number, fd_number + q_sf(j, k, l) = q_sf(j, k, l) & + + q_prim_vf0(momxb)%sf(j, k, l)*fd_coeff_x(r, j)* & + q_prim_vf0(momxe)%sf(r + j, k, l) & + + q_prim_vf0(momxb + 1)%sf(j, k, l)*fd_coeff_y(r, k)* & + q_prim_vf0(momxe)%sf(j, r + k, l) & + + q_prim_vf0(momxe)%sf(j, k, l)*fd_coeff_z(r, l)* & + q_prim_vf0(momxe)%sf(j, k, r + l) + end do end do end do end do - end do + #:endcall GPU_PARALLEL_LOOP end if end if @@ -425,78 +438,81 @@ contains end do if (n == 0) then !1D simulation - $:GPU_PARALLEL_LOOP(collapse=3,private='[dV]') - do l = 0, p !Loop over grid - do k = 0, n - do j = 0, m - $:GPU_LOOP(parallelism='[seq]') - do i = 1, num_fluids !Loop over individual fluids - dV = dx(j) - ! Mass - $:GPU_ATOMIC(atomic='update') - c_m(i, 1) = c_m(i, 1) + q_vf(i)%sf(j, k, l)*dV - ! x-location weighted - $:GPU_ATOMIC(atomic='update') - c_m(i, 2) = c_m(i, 2) + q_vf(i)%sf(j, k, l)*dV*x_cc(j) - ! Volume fraction - $:GPU_ATOMIC(atomic='update') - c_m(i, 5) = c_m(i, 5) + q_vf(i + advxb - 1)%sf(j, k, l)*dV + #:call GPU_PARALLEL_LOOP(collapse=3,private='[dV]') + do l = 0, p !Loop over grid + do k = 0, n + do j = 0, m + $:GPU_LOOP(parallelism='[seq]') + do i = 1, num_fluids !Loop over individual fluids + dV = dx(j) + ! Mass + $:GPU_ATOMIC(atomic='update') + c_m(i, 1) = c_m(i, 1) + q_vf(i)%sf(j, k, l)*dV + ! x-location weighted + $:GPU_ATOMIC(atomic='update') + c_m(i, 2) = c_m(i, 2) + q_vf(i)%sf(j, k, l)*dV*x_cc(j) + ! Volume fraction + $:GPU_ATOMIC(atomic='update') + c_m(i, 5) = c_m(i, 5) + q_vf(i + advxb - 1)%sf(j, k, l)*dV + end do end do end do end do - end do + #:endcall GPU_PARALLEL_LOOP elseif (p == 0) then !2D simulation - $:GPU_PARALLEL_LOOP(collapse=3,private='[dV]') - do l = 0, p !Loop over grid - do k = 0, n - do j = 0, m - $:GPU_LOOP(parallelism='[seq]') - do i = 1, num_fluids !Loop over individual fluids - dV = dx(j)*dy(k) - ! Mass - $:GPU_ATOMIC(atomic='update') - c_m(i, 1) = c_m(i, 1) + q_vf(i)%sf(j, k, l)*dV - ! x-location weighted - $:GPU_ATOMIC(atomic='update') - c_m(i, 2) = c_m(i, 2) + q_vf(i)%sf(j, k, l)*dV*x_cc(j) - ! y-location weighted - $:GPU_ATOMIC(atomic='update') - c_m(i, 3) = c_m(i, 3) + q_vf(i)%sf(j, k, l)*dV*y_cc(k) - ! Volume fraction - $:GPU_ATOMIC(atomic='update') - c_m(i, 5) = c_m(i, 5) + q_vf(i + advxb - 1)%sf(j, k, l)*dV + #:call GPU_PARALLEL_LOOP(collapse=3,private='[dV]') + do l = 0, p !Loop over grid + do k = 0, n + do j = 0, m + $:GPU_LOOP(parallelism='[seq]') + do i = 1, num_fluids !Loop over individual fluids + dV = dx(j)*dy(k) + ! Mass + $:GPU_ATOMIC(atomic='update') + c_m(i, 1) = c_m(i, 1) + q_vf(i)%sf(j, k, l)*dV + ! x-location weighted + $:GPU_ATOMIC(atomic='update') + c_m(i, 2) = c_m(i, 2) + q_vf(i)%sf(j, k, l)*dV*x_cc(j) + ! y-location weighted + $:GPU_ATOMIC(atomic='update') + c_m(i, 3) = c_m(i, 3) + q_vf(i)%sf(j, k, l)*dV*y_cc(k) + ! Volume fraction + $:GPU_ATOMIC(atomic='update') + c_m(i, 5) = c_m(i, 5) + q_vf(i + advxb - 1)%sf(j, k, l)*dV + end do end do end do end do - end do + #:endcall GPU_PARALLEL_LOOP else !3D simulation - $:GPU_PARALLEL_LOOP(collapse=3,private='[dV]') - do l = 0, p !Loop over grid - do k = 0, n - do j = 0, m - $:GPU_LOOP(parallelism='[seq]') - do i = 1, num_fluids !Loop over individual fluids - - dV = dx(j)*dy(k)*dz(l) - ! Mass - $:GPU_ATOMIC(atomic='update') - c_m(i, 1) = c_m(i, 1) + q_vf(i)%sf(j, k, l)*dV - ! x-location weighted - $:GPU_ATOMIC(atomic='update') - c_m(i, 2) = c_m(i, 2) + q_vf(i)%sf(j, k, l)*dV*x_cc(j) - ! y-location weighted - $:GPU_ATOMIC(atomic='update') - c_m(i, 3) = c_m(i, 3) + q_vf(i)%sf(j, k, l)*dV*y_cc(k) - ! z-location weighted - $:GPU_ATOMIC(atomic='update') - c_m(i, 4) = c_m(i, 4) + q_vf(i)%sf(j, k, l)*dV*z_cc(l) - ! Volume fraction - $:GPU_ATOMIC(atomic='update') - c_m(i, 5) = c_m(i, 5) + q_vf(i + advxb - 1)%sf(j, k, l)*dV + #:call GPU_PARALLEL_LOOP(collapse=3,private='[dV]') + do l = 0, p !Loop over grid + do k = 0, n + do j = 0, m + $:GPU_LOOP(parallelism='[seq]') + do i = 1, num_fluids !Loop over individual fluids + + dV = dx(j)*dy(k)*dz(l) + ! Mass + $:GPU_ATOMIC(atomic='update') + c_m(i, 1) = c_m(i, 1) + q_vf(i)%sf(j, k, l)*dV + ! x-location weighted + $:GPU_ATOMIC(atomic='update') + c_m(i, 2) = c_m(i, 2) + q_vf(i)%sf(j, k, l)*dV*x_cc(j) + ! y-location weighted + $:GPU_ATOMIC(atomic='update') + c_m(i, 3) = c_m(i, 3) + q_vf(i)%sf(j, k, l)*dV*y_cc(k) + ! z-location weighted + $:GPU_ATOMIC(atomic='update') + c_m(i, 4) = c_m(i, 4) + q_vf(i)%sf(j, k, l)*dV*z_cc(l) + ! Volume fraction + $:GPU_ATOMIC(atomic='update') + c_m(i, 5) = c_m(i, 5) + q_vf(i + advxb - 1)%sf(j, k, l)*dV + end do end do end do end do - end do + #:endcall GPU_PARALLEL_LOOP end if $:GPU_UPDATE(host='[c_m]') diff --git a/src/simulation/m_fftw.fpp b/src/simulation/m_fftw.fpp index 852fb90290..39c8bd493e 100644 --- a/src/simulation/m_fftw.fpp +++ b/src/simulation/m_fftw.fpp @@ -15,9 +15,9 @@ module m_fftw use m_mpi_proxy !< Message passing interface (MPI) module proxy -#if defined(MFC_OpenACC) && defined(__PGI) +#if defined(MFC_GPU) && defined(__PGI) use cufft -#elif defined(MFC_OpenACC) +#elif defined(MFC_GPU) use hipfort use hipfort_check use hipfort_hipfft @@ -29,7 +29,7 @@ module m_fftw s_apply_fourier_filter, & s_finalize_fftw_module -#if !defined(MFC_OpenACC) +#if !defined(MFC_GPU) include 'fftw3.f03' #endif @@ -45,7 +45,7 @@ module m_fftw complex(c_double_complex), pointer :: data_fltr_cmplx(:) !< !! Filtered complex data in Fourier space -#if defined(MFC_OpenACC) +#if defined(MFC_GPU) $:GPU_DECLARE(create='[real_size,cmplx_size,x_size,batch_size,Nfq]') real(dp), allocatable, target :: data_real_gpu(:) @@ -81,8 +81,7 @@ contains x_size = m + 1 batch_size = x_size*sys_size -#if defined(MFC_OpenACC) - +#if defined(MFC_GPU) rank = 1; istride = 1; ostride = 1 allocate (gpu_fft_size(1:rank), iembed(1:rank), oembed(1:rank)) @@ -107,7 +106,7 @@ contains bwd_plan = fftw_plan_dft_c2r_1d(real_size, data_fltr_cmplx, data_real, FFTW_ESTIMATE) #endif -#if defined(MFC_OpenACC) +#if defined(MFC_GPU) @:ALLOCATE(data_real_gpu(1:real_size*x_size*sys_size)) @:ALLOCATE(data_cmplx_gpu(1:cmplx_size*x_size*sys_size)) @:ALLOCATE(data_fltr_cmplx_gpu(1:cmplx_size*x_size*sys_size)) @@ -137,74 +136,12 @@ contains integer :: i, j, k, l !< Generic loop iterators integer :: ierr !< Generic flag used to identify and report GPU errors - ! Restrict filter to processors that have cells adjacent to axis - if (bc_y%beg >= 0) return -#if defined(MFC_OpenACC) - $:GPU_PARALLEL_LOOP(collapse=3) - do k = 1, sys_size - do j = 0, m - do l = 1, cmplx_size - data_fltr_cmplx_gpu(l + j*cmplx_size + (k - 1)*cmplx_size*x_size) = (0_dp, 0_dp) - end do - end do - end do - - $:GPU_PARALLEL_LOOP(collapse=3) - do k = 1, sys_size - do j = 0, m - do l = 0, p - data_real_gpu(l + j*real_size + 1 + (k - 1)*real_size*x_size) = q_cons_vf(k)%sf(j, 0, l) - end do - end do - end do - - p_real => data_real_gpu - p_cmplx => data_cmplx_gpu - p_fltr_cmplx => data_fltr_cmplx_gpu - - #:call GPU_DATA(attach='[p_real, p_cmplx, p_fltr_cmplx]') - #:call GPU_HOST_DATA(use_device='[p_real, p_cmplx, p_fltr_cmplx]') -#if defined(__PGI) - ierr = cufftExecD2Z(fwd_plan_gpu, data_real_gpu, data_cmplx_gpu) -#else - ierr = hipfftExecD2Z(fwd_plan_gpu, c_loc(p_real), c_loc(p_cmplx)) - call hipCheck(hipDeviceSynchronize()) -#endif - #:endcall GPU_HOST_DATA - Nfq = 3 - $:GPU_UPDATE(device='[Nfq]') - - $:GPU_PARALLEL_LOOP(collapse=3) - do k = 1, sys_size - do j = 0, m - do l = 1, Nfq - data_fltr_cmplx_gpu(l + j*cmplx_size + (k - 1)*cmplx_size*x_size) = data_cmplx_gpu(l + j*cmplx_size + (k - 1)*cmplx_size*x_size) - end do - end do - end do - - #:call GPU_HOST_DATA(use_device='[p_real, p_fltr_cmplx]') -#if defined(__PGI) - ierr = cufftExecZ2D(bwd_plan_gpu, data_fltr_cmplx_gpu, data_real_gpu) -#else - ierr = hipfftExecZ2D(bwd_plan_gpu, c_loc(p_fltr_cmplx), c_loc(p_real)) - call hipCheck(hipDeviceSynchronize()) -#endif - #:endcall GPU_HOST_DATA - - $:GPU_PARALLEL_LOOP(collapse=3) - do k = 1, sys_size - do j = 0, m - do l = 0, p - data_real_gpu(l + j*real_size + 1 + (k - 1)*real_size*x_size) = data_real_gpu(l + j*real_size + 1 + (k - 1)*real_size*x_size)/real(real_size, dp) - q_cons_vf(k)%sf(j, 0, l) = data_real_gpu(l + j*real_size + 1 + (k - 1)*real_size*x_size) - end do - end do - end do - - do i = 1, fourier_rings + #:block UNDEF_CCE + ! Restrict filter to processors that have cells adjacent to axis + if (bc_y%beg >= 0) return +#if defined(MFC_GPU) - $:GPU_PARALLEL_LOOP(collapse=3) + #:call GPU_PARALLEL_LOOP(collapse=3) do k = 1, sys_size do j = 0, m do l = 1, cmplx_size @@ -212,17 +149,26 @@ contains end do end do end do + #:endcall GPU_PARALLEL_LOOP - $:GPU_PARALLEL_LOOP(collapse=3, firstprivate='[i]') + #:call GPU_PARALLEL_LOOP(collapse=3) do k = 1, sys_size do j = 0, m do l = 0, p - data_real_gpu(l + j*real_size + 1 + (k - 1)*real_size*x_size) = q_cons_vf(k)%sf(j, i, l) + data_real_gpu(l + j*real_size + 1 + (k - 1)*real_size*x_size) = q_cons_vf(k)%sf(j, 0, l) end do end do end do + #:endcall GPU_PARALLEL_LOOP - #:call GPU_HOST_DATA(use_device='[p_real, p_cmplx]') + #:if not USING_NVHPC + p_real => data_real_gpu + p_cmplx => data_cmplx_gpu + p_fltr_cmplx => data_fltr_cmplx_gpu + #:endif + + #:call GPU_DATA(attach='[p_real, p_cmplx, p_fltr_cmplx]') + #:call GPU_HOST_DATA(use_device_ptr='[p_real, p_cmplx, p_fltr_cmplx]') #if defined(__PGI) ierr = cufftExecD2Z(fwd_plan_gpu, data_real_gpu, data_cmplx_gpu) #else @@ -230,20 +176,20 @@ contains call hipCheck(hipDeviceSynchronize()) #endif #:endcall GPU_HOST_DATA - - Nfq = min(floor(2_dp*real(i, dp)*pi), cmplx_size) + Nfq = 3 $:GPU_UPDATE(device='[Nfq]') - $:GPU_PARALLEL_LOOP(collapse=3) - do k = 1, sys_size - do j = 0, m - do l = 1, Nfq - data_fltr_cmplx_gpu(l + j*cmplx_size + (k - 1)*cmplx_size*x_size) = data_cmplx_gpu(l + j*cmplx_size + (k - 1)*cmplx_size*x_size) + #:call GPU_PARALLEL_LOOP(collapse=3) + do k = 1, sys_size + do j = 0, m + do l = 1, Nfq + data_fltr_cmplx_gpu(l + j*cmplx_size + (k - 1)*cmplx_size*x_size) = data_cmplx_gpu(l + j*cmplx_size + (k - 1)*cmplx_size*x_size) + end do end do end do - end do + #:endcall GPU_PARALLEL_LOOP - #:call GPU_HOST_DATA(use_device='[p_real, p_fltr_cmplx]') + #:call GPU_HOST_DATA(use_device_ptr='[p_real, p_fltr_cmplx]') #if defined(__PGI) ierr = cufftExecZ2D(bwd_plan_gpu, data_fltr_cmplx_gpu, data_real_gpu) #else @@ -252,17 +198,83 @@ contains #endif #:endcall GPU_HOST_DATA - $:GPU_PARALLEL_LOOP(collapse=3, firstprivate='[i]') - do k = 1, sys_size - do j = 0, m - do l = 0, p - data_real_gpu(l + j*real_size + 1 + (k - 1)*real_size*x_size) = data_real_gpu(l + j*real_size + 1 + (k - 1)*real_size*x_size)/real(real_size, dp) - q_cons_vf(k)%sf(j, i, l) = data_real_gpu(l + j*real_size + 1 + (k - 1)*real_size*x_size) + #:call GPU_PARALLEL_LOOP(collapse=3) + do k = 1, sys_size + do j = 0, m + do l = 0, p + data_real_gpu(l + j*real_size + 1 + (k - 1)*real_size*x_size) = data_real_gpu(l + j*real_size + 1 + (k - 1)*real_size*x_size)/real(real_size, dp) + q_cons_vf(k)%sf(j, 0, l) = data_real_gpu(l + j*real_size + 1 + (k - 1)*real_size*x_size) + end do end do end do - end do + #:endcall GPU_PARALLEL_LOOP - end do + do i = 1, fourier_rings + + #:call GPU_PARALLEL_LOOP(collapse=3) + do k = 1, sys_size + do j = 0, m + do l = 1, cmplx_size + data_fltr_cmplx_gpu(l + j*cmplx_size + (k - 1)*cmplx_size*x_size) = (0_dp, 0_dp) + end do + end do + end do + #:endcall GPU_PARALLEL_LOOP + + #:call GPU_PARALLEL_LOOP(collapse=3, firstprivate='[i]') + do k = 1, sys_size + do j = 0, m + do l = 0, p + data_real_gpu(l + j*real_size + 1 + (k - 1)*real_size*x_size) = q_cons_vf(k)%sf(j, i, l) + end do + end do + end do + #:endcall GPU_PARALLEL_LOOP + + #:call GPU_HOST_DATA(use_device_ptr='[p_real, p_cmplx]') +#if defined(__PGI) + ierr = cufftExecD2Z(fwd_plan_gpu, data_real_gpu, data_cmplx_gpu) +#else + ierr = hipfftExecD2Z(fwd_plan_gpu, c_loc(p_real), c_loc(p_cmplx)) + call hipCheck(hipDeviceSynchronize()) +#endif + #:endcall GPU_HOST_DATA + + Nfq = min(floor(2_dp*real(i, dp)*pi), cmplx_size) + $:GPU_UPDATE(device='[Nfq]') + + #:call GPU_PARALLEL_LOOP(collapse=3) + do k = 1, sys_size + do j = 0, m + do l = 1, Nfq + data_fltr_cmplx_gpu(l + j*cmplx_size + (k - 1)*cmplx_size*x_size) = data_cmplx_gpu(l + j*cmplx_size + (k - 1)*cmplx_size*x_size) + end do + end do + end do + #:endcall GPU_PARALLEL_LOOP + + #:call GPU_HOST_DATA(use_device_ptr='[p_real, p_fltr_cmplx]') +#if defined(__PGI) + ierr = cufftExecZ2D(bwd_plan_gpu, data_fltr_cmplx_gpu, data_real_gpu) +#else + ierr = hipfftExecZ2D(bwd_plan_gpu, c_loc(p_fltr_cmplx), c_loc(p_real)) + call hipCheck(hipDeviceSynchronize()) +#endif + #:endcall GPU_HOST_DATA + + #:call GPU_PARALLEL_LOOP(collapse=3, firstprivate='[i]') + do k = 1, sys_size + do j = 0, m + do l = 0, p + data_real_gpu(l + j*real_size + 1 + (k - 1)*real_size*x_size) = data_real_gpu(l + j*real_size + 1 + (k - 1)*real_size*x_size)/real(real_size, dp) + q_cons_vf(k)%sf(j, i, l) = data_real_gpu(l + j*real_size + 1 + (k - 1)*real_size*x_size) + end do + end do + end do + #:endcall GPU_PARALLEL_LOOP + + end do + #:endcall GPU_DATA #else Nfq = 3 @@ -294,7 +306,8 @@ contains end do end do #endif - #:endcall GPU_DATA + #:endblock UNDEF_CCE + end subroutine s_apply_fourier_filter !> The purpose of this subroutine is to destroy the fftw plan @@ -302,7 +315,7 @@ contains !! applying the Fourier filter in the azimuthal direction. impure subroutine s_finalize_fftw_module -#if defined(MFC_OpenACC) +#if defined(MFC_GPU) integer :: ierr !< Generic flag used to identify and report GPU errors @:DEALLOCATE(data_real_gpu, data_fltr_cmplx_gpu, data_cmplx_gpu) #if defined(__PGI) diff --git a/src/simulation/m_global_parameters.fpp b/src/simulation/m_global_parameters.fpp index 37075a3c4d..6f2ec0641c 100644 --- a/src/simulation/m_global_parameters.fpp +++ b/src/simulation/m_global_parameters.fpp @@ -21,9 +21,7 @@ module m_global_parameters use m_helper_basic !< Functions to compare floating point numbers -#ifdef MFC_OpenACC - use openacc -#endif + ! $:USE_GPU_MODULE() implicit none @@ -194,8 +192,6 @@ module m_global_parameters integer :: num_igr_warm_start_iters !< number of warm start iterations for elliptic solve real(wp) :: alf_factor !< alpha factor for IGR - $:GPU_DECLARE(create='[chemistry]') - logical :: bodyForces logical :: bf_x, bf_y, bf_z !< body force toggle in three directions !< amplitude, frequency, and phase shift sinusoid in each direction @@ -206,6 +202,7 @@ module m_global_parameters #:endfor real(wp), dimension(3) :: accel_bf $:GPU_DECLARE(create='[accel_bf]') + ! $:GPU_DECLARE(create='[k_x,w_x,p_x,g_x,k_y,w_y,p_y,g_y,k_z,w_z,p_z,g_z]') integer :: cpu_start, cpu_end, cpu_rate @@ -235,10 +232,13 @@ module m_global_parameters !> @{ type(int_bounds_info) :: bc_x, bc_y, bc_z !> @} +#if defined(MFC_OpenACC) $:GPU_DECLARE(create='[bc_x%vb1, bc_x%vb2, bc_x%vb3, bc_x%ve1, bc_x%ve2, bc_x%ve3]') $:GPU_DECLARE(create='[bc_y%vb1, bc_y%vb2, bc_y%vb3, bc_y%ve1, bc_y%ve2, bc_y%ve3]') $:GPU_DECLARE(create='[bc_z%vb1, bc_z%vb2, bc_z%vb3, bc_z%ve1, bc_z%ve2, bc_z%ve3]') - +#elif defined(MFC_OpenMP) + $:GPU_DECLARE(create='[bc_x, bc_y, bc_z]') +#endif type(bounds_info) :: x_domain, y_domain, z_domain real(wp) :: x_a, y_a, z_a real(wp) :: x_b, y_b, z_b diff --git a/src/simulation/m_hyperelastic.fpp b/src/simulation/m_hyperelastic.fpp index f4a24fba7a..2b171016cb 100644 --- a/src/simulation/m_hyperelastic.fpp +++ b/src/simulation/m_hyperelastic.fpp @@ -28,12 +28,12 @@ module m_hyperelastic type(vector_field) :: btensor !< $:GPU_DECLARE(create='[btensor]') - real(wp), allocatable, dimension(:, :) :: fd_coeff_x - real(wp), allocatable, dimension(:, :) :: fd_coeff_y - real(wp), allocatable, dimension(:, :) :: fd_coeff_z - $:GPU_DECLARE(create='[fd_coeff_x,fd_coeff_y, fd_coeff_z]') - real(wp), allocatable, dimension(:) :: Gs - $:GPU_DECLARE(create='[Gs]') + real(wp), allocatable, dimension(:, :) :: fd_coeff_x_hyper + real(wp), allocatable, dimension(:, :) :: fd_coeff_y_hyper + real(wp), allocatable, dimension(:, :) :: fd_coeff_z_hyper + $:GPU_DECLARE(create='[fd_coeff_x_hyper,fd_coeff_y_hyper, fd_coeff_z_hyper]') + real(wp), allocatable, dimension(:) :: Gs_hyper + $:GPU_DECLARE(create='[Gs_hyper]') contains @@ -54,34 +54,34 @@ contains end do @:ACC_SETUP_VFs(btensor) - @:ALLOCATE(Gs(1:num_fluids)) + @:ALLOCATE(Gs_hyper(1:num_fluids)) $:GPU_LOOP(parallelism='[seq]') do i = 1, num_fluids - Gs(i) = fluid_pp(i)%G + Gs_hyper(i) = fluid_pp(i)%G end do - $:GPU_UPDATE(device='[Gs]') + $:GPU_UPDATE(device='[Gs_hyper]') - @:ALLOCATE(fd_coeff_x(-fd_number:fd_number, 0:m)) + @:ALLOCATE(fd_coeff_x_hyper(-fd_number:fd_number, 0:m)) if (n > 0) then - @:ALLOCATE(fd_coeff_y(-fd_number:fd_number, 0:n)) + @:ALLOCATE(fd_coeff_y_hyper(-fd_number:fd_number, 0:n)) end if if (p > 0) then - @:ALLOCATE(fd_coeff_z(-fd_number:fd_number, 0:p)) + @:ALLOCATE(fd_coeff_z_hyper(-fd_number:fd_number, 0:p)) end if ! Computing centered finite difference coefficients - call s_compute_finite_difference_coefficients(m, x_cc, fd_coeff_x, buff_size, & + call s_compute_finite_difference_coefficients(m, x_cc, fd_coeff_x_hyper, buff_size, & fd_number, fd_order) - $:GPU_UPDATE(device='[fd_coeff_x]') + $:GPU_UPDATE(device='[fd_coeff_x_hyper]') if (n > 0) then - call s_compute_finite_difference_coefficients(n, y_cc, fd_coeff_y, buff_size, & + call s_compute_finite_difference_coefficients(n, y_cc, fd_coeff_y_hyper, buff_size, & fd_number, fd_order) - $:GPU_UPDATE(device='[fd_coeff_y]') + $:GPU_UPDATE(device='[fd_coeff_y_hyper]') end if if (p > 0) then - call s_compute_finite_difference_coefficients(p, z_cc, fd_coeff_z, buff_size, & + call s_compute_finite_difference_coefficients(p, z_cc, fd_coeff_z_hyper, buff_size, & fd_number, fd_order) - $:GPU_UPDATE(device='[fd_coeff_z]') + $:GPU_UPDATE(device='[fd_coeff_z_hyper]') end if end subroutine s_initialize_hyperelastic_module @@ -106,108 +106,108 @@ contains real(wp) :: G_local integer :: j, k, l, i, r - $:GPU_PARALLEL_LOOP(collapse=3, private='[alpha_K, alpha_rho_K, rho, & - & gamma, pi_inf, qv, G_local, Re, tensora, tensorb]') - do l = 0, p - do k = 0, n - do j = 0, m - $:GPU_LOOP(parallelism='[seq]') - do i = 1, num_fluids - alpha_rho_k(i) = q_cons_vf(i)%sf(j, k, l) - alpha_k(i) = q_cons_vf(advxb + i - 1)%sf(j, k, l) - end do - ! If in simulation, use acc mixture subroutines - call s_convert_species_to_mixture_variables_acc(rho, gamma, pi_inf, qv, alpha_k, & - alpha_rho_k, Re, G_local, Gs) - rho = max(rho, sgm_eps) - G_local = max(G_local, sgm_eps) - !if ( G_local <= verysmall ) G_K = 0._wp - - if (G_local > verysmall) then - $:GPU_LOOP(parallelism='[seq]') - do i = 1, tensor_size - tensora(i) = 0._wp - end do - ! STEP 1: computing the grad_xi tensor using finite differences - ! grad_xi definition / organization - ! number for the tensor 1-3: dxix_dx, dxiy_dx, dxiz_dx - ! 4-6 : dxix_dy, dxiy_dy, dxiz_dy - ! 7-9 : dxix_dz, dxiy_dz, dxiz_dz + #:call GPU_PARALLEL_LOOP(collapse=3, private='[alpha_K, alpha_rho_K, rho, gamma, pi_inf, qv, G_local, Re, tensora, tensorb]') + do l = 0, p + do k = 0, n + do j = 0, m $:GPU_LOOP(parallelism='[seq]') - do r = -fd_number, fd_number - ! derivatives in the x-direction - tensora(1) = tensora(1) + q_prim_vf(xibeg)%sf(j + r, k, l)*fd_coeff_x(r, j) - tensora(2) = tensora(2) + q_prim_vf(xibeg + 1)%sf(j + r, k, l)*fd_coeff_x(r, j) - tensora(3) = tensora(3) + q_prim_vf(xiend)%sf(j + r, k, l)*fd_coeff_x(r, j) - ! derivatives in the y-direction - tensora(4) = tensora(4) + q_prim_vf(xibeg)%sf(j, k + r, l)*fd_coeff_y(r, k) - tensora(5) = tensora(5) + q_prim_vf(xibeg + 1)%sf(j, k + r, l)*fd_coeff_y(r, k) - tensora(6) = tensora(6) + q_prim_vf(xiend)%sf(j, k + r, l)*fd_coeff_y(r, k) - ! derivatives in the z-direction - tensora(7) = tensora(7) + q_prim_vf(xibeg)%sf(j, k, l + r)*fd_coeff_z(r, l) - tensora(8) = tensora(8) + q_prim_vf(xibeg + 1)%sf(j, k, l + r)*fd_coeff_z(r, l) - tensora(9) = tensora(9) + q_prim_vf(xiend)%sf(j, k, l + r)*fd_coeff_z(r, l) + do i = 1, num_fluids + alpha_rho_k(i) = q_cons_vf(i)%sf(j, k, l) + alpha_k(i) = q_cons_vf(advxb + i - 1)%sf(j, k, l) end do - ! STEP 2a: computing the adjoint of the grad_xi tensor for the inverse - tensorb(1) = tensora(5)*tensora(9) - tensora(6)*tensora(8) - tensorb(2) = -(tensora(2)*tensora(9) - tensora(3)*tensora(8)) - tensorb(3) = tensora(2)*tensora(6) - tensora(3)*tensora(5) - tensorb(4) = -(tensora(4)*tensora(9) - tensora(6)*tensora(7)) - tensorb(5) = tensora(1)*tensora(9) - tensora(3)*tensora(7) - tensorb(6) = -(tensora(1)*tensora(6) - tensora(4)*tensora(3)) - tensorb(7) = tensora(4)*tensora(8) - tensora(5)*tensora(7) - tensorb(8) = -(tensora(1)*tensora(8) - tensora(2)*tensora(7)) - tensorb(9) = tensora(1)*tensora(5) - tensora(2)*tensora(4) - - ! STEP 2b: computing the determinant of the grad_xi tensor - tensorb(tensor_size) = tensora(1)*(tensora(5)*tensora(9) - tensora(6)*tensora(8)) & - - tensora(2)*(tensora(4)*tensora(9) - tensora(6)*tensora(7)) & - + tensora(3)*(tensora(4)*tensora(8) - tensora(5)*tensora(7)) - - if (tensorb(tensor_size) > verysmall) then - ! STEP 2c: computing the inverse of grad_xi tensor = F - ! tensorb is the adjoint, tensora becomes F + ! If in simulation, use acc mixture subroutines + call s_convert_species_to_mixture_variables_acc(rho, gamma, pi_inf, qv, alpha_k, & + alpha_rho_k, Re, G_local, Gs_hyper) + rho = max(rho, sgm_eps) + G_local = max(G_local, sgm_eps) + !if ( G_local <= verysmall ) G_K = 0._wp + + if (G_local > verysmall) then $:GPU_LOOP(parallelism='[seq]') - do i = 1, tensor_size - 1 - tensora(i) = tensorb(i)/tensorb(tensor_size) + do i = 1, tensor_size + tensora(i) = 0._wp end do - - ! STEP 2d: computing the J = det(F) = 1/det(\grad{\xi}) - tensorb(tensor_size) = 1._wp/tensorb(tensor_size) - - ! STEP 3: computing F transpose F - tensorb(1) = tensora(1)**2 + tensora(2)**2 + tensora(3)**2 - tensorb(5) = tensora(4)**2 + tensora(5)**2 + tensora(6)**2 - tensorb(9) = tensora(7)**2 + tensora(8)**2 + tensora(9)**2 - tensorb(2) = tensora(1)*tensora(4) + tensora(2)*tensora(5) + tensora(3)*tensora(6) - tensorb(3) = tensora(1)*tensora(7) + tensora(2)*tensora(8) + tensora(3)*tensora(9) - tensorb(6) = tensora(4)*tensora(7) + tensora(5)*tensora(8) + tensora(6)*tensora(9) - ! STEP 4: update the btensor, this is consistent with Riemann solvers - #:for BIJ, TXY in [(1,1),(2,2),(3,5),(4,3),(5,6),(6,9)] - btensor%vf(${BIJ}$)%sf(j, k, l) = tensorb(${TXY}$) - #:endfor - ! store the determinant at the last entry of the btensor - btensor%vf(b_size)%sf(j, k, l) = tensorb(tensor_size) - ! STEP 5a: updating the Cauchy stress primitive scalar field - if (hyper_model == 1) then - call s_neoHookean_cauchy_solver(btensor%vf, q_prim_vf, G_local, j, k, l) - elseif (hyper_model == 2) then - call s_Mooney_Rivlin_cauchy_solver(btensor%vf, q_prim_vf, G_local, j, k, l) - end if - ! STEP 5b: updating the pressure field - q_prim_vf(E_idx)%sf(j, k, l) = q_prim_vf(E_idx)%sf(j, k, l) - & - G_local*q_prim_vf(xiend + 1)%sf(j, k, l)/gamma - ! STEP 5c: updating the Cauchy stress conservative scalar field + ! STEP 1: computing the grad_xi tensor using finite differences + ! grad_xi definition / organization + ! number for the tensor 1-3: dxix_dx, dxiy_dx, dxiz_dx + ! 4-6 : dxix_dy, dxiy_dy, dxiz_dy + ! 7-9 : dxix_dz, dxiy_dz, dxiz_dz $:GPU_LOOP(parallelism='[seq]') - do i = 1, b_size - 1 - q_cons_vf(strxb + i - 1)%sf(j, k, l) = & - rho*q_prim_vf(strxb + i - 1)%sf(j, k, l) + do r = -fd_number, fd_number + ! derivatives in the x-direction + tensora(1) = tensora(1) + q_prim_vf(xibeg)%sf(j + r, k, l)*fd_coeff_x_hyper(r, j) + tensora(2) = tensora(2) + q_prim_vf(xibeg + 1)%sf(j + r, k, l)*fd_coeff_x_hyper(r, j) + tensora(3) = tensora(3) + q_prim_vf(xiend)%sf(j + r, k, l)*fd_coeff_x_hyper(r, j) + ! derivatives in the y-direction + tensora(4) = tensora(4) + q_prim_vf(xibeg)%sf(j, k + r, l)*fd_coeff_y_hyper(r, k) + tensora(5) = tensora(5) + q_prim_vf(xibeg + 1)%sf(j, k + r, l)*fd_coeff_y_hyper(r, k) + tensora(6) = tensora(6) + q_prim_vf(xiend)%sf(j, k + r, l)*fd_coeff_y_hyper(r, k) + ! derivatives in the z-direction + tensora(7) = tensora(7) + q_prim_vf(xibeg)%sf(j, k, l + r)*fd_coeff_z_hyper(r, l) + tensora(8) = tensora(8) + q_prim_vf(xibeg + 1)%sf(j, k, l + r)*fd_coeff_z_hyper(r, l) + tensora(9) = tensora(9) + q_prim_vf(xiend)%sf(j, k, l + r)*fd_coeff_z_hyper(r, l) end do + ! STEP 2a: computing the adjoint of the grad_xi tensor for the inverse + tensorb(1) = tensora(5)*tensora(9) - tensora(6)*tensora(8) + tensorb(2) = -(tensora(2)*tensora(9) - tensora(3)*tensora(8)) + tensorb(3) = tensora(2)*tensora(6) - tensora(3)*tensora(5) + tensorb(4) = -(tensora(4)*tensora(9) - tensora(6)*tensora(7)) + tensorb(5) = tensora(1)*tensora(9) - tensora(3)*tensora(7) + tensorb(6) = -(tensora(1)*tensora(6) - tensora(4)*tensora(3)) + tensorb(7) = tensora(4)*tensora(8) - tensora(5)*tensora(7) + tensorb(8) = -(tensora(1)*tensora(8) - tensora(2)*tensora(7)) + tensorb(9) = tensora(1)*tensora(5) - tensora(2)*tensora(4) + + ! STEP 2b: computing the determinant of the grad_xi tensor + tensorb(tensor_size) = tensora(1)*(tensora(5)*tensora(9) - tensora(6)*tensora(8)) & + - tensora(2)*(tensora(4)*tensora(9) - tensora(6)*tensora(7)) & + + tensora(3)*(tensora(4)*tensora(8) - tensora(5)*tensora(7)) + + if (tensorb(tensor_size) > verysmall) then + ! STEP 2c: computing the inverse of grad_xi tensor = F + ! tensorb is the adjoint, tensora becomes F + $:GPU_LOOP(parallelism='[seq]') + do i = 1, tensor_size - 1 + tensora(i) = tensorb(i)/tensorb(tensor_size) + end do + + ! STEP 2d: computing the J = det(F) = 1/det(\grad{\xi}) + tensorb(tensor_size) = 1._wp/tensorb(tensor_size) + + ! STEP 3: computing F transpose F + tensorb(1) = tensora(1)**2 + tensora(2)**2 + tensora(3)**2 + tensorb(5) = tensora(4)**2 + tensora(5)**2 + tensora(6)**2 + tensorb(9) = tensora(7)**2 + tensora(8)**2 + tensora(9)**2 + tensorb(2) = tensora(1)*tensora(4) + tensora(2)*tensora(5) + tensora(3)*tensora(6) + tensorb(3) = tensora(1)*tensora(7) + tensora(2)*tensora(8) + tensora(3)*tensora(9) + tensorb(6) = tensora(4)*tensora(7) + tensora(5)*tensora(8) + tensora(6)*tensora(9) + ! STEP 4: update the btensor, this is consistent with Riemann solvers + #:for BIJ, TXY in [(1,1),(2,2),(3,5),(4,3),(5,6),(6,9)] + btensor%vf(${BIJ}$)%sf(j, k, l) = tensorb(${TXY}$) + #:endfor + ! store the determinant at the last entry of the btensor + btensor%vf(b_size)%sf(j, k, l) = tensorb(tensor_size) + ! STEP 5a: updating the Cauchy stress primitive scalar field + if (hyper_model == 1) then + call s_neoHookean_cauchy_solver(btensor%vf, q_prim_vf, G_local, j, k, l) + elseif (hyper_model == 2) then + call s_Mooney_Rivlin_cauchy_solver(btensor%vf, q_prim_vf, G_local, j, k, l) + end if + ! STEP 5b: updating the pressure field + q_prim_vf(E_idx)%sf(j, k, l) = q_prim_vf(E_idx)%sf(j, k, l) - & + G_local*q_prim_vf(xiend + 1)%sf(j, k, l)/gamma + ! STEP 5c: updating the Cauchy stress conservative scalar field + $:GPU_LOOP(parallelism='[seq]') + do i = 1, b_size - 1 + q_cons_vf(strxb + i - 1)%sf(j, k, l) = & + rho*q_prim_vf(strxb + i - 1)%sf(j, k, l) + end do + end if end if - end if + end do end do end do - end do + #:endcall GPU_PARALLEL_LOOP end subroutine s_hyperelastic_rmt_stress_update !> The following subroutine handles the calculation of the btensor. @@ -218,7 +218,7 @@ contains !! calculate the inverse of grad_xi to obtain F, F is a nxn tensor !! calculate the FFtranspose to obtain the btensor, btensor is nxn tensor !! btensor is symmetric, save the data space - pure subroutine s_neoHookean_cauchy_solver(btensor_in, q_prim_vf, G_param, j, k, l) + subroutine s_neoHookean_cauchy_solver(btensor_in, q_prim_vf, G_param, j, k, l) $:GPU_ROUTINE(parallelism='[seq]') type(scalar_field), dimension(sys_size), intent(inout) :: q_prim_vf type(scalar_field), dimension(b_size), intent(inout) :: btensor_in @@ -257,7 +257,7 @@ contains !! calculate the inverse of grad_xi to obtain F, F is a nxn tensor !! calculate the FFtranspose to obtain the btensor, btensor is nxn tensor !! btensor is symmetric, save the data space - pure subroutine s_Mooney_Rivlin_cauchy_solver(btensor_in, q_prim_vf, G_param, j, k, l) + subroutine s_Mooney_Rivlin_cauchy_solver(btensor_in, q_prim_vf, G_param, j, k, l) $:GPU_ROUTINE(parallelism='[seq]') type(scalar_field), dimension(sys_size), intent(inout) :: q_prim_vf type(scalar_field), dimension(b_size), intent(inout) :: btensor_in @@ -298,11 +298,11 @@ contains do i = 1, b_size @:DEALLOCATE(btensor%vf(i)%sf) end do - @:DEALLOCATE(fd_coeff_x) + @:DEALLOCATE(fd_coeff_x_hyper) if (n > 0) then - @:DEALLOCATE(fd_coeff_y) + @:DEALLOCATE(fd_coeff_y_hyper) if (p > 0) then - @:DEALLOCATE(fd_coeff_z) + @:DEALLOCATE(fd_coeff_z_hyper) end if end if diff --git a/src/simulation/m_hypoelastic.fpp b/src/simulation/m_hypoelastic.fpp index 3f736b0b0b..312c2343b6 100644 --- a/src/simulation/m_hypoelastic.fpp +++ b/src/simulation/m_hypoelastic.fpp @@ -19,21 +19,21 @@ module m_hypoelastic s_compute_hypoelastic_rhs, & s_compute_damage_state - real(wp), allocatable, dimension(:) :: Gs - $:GPU_DECLARE(create='[Gs]') + real(wp), allocatable, dimension(:) :: Gs_hypo + $:GPU_DECLARE(create='[Gs_hypo]') - real(wp), allocatable, dimension(:, :, :) :: du_dx, du_dy, du_dz - real(wp), allocatable, dimension(:, :, :) :: dv_dx, dv_dy, dv_dz - real(wp), allocatable, dimension(:, :, :) :: dw_dx, dw_dy, dw_dz - $:GPU_DECLARE(create='[du_dx,du_dy,du_dz,dv_dx,dv_dy,dv_dz,dw_dx,dw_dy,dw_dz]') + real(wp), allocatable, dimension(:, :, :) :: du_dx_hypo, du_dy_hypo, du_dz_hypo + real(wp), allocatable, dimension(:, :, :) :: dv_dx_hypo, dv_dy_hypo, dv_dz_hypo + real(wp), allocatable, dimension(:, :, :) :: dw_dx_hypo, dw_dy_hypo, dw_dz_hypo + $:GPU_DECLARE(create='[du_dx_hypo,du_dy_hypo,du_dz_hypo,dv_dx_hypo,dv_dy_hypo,dv_dz_hypo,dw_dx_hypo,dw_dy_hypo,dw_dz_hypo]') real(wp), allocatable, dimension(:, :, :) :: rho_K_field, G_K_field $:GPU_DECLARE(create='[rho_K_field,G_K_field]') - real(wp), allocatable, dimension(:, :) :: fd_coeff_x_h - real(wp), allocatable, dimension(:, :) :: fd_coeff_y_h - real(wp), allocatable, dimension(:, :) :: fd_coeff_z_h - $:GPU_DECLARE(create='[fd_coeff_x_h,fd_coeff_y_h,fd_coeff_z_h]') + real(wp), allocatable, dimension(:, :) :: fd_coeff_x_hypo + real(wp), allocatable, dimension(:, :) :: fd_coeff_y_hypo + real(wp), allocatable, dimension(:, :) :: fd_coeff_z_hypo + $:GPU_DECLARE(create='[fd_coeff_x_hypo,fd_coeff_y_hypo,fd_coeff_z_hypo]') contains @@ -41,43 +41,43 @@ contains integer :: i - @:ALLOCATE(Gs(1:num_fluids)) + @:ALLOCATE(Gs_hypo(1:num_fluids)) @:ALLOCATE(rho_K_field(0:m,0:n,0:p), G_K_field(0:m,0:n,0:p)) - @:ALLOCATE(du_dx(0:m,0:n,0:p)) + @:ALLOCATE(du_dx_hypo(0:m,0:n,0:p)) if (n > 0) then - @:ALLOCATE(du_dy(0:m,0:n,0:p), dv_dx(0:m,0:n,0:p), dv_dy(0:m,0:n,0:p)) + @:ALLOCATE(du_dy_hypo(0:m,0:n,0:p), dv_dx_hypo(0:m,0:n,0:p), dv_dy_hypo(0:m,0:n,0:p)) if (p > 0) then - @:ALLOCATE(du_dz(0:m,0:n,0:p), dv_dz(0:m,0:n,0:p)) - @:ALLOCATE(dw_dx(0:m,0:n,0:p), dw_dy(0:m,0:n,0:p), dw_dz(0:m,0:n,0:p)) + @:ALLOCATE(du_dz_hypo(0:m,0:n,0:p), dv_dz_hypo(0:m,0:n,0:p)) + @:ALLOCATE(dw_dx_hypo(0:m,0:n,0:p), dw_dy_hypo(0:m,0:n,0:p), dw_dz_hypo(0:m,0:n,0:p)) end if end if do i = 1, num_fluids - Gs(i) = fluid_pp(i)%G + Gs_hypo(i) = fluid_pp(i)%G end do - $:GPU_UPDATE(device='[Gs]') + $:GPU_UPDATE(device='[Gs_hypo]') - @:ALLOCATE(fd_coeff_x_h(-fd_number:fd_number, 0:m)) + @:ALLOCATE(fd_coeff_x_hypo(-fd_number:fd_number, 0:m)) if (n > 0) then - @:ALLOCATE(fd_coeff_y_h(-fd_number:fd_number, 0:n)) + @:ALLOCATE(fd_coeff_y_hypo(-fd_number:fd_number, 0:n)) end if if (p > 0) then - @:ALLOCATE(fd_coeff_z_h(-fd_number:fd_number, 0:p)) + @:ALLOCATE(fd_coeff_z_hypo(-fd_number:fd_number, 0:p)) end if ! Computing centered finite difference coefficients - call s_compute_finite_difference_coefficients(m, x_cc, fd_coeff_x_h, buff_size, & + call s_compute_finite_difference_coefficients(m, x_cc, fd_coeff_x_hypo, buff_size, & fd_number, fd_order) - $:GPU_UPDATE(device='[fd_coeff_x_h]') + $:GPU_UPDATE(device='[fd_coeff_x_hypo]') if (n > 0) then - call s_compute_finite_difference_coefficients(n, y_cc, fd_coeff_y_h, buff_size, & + call s_compute_finite_difference_coefficients(n, y_cc, fd_coeff_y_hypo, buff_size, & fd_number, fd_order) - $:GPU_UPDATE(device='[fd_coeff_y_h]') + $:GPU_UPDATE(device='[fd_coeff_y_hypo]') end if if (p > 0) then - call s_compute_finite_difference_coefficients(p, z_cc, fd_coeff_z_h, buff_size, & + call s_compute_finite_difference_coefficients(p, z_cc, fd_coeff_z_hypo, buff_size, & fd_number, fd_order) - $:GPU_UPDATE(device='[fd_coeff_z_h]') + $:GPU_UPDATE(device='[fd_coeff_z_hypo]') end if end subroutine s_initialize_hypoelastic_module @@ -104,261 +104,272 @@ contains ! calculate velocity gradients + rho_K and G_K ! TODO: re-organize these loops one by one for GPU efficiency if possible? - $:GPU_PARALLEL_LOOP(collapse=3) - do q = 0, p - do l = 0, n - do k = 0, m - du_dx(k, l, q) = 0._wp - end do - end do - end do - - $:GPU_PARALLEL_LOOP(collapse=3) - do q = 0, p - do l = 0, n - do k = 0, m - $:GPU_LOOP(parallelism='[seq]') - do r = -fd_number, fd_number - du_dx(k, l, q) = du_dx(k, l, q) & - + q_prim_vf(momxb)%sf(k + r, l, q)*fd_coeff_x_h(r, k) - end do - - end do - end do - end do - - if (ndirs > 1) then - $:GPU_PARALLEL_LOOP(collapse=3) + #:call GPU_PARALLEL_LOOP(collapse=3) do q = 0, p do l = 0, n do k = 0, m - du_dy(k, l, q) = 0._wp; dv_dx(k, l, q) = 0._wp; dv_dy(k, l, q) = 0._wp + du_dx_hypo(k, l, q) = 0._wp end do end do end do + #:endcall GPU_PARALLEL_LOOP - $:GPU_PARALLEL_LOOP(collapse=3) + #:call GPU_PARALLEL_LOOP(collapse=3) do q = 0, p do l = 0, n do k = 0, m $:GPU_LOOP(parallelism='[seq]') do r = -fd_number, fd_number - du_dy(k, l, q) = du_dy(k, l, q) & - + q_prim_vf(momxb)%sf(k, l + r, q)*fd_coeff_y_h(r, l) - dv_dx(k, l, q) = dv_dx(k, l, q) & - + q_prim_vf(momxb + 1)%sf(k + r, l, q)*fd_coeff_x_h(r, k) - dv_dy(k, l, q) = dv_dy(k, l, q) & - + q_prim_vf(momxb + 1)%sf(k, l + r, q)*fd_coeff_y_h(r, l) + du_dx_hypo(k, l, q) = du_dx_hypo(k, l, q) & + + q_prim_vf(momxb)%sf(k + r, l, q)*fd_coeff_x_hypo(r, k) end do + end do end do end do + #:endcall GPU_PARALLEL_LOOP - ! 3D - if (ndirs == 3) then - - $:GPU_PARALLEL_LOOP(collapse=3) + if (ndirs > 1) then + #:call GPU_PARALLEL_LOOP(collapse=3) do q = 0, p do l = 0, n do k = 0, m - du_dz(k, l, q) = 0._wp; dv_dz(k, l, q) = 0._wp; dw_dx(k, l, q) = 0._wp; - dw_dy(k, l, q) = 0._wp; dw_dz(k, l, q) = 0._wp; + du_dy_hypo(k, l, q) = 0._wp; dv_dx_hypo(k, l, q) = 0._wp; dv_dy_hypo(k, l, q) = 0._wp end do end do end do + #:endcall GPU_PARALLEL_LOOP - $:GPU_PARALLEL_LOOP(collapse=3) + #:call GPU_PARALLEL_LOOP(collapse=3) do q = 0, p do l = 0, n do k = 0, m $:GPU_LOOP(parallelism='[seq]') do r = -fd_number, fd_number - du_dz(k, l, q) = du_dz(k, l, q) & - + q_prim_vf(momxb)%sf(k, l, q + r)*fd_coeff_z_h(r, q) - dv_dz(k, l, q) = dv_dz(k, l, q) & - + q_prim_vf(momxb + 1)%sf(k, l, q + r)*fd_coeff_z_h(r, q) - dw_dx(k, l, q) = dw_dx(k, l, q) & - + q_prim_vf(momxe)%sf(k + r, l, q)*fd_coeff_x_h(r, k) - dw_dy(k, l, q) = dw_dy(k, l, q) & - + q_prim_vf(momxe)%sf(k, l + r, q)*fd_coeff_y_h(r, l) - dw_dz(k, l, q) = dw_dz(k, l, q) & - + q_prim_vf(momxe)%sf(k, l, q + r)*fd_coeff_z_h(r, q) + du_dy_hypo(k, l, q) = du_dy_hypo(k, l, q) & + + q_prim_vf(momxb)%sf(k, l + r, q)*fd_coeff_y_hypo(r, l) + dv_dx_hypo(k, l, q) = dv_dx_hypo(k, l, q) & + + q_prim_vf(momxb + 1)%sf(k + r, l, q)*fd_coeff_x_hypo(r, k) + dv_dy_hypo(k, l, q) = dv_dy_hypo(k, l, q) & + + q_prim_vf(momxb + 1)%sf(k, l + r, q)*fd_coeff_y_hypo(r, l) end do end do end do end do + #:endcall GPU_PARALLEL_LOOP + + ! 3D + if (ndirs == 3) then + + #:call GPU_PARALLEL_LOOP(collapse=3) + do q = 0, p + do l = 0, n + do k = 0, m + du_dz_hypo(k, l, q) = 0._wp; dv_dz_hypo(k, l, q) = 0._wp; dw_dx_hypo(k, l, q) = 0._wp; + dw_dy_hypo(k, l, q) = 0._wp; dw_dz_hypo(k, l, q) = 0._wp; + end do + end do + end do + #:endcall GPU_PARALLEL_LOOP + + #:call GPU_PARALLEL_LOOP(collapse=3) + do q = 0, p + do l = 0, n + do k = 0, m + $:GPU_LOOP(parallelism='[seq]') + do r = -fd_number, fd_number + du_dz_hypo(k, l, q) = du_dz_hypo(k, l, q) & + + q_prim_vf(momxb)%sf(k, l, q + r)*fd_coeff_z_hypo(r, q) + dv_dz_hypo(k, l, q) = dv_dz_hypo(k, l, q) & + + q_prim_vf(momxb + 1)%sf(k, l, q + r)*fd_coeff_z_hypo(r, q) + dw_dx_hypo(k, l, q) = dw_dx_hypo(k, l, q) & + + q_prim_vf(momxe)%sf(k + r, l, q)*fd_coeff_x_hypo(r, k) + dw_dy_hypo(k, l, q) = dw_dy_hypo(k, l, q) & + + q_prim_vf(momxe)%sf(k, l + r, q)*fd_coeff_y_hypo(r, l) + dw_dz_hypo(k, l, q) = dw_dz_hypo(k, l, q) & + + q_prim_vf(momxe)%sf(k, l, q + r)*fd_coeff_z_hypo(r, q) + end do + end do + end do + end do + #:endcall GPU_PARALLEL_LOOP end if end if - $:GPU_PARALLEL_LOOP(collapse=3) - do q = 0, p - do l = 0, n - do k = 0, m - rho_K = 0._wp; G_K = 0._wp - do i = 1, num_fluids - rho_K = rho_K + q_prim_vf(i)%sf(k, l, q) !alpha_rho_K(1) - G_K = G_K + q_prim_vf(advxb - 1 + i)%sf(k, l, q)*Gs(i) !alpha_K(1) * Gs(1) - end do + #:call GPU_PARALLEL_LOOP(collapse=3) + do q = 0, p + do l = 0, n + do k = 0, m + rho_K = 0._wp; G_K = 0._wp + do i = 1, num_fluids + rho_K = rho_K + q_prim_vf(i)%sf(k, l, q) !alpha_rho_K(1) + G_K = G_K + q_prim_vf(advxb - 1 + i)%sf(k, l, q)*Gs_hypo(i) !alpha_K(1) * Gs_hypo(1) + end do - if (cont_damage) G_K = G_K*max((1._wp - q_prim_vf(damage_idx)%sf(k, l, q)), 0._wp) + if (cont_damage) G_K = G_K*max((1._wp - q_prim_vf(damage_idx)%sf(k, l, q)), 0._wp) - rho_K_field(k, l, q) = rho_K - G_K_field(k, l, q) = G_K + rho_K_field(k, l, q) = rho_K + G_K_field(k, l, q) = G_K - !TODO: take this out if not needed - if (G_K < verysmall) then - G_K_field(k, l, q) = 0 - end if + !TODO: take this out if not needed + if (G_K < verysmall) then + G_K_field(k, l, q) = 0 + end if + end do end do end do - end do + #:endcall GPU_PARALLEL_LOOP ! apply rhs source term to elastic stress equation - $:GPU_PARALLEL_LOOP(collapse=3) - do q = 0, p - do l = 0, n - do k = 0, m - rhs_vf(strxb)%sf(k, l, q) = & - rhs_vf(strxb)%sf(k, l, q) + rho_K_field(k, l, q)* & - ((4._wp*G_K_field(k, l, q)/3._wp) + & - q_prim_vf(strxb)%sf(k, l, q))* & - du_dx(k, l, q) + #:call GPU_PARALLEL_LOOP(collapse=3) + do q = 0, p + do l = 0, n + do k = 0, m + rhs_vf(strxb)%sf(k, l, q) = & + rhs_vf(strxb)%sf(k, l, q) + rho_K_field(k, l, q)* & + ((4._wp*G_K_field(k, l, q)/3._wp) + & + q_prim_vf(strxb)%sf(k, l, q))* & + du_dx_hypo(k, l, q) + end do end do end do - end do + #:endcall GPU_PARALLEL_LOOP elseif (idir == 2) then - $:GPU_PARALLEL_LOOP(collapse=3) - do q = 0, p - do l = 0, n - do k = 0, m - rhs_vf(strxb)%sf(k, l, q) = rhs_vf(strxb)%sf(k, l, q) + rho_K_field(k, l, q)* & - (q_prim_vf(strxb + 1)%sf(k, l, q)*du_dy(k, l, q) + & - q_prim_vf(strxb + 1)%sf(k, l, q)*du_dy(k, l, q) - & - q_prim_vf(strxb)%sf(k, l, q)*dv_dy(k, l, q) - & - 2._wp*G_K_field(k, l, q)*(1._wp/3._wp)*dv_dy(k, l, q)) - - rhs_vf(strxb + 1)%sf(k, l, q) = rhs_vf(strxb + 1)%sf(k, l, q) + rho_K_field(k, l, q)* & - (q_prim_vf(strxb + 1)%sf(k, l, q)*du_dx(k, l, q) + & - q_prim_vf(strxb)%sf(k, l, q)*dv_dx(k, l, q) - & - q_prim_vf(strxb + 1)%sf(k, l, q)*du_dx(k, l, q) + & - q_prim_vf(strxb + 2)%sf(k, l, q)*du_dy(k, l, q) + & - q_prim_vf(strxb + 1)%sf(k, l, q)*dv_dy(k, l, q) - & - q_prim_vf(strxb + 1)%sf(k, l, q)*dv_dy(k, l, q) + & - 2._wp*G_K_field(k, l, q)*(1._wp/2._wp)*(du_dy(k, l, q) + & - dv_dx(k, l, q))) - - rhs_vf(strxb + 2)%sf(k, l, q) = rhs_vf(strxb + 2)%sf(k, l, q) + rho_K_field(k, l, q)* & - (q_prim_vf(strxb + 1)%sf(k, l, q)*dv_dx(k, l, q) + & - q_prim_vf(strxb + 1)%sf(k, l, q)*dv_dx(k, l, q) - & - q_prim_vf(strxb + 2)%sf(k, l, q)*du_dx(k, l, q) + & - q_prim_vf(strxb + 2)%sf(k, l, q)*dv_dy(k, l, q) + & - q_prim_vf(strxb + 2)%sf(k, l, q)*dv_dy(k, l, q) - & - q_prim_vf(strxb + 2)%sf(k, l, q)*dv_dy(k, l, q) + & - 2._wp*G_K_field(k, l, q)*(dv_dy(k, l, q) - (1._wp/3._wp)* & - (du_dx(k, l, q) + & - dv_dy(k, l, q)))) + #:call GPU_PARALLEL_LOOP(collapse=3) + do q = 0, p + do l = 0, n + do k = 0, m + rhs_vf(strxb)%sf(k, l, q) = rhs_vf(strxb)%sf(k, l, q) + rho_K_field(k, l, q)* & + (q_prim_vf(strxb + 1)%sf(k, l, q)*du_dy_hypo(k, l, q) + & + q_prim_vf(strxb + 1)%sf(k, l, q)*du_dy_hypo(k, l, q) - & + q_prim_vf(strxb)%sf(k, l, q)*dv_dy_hypo(k, l, q) - & + 2._wp*G_K_field(k, l, q)*(1._wp/3._wp)*dv_dy_hypo(k, l, q)) + + rhs_vf(strxb + 1)%sf(k, l, q) = rhs_vf(strxb + 1)%sf(k, l, q) + rho_K_field(k, l, q)* & + (q_prim_vf(strxb + 1)%sf(k, l, q)*du_dx_hypo(k, l, q) + & + q_prim_vf(strxb)%sf(k, l, q)*dv_dx_hypo(k, l, q) - & + q_prim_vf(strxb + 1)%sf(k, l, q)*du_dx_hypo(k, l, q) + & + q_prim_vf(strxb + 2)%sf(k, l, q)*du_dy_hypo(k, l, q) + & + q_prim_vf(strxb + 1)%sf(k, l, q)*dv_dy_hypo(k, l, q) - & + q_prim_vf(strxb + 1)%sf(k, l, q)*dv_dy_hypo(k, l, q) + & + 2._wp*G_K_field(k, l, q)*(1._wp/2._wp)*(du_dy_hypo(k, l, q) + & + dv_dx_hypo(k, l, q))) + + rhs_vf(strxb + 2)%sf(k, l, q) = rhs_vf(strxb + 2)%sf(k, l, q) + rho_K_field(k, l, q)* & + (q_prim_vf(strxb + 1)%sf(k, l, q)*dv_dx_hypo(k, l, q) + & + q_prim_vf(strxb + 1)%sf(k, l, q)*dv_dx_hypo(k, l, q) - & + q_prim_vf(strxb + 2)%sf(k, l, q)*du_dx_hypo(k, l, q) + & + q_prim_vf(strxb + 2)%sf(k, l, q)*dv_dy_hypo(k, l, q) + & + q_prim_vf(strxb + 2)%sf(k, l, q)*dv_dy_hypo(k, l, q) - & + q_prim_vf(strxb + 2)%sf(k, l, q)*dv_dy_hypo(k, l, q) + & + 2._wp*G_K_field(k, l, q)*(dv_dy_hypo(k, l, q) - (1._wp/3._wp)* & + (du_dx_hypo(k, l, q) + & + dv_dy_hypo(k, l, q)))) + end do end do end do - end do + #:endcall GPU_PARALLEL_LOOP elseif (idir == 3) then - $:GPU_PARALLEL_LOOP(collapse=3) - do q = 0, p - do l = 0, n - do k = 0, m - rhs_vf(strxb)%sf(k, l, q) = rhs_vf(strxb)%sf(k, l, q) + rho_K_field(k, l, q)* & - (q_prim_vf(strxb + 3)%sf(k, l, q)*du_dz(k, l, q) + & - q_prim_vf(strxb + 3)%sf(k, l, q)*du_dz(k, l, q) - & - q_prim_vf(strxb)%sf(k, l, q)*dw_dz(k, l, q) - & - 2._wp*G_K_field(k, l, q)*(1._wp/3._wp)*dw_dz(k, l, q)) - - rhs_vf(strxb + 1)%sf(k, l, q) = rhs_vf(strxb + 1)%sf(k, l, q) + rho_K_field(k, l, q)* & - (q_prim_vf(strxb + 4)%sf(k, l, q)*du_dz(k, l, q) + & - q_prim_vf(strxb + 3)%sf(k, l, q)*dv_dz(k, l, q) - & - q_prim_vf(strxb + 1)%sf(k, l, q)*dw_dz(k, l, q)) - - rhs_vf(strxb + 2)%sf(k, l, q) = rhs_vf(strxb + 2)%sf(k, l, q) + rho_K_field(k, l, q)* & - (q_prim_vf(strxb + 4)%sf(k, l, q)*dv_dz(k, l, q) + & - q_prim_vf(strxb + 4)%sf(k, l, q)*dv_dz(k, l, q) - & - q_prim_vf(strxb + 2)%sf(k, l, q)*dw_dz(k, l, q) - & - 2._wp*G_K_field(k, l, q)*(1._wp/3._wp)*dw_dz(k, l, q)) - - rhs_vf(strxb + 3)%sf(k, l, q) = rhs_vf(strxb + 3)%sf(k, l, q) + rho_K_field(k, l, q)* & - (q_prim_vf(strxb + 3)%sf(k, l, q)*du_dx(k, l, q) + & - q_prim_vf(strxb)%sf(k, l, q)*dw_dx(k, l, q) - & - q_prim_vf(strxb + 3)%sf(k, l, q)*du_dx(k, l, q) + & - q_prim_vf(strxb + 4)%sf(k, l, q)*du_dy(k, l, q) + & - q_prim_vf(strxb + 1)%sf(k, l, q)*dw_dy(k, l, q) - & - q_prim_vf(strxb + 3)%sf(k, l, q)*dv_dy(k, l, q) + & - q_prim_vf(strxb + 5)%sf(k, l, q)*du_dz(k, l, q) + & - q_prim_vf(strxb + 3)%sf(k, l, q)*dw_dz(k, l, q) - & - q_prim_vf(strxb + 3)%sf(k, l, q)*dw_dz(k, l, q) + & - 2._wp*G_K_field(k, l, q)*(1._wp/2._wp)*(du_dz(k, l, q) + & - dw_dx(k, l, q))) - - rhs_vf(strxb + 4)%sf(k, l, q) = rhs_vf(strxb + 4)%sf(k, l, q) + rho_K_field(k, l, q)* & - (q_prim_vf(strxb + 3)%sf(k, l, q)*dv_dx(k, l, q) + & - q_prim_vf(strxb + 1)%sf(k, l, q)*dw_dx(k, l, q) - & - q_prim_vf(strxb + 4)%sf(k, l, q)*du_dx(k, l, q) + & - q_prim_vf(strxb + 4)%sf(k, l, q)*dv_dy(k, l, q) + & - q_prim_vf(strxb + 2)%sf(k, l, q)*dw_dy(k, l, q) - & - q_prim_vf(strxb + 4)%sf(k, l, q)*dv_dy(k, l, q) + & - q_prim_vf(strxb + 5)%sf(k, l, q)*dv_dz(k, l, q) + & - q_prim_vf(strxb + 4)%sf(k, l, q)*dw_dz(k, l, q) - & - q_prim_vf(strxb + 4)%sf(k, l, q)*dw_dz(k, l, q) + & - 2._wp*G_K_field(k, l, q)*(1._wp/2._wp)*(dv_dz(k, l, q) + & - dw_dy(k, l, q))) - - rhs_vf(strxe)%sf(k, l, q) = rhs_vf(strxe)%sf(k, l, q) + rho_K_field(k, l, q)* & - (q_prim_vf(strxe - 2)%sf(k, l, q)*dw_dx(k, l, q) + & - q_prim_vf(strxe - 2)%sf(k, l, q)*dw_dx(k, l, q) - & - q_prim_vf(strxe)%sf(k, l, q)*du_dx(k, l, q) + & - q_prim_vf(strxe - 1)%sf(k, l, q)*dw_dy(k, l, q) + & - q_prim_vf(strxe - 1)%sf(k, l, q)*dw_dy(k, l, q) - & - q_prim_vf(strxe)%sf(k, l, q)*dv_dy(k, l, q) + & - q_prim_vf(strxe)%sf(k, l, q)*dw_dz(k, l, q) + & - q_prim_vf(strxe)%sf(k, l, q)*dw_dz(k, l, q) - & - q_prim_vf(strxe)%sf(k, l, q)*dw_dz(k, l, q) + & - 2._wp*G_K_field(k, l, q)*(dw_dz(k, l, q) - (1._wp/3._wp)* & - (du_dx(k, l, q) + & - dv_dy(k, l, q) + & - dw_dz(k, l, q)))) + #:call GPU_PARALLEL_LOOP(collapse=3) + do q = 0, p + do l = 0, n + do k = 0, m + rhs_vf(strxb)%sf(k, l, q) = rhs_vf(strxb)%sf(k, l, q) + rho_K_field(k, l, q)* & + (q_prim_vf(strxb + 3)%sf(k, l, q)*du_dz_hypo(k, l, q) + & + q_prim_vf(strxb + 3)%sf(k, l, q)*du_dz_hypo(k, l, q) - & + q_prim_vf(strxb)%sf(k, l, q)*dw_dz_hypo(k, l, q) - & + 2._wp*G_K_field(k, l, q)*(1._wp/3._wp)*dw_dz_hypo(k, l, q)) + + rhs_vf(strxb + 1)%sf(k, l, q) = rhs_vf(strxb + 1)%sf(k, l, q) + rho_K_field(k, l, q)* & + (q_prim_vf(strxb + 4)%sf(k, l, q)*du_dz_hypo(k, l, q) + & + q_prim_vf(strxb + 3)%sf(k, l, q)*dv_dz_hypo(k, l, q) - & + q_prim_vf(strxb + 1)%sf(k, l, q)*dw_dz_hypo(k, l, q)) + + rhs_vf(strxb + 2)%sf(k, l, q) = rhs_vf(strxb + 2)%sf(k, l, q) + rho_K_field(k, l, q)* & + (q_prim_vf(strxb + 4)%sf(k, l, q)*dv_dz_hypo(k, l, q) + & + q_prim_vf(strxb + 4)%sf(k, l, q)*dv_dz_hypo(k, l, q) - & + q_prim_vf(strxb + 2)%sf(k, l, q)*dw_dz_hypo(k, l, q) - & + 2._wp*G_K_field(k, l, q)*(1._wp/3._wp)*dw_dz_hypo(k, l, q)) + + rhs_vf(strxb + 3)%sf(k, l, q) = rhs_vf(strxb + 3)%sf(k, l, q) + rho_K_field(k, l, q)* & + (q_prim_vf(strxb + 3)%sf(k, l, q)*du_dx_hypo(k, l, q) + & + q_prim_vf(strxb)%sf(k, l, q)*dw_dx_hypo(k, l, q) - & + q_prim_vf(strxb + 3)%sf(k, l, q)*du_dx_hypo(k, l, q) + & + q_prim_vf(strxb + 4)%sf(k, l, q)*du_dy_hypo(k, l, q) + & + q_prim_vf(strxb + 1)%sf(k, l, q)*dw_dy_hypo(k, l, q) - & + q_prim_vf(strxb + 3)%sf(k, l, q)*dv_dy_hypo(k, l, q) + & + q_prim_vf(strxb + 5)%sf(k, l, q)*du_dz_hypo(k, l, q) + & + q_prim_vf(strxb + 3)%sf(k, l, q)*dw_dz_hypo(k, l, q) - & + q_prim_vf(strxb + 3)%sf(k, l, q)*dw_dz_hypo(k, l, q) + & + 2._wp*G_K_field(k, l, q)*(1._wp/2._wp)*(du_dz_hypo(k, l, q) + & + dw_dx_hypo(k, l, q))) + + rhs_vf(strxb + 4)%sf(k, l, q) = rhs_vf(strxb + 4)%sf(k, l, q) + rho_K_field(k, l, q)* & + (q_prim_vf(strxb + 3)%sf(k, l, q)*dv_dx_hypo(k, l, q) + & + q_prim_vf(strxb + 1)%sf(k, l, q)*dw_dx_hypo(k, l, q) - & + q_prim_vf(strxb + 4)%sf(k, l, q)*du_dx_hypo(k, l, q) + & + q_prim_vf(strxb + 4)%sf(k, l, q)*dv_dy_hypo(k, l, q) + & + q_prim_vf(strxb + 2)%sf(k, l, q)*dw_dy_hypo(k, l, q) - & + q_prim_vf(strxb + 4)%sf(k, l, q)*dv_dy_hypo(k, l, q) + & + q_prim_vf(strxb + 5)%sf(k, l, q)*dv_dz_hypo(k, l, q) + & + q_prim_vf(strxb + 4)%sf(k, l, q)*dw_dz_hypo(k, l, q) - & + q_prim_vf(strxb + 4)%sf(k, l, q)*dw_dz_hypo(k, l, q) + & + 2._wp*G_K_field(k, l, q)*(1._wp/2._wp)*(dv_dz_hypo(k, l, q) + & + dw_dy_hypo(k, l, q))) + + rhs_vf(strxe)%sf(k, l, q) = rhs_vf(strxe)%sf(k, l, q) + rho_K_field(k, l, q)* & + (q_prim_vf(strxe - 2)%sf(k, l, q)*dw_dx_hypo(k, l, q) + & + q_prim_vf(strxe - 2)%sf(k, l, q)*dw_dx_hypo(k, l, q) - & + q_prim_vf(strxe)%sf(k, l, q)*du_dx_hypo(k, l, q) + & + q_prim_vf(strxe - 1)%sf(k, l, q)*dw_dy_hypo(k, l, q) + & + q_prim_vf(strxe - 1)%sf(k, l, q)*dw_dy_hypo(k, l, q) - & + q_prim_vf(strxe)%sf(k, l, q)*dv_dy_hypo(k, l, q) + & + q_prim_vf(strxe)%sf(k, l, q)*dw_dz_hypo(k, l, q) + & + q_prim_vf(strxe)%sf(k, l, q)*dw_dz_hypo(k, l, q) - & + q_prim_vf(strxe)%sf(k, l, q)*dw_dz_hypo(k, l, q) + & + 2._wp*G_K_field(k, l, q)*(dw_dz_hypo(k, l, q) - (1._wp/3._wp)* & + (du_dx_hypo(k, l, q) + & + dv_dy_hypo(k, l, q) + & + dw_dz_hypo(k, l, q)))) + end do end do end do - end do + #:endcall GPU_PARALLEL_LOOP end if if (cyl_coord .and. idir == 2) then - $:GPU_PARALLEL_LOOP(collapse=3) - do q = 0, p - do l = 0, n - do k = 0, m - ! S_xx -= rho * v/r * (tau_xx + 2/3*G) - rhs_vf(strxb)%sf(k, l, q) = rhs_vf(strxb)%sf(k, l, q) - & - rho_K_field(k, l, q)*q_prim_vf(momxb + 1)%sf(k, l, q)/y_cc(l)* & - (q_prim_vf(strxb)%sf(k, l, q) + (2._wp/3._wp)*G_K_field(k, l, q)) ! tau_xx + 2/3*G - - ! S_xr -= rho * v/r * tau_xr - rhs_vf(strxb + 1)%sf(k, l, q) = rhs_vf(strxb + 1)%sf(k, l, q) - & - rho_K_field(k, l, q)*q_prim_vf(momxb + 1)%sf(k, l, q)/y_cc(l)* & - q_prim_vf(strxb + 1)%sf(k, l, q) ! tau_xx - - ! S_rr -= rho * v/r * (tau_rr + 2/3*G) - rhs_vf(strxb + 2)%sf(k, l, q) = rhs_vf(strxb + 2)%sf(k, l, q) - & + #:call GPU_PARALLEL_LOOP(collapse=3) + do q = 0, p + do l = 0, n + do k = 0, m + ! S_xx -= rho * v/r * (tau_xx + 2/3*G) + rhs_vf(strxb)%sf(k, l, q) = rhs_vf(strxb)%sf(k, l, q) - & rho_K_field(k, l, q)*q_prim_vf(momxb + 1)%sf(k, l, q)/y_cc(l)* & - (q_prim_vf(strxb + 2)%sf(k, l, q) + (2._wp/3._wp)*G_K_field(k, l, q)) ! tau_rr + 2/3*G - - ! S_thetatheta += rho * ( -(tau_thetatheta + 2/3*G)*(du/dx + dv/dr + v/r) + 2*(tau_thetatheta + G)*v/r ) - rhs_vf(strxb + 3)%sf(k, l, q) = rhs_vf(strxb + 3)%sf(k, l, q) + & - rho_K_field(k, l, q)*( & - -(q_prim_vf(strxb + 3)%sf(k, l, q) + (2._wp/3._wp)*G_K_field(k, l, q))* & - (du_dx(k, l, q) + dv_dy(k, l, q) + q_prim_vf(momxb + 1)%sf(k, l, q)/y_cc(l)) & - + 2._wp*(q_prim_vf(strxb + 3)%sf(k, l, q) + G_K_field(k, l, q))*q_prim_vf(momxb + 1)%sf(k, l, q)/y_cc(l)) + (q_prim_vf(strxb)%sf(k, l, q) + (2._wp/3._wp)*G_K_field(k, l, q)) ! tau_xx + 2/3*G + + ! S_xr -= rho * v/r * tau_xr + rhs_vf(strxb + 1)%sf(k, l, q) = rhs_vf(strxb + 1)%sf(k, l, q) - & + rho_K_field(k, l, q)*q_prim_vf(momxb + 1)%sf(k, l, q)/y_cc(l)* & + q_prim_vf(strxb + 1)%sf(k, l, q) ! tau_xx + + ! S_rr -= rho * v/r * (tau_rr + 2/3*G) + rhs_vf(strxb + 2)%sf(k, l, q) = rhs_vf(strxb + 2)%sf(k, l, q) - & + rho_K_field(k, l, q)*q_prim_vf(momxb + 1)%sf(k, l, q)/y_cc(l)* & + (q_prim_vf(strxb + 2)%sf(k, l, q) + (2._wp/3._wp)*G_K_field(k, l, q)) ! tau_rr + 2/3*G + + ! S_thetatheta += rho * ( -(tau_thetatheta + 2/3*G)*(du/dx + dv/dr + v/r) + 2*(tau_thetatheta + G)*v/r ) + rhs_vf(strxb + 3)%sf(k, l, q) = rhs_vf(strxb + 3)%sf(k, l, q) + & + rho_K_field(k, l, q)*( & + -(q_prim_vf(strxb + 3)%sf(k, l, q) + (2._wp/3._wp)*G_K_field(k, l, q))* & + (du_dx_hypo(k, l, q) + dv_dy_hypo(k, l, q) + q_prim_vf(momxb + 1)%sf(k, l, q)/y_cc(l)) & + + 2._wp*(q_prim_vf(strxb + 3)%sf(k, l, q) + G_K_field(k, l, q))*q_prim_vf(momxb + 1)%sf(k, l, q)/y_cc(l)) + end do end do end do - end do + #:endcall GPU_PARALLEL_LOOP end if @@ -366,22 +377,22 @@ contains impure subroutine s_finalize_hypoelastic_module() - @:DEALLOCATE(Gs) + @:DEALLOCATE(Gs_hypo) @:DEALLOCATE(rho_K_field, G_K_field) - @:DEALLOCATE(du_dx) - @:DEALLOCATE(fd_coeff_x_h) + @:DEALLOCATE(du_dx_hypo) + @:DEALLOCATE(fd_coeff_x_hypo) if (n > 0) then - @:DEALLOCATE(du_dy,dv_dx,dv_dy) - @:DEALLOCATE(fd_coeff_y_h) + @:DEALLOCATE(du_dy_hypo,dv_dx_hypo,dv_dy_hypo) + @:DEALLOCATE(fd_coeff_y_hypo) if (p > 0) then - @:DEALLOCATE(du_dz, dv_dz, dw_dx, dw_dy, dw_dz) - @:DEALLOCATE(fd_coeff_z_h) + @:DEALLOCATE(du_dz_hypo, dv_dz_hypo, dw_dx_hypo, dw_dy_hypo, dw_dz_hypo) + @:DEALLOCATE(fd_coeff_z_hypo) end if end if end subroutine s_finalize_hypoelastic_module - pure subroutine s_compute_damage_state(q_cons_vf, rhs_vf) + subroutine s_compute_damage_state(q_cons_vf, rhs_vf) type(scalar_field), dimension(sys_size), intent(in) :: q_cons_vf type(scalar_field), dimension(sys_size), intent(inout) :: rhs_vf @@ -393,63 +404,66 @@ contains if (n == 0) then l = 0; q = 0 - $:GPU_PARALLEL_LOOP() - do k = 0, m - rhs_vf(damage_idx)%sf(k, l, q) = (alpha_bar*max(abs(q_cons_vf(stress_idx%beg)%sf(k, l, q)) - tau_star, 0._wp))**cont_damage_s - end do - elseif (p == 0) then - q = 0 - $:GPU_PARALLEL_LOOP(collapse=2) - do l = 0, n + #:call GPU_PARALLEL_LOOP() do k = 0, m - ! Maximum principal stress - tau_p = 0.5_wp*(q_cons_vf(stress_idx%beg)%sf(k, l, q) + & - q_cons_vf(stress_idx%beg + 2)%sf(k, l, q)) + & - sqrt((q_cons_vf(stress_idx%beg)%sf(k, l, q) - & - q_cons_vf(stress_idx%beg + 2)%sf(k, l, q))**2.0_wp + & - 4._wp*q_cons_vf(stress_idx%beg + 1)%sf(k, l, q)**2.0_wp)/2._wp - - rhs_vf(damage_idx)%sf(k, l, q) = (alpha_bar*max(tau_p - tau_star, 0._wp))**cont_damage_s + rhs_vf(damage_idx)%sf(k, l, q) = (alpha_bar*max(abs(q_cons_vf(stress_idx%beg)%sf(k, l, q)) - tau_star, 0._wp))**cont_damage_s end do - end do - else - $:GPU_PARALLEL_LOOP(collapse=3) - do q = 0, p + #:endcall GPU_PARALLEL_LOOP + elseif (p == 0) then + q = 0 + #:call GPU_PARALLEL_LOOP(collapse=2) do l = 0, n do k = 0, m - tau_xx = q_cons_vf(stress_idx%beg)%sf(k, l, q) - tau_xy = q_cons_vf(stress_idx%beg + 1)%sf(k, l, q) - tau_yy = q_cons_vf(stress_idx%beg + 2)%sf(k, l, q) - tau_xz = q_cons_vf(stress_idx%beg + 3)%sf(k, l, q) - tau_yz = q_cons_vf(stress_idx%beg + 4)%sf(k, l, q) - tau_zz = q_cons_vf(stress_idx%beg + 5)%sf(k, l, q) - - ! Invariants of the stress tensor - I1 = tau_xx + tau_yy + tau_zz - I2 = tau_xx*tau_yy + tau_xx*tau_zz + tau_yy*tau_zz - & - (tau_xy**2.0_wp + tau_xz**2.0_wp + tau_yz**2.0_wp) - I3 = tau_xx*tau_yy*tau_zz + 2.0_wp*tau_xy*tau_xz*tau_yz - & - tau_xx*tau_yz**2.0_wp - tau_yy*tau_xz**2.0_wp - tau_zz*tau_xy**2.0_wp - ! Maximum principal stress - temp = I1**2.0_wp - 3.0_wp*I2 - sqrt_term_1 = sqrt(max(temp, 0.0_wp)) - if (sqrt_term_1 > verysmall) then ! Avoid 0/0 - argument = (2.0_wp*I1*I1*I1 - 9.0_wp*I1*I2 + 27.0_wp*I3)/ & - (2.0_wp*sqrt_term_1*sqrt_term_1*sqrt_term_1) - if (argument > 1.0_wp) argument = 1.0_wp - if (argument < -1.0_wp) argument = -1.0_wp - phi = acos(argument) - sqrt_term_2 = sqrt(max(I1**2.0_wp - 3.0_wp*I2, 0.0_wp)) - tau_p = I1/3.0_wp + 2.0_wp/sqrt(3.0_wp)*sqrt_term_2*cos(phi/3.0_wp) - else - tau_p = I1/3.0_wp - end if + tau_p = 0.5_wp*(q_cons_vf(stress_idx%beg)%sf(k, l, q) + & + q_cons_vf(stress_idx%beg + 2)%sf(k, l, q)) + & + sqrt((q_cons_vf(stress_idx%beg)%sf(k, l, q) - & + q_cons_vf(stress_idx%beg + 2)%sf(k, l, q))**2.0_wp + & + 4._wp*q_cons_vf(stress_idx%beg + 1)%sf(k, l, q)**2.0_wp)/2._wp rhs_vf(damage_idx)%sf(k, l, q) = (alpha_bar*max(tau_p - tau_star, 0._wp))**cont_damage_s end do end do - end do + #:endcall GPU_PARALLEL_LOOP + else + #:call GPU_PARALLEL_LOOP(collapse=3) + do q = 0, p + do l = 0, n + do k = 0, m + tau_xx = q_cons_vf(stress_idx%beg)%sf(k, l, q) + tau_xy = q_cons_vf(stress_idx%beg + 1)%sf(k, l, q) + tau_yy = q_cons_vf(stress_idx%beg + 2)%sf(k, l, q) + tau_xz = q_cons_vf(stress_idx%beg + 3)%sf(k, l, q) + tau_yz = q_cons_vf(stress_idx%beg + 4)%sf(k, l, q) + tau_zz = q_cons_vf(stress_idx%beg + 5)%sf(k, l, q) + + ! Invariants of the stress tensor + I1 = tau_xx + tau_yy + tau_zz + I2 = tau_xx*tau_yy + tau_xx*tau_zz + tau_yy*tau_zz - & + (tau_xy**2.0_wp + tau_xz**2.0_wp + tau_yz**2.0_wp) + I3 = tau_xx*tau_yy*tau_zz + 2.0_wp*tau_xy*tau_xz*tau_yz - & + tau_xx*tau_yz**2.0_wp - tau_yy*tau_xz**2.0_wp - tau_zz*tau_xy**2.0_wp + + ! Maximum principal stress + temp = I1**2.0_wp - 3.0_wp*I2 + sqrt_term_1 = sqrt(max(temp, 0.0_wp)) + if (sqrt_term_1 > verysmall) then ! Avoid 0/0 + argument = (2.0_wp*I1*I1*I1 - 9.0_wp*I1*I2 + 27.0_wp*I3)/ & + (2.0_wp*sqrt_term_1*sqrt_term_1*sqrt_term_1) + if (argument > 1.0_wp) argument = 1.0_wp + if (argument < -1.0_wp) argument = -1.0_wp + phi = acos(argument) + sqrt_term_2 = sqrt(max(I1**2.0_wp - 3.0_wp*I2, 0.0_wp)) + tau_p = I1/3.0_wp + 2.0_wp/sqrt(3.0_wp)*sqrt_term_2*cos(phi/3.0_wp) + else + tau_p = I1/3.0_wp + end if + + rhs_vf(damage_idx)%sf(k, l, q) = (alpha_bar*max(tau_p - tau_star, 0._wp))**cont_damage_s + end do + end do + end do + #:endcall GPU_PARALLEL_LOOP end if end subroutine s_compute_damage_state diff --git a/src/simulation/m_ibm.fpp b/src/simulation/m_ibm.fpp index 7c99144a77..5353aed125 100644 --- a/src/simulation/m_ibm.fpp +++ b/src/simulation/m_ibm.fpp @@ -50,8 +50,11 @@ module m_ibm integer :: num_gps !< Number of ghost points integer :: num_inner_gps !< Number of ghost points +#if defined(MFC_OpenACC) $:GPU_DECLARE(create='[gp_layers,num_gps,num_inner_gps]') - +#elif defined(MFC_OpenMP) + $:GPU_DECLARE(create='[num_gps,num_inner_gps]') +#endif logical :: moving_immersed_boundary_flag contains @@ -149,7 +152,7 @@ contains !! @param q_prim_vf Primitive variables !! @param pb Internal bubble pressure !! @param mv Mass of vapor in bubble - pure subroutine s_ibm_correct_state(q_cons_vf, q_prim_vf, pb_in, mv_in) + subroutine s_ibm_correct_state(q_cons_vf, q_prim_vf, pb_in, mv_in) type(scalar_field), & dimension(sys_size), & @@ -187,178 +190,180 @@ contains real(wp) :: buf type(ghost_point) :: gp type(ghost_point) :: innerp + if (num_gps > 0) then + #:call GPU_PARALLEL_LOOP(private='[physical_loc,dyn_pres,alpha_rho_IP, alpha_IP,pres_IP,vel_IP,vel_g,vel_norm_IP,r_IP, v_IP,pb_IP,mv_IP,nmom_IP,presb_IP,massv_IP,rho, gamma,pi_inf,Re_K,G_K,Gs,gp,innerp,norm,buf, j,k,l,q]') + do i = 1, num_gps + + gp = ghost_points(i) + j = gp%loc(1) + k = gp%loc(2) + l = gp%loc(3) + patch_id = ghost_points(i)%ib_patch_id + + ! Calculate physical location of GP + if (p > 0) then + physical_loc = [x_cc(j), y_cc(k), z_cc(l)] + else + physical_loc = [x_cc(j), y_cc(k), 0._wp] + end if - $:GPU_PARALLEL_LOOP(private='[physical_loc,dyn_pres,alpha_rho_IP, & - & alpha_IP,pres_IP,vel_IP,vel_g,vel_norm_IP,r_IP, & - & v_IP,pb_IP,mv_IP,nmom_IP,presb_IP,massv_IP,rho, & - & gamma,pi_inf,Re_K,G_K,Gs,gp,innerp,norm,buf, & - & j,k,l,q]') - do i = 1, num_gps - - gp = ghost_points(i) - j = gp%loc(1) - k = gp%loc(2) - l = gp%loc(3) - patch_id = ghost_points(i)%ib_patch_id - - ! Calculate physical location of GP - if (p > 0) then - physical_loc = [x_cc(j), y_cc(k), z_cc(l)] - else - physical_loc = [x_cc(j), y_cc(k), 0._wp] - end if - - !Interpolate primitive variables at image point associated w/ GP - if (bubbles_euler .and. .not. qbmm) then - call s_interpolate_image_point(q_prim_vf, gp, & - alpha_rho_IP, alpha_IP, pres_IP, vel_IP, c_IP, & - r_IP, v_IP, pb_IP, mv_IP) - else if (qbmm .and. polytropic) then - call s_interpolate_image_point(q_prim_vf, gp, & - alpha_rho_IP, alpha_IP, pres_IP, vel_IP, c_IP, & - r_IP, v_IP, pb_IP, mv_IP, nmom_IP) - else if (qbmm .and. .not. polytropic) then - call s_interpolate_image_point(q_prim_vf, gp, & - alpha_rho_IP, alpha_IP, pres_IP, vel_IP, c_IP, & - r_IP, v_IP, pb_IP, mv_IP, nmom_IP, pb_in, mv_in, presb_IP, massv_IP) - else - call s_interpolate_image_point(q_prim_vf, gp, & - alpha_rho_IP, alpha_IP, pres_IP, vel_IP, c_IP) - end if - - dyn_pres = 0._wp - - ! Set q_prim_vf params at GP so that mixture vars calculated properly - $:GPU_LOOP(parallelism='[seq]') - do q = 1, num_fluids - q_prim_vf(q)%sf(j, k, l) = alpha_rho_IP(q) - q_prim_vf(advxb + q - 1)%sf(j, k, l) = alpha_IP(q) - end do + !Interpolate primitive variables at image point associated w/ GP + if (bubbles_euler .and. .not. qbmm) then + call s_interpolate_image_point(q_prim_vf, gp, & + alpha_rho_IP, alpha_IP, pres_IP, vel_IP, c_IP, & + r_IP, v_IP, pb_IP, mv_IP) + else if (qbmm .and. polytropic) then + call s_interpolate_image_point(q_prim_vf, gp, & + alpha_rho_IP, alpha_IP, pres_IP, vel_IP, c_IP, & + r_IP, v_IP, pb_IP, mv_IP, nmom_IP) + else if (qbmm .and. .not. polytropic) then + call s_interpolate_image_point(q_prim_vf, gp, & + alpha_rho_IP, alpha_IP, pres_IP, vel_IP, c_IP, & + r_IP, v_IP, pb_IP, mv_IP, nmom_IP, pb_in, mv_in, presb_IP, massv_IP) + else + call s_interpolate_image_point(q_prim_vf, gp, & + alpha_rho_IP, alpha_IP, pres_IP, vel_IP, c_IP) + end if + dyn_pres = 0._wp - if (surface_tension) then - q_prim_vf(c_idx)%sf(j, k, l) = c_IP - end if + ! Set q_prim_vf params at GP so that mixture vars calculated properly + $:GPU_LOOP(parallelism='[seq]') + do q = 1, num_fluids + q_prim_vf(q)%sf(j, k, l) = alpha_rho_IP(q) + q_prim_vf(advxb + q - 1)%sf(j, k, l) = alpha_IP(q) + end do - if (model_eqns /= 4) then - ! If in simulation, use acc mixture subroutines - if (elasticity) then - call s_convert_species_to_mixture_variables_acc(rho, gamma, pi_inf, qv_K, alpha_IP, & - alpha_rho_IP, Re_K, G_K, Gs) - else if (bubbles_euler) then - call s_convert_species_to_mixture_variables_bubbles_acc(rho, gamma, pi_inf, qv_K, alpha_IP, & + if (surface_tension) then + q_prim_vf(c_idx)%sf(j, k, l) = c_IP + end if + if (model_eqns /= 4) then + ! If in simulation, use acc mixture subroutines + if (elasticity) then + call s_convert_species_to_mixture_variables_acc(rho, gamma, pi_inf, qv_K, alpha_IP, & + alpha_rho_IP, Re_K, G_K, Gs) + else if (bubbles_euler) then + call s_convert_species_to_mixture_variables_bubbles_acc(rho, gamma, pi_inf, qv_K, alpha_IP, & + alpha_rho_IP, Re_K) + else + call s_convert_species_to_mixture_variables_acc(rho, gamma, pi_inf, qv_K, alpha_IP, & alpha_rho_IP, Re_K) - else - call s_convert_species_to_mixture_variables_acc(rho, gamma, pi_inf, qv_K, alpha_IP, & - alpha_rho_IP, Re_K) - end if - end if - - ! Calculate velocity of ghost cell - if (gp%slip) then - norm(1:3) = levelset_norm%sf(gp%loc(1), gp%loc(2), gp%loc(3), gp%ib_patch_id, 1:3) - buf = sqrt(sum(norm**2)) - norm = norm/buf - vel_norm_IP = sum(vel_IP*norm)*norm - vel_g = vel_IP - vel_norm_IP - else - if (patch_ib(patch_id)%moving_ibm == 0) then - ! we know the object is not moving if moving_ibm is 0 (false) - vel_g = 0._wp - else - do q = 1, 3 - ! if mibm is 1 or 2, then the boundary may be moving - vel_g(q) = patch_ib(patch_id)%vel(q) - end do - end if - end if + end if + end if - ! Set momentum - $:GPU_LOOP(parallelism='[seq]') - do q = momxb, momxe - q_cons_vf(q)%sf(j, k, l) = rho*vel_g(q - momxb + 1) - dyn_pres = dyn_pres + q_cons_vf(q)%sf(j, k, l)* & - vel_g(q - momxb + 1)/2._wp - end do + ! Calculate velocity of ghost cell + if (gp%slip) then + norm(1:3) = levelset_norm%sf(gp%loc(1), gp%loc(2), gp%loc(3), gp%ib_patch_id, 1:3) + buf = sqrt(sum(norm**2)) + norm = norm/buf + vel_norm_IP = sum(vel_IP*norm)*norm + vel_g = vel_IP - vel_norm_IP + else + if (patch_ib(patch_id)%moving_ibm == 0) then + ! we know the object is not moving if moving_ibm is 0 (false) + vel_g = 0._wp + else + do q = 1, 3 + ! if mibm is 1 or 2, then the boundary may be moving + vel_g(q) = patch_ib(patch_id)%vel(q) + end do + end if + end if - ! Set continuity and adv vars - $:GPU_LOOP(parallelism='[seq]') - do q = 1, num_fluids - q_cons_vf(q)%sf(j, k, l) = alpha_rho_IP(q) - q_cons_vf(advxb + q - 1)%sf(j, k, l) = alpha_IP(q) - end do + ! Set momentum + $:GPU_LOOP(parallelism='[seq]') + do q = momxb, momxe + q_cons_vf(q)%sf(j, k, l) = rho*vel_g(q - momxb + 1) + dyn_pres = dyn_pres + q_cons_vf(q)%sf(j, k, l)* & + vel_g(q - momxb + 1)/2._wp + end do - ! Set color function - if (surface_tension) then - q_cons_vf(c_idx)%sf(j, k, l) = c_IP - end if + ! Set continuity and adv vars + $:GPU_LOOP(parallelism='[seq]') + do q = 1, num_fluids + q_cons_vf(q)%sf(j, k, l) = alpha_rho_IP(q) + q_cons_vf(advxb + q - 1)%sf(j, k, l) = alpha_IP(q) + end do - ! Set Energy - if (bubbles_euler) then - q_cons_vf(E_idx)%sf(j, k, l) = (1 - alpha_IP(1))*(gamma*pres_IP + pi_inf + dyn_pres) - else - q_cons_vf(E_idx)%sf(j, k, l) = gamma*pres_IP + pi_inf + dyn_pres - end if + ! Set color function + if (surface_tension) then + q_cons_vf(c_idx)%sf(j, k, l) = c_IP + end if - ! Set bubble vars - if (bubbles_euler .and. .not. qbmm) then - call s_comp_n_from_prim(alpha_IP(1), r_IP, nbub, weight) - do q = 1, nb - q_cons_vf(bubxb + (q - 1)*2)%sf(j, k, l) = nbub*r_IP(q) - q_cons_vf(bubxb + (q - 1)*2 + 1)%sf(j, k, l) = nbub*v_IP(q) - if (.not. polytropic) then - q_cons_vf(bubxb + (q - 1)*4)%sf(j, k, l) = nbub*r_IP(q) - q_cons_vf(bubxb + (q - 1)*4 + 1)%sf(j, k, l) = nbub*v_IP(q) - q_cons_vf(bubxb + (q - 1)*4 + 2)%sf(j, k, l) = nbub*pb_IP(q) - q_cons_vf(bubxb + (q - 1)*4 + 3)%sf(j, k, l) = nbub*mv_IP(q) + ! Set Energy + if (bubbles_euler) then + q_cons_vf(E_idx)%sf(j, k, l) = (1 - alpha_IP(1))*(gamma*pres_IP + pi_inf + dyn_pres) + else + q_cons_vf(E_idx)%sf(j, k, l) = gamma*pres_IP + pi_inf + dyn_pres + end if + ! Set bubble vars + if (bubbles_euler .and. .not. qbmm) then + call s_comp_n_from_prim(alpha_IP(1), r_IP, nbub, weight) + $:GPU_LOOP(parallelism='[seq]') + do q = 1, nb + q_cons_vf(bubxb + (q - 1)*2)%sf(j, k, l) = nbub*r_IP(q) + q_cons_vf(bubxb + (q - 1)*2 + 1)%sf(j, k, l) = nbub*v_IP(q) + if (.not. polytropic) then + q_cons_vf(bubxb + (q - 1)*4)%sf(j, k, l) = nbub*r_IP(q) + q_cons_vf(bubxb + (q - 1)*4 + 1)%sf(j, k, l) = nbub*v_IP(q) + q_cons_vf(bubxb + (q - 1)*4 + 2)%sf(j, k, l) = nbub*pb_IP(q) + q_cons_vf(bubxb + (q - 1)*4 + 3)%sf(j, k, l) = nbub*mv_IP(q) + end if + end do end if - end do - end if - if (qbmm) then + if (qbmm) then - nbub = nmom_IP(1) - do q = 1, nb*nmom - q_cons_vf(bubxb + q - 1)%sf(j, k, l) = nbub*nmom_IP(q) - end do - do q = 1, nb - q_cons_vf(bubxb + (q - 1)*nmom)%sf(j, k, l) = nbub - end do + nbub = nmom_IP(1) + $:GPU_LOOP(parallelism='[seq]') + do q = 1, nb*nmom + q_cons_vf(bubxb + q - 1)%sf(j, k, l) = nbub*nmom_IP(q) + end do - if (.not. polytropic) then - do q = 1, nb - do r = 1, nnode - pb_in(j, k, l, r, q) = presb_IP((q - 1)*nnode + r) - mv_in(j, k, l, r, q) = massv_IP((q - 1)*nnode + r) + $:GPU_LOOP(parallelism='[seq]') + do q = 1, nb + q_cons_vf(bubxb + (q - 1)*nmom)%sf(j, k, l) = nbub end do - end do - end if - end if - if (model_eqns == 3) then - $:GPU_LOOP(parallelism='[seq]') - do q = intxb, intxe - q_cons_vf(q)%sf(j, k, l) = alpha_IP(q - intxb + 1)*(gammas(q - intxb + 1)*pres_IP & - + pi_infs(q - intxb + 1)) + if (.not. polytropic) then + $:GPU_LOOP(parallelism='[seq]') + do q = 1, nb + $:GPU_LOOP(parallelism='[seq]') + do r = 1, nnode + pb_in(j, k, l, r, q) = presb_IP((q - 1)*nnode + r) + mv_in(j, k, l, r, q) = massv_IP((q - 1)*nnode + r) + end do + end do + end if + end if + + if (model_eqns == 3) then + $:GPU_LOOP(parallelism='[seq]') + do q = intxb, intxe + q_cons_vf(q)%sf(j, k, l) = alpha_IP(q - intxb + 1)*(gammas(q - intxb + 1)*pres_IP & + + pi_infs(q - intxb + 1)) + end do + end if end do - end if - end do + #:endcall GPU_PARALLEL_LOOP + end if !Correct the state of the inner points in IBs - $:GPU_PARALLEL_LOOP(private='[physical_loc,dyn_pres,alpha_rho_IP, & - & alpha_IP,vel_g,rho,gamma,pi_inf,Re_K,innerp, & - & j,k,l,q]') - do i = 1, num_inner_gps + if (num_inner_gps > 0) then + #:call GPU_PARALLEL_LOOP(private='[physical_loc,dyn_pres,alpha_rho_IP, alpha_IP,vel_g,rho,gamma,pi_inf,Re_K,innerp,j,k,l,q]') + do i = 1, num_inner_gps - innerp = inner_points(i) - j = innerp%loc(1) - k = innerp%loc(2) - l = innerp%loc(3) + innerp = inner_points(i) + j = innerp%loc(1) + k = innerp%loc(2) + l = innerp%loc(3) - $:GPU_LOOP(parallelism='[seq]') - do q = momxb, momxe - q_cons_vf(q)%sf(j, k, l) = 0._wp - end do - end do + $:GPU_LOOP(parallelism='[seq]') + do q = momxb, momxe + q_cons_vf(q)%sf(j, k, l) = 0._wp + end do + end do + #:endcall GPU_PARALLEL_LOOP + end if end subroutine s_ibm_correct_state @@ -456,7 +461,7 @@ contains !> Function that finds the number of ghost points, used for allocating !! memory. - pure subroutine s_find_num_ghost_points(num_gps_out, num_inner_gps_out) + subroutine s_find_num_ghost_points(num_gps_out, num_inner_gps_out) integer, intent(out) :: num_gps_out integer, intent(out) :: num_inner_gps_out @@ -622,7 +627,7 @@ contains end subroutine s_find_ghost_points !> Function that computes the interpolation coefficients of image points - pure subroutine s_compute_interpolation_coeffs(ghost_points_in) + subroutine s_compute_interpolation_coeffs(ghost_points_in) type(ghost_point), dimension(num_gps), intent(INOUT) :: ghost_points_in @@ -776,7 +781,7 @@ contains !> Function that uses the interpolation coefficients and the current state !! at the cell centers in order to estimate the state at the image point - pure subroutine s_interpolate_image_point(q_prim_vf, gp, alpha_rho_IP, alpha_IP, pres_IP, vel_IP, c_IP, r_IP, v_IP, pb_IP, mv_IP, nmom_IP, pb_in, mv_in, presb_IP, massv_IP) + subroutine s_interpolate_image_point(q_prim_vf, gp, alpha_rho_IP, alpha_IP, pres_IP, vel_IP, c_IP, r_IP, v_IP, pb_IP, mv_IP, nmom_IP, pb_in, mv_in, presb_IP, massv_IP) $:GPU_ROUTINE(parallelism='[seq]') type(scalar_field), & dimension(sys_size), & diff --git a/src/simulation/m_igr.fpp b/src/simulation/m_igr.fpp index 0fbc76346f..599fe2cc7b 100644 --- a/src/simulation/m_igr.fpp +++ b/src/simulation/m_igr.fpp @@ -31,12 +31,15 @@ module m_igr real(wp), allocatable, dimension(:, :, :), pinned, target :: jac_rhs_host real(wp), allocatable, dimension(:, :, :), pinned, target :: jac_old_host #else - real(wp), allocatable, dimension(:, :, :) :: jac, jac_rhs, jac_old + real(wp), allocatable, target, dimension(:, :, :) :: jac + real(wp), allocatable, dimension(:, :, :) :: jac_rhs, jac_old $:GPU_DECLARE(create='[jac, jac_rhs, jac_old]') #endif + type(scalar_field), dimension(1) :: jac_sf + $:GPU_DECLARE(create='[jac_sf]') - real(wp), allocatable, dimension(:, :) :: Res - $:GPU_DECLARE(create='[Res]') + real(wp), allocatable, dimension(:, :) :: Res_igr + $:GPU_DECLARE(create='[Res_igr]') real(wp) :: alf_igr $:GPU_DECLARE(create='[alf_igr]') @@ -91,13 +94,13 @@ contains subroutine s_initialize_igr_module() if (viscous) then - @:ALLOCATE(Res(1:2, 1:maxval(Re_size))) + @:ALLOCATE(Res_igr(1:2, 1:maxval(Re_size))) do i = 1, 2 do j = 1, Re_size(i) - Res(i, j) = fluid_pp(Re_idx(i, j))%Re(i) + Res_igr(i, j) = fluid_pp(Re_idx(i, j))%Re(i) end do end do - $:GPU_UPDATE(device='[Res, Re_idx, Re_size]') + $:GPU_UPDATE(device='[Res_igr, Re_idx, Re_size]') @:PREFER_GPU(Res) @:PREFER_GPU(Re_idx) end if @@ -159,15 +162,16 @@ contains end if #endif - $:GPU_PARALLEL_LOOP(collapse=3) - do l = idwbuff(3)%beg, idwbuff(3)%end - do k = idwbuff(2)%beg, idwbuff(2)%end - do j = idwbuff(1)%beg, idwbuff(1)%end - jac(j, k, l) = 0._wp - if (igr_iter_solver == 1) jac_old(j, k, l) = 0._wp + #:call GPU_PARALLEL_LOOP(collapse=3) + do l = idwbuff(3)%beg, idwbuff(3)%end + do k = idwbuff(2)%beg, idwbuff(2)%end + do j = idwbuff(1)%beg, idwbuff(1)%end + jac(j, k, l) = 0._wp + if (igr_iter_solver == 1) jac_old(j, k, l) = 0._wp + end do end do end do - end do + #:endcall GPU_PARALLEL_LOOP if (p == 0) then alf_igr = alf_factor*max(dx(1), dy(1))**2._wp @@ -215,6 +219,10 @@ contains end if #:endif + jac_sf(1)%sf => jac + $:GPU_ENTER_DATA(copyin='[jac_sf(1)%sf]') + $:GPU_ENTER_DATA(attach='[jac_sf(1)%sf]') + end subroutine s_initialize_igr_module subroutine s_igr_iterative_solve(q_cons_vf, bc_type, t_step) @@ -236,82 +244,83 @@ contains end if do q = 1, num_iters - $:GPU_PARALLEL_LOOP(collapse=3, private='[rho_lx, rho_rx, rho_ly, & - & rho_ry, rho_lz, rho_rz, fd_coeff]') - do l = 0, p - do k = 0, n - do j = 0, m - rho_lx = 0._wp - rho_rx = 0._wp - rho_ly = 0._wp - rho_ry = 0._wp - rho_lz = 0._wp - rho_rz = 0._wp - fd_coeff = 0._wp - - $:GPU_LOOP(parallelism='[seq]') - do i = 1, num_fluids - rho_lx = rho_lx + (q_cons_vf(i)%sf(j, k, l) + q_cons_vf(i)%sf(j - 1, k, l))/2._wp - rho_rx = rho_rx + (q_cons_vf(i)%sf(j, k, l) + q_cons_vf(i)%sf(j + 1, k, l))/2._wp - rho_ly = rho_ly + (q_cons_vf(i)%sf(j, k, l) + q_cons_vf(i)%sf(j, k - 1, l))/2._wp - rho_ry = rho_ry + (q_cons_vf(i)%sf(j, k, l) + q_cons_vf(i)%sf(j, k + 1, l))/2._wp - if (p > 0) then - rho_lz = rho_lz + (q_cons_vf(i)%sf(j, k, l) + q_cons_vf(i)%sf(j, k, l - 1))/2._wp - rho_rz = rho_rz + (q_cons_vf(i)%sf(j, k, l) + q_cons_vf(i)%sf(j, k, l + 1))/2._wp - end if - fd_coeff = fd_coeff + q_cons_vf(i)%sf(j, k, l) - end do + #:call GPU_PARALLEL_LOOP(collapse=3, private='[rho_lx, rho_rx, rho_ly, rho_ry, rho_lz, rho_rz, fd_coeff]') + do l = 0, p + do k = 0, n + do j = 0, m + rho_lx = 0._wp + rho_rx = 0._wp + rho_ly = 0._wp + rho_ry = 0._wp + rho_lz = 0._wp + rho_rz = 0._wp + fd_coeff = 0._wp - fd_coeff = 1._wp/fd_coeff + alf_igr* & - ((1._wp/dx(j)**2._wp)*(1._wp/rho_lx + 1._wp/rho_rx) + & - (1._wp/dy(k)**2._wp)*(1._wp/rho_ly + 1._wp/rho_ry)) + $:GPU_LOOP(parallelism='[seq]') + do i = 1, num_fluids + rho_lx = rho_lx + (q_cons_vf(i)%sf(j, k, l) + q_cons_vf(i)%sf(j - 1, k, l))/2._wp + rho_rx = rho_rx + (q_cons_vf(i)%sf(j, k, l) + q_cons_vf(i)%sf(j + 1, k, l))/2._wp + rho_ly = rho_ly + (q_cons_vf(i)%sf(j, k, l) + q_cons_vf(i)%sf(j, k - 1, l))/2._wp + rho_ry = rho_ry + (q_cons_vf(i)%sf(j, k, l) + q_cons_vf(i)%sf(j, k + 1, l))/2._wp + if (p > 0) then + rho_lz = rho_lz + (q_cons_vf(i)%sf(j, k, l) + q_cons_vf(i)%sf(j, k, l - 1))/2._wp + rho_rz = rho_rz + (q_cons_vf(i)%sf(j, k, l) + q_cons_vf(i)%sf(j, k, l + 1))/2._wp + end if + fd_coeff = fd_coeff + q_cons_vf(i)%sf(j, k, l) + end do - if (num_dims == 3) then - fd_coeff = fd_coeff + alf_igr*(1._wp/dz(l)**2._wp)*(1._wp/rho_lz + 1._wp/rho_rz) - end if + fd_coeff = 1._wp/fd_coeff + alf_igr* & + ((1._wp/dx(j)**2._wp)*(1._wp/rho_lx + 1._wp/rho_rx) + & + (1._wp/dy(k)**2._wp)*(1._wp/rho_ly + 1._wp/rho_ry)) - if (igr_iter_solver == 1) then ! Jacobi iteration if (num_dims == 3) then - jac(j, k, l) = (alf_igr/fd_coeff)* & - ((1._wp/dx(j)**2._wp)*(jac_old(j - 1, k, l)/rho_lx + jac_old(j + 1, k, l)/rho_rx) + & - (1._wp/dy(k)**2._wp)*(jac_old(j, k - 1, l)/rho_ly + jac_old(j, k + 1, l)/rho_ry) + & - (1._wp/dz(l)**2._wp)*(jac_old(j, k, l - 1)/rho_lz + jac_old(j, k, l + 1)/rho_rz)) + & - jac_rhs(j, k, l)/fd_coeff - else - jac(j, k, l) = (alf_igr/fd_coeff)* & - ((1._wp/dx(j)**2._wp)*(jac_old(j - 1, k, l)/rho_lx + jac_old(j + 1, k, l)/rho_rx) + & - (1._wp/dy(k)**2._wp)*(jac_old(j, k - 1, l)/rho_ly + jac_old(j, k + 1, l)/rho_ry)) + & - jac_rhs(j, k, l)/fd_coeff + fd_coeff = fd_coeff + alf_igr*(1._wp/dz(l)**2._wp)*(1._wp/rho_lz + 1._wp/rho_rz) end if - else ! Gauss Seidel iteration - if (num_dims == 3) then - jac(j, k, l) = (alf_igr/fd_coeff)* & - ((1._wp/dx(j)**2._wp)*(jac(j - 1, k, l)/rho_lx + jac(j + 1, k, l)/rho_rx) + & - (1._wp/dy(k)**2._wp)*(jac(j, k - 1, l)/rho_ly + jac(j, k + 1, l)/rho_ry) + & - (1._wp/dz(l)**2._wp)*(jac(j, k, l - 1)/rho_lz + jac(j, k, l + 1)/rho_rz)) + & - jac_rhs(j, k, l)/fd_coeff - else - jac(j, k, l) = (alf_igr/fd_coeff)* & - ((1._wp/dx(j)**2._wp)*(jac(j - 1, k, l)/rho_lx + jac(j + 1, k, l)/rho_rx) + & - (1._wp/dy(k)**2._wp)*(jac(j, k - 1, l)/rho_ly + jac(j, k + 1, l)/rho_ry)) + & - jac_rhs(j, k, l)/fd_coeff + + if (igr_iter_solver == 1) then ! Jacobi iteration + if (num_dims == 3) then + jac(j, k, l) = (alf_igr/fd_coeff)* & + ((1._wp/dx(j)**2._wp)*(jac_old(j - 1, k, l)/rho_lx + jac_old(j + 1, k, l)/rho_rx) + & + (1._wp/dy(k)**2._wp)*(jac_old(j, k - 1, l)/rho_ly + jac_old(j, k + 1, l)/rho_ry) + & + (1._wp/dz(l)**2._wp)*(jac_old(j, k, l - 1)/rho_lz + jac_old(j, k, l + 1)/rho_rz)) + & + jac_rhs(j, k, l)/fd_coeff + else + jac(j, k, l) = (alf_igr/fd_coeff)* & + ((1._wp/dx(j)**2._wp)*(jac_old(j - 1, k, l)/rho_lx + jac_old(j + 1, k, l)/rho_rx) + & + (1._wp/dy(k)**2._wp)*(jac_old(j, k - 1, l)/rho_ly + jac_old(j, k + 1, l)/rho_ry)) + & + jac_rhs(j, k, l)/fd_coeff + end if + else ! Gauss Seidel iteration + if (num_dims == 3) then + jac(j, k, l) = (alf_igr/fd_coeff)* & + ((1._wp/dx(j)**2._wp)*(jac(j - 1, k, l)/rho_lx + jac(j + 1, k, l)/rho_rx) + & + (1._wp/dy(k)**2._wp)*(jac(j, k - 1, l)/rho_ly + jac(j, k + 1, l)/rho_ry) + & + (1._wp/dz(l)**2._wp)*(jac(j, k, l - 1)/rho_lz + jac(j, k, l + 1)/rho_rz)) + & + jac_rhs(j, k, l)/fd_coeff + else + jac(j, k, l) = (alf_igr/fd_coeff)* & + ((1._wp/dx(j)**2._wp)*(jac(j - 1, k, l)/rho_lx + jac(j + 1, k, l)/rho_rx) + & + (1._wp/dy(k)**2._wp)*(jac(j, k - 1, l)/rho_ly + jac(j, k + 1, l)/rho_ry)) + & + jac_rhs(j, k, l)/fd_coeff + end if end if - end if + end do end do end do - end do + #:endcall GPU_PARALLEL_LOOP - call s_populate_F_igr_buffers(bc_type, jac) + call s_populate_F_igr_buffers(bc_type, jac_sf) if (igr_iter_solver == 1) then ! Jacobi iteration - $:GPU_PARALLEL_LOOP(collapse=3) - do l = idwbuff(3)%beg, idwbuff(3)%end - do k = idwbuff(2)%beg, idwbuff(2)%end - do j = idwbuff(1)%beg, idwbuff(1)%end - jac_old(j, k, l) = jac(j, k, l) + #:call GPU_PARALLEL_LOOP(collapse=3) + do l = idwbuff(3)%beg, idwbuff(3)%end + do k = idwbuff(2)%beg, idwbuff(2)%end + do j = idwbuff(1)%beg, idwbuff(1)%end + jac_old(j, k, l) = jac(j, k, l) + end do end do end do - end do + #:endcall GPU_PARALLEL_LOOP end if end do @@ -331,58 +340,58 @@ contains real(wp) :: F_L, vel_L, rho_L, F_R, vel_R, rho_R real(wp), dimension(num_fluids) :: alpha_rho_L, alpha_rho_R - $:GPU_PARALLEL_LOOP(collapse=3, private='[F_L, vel_L, alpha_rho_L, & - & F_R, vel_R, alpha_rho_R]') - do l = 0, p - do k = 0, n - do j = -1, m + #:call GPU_PARALLEL_LOOP(collapse=3, private='[F_L, vel_L, alpha_rho_L, F_R, vel_R, alpha_rho_R]') + do l = 0, p + do k = 0, n + do j = -1, m - F_L = 0._wp; F_R = 0._wp - alpha_rho_L = 0._wp; alpha_rho_R = 0._wp - vel_L = 0._wp; vel_R = 0._wp + F_L = 0._wp; F_R = 0._wp + alpha_rho_L = 0._wp; alpha_rho_R = 0._wp + vel_L = 0._wp; vel_R = 0._wp - $:GPU_LOOP(parallelism='[seq]') - do q = vidxb + 1, vidxe $:GPU_LOOP(parallelism='[seq]') - do i = 1, num_fluids - alpha_rho_L(i) = alpha_rho_L(i) + coeff_L(q)*q_cons_vf(i)%sf(j + q, k, l) - end do + do q = vidxb + 1, vidxe + $:GPU_LOOP(parallelism='[seq]') + do i = 1, num_fluids + alpha_rho_L(i) = alpha_rho_L(i) + coeff_L(q)*q_cons_vf(i)%sf(j + q, k, l) + end do - vel_L = vel_L + coeff_L(q)*q_cons_vf(momxb)%sf(j + q, k, l) - F_L = F_L + coeff_L(q)*jac(j + q, k, l) - end do + vel_L = vel_L + coeff_L(q)*q_cons_vf(momxb)%sf(j + q, k, l) + F_L = F_L + coeff_L(q)*jac(j + q, k, l) + end do - $:GPU_LOOP(parallelism='[seq]') - do q = vidxb, vidxe - 1 $:GPU_LOOP(parallelism='[seq]') - do i = 1, num_fluids - alpha_rho_R(i) = alpha_rho_R(i) + coeff_R(q)*q_cons_vf(i)%sf(j + q, k, l) + do q = vidxb, vidxe - 1 + $:GPU_LOOP(parallelism='[seq]') + do i = 1, num_fluids + alpha_rho_R(i) = alpha_rho_R(i) + coeff_R(q)*q_cons_vf(i)%sf(j + q, k, l) + end do + + vel_R = vel_R + coeff_R(q)*q_cons_vf(momxb)%sf(j + q, k, l) + F_R = F_R + coeff_R(q)*jac(j + q, k, l) end do - vel_R = vel_R + coeff_R(q)*q_cons_vf(momxb)%sf(j + q, k, l) - F_R = F_R + coeff_R(q)*jac(j + q, k, l) - end do + vel_L = vel_L/sum(alpha_rho_L) + vel_R = vel_R/sum(alpha_rho_R) - vel_L = vel_L/sum(alpha_rho_L) - vel_R = vel_R/sum(alpha_rho_R) - - #:for LR in ['L', 'R'] - $:GPU_ATOMIC(atomic='update') - rhs_vf(momxb)%sf(j + 1, k, l) = rhs_vf(momxb)%sf(j + 1, k, l) + & - 0.5_wp*F_${LR}$*(1._wp/dx(j + 1)) - $:GPU_ATOMIC(atomic='update') - rhs_vf(E_idx)%sf(j + 1, k, l) = rhs_vf(E_idx)%sf(j + 1, k, l) + & - 0.5_wp*vel_${LR}$*F_${LR}$*(1._wp/dx(j + 1)) - $:GPU_ATOMIC(atomic='update') - rhs_vf(momxb)%sf(j, k, l) = rhs_vf(momxb)%sf(j, k, l) - & - 0.5_wp*F_${LR}$*(1._wp/dx(j)) - $:GPU_ATOMIC(atomic='update') - rhs_vf(E_idx)%sf(j, k, l) = rhs_vf(E_idx)%sf(j, k, l) - & - 0.5_wp*vel_${LR}$*F_${LR}$*(1._wp/dx(j)) - #:endfor + #:for LR in ['L', 'R'] + $:GPU_ATOMIC(atomic='update') + rhs_vf(momxb)%sf(j + 1, k, l) = rhs_vf(momxb)%sf(j + 1, k, l) + & + 0.5_wp*F_${LR}$*(1._wp/dx(j + 1)) + $:GPU_ATOMIC(atomic='update') + rhs_vf(E_idx)%sf(j + 1, k, l) = rhs_vf(E_idx)%sf(j + 1, k, l) + & + 0.5_wp*vel_${LR}$*F_${LR}$*(1._wp/dx(j + 1)) + $:GPU_ATOMIC(atomic='update') + rhs_vf(momxb)%sf(j, k, l) = rhs_vf(momxb)%sf(j, k, l) - & + 0.5_wp*F_${LR}$*(1._wp/dx(j)) + $:GPU_ATOMIC(atomic='update') + rhs_vf(E_idx)%sf(j, k, l) = rhs_vf(E_idx)%sf(j, k, l) - & + 0.5_wp*vel_${LR}$*F_${LR}$*(1._wp/dx(j)) + #:endfor + end do end do end do - end do + #:endcall GPU_PARALLEL_LOOP end subroutine s_igr_sigma_x @@ -410,1244 +419,1688 @@ contains if (idir == 1) then if (p == 0) then - $:GPU_PARALLEL_LOOP(collapse=3, private='[rho_L, rho_R, gamma_L, & - & gamma_R, pi_inf_L, pi_inf_R, mu_L, mu_R, vel_L, vel_R, & - & pres_L, pres_R, alpha_L, alpha_R, alpha_rho_L, alpha_rho_R, & - & F_L, F_R, E_L, E_R, cfl, dvel, dvel_small, rho_sf_small, & - & vflux_L_arr, vflux_R_arr]') - do l = 0, p - do k = 0, n - do j = -1, m + #:call GPU_PARALLEL_LOOP(collapse=3, private='[rho_L, rho_R, gamma_L, gamma_R, pi_inf_L, pi_inf_R, mu_L, mu_R, vel_L, vel_R, pres_L, pres_R, alpha_L, alpha_R, alpha_rho_L, alpha_rho_R, F_L, F_R, E_L, E_R, cfl, dvel, dvel_small, rho_sf_small, vflux_L_arr, vflux_R_arr]') + do l = 0, p + do k = 0, n + do j = -1, m - dvel = 0._wp - vflux_L_arr = 0._wp - vflux_R_arr = 0._wp + dvel = 0._wp + vflux_L_arr = 0._wp + vflux_R_arr = 0._wp - #:if MFC_CASE_OPTIMIZATION - #:if igr_order == 5 - !DIR$ unroll 6 - #:elif igr_order == 3 - !DIR$ unroll 4 + #:if MFC_CASE_OPTIMIZATION + #:if igr_order == 5 + !DIR$ unroll 6 + #:elif igr_order == 3 + !DIR$ unroll 4 + #:endif #:endif - #:endif - $:GPU_LOOP(parallelism='[seq]') - do q = vidxb, vidxe - dvel_small = 0._wp - !x-direction contributions $:GPU_LOOP(parallelism='[seq]') - do i = -1, 1 - rho_L = 0._wp + do q = vidxb, vidxe + dvel_small = 0._wp + !x-direction contributions + $:GPU_LOOP(parallelism='[seq]') + do i = -1, 1 + rho_L = 0._wp + $:GPU_LOOP(parallelism='[seq]') + do r = 1, num_fluids + rho_L = rho_L + q_cons_vf(r)%sf(j + i + q, k, l) + end do + rho_sf_small(i) = rho_L + end do + + dvel_small(1) = (1/(2._wp*dx(j)))*( & + 1._wp*q_cons_vf(momxb)%sf(j + 1 + q, k, l)/rho_sf_small(1) - & + 1._wp*q_cons_vf(momxb)%sf(j - 1 + q, k, l)/rho_sf_small(-1)) + dvel_small(2) = (1/(2._wp*dx(j)))*( & + q_cons_vf(momxb + 1)%sf(j + 1 + q, k, l)/rho_sf_small(1) - & + q_cons_vf(momxb + 1)%sf(j - 1 + q, k, l)/rho_sf_small(-1)) + + if (q == 0) dvel(:, 1) = dvel_small + if (q > vidxb) then + vflux_L_arr(1) = vflux_L_arr(1) + coeff_L(q)*(dvel_small(2)) + vflux_L_arr(3) = vflux_L_arr(3) + coeff_L(q)*(4._wp*dvel_small(1))/3._wp + end if + if (q < vidxe) then + vflux_R_arr(1) = vflux_R_arr(1) + coeff_R(q)*(dvel_small(2)) + vflux_R_arr(3) = vflux_R_arr(3) + coeff_R(q)*(4._wp*dvel_small(1))/3._wp + end if + + !y-direction contributions $:GPU_LOOP(parallelism='[seq]') - do r = 1, num_fluids - rho_L = rho_L + q_cons_vf(r)%sf(j + i + q, k, l) + do i = -1, 1 + rho_L = 0._wp + $:GPU_LOOP(parallelism='[seq]') + do r = 1, num_fluids + rho_L = rho_L + q_cons_vf(r)%sf(j + q, k + i, l) + end do + rho_sf_small(i) = rho_L end do - rho_sf_small(i) = rho_L + + dvel_small(1) = (1/(2._wp*dy(k)))*( & + q_cons_vf(momxb)%sf(j + q, k + 1, l)/rho_sf_small(1) - & + q_cons_vf(momxb)%sf(j + q, k - 1, l)/rho_sf_small(-1)) + dvel_small(2) = (1/(2._wp*dy(k)))*( & + q_cons_vf(momxb + 1)%sf(j + q, k + 1, l)/rho_sf_small(1) - & + q_cons_vf(momxb + 1)%sf(j + q, k - 1, l)/rho_sf_small(-1)) + + if (q == 0) dvel(:, 2) = dvel_small + + if (q > vidxb) then + vflux_L_arr(1) = vflux_L_arr(1) + coeff_L(q)*(dvel_small(1)) + vflux_L_arr(3) = vflux_L_arr(3) + coeff_L(q)*(-2._wp*dvel_small(2))/3._wp + end if + if (q < vidxe) then + vflux_R_arr(1) = vflux_R_arr(1) + coeff_R(q)*(dvel_small(1)) + vflux_R_arr(3) = vflux_R_arr(3) + coeff_R(q)*(-2._wp*dvel_small(2))/3._wp + end if + + if (q == 0) then + jac_rhs(j, k, l) = alf_igr*(2._wp*(dvel(1, 2)*dvel(2, 1)) & + + dvel(1, 1)**2._wp + dvel(2, 2)**2._wp & + + (dvel(1, 1) + dvel(2, 2))**2._wp) + end if end do - dvel_small(1) = (1/(2._wp*dx(j)))*( & - 1._wp*q_cons_vf(momxb)%sf(j + 1 + q, k, l)/rho_sf_small(1) - & - 1._wp*q_cons_vf(momxb)%sf(j - 1 + q, k, l)/rho_sf_small(-1)) - dvel_small(2) = (1/(2._wp*dx(j)))*( & - q_cons_vf(momxb + 1)%sf(j + 1 + q, k, l)/rho_sf_small(1) - & - q_cons_vf(momxb + 1)%sf(j - 1 + q, k, l)/rho_sf_small(-1)) - - if (q == 0) dvel(:, 1) = dvel_small - if (q > vidxb) then - vflux_L_arr(1) = vflux_L_arr(1) + coeff_L(q)*(dvel_small(2)) - vflux_L_arr(3) = vflux_L_arr(3) + coeff_L(q)*(4._wp*dvel_small(1))/3._wp - end if - if (q < vidxe) then - vflux_R_arr(1) = vflux_R_arr(1) + coeff_R(q)*(dvel_small(2)) - vflux_R_arr(3) = vflux_R_arr(3) + coeff_R(q)*(4._wp*dvel_small(1))/3._wp - end if + alpha_rho_L = 0._wp; alpha_rho_R = 0._wp + alpha_L = 0._wp; alpha_R = 0._wp + vel_L = 0._wp; vel_R = 0._wp - !y-direction contributions $:GPU_LOOP(parallelism='[seq]') - do i = -1, 1 - rho_L = 0._wp + do q = vidxb + 1, vidxe + $:GPU_LOOP(parallelism='[seq]') + do i = 1, num_fluids + alpha_rho_L(i) = alpha_rho_L(i) + coeff_L(q)*q_cons_vf(i)%sf(j + q, k, l) + end do + + if (num_fluids > 1) then + $:GPU_LOOP(parallelism='[seq]') + do i = 1, num_fluids - 1 + alpha_L(i) = alpha_L(i) + coeff_L(q)*q_cons_vf(E_idx + i)%sf(j + q, k, l) + end do + else + alpha_L(1) = 1._wp + end if + $:GPU_LOOP(parallelism='[seq]') - do r = 1, num_fluids - rho_L = rho_L + q_cons_vf(r)%sf(j + q, k + i, l) + do i = 1, num_dims + vel_L(i) = vel_L(i) + coeff_L(q)*q_cons_vf(momxb + i - 1)%sf(j + q, k, l) end do - rho_sf_small(i) = rho_L end do - dvel_small(1) = (1/(2._wp*dy(k)))*( & - q_cons_vf(momxb)%sf(j + q, k + 1, l)/rho_sf_small(1) - & - q_cons_vf(momxb)%sf(j + q, k - 1, l)/rho_sf_small(-1)) - dvel_small(2) = (1/(2._wp*dy(k)))*( & - q_cons_vf(momxb + 1)%sf(j + q, k + 1, l)/rho_sf_small(1) - & - q_cons_vf(momxb + 1)%sf(j + q, k - 1, l)/rho_sf_small(-1)) + $:GPU_LOOP(parallelism='[seq]') + do q = vidxb, vidxe - 1 + $:GPU_LOOP(parallelism='[seq]') + do i = 1, num_fluids + alpha_rho_R(i) = alpha_rho_R(i) + coeff_R(q)*q_cons_vf(i)%sf(j + q, k, l) + end do - if (q == 0) dvel(:, 2) = dvel_small + if (num_fluids > 1) then + $:GPU_LOOP(parallelism='[seq]') + do i = 1, num_fluids - 1 + alpha_R(i) = alpha_R(i) + coeff_R(q)*q_cons_vf(E_idx + i)%sf(j + q, k, l) + end do + else + alpha_R(1) = 1._wp + end if - if (q > vidxb) then - vflux_L_arr(1) = vflux_L_arr(1) + coeff_L(q)*(dvel_small(1)) - vflux_L_arr(3) = vflux_L_arr(3) + coeff_L(q)*(-2._wp*dvel_small(2))/3._wp - end if - if (q < vidxe) then - vflux_R_arr(1) = vflux_R_arr(1) + coeff_R(q)*(dvel_small(1)) - vflux_R_arr(3) = vflux_R_arr(3) + coeff_R(q)*(-2._wp*dvel_small(2))/3._wp - end if + $:GPU_LOOP(parallelism='[seq]') + do i = 1, num_dims + vel_R(i) = vel_R(i) + coeff_R(q)*q_cons_vf(momxb + i - 1)%sf(j + q, k, l) + end do + end do - if (q == 0) then - jac_rhs(j, k, l) = alf_igr*(2._wp*(dvel(1, 2)*dvel(2, 1)) & - + dvel(1, 1)**2._wp + dvel(2, 2)**2._wp & - + (dvel(1, 1) + dvel(2, 2))**2._wp) + if (num_fluids > 1) then + alpha_L(num_fluids) = 1._wp - sum(alpha_L(1:num_fluids - 1)) + alpha_R(num_fluids) = 1._wp - sum(alpha_R(1:num_fluids - 1)) end if - end do - alpha_rho_L = 0._wp; alpha_rho_R = 0._wp - alpha_L = 0._wp; alpha_R = 0._wp - vel_L = 0._wp; vel_R = 0._wp + rho_L = sum(alpha_rho_L) + gamma_L = sum(alpha_L*gammas) + pi_inf_L = sum(alpha_L*pi_infs) - $:GPU_LOOP(parallelism='[seq]') - do q = vidxb + 1, vidxe - $:GPU_LOOP(parallelism='[seq]') - do i = 1, num_fluids - alpha_rho_L(i) = alpha_rho_L(i) + coeff_L(q)*q_cons_vf(i)%sf(j + q, k, l) - end do + rho_R = sum(alpha_rho_R) + gamma_R = sum(alpha_R*gammas) + pi_inf_R = sum(alpha_R*pi_infs) - if (num_fluids > 1) then + vel_L = vel_L/rho_L + vel_R = vel_R/rho_R + + if (viscous) then + mu_L = 0._wp; mu_R = 0._wp $:GPU_LOOP(parallelism='[seq]') - do i = 1, num_fluids - 1 - alpha_L(i) = alpha_L(i) + coeff_L(q)*q_cons_vf(E_idx + i)%sf(j + q, k, l) + do i = 1, num_fluids + mu_L = alpha_L(i)/Res_igr(1, i) + mu_L + mu_R = alpha_R(i)/Res_igr(1, i) + mu_R end do - else - alpha_L(1) = 1._wp + + $:GPU_ATOMIC(atomic='update') + rhs_vf(momxb + 1)%sf(j + 1, k, l) = rhs_vf(momxb + 1)%sf(j + 1, k, l) - & + 0.5_wp*mu_L*vflux_L_arr(1)*(1._wp/dx(j + 1)) + $:GPU_ATOMIC(atomic='update') + rhs_vf(E_idx)%sf(j + 1, k, l) = rhs_vf(E_idx)%sf(j + 1, k, l) - & + 0.5_wp*mu_L*vflux_L_arr(1)*vel_L(2)*(1._wp/dx(j + 1)) + + $:GPU_ATOMIC(atomic='update') + rhs_vf(momxb + 1)%sf(j, k, l) = rhs_vf(momxb + 1)%sf(j, k, l) + & + 0.5_wp*mu_L*vflux_L_arr(1)*(1._wp/dx(j)) + $:GPU_ATOMIC(atomic='update') + rhs_vf(E_idx)%sf(j, k, l) = rhs_vf(E_idx)%sf(j, k, l) + & + 0.5_wp*mu_L*vflux_L_arr(1)*vel_L(2)*(1._wp/dx(j)) + + $:GPU_ATOMIC(atomic='update') + rhs_vf(momxb + 1)%sf(j + 1, k, l) = rhs_vf(momxb + 1)%sf(j + 1, k, l) - & + 0.5_wp*mu_R*vflux_R_arr(1)*(1._wp/dx(j + 1)) + $:GPU_ATOMIC(atomic='update') + rhs_vf(E_idx)%sf(j + 1, k, l) = rhs_vf(E_idx)%sf(j + 1, k, l) - & + 0.5_wp*mu_R*vflux_R_arr(1)*vel_R(2)*(1._wp/dx(j + 1)) + + $:GPU_ATOMIC(atomic='update') + rhs_vf(momxb + 1)%sf(j, k, l) = rhs_vf(momxb + 1)%sf(j, k, l) + & + 0.5_wp*mu_R*vflux_R_arr(1)*(1._wp/dx(j)) + $:GPU_ATOMIC(atomic='update') + rhs_vf(E_idx)%sf(j, k, l) = rhs_vf(E_idx)%sf(j, k, l) + & + 0.5_wp*mu_R*vflux_R_arr(1)*vel_R(2)*(1._wp/dx(j)) + + $:GPU_ATOMIC(atomic='update') + rhs_vf(momxb)%sf(j + 1, k, l) = rhs_vf(momxb)%sf(j + 1, k, l) - & + 0.5_wp*mu_L*vflux_L_arr(3)*(1._wp/dx(j + 1)) + $:GPU_ATOMIC(atomic='update') + rhs_vf(E_idx)%sf(j + 1, k, l) = rhs_vf(E_idx)%sf(j + 1, k, l) - & + 0.5_wp*mu_L*vflux_L_arr(3)*vel_L(1)*(1._wp/dx(j + 1)) + + $:GPU_ATOMIC(atomic='update') + rhs_vf(momxb)%sf(j, k, l) = rhs_vf(momxb)%sf(j, k, l) + & + 0.5_wp*mu_L*vflux_L_arr(3)*(1._wp/dx(j)) + $:GPU_ATOMIC(atomic='update') + rhs_vf(E_idx)%sf(j, k, l) = rhs_vf(E_idx)%sf(j, k, l) + & + 0.5_wp*mu_L*vflux_L_arr(3)*vel_L(1)*(1._wp/dx(j)) + + $:GPU_ATOMIC(atomic='update') + rhs_vf(momxb)%sf(j + 1, k, l) = rhs_vf(momxb)%sf(j + 1, k, l) - & + 0.5_wp*mu_R*vflux_R_arr(3)*(1._wp/dx(j + 1)) + $:GPU_ATOMIC(atomic='update') + rhs_vf(E_idx)%sf(j + 1, k, l) = rhs_vf(E_idx)%sf(j + 1, k, l) - & + 0.5_wp*mu_R*vflux_R_arr(3)*vel_R(1)*(1._wp/dx(j + 1)) + + $:GPU_ATOMIC(atomic='update') + rhs_vf(momxb)%sf(j, k, l) = rhs_vf(momxb)%sf(j, k, l) + & + 0.5_wp*mu_R*vflux_R_arr(3)*(1._wp/dx(j)) + $:GPU_ATOMIC(atomic='update') + rhs_vf(E_idx)%sf(j, k, l) = rhs_vf(E_idx)%sf(j, k, l) + & + 0.5_wp*mu_R*vflux_R_arr(3)*vel_R(1)*(1._wp/dx(j)) end if + E_L = 0._wp; E_R = 0._wp + $:GPU_LOOP(parallelism='[seq]') - do i = 1, num_dims - vel_L(i) = vel_L(i) + coeff_L(q)*q_cons_vf(momxb + i - 1)%sf(j + q, k, l) + do q = vidxb + 1, vidxe + E_L = E_L + coeff_L(q)*q_cons_vf(E_idx)%sf(j + q, k, l) end do - end do - $:GPU_LOOP(parallelism='[seq]') - do q = vidxb, vidxe - 1 $:GPU_LOOP(parallelism='[seq]') + do q = vidxb, vidxe - 1 + E_R = E_R + coeff_R(q)*q_cons_vf(E_idx)%sf(j + q, k, l) + end do + + call s_get_derived_states(E_L, gamma_L, pi_inf_L, rho_L, vel_L, & + E_R, gamma_R, pi_inf_R, rho_R, vel_R, & + pres_L, pres_R, cfl) + do i = 1, num_fluids - alpha_rho_R(i) = alpha_rho_R(i) + coeff_R(q)*q_cons_vf(i)%sf(j + q, k, l) + $:GPU_ATOMIC(atomic='update') + rhs_vf(i)%sf(j + 1, k, l) = rhs_vf(i)%sf(j + 1, k, l) + & + (0.5_wp*(alpha_rho_L(i)* & + vel_L(1))*(1._wp/dx(j + 1)) - & + 0.5_wp*cfl*(alpha_rho_L(i))*(1._wp/dx(j + 1))) + + $:GPU_ATOMIC(atomic='update') + rhs_vf(i)%sf(j, k, l) = rhs_vf(i)%sf(j, k, l) - & + (0.5_wp*(alpha_rho_L(i)* & + vel_L(1))*(1._wp/dx(j)) - & + 0.5_wp*cfl*(alpha_rho_L(i))*(1._wp/dx(j))) end do if (num_fluids > 1) then $:GPU_LOOP(parallelism='[seq]') do i = 1, num_fluids - 1 - alpha_R(i) = alpha_R(i) + coeff_R(q)*q_cons_vf(E_idx + i)%sf(j + q, k, l) + $:GPU_ATOMIC(atomic='update') + rhs_vf(advxb + i - 1)%sf(j + 1, k, l) = rhs_vf(advxb + i - 1)%sf(j + 1, k, l) + & + (0.5_wp*(alpha_L(i)* & + vel_L(1))*(1._wp/dx(j + 1)) - & + 0.5_wp*cfl*(alpha_L(i))*(1._wp/dx(j + 1))) + + $:GPU_ATOMIC(atomic='update') + rhs_vf(advxb + i - 1)%sf(j + 1, k, l) = rhs_vf(advxb + i - 1)%sf(j + 1, k, l) & + - (0.5_wp*q_cons_vf(advxb + i - 1)%sf(j + 1, k, l)*vel_L(1)*(1._wp/dx(j + 1))) + + $:GPU_ATOMIC(atomic='update') + rhs_vf(advxb + i - 1)%sf(j, k, l) = rhs_vf(advxb + i - 1)%sf(j, k, l) - & + (0.5_wp*(alpha_L(i)* & + vel_L(1))*(1._wp/dx(j)) - & + 0.5_wp*cfl*(alpha_L(i))*(1._wp/dx(j))) + + $:GPU_ATOMIC(atomic='update') + rhs_vf(advxb + i - 1)%sf(j, k, l) = rhs_vf(advxb + i - 1)%sf(j, k, l) & + + (0.5_wp*q_cons_vf(advxb + i - 1)%sf(j, k, l)*vel_L(1)*(1._wp/dx(j))) end do - else - alpha_R(1) = 1._wp end if - $:GPU_LOOP(parallelism='[seq]') - do i = 1, num_dims - vel_R(i) = vel_R(i) + coeff_R(q)*q_cons_vf(momxb + i - 1)%sf(j + q, k, l) - end do - end do + $:GPU_ATOMIC(atomic='update') + rhs_vf(momxb)%sf(j + 1, k, l) = rhs_vf(momxb)%sf(j + 1, k, l) + & + (0.5_wp*(rho_L*(vel_L(1))**2.0 + & + pres_L)*(1._wp/dx(j + 1)) - & + 0.5_wp*cfl*(rho_L*vel_L(1))*(1._wp/dx(j + 1))) - if (num_fluids > 1) then - alpha_L(num_fluids) = 1._wp - sum(alpha_L(1:num_fluids - 1)) - alpha_R(num_fluids) = 1._wp - sum(alpha_R(1:num_fluids - 1)) - end if + $:GPU_ATOMIC(atomic='update') + rhs_vf(momxb + 1)%sf(j + 1, k, l) = rhs_vf(momxb + 1)%sf(j + 1, k, l) + & + (0.5_wp*rho_L*vel_L(1)*vel_L(2)*(1._wp/dx(j + 1)) - & + 0.5_wp*cfl*(rho_L*vel_L(2))*(1._wp/dx(j + 1))) - rho_L = sum(alpha_rho_L) - gamma_L = sum(alpha_L*gammas) - pi_inf_L = sum(alpha_L*pi_infs) + $:GPU_ATOMIC(atomic='update') + rhs_vf(E_idx)%sf(j + 1, k, l) = rhs_vf(E_idx)%sf(j + 1, k, l) + & + (0.5_wp*(vel_L(1)*(E_L + & + pres_L))*(1._wp/dx(j + 1)) - & + 0.5_wp*cfl*(E_L)*(1._wp/dx(j + 1))) - rho_R = sum(alpha_rho_R) - gamma_R = sum(alpha_R*gammas) - pi_inf_R = sum(alpha_R*pi_infs) + $:GPU_ATOMIC(atomic='update') + rhs_vf(momxb)%sf(j, k, l) = rhs_vf(momxb)%sf(j, k, l) - & + (0.5_wp*(rho_L*(vel_L(1))**2.0 + & + pres_L)*(1._wp/dx(j)) - & + 0.5_wp*cfl*(rho_L*vel_L(1))*(1._wp/dx(j))) - vel_L = vel_L/rho_L - vel_R = vel_R/rho_R + $:GPU_ATOMIC(atomic='update') + rhs_vf(momxb + 1)%sf(j, k, l) = rhs_vf(momxb + 1)%sf(j, k, l) - & + (0.5_wp*rho_L*vel_L(1)*vel_L(2)*(1._wp/dx(j)) - & + 0.5_wp*cfl*(rho_L*vel_L(2))*(1._wp/dx(j))) + + $:GPU_ATOMIC(atomic='update') + rhs_vf(E_idx)%sf(j, k, l) = rhs_vf(E_idx)%sf(j, k, l) - & + (0.5_wp*(vel_L(1)*(E_L + & + pres_L))*(1._wp/dx(j)) - & + 0.5_wp*cfl*(E_L)*(1._wp/dx(j))) - if (viscous) then - mu_L = 0._wp; mu_R = 0._wp $:GPU_LOOP(parallelism='[seq]') do i = 1, num_fluids - mu_L = alpha_L(i)/Res(1, i) + mu_L - mu_R = alpha_R(i)/Res(1, i) + mu_R + $:GPU_ATOMIC(atomic='update') + rhs_vf(i)%sf(j + 1, k, l) = rhs_vf(i)%sf(j + 1, k, l) + & + (0.5_wp*(alpha_rho_R(i)* & + vel_R(1))*(1._wp/dx(j + 1)) + & + 0.5_wp*cfl*(alpha_rho_R(i))*(1._wp/dx(j + 1))) + + $:GPU_ATOMIC(atomic='update') + rhs_vf(i)%sf(j, k, l) = rhs_vf(i)%sf(j, k, l) - & + (0.5_wp*(alpha_rho_R(i)* & + vel_R(1))*(1._wp/dx(j)) + & + 0.5_wp*cfl*(alpha_rho_R(i))*(1._wp/dx(j))) end do - $:GPU_ATOMIC(atomic='update') - rhs_vf(momxb + 1)%sf(j + 1, k, l) = rhs_vf(momxb + 1)%sf(j + 1, k, l) - & - 0.5_wp*mu_L*vflux_L_arr(1)*(1._wp/dx(j + 1)) - $:GPU_ATOMIC(atomic='update') - rhs_vf(E_idx)%sf(j + 1, k, l) = rhs_vf(E_idx)%sf(j + 1, k, l) - & - 0.5_wp*mu_L*vflux_L_arr(1)*vel_L(2)*(1._wp/dx(j + 1)) + if (num_fluids > 1) then + $:GPU_LOOP(parallelism='[seq]') + do i = 1, num_fluids - 1 + $:GPU_ATOMIC(atomic='update') + rhs_vf(advxb + i - 1)%sf(j + 1, k, l) = rhs_vf(advxb + i - 1)%sf(j + 1, k, l) + & + (0.5_wp*(alpha_R(i)* & + vel_R(1))*(1._wp/dx(j + 1)) + & + 0.5_wp*cfl*(alpha_R(i))*(1._wp/dx(j + 1))) + + $:GPU_ATOMIC(atomic='update') + rhs_vf(advxb + i - 1)%sf(j + 1, k, l) = rhs_vf(advxb + i - 1)%sf(j + 1, k, l) & + - (0.5_wp*q_cons_vf(advxb + i - 1)%sf(j + 1, k, l)*vel_R(1)*(1._wp/dx(j + 1))) + + $:GPU_ATOMIC(atomic='update') + rhs_vf(advxb + i - 1)%sf(j, k, l) = rhs_vf(advxb + i - 1)%sf(j, k, l) - & + (0.5_wp*(alpha_R(i)* & + vel_R(1))*(1._wp/dx(j)) + & + 0.5_wp*cfl*(alpha_R(i))*(1._wp/dx(j))) - $:GPU_ATOMIC(atomic='update') - rhs_vf(momxb + 1)%sf(j, k, l) = rhs_vf(momxb + 1)%sf(j, k, l) + & - 0.5_wp*mu_L*vflux_L_arr(1)*(1._wp/dx(j)) - $:GPU_ATOMIC(atomic='update') - rhs_vf(E_idx)%sf(j, k, l) = rhs_vf(E_idx)%sf(j, k, l) + & - 0.5_wp*mu_L*vflux_L_arr(1)*vel_L(2)*(1._wp/dx(j)) + $:GPU_ATOMIC(atomic='update') + rhs_vf(advxb + i - 1)%sf(j, k, l) = rhs_vf(advxb + i - 1)%sf(j, k, l) & + + (0.5_wp*q_cons_vf(advxb + i - 1)%sf(j, k, l)*vel_R(1)*(1._wp/dx(j))) + end do + end if $:GPU_ATOMIC(atomic='update') - rhs_vf(momxb + 1)%sf(j + 1, k, l) = rhs_vf(momxb + 1)%sf(j + 1, k, l) - & - 0.5_wp*mu_R*vflux_R_arr(1)*(1._wp/dx(j + 1)) - $:GPU_ATOMIC(atomic='update') - rhs_vf(E_idx)%sf(j + 1, k, l) = rhs_vf(E_idx)%sf(j + 1, k, l) - & - 0.5_wp*mu_R*vflux_R_arr(1)*vel_R(2)*(1._wp/dx(j + 1)) + rhs_vf(momxb)%sf(j + 1, k, l) = rhs_vf(momxb)%sf(j + 1, k, l) + & + (0.5_wp*(rho_R*(vel_R(1))**2.0 + & + pres_R)*(1._wp/dx(j + 1)) + & + 0.5_wp*cfl*(rho_R*vel_R(1))*(1._wp/dx(j + 1))) $:GPU_ATOMIC(atomic='update') - rhs_vf(momxb + 1)%sf(j, k, l) = rhs_vf(momxb + 1)%sf(j, k, l) + & - 0.5_wp*mu_R*vflux_R_arr(1)*(1._wp/dx(j)) - $:GPU_ATOMIC(atomic='update') - rhs_vf(E_idx)%sf(j, k, l) = rhs_vf(E_idx)%sf(j, k, l) + & - 0.5_wp*mu_R*vflux_R_arr(1)*vel_R(2)*(1._wp/dx(j)) + rhs_vf(momxb + 1)%sf(j + 1, k, l) = rhs_vf(momxb + 1)%sf(j + 1, k, l) + & + (0.5_wp*rho_R*vel_R(1)*vel_R(2)*(1._wp/dx(j + 1)) + & + 0.5_wp*cfl*(rho_R*vel_R(2))*(1._wp/dx(j + 1))) $:GPU_ATOMIC(atomic='update') - rhs_vf(momxb)%sf(j + 1, k, l) = rhs_vf(momxb)%sf(j + 1, k, l) - & - 0.5_wp*mu_L*vflux_L_arr(3)*(1._wp/dx(j + 1)) - $:GPU_ATOMIC(atomic='update') - rhs_vf(E_idx)%sf(j + 1, k, l) = rhs_vf(E_idx)%sf(j + 1, k, l) - & - 0.5_wp*mu_L*vflux_L_arr(3)*vel_L(1)*(1._wp/dx(j + 1)) + rhs_vf(E_idx)%sf(j + 1, k, l) = rhs_vf(E_idx)%sf(j + 1, k, l) + & + (0.5_wp*(vel_R(1)*(E_R + & + pres_R))*(1._wp/dx(j + 1)) + & + 0.5_wp*cfl*(E_R)*(1._wp/dx(j + 1))) $:GPU_ATOMIC(atomic='update') - rhs_vf(momxb)%sf(j, k, l) = rhs_vf(momxb)%sf(j, k, l) + & - 0.5_wp*mu_L*vflux_L_arr(3)*(1._wp/dx(j)) - $:GPU_ATOMIC(atomic='update') - rhs_vf(E_idx)%sf(j, k, l) = rhs_vf(E_idx)%sf(j, k, l) + & - 0.5_wp*mu_L*vflux_L_arr(3)*vel_L(1)*(1._wp/dx(j)) + rhs_vf(momxb)%sf(j, k, l) = rhs_vf(momxb)%sf(j, k, l) - & + (0.5_wp*(rho_R*(vel_R(1))**2.0 + & + pres_R)*(1._wp/dx(j)) + & + 0.5_wp*cfl*(rho_R*vel_R(1))*(1._wp/dx(j))) $:GPU_ATOMIC(atomic='update') - rhs_vf(momxb)%sf(j + 1, k, l) = rhs_vf(momxb)%sf(j + 1, k, l) - & - 0.5_wp*mu_R*vflux_R_arr(3)*(1._wp/dx(j + 1)) - $:GPU_ATOMIC(atomic='update') - rhs_vf(E_idx)%sf(j + 1, k, l) = rhs_vf(E_idx)%sf(j + 1, k, l) - & - 0.5_wp*mu_R*vflux_R_arr(3)*vel_R(1)*(1._wp/dx(j + 1)) + rhs_vf(momxb + 1)%sf(j, k, l) = rhs_vf(momxb + 1)%sf(j, k, l) - & + (0.5_wp*rho_R*vel_R(1)*vel_R(2)*(1._wp/dx(j)) + & + 0.5_wp*cfl*(rho_R*vel_R(2))*(1._wp/dx(j))) $:GPU_ATOMIC(atomic='update') - rhs_vf(momxb)%sf(j, k, l) = rhs_vf(momxb)%sf(j, k, l) + & - 0.5_wp*mu_R*vflux_R_arr(3)*(1._wp/dx(j)) - $:GPU_ATOMIC(atomic='update') - rhs_vf(E_idx)%sf(j, k, l) = rhs_vf(E_idx)%sf(j, k, l) + & - 0.5_wp*mu_R*vflux_R_arr(3)*vel_R(1)*(1._wp/dx(j)) - end if - - E_L = 0._wp; E_R = 0._wp - - $:GPU_LOOP(parallelism='[seq]') - do q = vidxb + 1, vidxe - E_L = E_L + coeff_L(q)*q_cons_vf(E_idx)%sf(j + q, k, l) - end do + rhs_vf(E_idx)%sf(j, k, l) = rhs_vf(E_idx)%sf(j, k, l) - & + (0.5_wp*(vel_R(1)*(E_R + & + pres_R))*(1._wp/dx(j)) + & + 0.5_wp*cfl*(E_R)*(1._wp/dx(j))) - $:GPU_LOOP(parallelism='[seq]') - do q = vidxb, vidxe - 1 - E_R = E_R + coeff_R(q)*q_cons_vf(E_idx)%sf(j + q, k, l) end do + end do + end do + #:endcall GPU_PARALLEL_LOOP + else + #:call GPU_PARALLEL_LOOP(collapse=3, private='[rho_L, rho_R, gamma_L, gamma_R, pi_inf_L, pi_inf_R, mu_L, mu_R, vel_L, vel_R, pres_L, pres_R, alpha_L, alpha_R, alpha_rho_L, alpha_rho_R, F_L, F_R, E_L, E_R, cfl, dvel, dvel_small, rho_sf_small, vflux_L_arr, vflux_R_arr]') + do l = 0, p + do k = 0, n + do j = -1, m - call s_get_derived_states(E_L, gamma_L, pi_inf_L, rho_L, vel_L, & - E_R, gamma_R, pi_inf_R, rho_R, vel_R, & - pres_L, pres_R, cfl) + dvel = 0._wp + vflux_L_arr = 0._wp + vflux_R_arr = 0._wp - do i = 1, num_fluids - $:GPU_ATOMIC(atomic='update') - rhs_vf(i)%sf(j + 1, k, l) = rhs_vf(i)%sf(j + 1, k, l) + & - (0.5_wp*(alpha_rho_L(i)* & - vel_L(1))*(1._wp/dx(j + 1)) - & - 0.5_wp*cfl*(alpha_rho_L(i))*(1._wp/dx(j + 1))) + #:if MFC_CASE_OPTIMIZATION + #:if igr_order == 5 + !DIR$ unroll 6 + #:elif igr_order == 3 + !DIR$ unroll 4 + #:endif + #:endif + $:GPU_LOOP(parallelism='[seq]') + do q = vidxb, vidxe + dvel_small = 0._wp + !x-direction contributions + $:GPU_LOOP(parallelism='[seq]') + do i = -1, 1 + rho_L = 0._wp + $:GPU_LOOP(parallelism='[seq]') + do r = 1, num_fluids + rho_L = rho_L + q_cons_vf(r)%sf(j + i + q, k, l) + end do + rho_sf_small(i) = rho_L + end do - $:GPU_ATOMIC(atomic='update') - rhs_vf(i)%sf(j, k, l) = rhs_vf(i)%sf(j, k, l) - & - (0.5_wp*(alpha_rho_L(i)* & - vel_L(1))*(1._wp/dx(j)) - & - 0.5_wp*cfl*(alpha_rho_L(i))*(1._wp/dx(j))) - end do + dvel_small(1) = (1/(2._wp*dx(j)))*( & + q_cons_vf(momxb)%sf(j + 1 + q, k, l)/rho_sf_small(1) - & + q_cons_vf(momxb)%sf(j - 1 + q, k, l)/rho_sf_small(-1)) + dvel_small(2) = (1/(2._wp*dx(j)))*( & + q_cons_vf(momxb + 1)%sf(j + 1 + q, k, l)/rho_sf_small(1) - & + q_cons_vf(momxb + 1)%sf(j - 1 + q, k, l)/rho_sf_small(-1)) + dvel_small(3) = (1/(2._wp*dx(j)))*( & + q_cons_vf(momxb + 2)%sf(j + 1 + q, k, l)/rho_sf_small(1) - & + q_cons_vf(momxb + 2)%sf(j - 1 + q, k, l)/rho_sf_small(-1)) - if (num_fluids > 1) then - $:GPU_LOOP(parallelism='[seq]') - do i = 1, num_fluids - 1 - $:GPU_ATOMIC(atomic='update') - rhs_vf(advxb + i - 1)%sf(j + 1, k, l) = rhs_vf(advxb + i - 1)%sf(j + 1, k, l) + & - (0.5_wp*(alpha_L(i)* & - vel_L(1))*(1._wp/dx(j + 1)) - & - 0.5_wp*cfl*(alpha_L(i))*(1._wp/dx(j + 1))) + if (q == 0) dvel(:, 1) = dvel_small + if (q > vidxb) then + vflux_L_arr(1) = vflux_L_arr(1) + coeff_L(q)*(dvel_small(2)) + vflux_L_arr(2) = vflux_L_arr(2) + coeff_L(q)*(dvel_small(3)) + vflux_L_arr(3) = vflux_L_arr(3) + coeff_L(q)*(4._wp*dvel_small(1))/3._wp + end if + if (q < vidxe) then + vflux_R_arr(1) = vflux_R_arr(1) + coeff_R(q)*(dvel_small(2)) + vflux_R_arr(2) = vflux_R_arr(2) + coeff_R(q)*(dvel_small(3)) + vflux_R_arr(3) = vflux_R_arr(3) + coeff_R(q)*(4._wp*dvel_small(1))/3._wp + end if - $:GPU_ATOMIC(atomic='update') - rhs_vf(advxb + i - 1)%sf(j + 1, k, l) = rhs_vf(advxb + i - 1)%sf(j + 1, k, l) & - - (0.5_wp*q_cons_vf(advxb + i - 1)%sf(j + 1, k, l)*vel_L(1)*(1._wp/dx(j + 1))) + !y-direction contributions + $:GPU_LOOP(parallelism='[seq]') + do i = -1, 1 + rho_L = 0._wp + $:GPU_LOOP(parallelism='[seq]') + do r = 1, num_fluids + rho_L = rho_L + q_cons_vf(r)%sf(j + q, k + i, l) + end do + rho_sf_small(i) = rho_L + end do - $:GPU_ATOMIC(atomic='update') - rhs_vf(advxb + i - 1)%sf(j, k, l) = rhs_vf(advxb + i - 1)%sf(j, k, l) - & - (0.5_wp*(alpha_L(i)* & - vel_L(1))*(1._wp/dx(j)) - & - 0.5_wp*cfl*(alpha_L(i))*(1._wp/dx(j))) + dvel_small(1) = (1/(2._wp*dy(k)))*( & + q_cons_vf(momxb)%sf(j + q, k + 1, l)/rho_sf_small(1) - & + q_cons_vf(momxb)%sf(j + q, k - 1, l)/rho_sf_small(-1)) + dvel_small(2) = (1/(2._wp*dy(k)))*( & + q_cons_vf(momxb + 1)%sf(j + q, k + 1, l)/rho_sf_small(1) - & + q_cons_vf(momxb + 1)%sf(j + q, k - 1, l)/rho_sf_small(-1)) + if (q == 0) dvel_small(3) = (1/(2._wp*dy(k)))*( & + q_cons_vf(momxb + 2)%sf(j + q, k + 1, l)/rho_sf_small(1) - & + q_cons_vf(momxb + 2)%sf(j + q, k - 1, l)/rho_sf_small(-1)) + if (q == 0) dvel(:, 2) = dvel_small - $:GPU_ATOMIC(atomic='update') - rhs_vf(advxb + i - 1)%sf(j, k, l) = rhs_vf(advxb + i - 1)%sf(j, k, l) & - + (0.5_wp*q_cons_vf(advxb + i - 1)%sf(j, k, l)*vel_L(1)*(1._wp/dx(j))) - end do - end if + if (q > vidxb) then + vflux_L_arr(1) = vflux_L_arr(1) + coeff_L(q)*(dvel_small(1)) + vflux_L_arr(3) = vflux_L_arr(3) + coeff_L(q)*(-2._wp*dvel_small(2))/3._wp + end if + if (q < vidxe) then + vflux_R_arr(1) = vflux_R_arr(1) + coeff_R(q)*(dvel_small(1)) + vflux_R_arr(3) = vflux_R_arr(3) + coeff_R(q)*(-2._wp*dvel_small(2))/3._wp + end if - $:GPU_ATOMIC(atomic='update') - rhs_vf(momxb)%sf(j + 1, k, l) = rhs_vf(momxb)%sf(j + 1, k, l) + & - (0.5_wp*(rho_L*(vel_L(1))**2.0 + & - pres_L)*(1._wp/dx(j + 1)) - & - 0.5_wp*cfl*(rho_L*vel_L(1))*(1._wp/dx(j + 1))) + !z-direction contributions + $:GPU_LOOP(parallelism='[seq]') + do i = -1, 1 + rho_L = 0._wp + $:GPU_LOOP(parallelism='[seq]') + do r = 1, num_fluids + rho_L = rho_L + q_cons_vf(r)%sf(j + q, k, l + i) + end do + rho_sf_small(i) = rho_L + end do - $:GPU_ATOMIC(atomic='update') - rhs_vf(momxb + 1)%sf(j + 1, k, l) = rhs_vf(momxb + 1)%sf(j + 1, k, l) + & - (0.5_wp*rho_L*vel_L(1)*vel_L(2)*(1._wp/dx(j + 1)) - & - 0.5_wp*cfl*(rho_L*vel_L(2))*(1._wp/dx(j + 1))) + dvel_small(1) = (1/(2._wp*dz(l)))*( & + q_cons_vf(momxb)%sf(j + q, k, l + 1)/rho_sf_small(1) - & + q_cons_vf(momxb)%sf(j + q, k, l - 1)/rho_sf_small(-1)) + if (q == 0) dvel_small(2) = (1/(2._wp*dz(l)))*( & + q_cons_vf(momxb + 1)%sf(j + q, k, l + 1)/rho_sf_small(1) - & + q_cons_vf(momxb + 1)%sf(j + q, k, l - 1)/rho_sf_small(-1)) + dvel_small(3) = (1/(2._wp*dz(l)))*( & + q_cons_vf(momxb + 2)%sf(j + q, k, l + 1)/rho_sf_small(1) - & + q_cons_vf(momxb + 2)%sf(j + q, k, l - 1)/rho_sf_small(-1)) + if (q == 0) dvel(:, 3) = dvel_small - $:GPU_ATOMIC(atomic='update') - rhs_vf(E_idx)%sf(j + 1, k, l) = rhs_vf(E_idx)%sf(j + 1, k, l) + & - (0.5_wp*(vel_L(1)*(E_L + & - pres_L))*(1._wp/dx(j + 1)) - & - 0.5_wp*cfl*(E_L)*(1._wp/dx(j + 1))) + if (q > vidxb) then + vflux_L_arr(2) = vflux_L_arr(2) + coeff_L(q)*(dvel_small(1)) + vflux_L_arr(3) = vflux_L_arr(3) + coeff_L(q)*(-2._wp*dvel_small(3))/3._wp + end if + if (q < vidxe) then + vflux_R_arr(2) = vflux_R_arr(2) + coeff_R(q)*(dvel_small(1)) + vflux_R_arr(3) = vflux_R_arr(3) + coeff_R(q)*(-2._wp*dvel_small(3))/3._wp + end if - $:GPU_ATOMIC(atomic='update') - rhs_vf(momxb)%sf(j, k, l) = rhs_vf(momxb)%sf(j, k, l) - & - (0.5_wp*(rho_L*(vel_L(1))**2.0 + & - pres_L)*(1._wp/dx(j)) - & - 0.5_wp*cfl*(rho_L*vel_L(1))*(1._wp/dx(j))) + if (q == 0) then + jac_rhs(j, k, l) = alf_igr*(2._wp*(dvel(1, 2)*dvel(2, 1) & + + dvel(1, 3)*dvel(3, 1) & + + dvel(2, 3)*dvel(3, 2)) & + + dvel(1, 1)**2._wp + dvel(2, 2)**2._wp & + + dvel(3, 3)**2._wp & + + (dvel(1, 1) + dvel(2, 2) + dvel(3, 3))**2._wp) + end if + end do - $:GPU_ATOMIC(atomic='update') - rhs_vf(momxb + 1)%sf(j, k, l) = rhs_vf(momxb + 1)%sf(j, k, l) - & - (0.5_wp*rho_L*vel_L(1)*vel_L(2)*(1._wp/dx(j)) - & - 0.5_wp*cfl*(rho_L*vel_L(2))*(1._wp/dx(j))) + alpha_rho_L = 0._wp; alpha_rho_R = 0._wp + alpha_L = 0._wp; alpha_R = 0._wp + vel_L = 0._wp; vel_R = 0._wp - $:GPU_ATOMIC(atomic='update') - rhs_vf(E_idx)%sf(j, k, l) = rhs_vf(E_idx)%sf(j, k, l) - & - (0.5_wp*(vel_L(1)*(E_L + & - pres_L))*(1._wp/dx(j)) - & - 0.5_wp*cfl*(E_L)*(1._wp/dx(j))) + $:GPU_LOOP(parallelism='[seq]') + do q = vidxb + 1, vidxe + $:GPU_LOOP(parallelism='[seq]') + do i = 1, num_fluids + alpha_rho_L(i) = alpha_rho_L(i) + coeff_L(q)*q_cons_vf(i)%sf(j + q, k, l) + end do - $:GPU_LOOP(parallelism='[seq]') - do i = 1, num_fluids - $:GPU_ATOMIC(atomic='update') - rhs_vf(i)%sf(j + 1, k, l) = rhs_vf(i)%sf(j + 1, k, l) + & - (0.5_wp*(alpha_rho_R(i)* & - vel_R(1))*(1._wp/dx(j + 1)) + & - 0.5_wp*cfl*(alpha_rho_R(i))*(1._wp/dx(j + 1))) + if (num_fluids > 1) then + $:GPU_LOOP(parallelism='[seq]') + do i = 1, num_fluids - 1 + alpha_L(i) = alpha_L(i) + coeff_L(q)*q_cons_vf(E_idx + i)%sf(j + q, k, l) + end do + else + alpha_L(1) = 1._wp + end if - $:GPU_ATOMIC(atomic='update') - rhs_vf(i)%sf(j, k, l) = rhs_vf(i)%sf(j, k, l) - & - (0.5_wp*(alpha_rho_R(i)* & - vel_R(1))*(1._wp/dx(j)) + & - 0.5_wp*cfl*(alpha_rho_R(i))*(1._wp/dx(j))) - end do + $:GPU_LOOP(parallelism='[seq]') + do i = 1, num_dims + vel_L(i) = vel_L(i) + coeff_L(q)*q_cons_vf(momxb + i - 1)%sf(j + q, k, l) + end do + end do - if (num_fluids > 1) then $:GPU_LOOP(parallelism='[seq]') - do i = 1, num_fluids - 1 - $:GPU_ATOMIC(atomic='update') - rhs_vf(advxb + i - 1)%sf(j + 1, k, l) = rhs_vf(advxb + i - 1)%sf(j + 1, k, l) + & - (0.5_wp*(alpha_R(i)* & - vel_R(1))*(1._wp/dx(j + 1)) + & - 0.5_wp*cfl*(alpha_R(i))*(1._wp/dx(j + 1))) - - $:GPU_ATOMIC(atomic='update') - rhs_vf(advxb + i - 1)%sf(j + 1, k, l) = rhs_vf(advxb + i - 1)%sf(j + 1, k, l) & - - (0.5_wp*q_cons_vf(advxb + i - 1)%sf(j + 1, k, l)*vel_R(1)*(1._wp/dx(j + 1))) + do q = vidxb, vidxe - 1 + $:GPU_LOOP(parallelism='[seq]') + do i = 1, num_fluids + alpha_rho_R(i) = alpha_rho_R(i) + coeff_R(q)*q_cons_vf(i)%sf(j + q, k, l) + end do - $:GPU_ATOMIC(atomic='update') - rhs_vf(advxb + i - 1)%sf(j, k, l) = rhs_vf(advxb + i - 1)%sf(j, k, l) - & - (0.5_wp*(alpha_R(i)* & - vel_R(1))*(1._wp/dx(j)) + & - 0.5_wp*cfl*(alpha_R(i))*(1._wp/dx(j))) + if (num_fluids > 1) then + $:GPU_LOOP(parallelism='[seq]') + do i = 1, num_fluids - 1 + alpha_R(i) = alpha_R(i) + coeff_R(q)*q_cons_vf(E_idx + i)%sf(j + q, k, l) + end do + else + alpha_R(1) = 1._wp + end if - $:GPU_ATOMIC(atomic='update') - rhs_vf(advxb + i - 1)%sf(j, k, l) = rhs_vf(advxb + i - 1)%sf(j, k, l) & - + (0.5_wp*q_cons_vf(advxb + i - 1)%sf(j, k, l)*vel_R(1)*(1._wp/dx(j))) + $:GPU_LOOP(parallelism='[seq]') + do i = 1, num_dims + vel_R(i) = vel_R(i) + coeff_R(q)*q_cons_vf(momxb + i - 1)%sf(j + q, k, l) + end do end do - end if - $:GPU_ATOMIC(atomic='update') - rhs_vf(momxb)%sf(j + 1, k, l) = rhs_vf(momxb)%sf(j + 1, k, l) + & - (0.5_wp*(rho_R*(vel_R(1))**2.0 + & - pres_R)*(1._wp/dx(j + 1)) + & - 0.5_wp*cfl*(rho_R*vel_R(1))*(1._wp/dx(j + 1))) + if (num_fluids > 1) then + alpha_L(num_fluids) = 1._wp - sum(alpha_L(1:num_fluids - 1)) + alpha_R(num_fluids) = 1._wp - sum(alpha_R(1:num_fluids - 1)) + end if - $:GPU_ATOMIC(atomic='update') - rhs_vf(momxb + 1)%sf(j + 1, k, l) = rhs_vf(momxb + 1)%sf(j + 1, k, l) + & - (0.5_wp*rho_R*vel_R(1)*vel_R(2)*(1._wp/dx(j + 1)) + & - 0.5_wp*cfl*(rho_R*vel_R(2))*(1._wp/dx(j + 1))) + rho_L = sum(alpha_rho_L) + gamma_L = sum(alpha_L*gammas) + pi_inf_L = sum(alpha_L*pi_infs) - $:GPU_ATOMIC(atomic='update') - rhs_vf(E_idx)%sf(j + 1, k, l) = rhs_vf(E_idx)%sf(j + 1, k, l) + & - (0.5_wp*(vel_R(1)*(E_R + & - pres_R))*(1._wp/dx(j + 1)) + & - 0.5_wp*cfl*(E_R)*(1._wp/dx(j + 1))) + rho_R = sum(alpha_rho_R) + gamma_R = sum(alpha_R*gammas) + pi_inf_R = sum(alpha_R*pi_infs) - $:GPU_ATOMIC(atomic='update') - rhs_vf(momxb)%sf(j, k, l) = rhs_vf(momxb)%sf(j, k, l) - & - (0.5_wp*(rho_R*(vel_R(1))**2.0 + & - pres_R)*(1._wp/dx(j)) + & - 0.5_wp*cfl*(rho_R*vel_R(1))*(1._wp/dx(j))) + vel_L = vel_L/rho_L + vel_R = vel_R/rho_R - $:GPU_ATOMIC(atomic='update') - rhs_vf(momxb + 1)%sf(j, k, l) = rhs_vf(momxb + 1)%sf(j, k, l) - & - (0.5_wp*rho_R*vel_R(1)*vel_R(2)*(1._wp/dx(j)) + & - 0.5_wp*cfl*(rho_R*vel_R(2))*(1._wp/dx(j))) + if (viscous) then + mu_L = 0._wp + mu_R = 0._wp + $:GPU_LOOP(parallelism='[seq]') + do i = 1, num_fluids + mu_L = alpha_L(i)/Res_igr(1, i) + mu_L + mu_R = alpha_R(i)/Res_igr(1, i) + mu_R + end do - $:GPU_ATOMIC(atomic='update') - rhs_vf(E_idx)%sf(j, k, l) = rhs_vf(E_idx)%sf(j, k, l) - & - (0.5_wp*(vel_R(1)*(E_R + & - pres_R))*(1._wp/dx(j)) + & - 0.5_wp*cfl*(E_R)*(1._wp/dx(j))) + $:GPU_ATOMIC(atomic='update') + rhs_vf(momxb + 1)%sf(j + 1, k, l) = rhs_vf(momxb + 1)%sf(j + 1, k, l) - & + 0.5_wp*mu_L*vflux_L_arr(1)*(1._wp/dx(j + 1)) + $:GPU_ATOMIC(atomic='update') + rhs_vf(E_idx)%sf(j + 1, k, l) = rhs_vf(E_idx)%sf(j + 1, k, l) - & + 0.5_wp*mu_L*vflux_L_arr(1)*vel_L(2)*(1._wp/dx(j + 1)) - end do - end do - end do - else - $:GPU_PARALLEL_LOOP(collapse=3, private='[rho_L, rho_R, gamma_L, & - & gamma_R, pi_inf_L, pi_inf_R, mu_L, mu_R, vel_L, vel_R, & - & pres_L, pres_R, alpha_L, alpha_R, alpha_rho_L, alpha_rho_R, & - & F_L, F_R, E_L, E_R, cfl, dvel, dvel_small, rho_sf_small, & - & vflux_L_arr, vflux_R_arr]') - do l = 0, p - do k = 0, n - do j = -1, m + $:GPU_ATOMIC(atomic='update') + rhs_vf(momxb + 1)%sf(j, k, l) = rhs_vf(momxb + 1)%sf(j, k, l) + & + 0.5_wp*mu_L*vflux_L_arr(1)*(1._wp/dx(j)) + $:GPU_ATOMIC(atomic='update') + rhs_vf(E_idx)%sf(j, k, l) = rhs_vf(E_idx)%sf(j, k, l) + & + 0.5_wp*mu_L*vflux_L_arr(1)*vel_L(2)*(1._wp/dx(j)) - dvel = 0._wp - vflux_L_arr = 0._wp - vflux_R_arr = 0._wp + $:GPU_ATOMIC(atomic='update') + rhs_vf(momxb + 1)%sf(j + 1, k, l) = rhs_vf(momxb + 1)%sf(j + 1, k, l) - & + 0.5_wp*mu_R*vflux_R_arr(1)*(1._wp/dx(j + 1)) + $:GPU_ATOMIC(atomic='update') + rhs_vf(E_idx)%sf(j + 1, k, l) = rhs_vf(E_idx)%sf(j + 1, k, l) - & + 0.5_wp*mu_R*vflux_R_arr(1)*vel_R(2)*(1._wp/dx(j + 1)) - #:if MFC_CASE_OPTIMIZATION - #:if igr_order == 5 - !DIR$ unroll 6 - #:elif igr_order == 3 - !DIR$ unroll 4 - #:endif - #:endif - $:GPU_LOOP(parallelism='[seq]') - do q = vidxb, vidxe - dvel_small = 0._wp - !x-direction contributions - $:GPU_LOOP(parallelism='[seq]') - do i = -1, 1 - rho_L = 0._wp - $:GPU_LOOP(parallelism='[seq]') - do r = 1, num_fluids - rho_L = rho_L + q_cons_vf(r)%sf(j + i + q, k, l) - end do - rho_sf_small(i) = rho_L - end do + $:GPU_ATOMIC(atomic='update') + rhs_vf(momxb + 1)%sf(j, k, l) = rhs_vf(momxb + 1)%sf(j, k, l) + & + 0.5_wp*mu_R*vflux_R_arr(1)*(1._wp/dx(j)) + $:GPU_ATOMIC(atomic='update') + rhs_vf(E_idx)%sf(j, k, l) = rhs_vf(E_idx)%sf(j, k, l) + & + 0.5_wp*mu_R*vflux_R_arr(1)*vel_R(2)*(1._wp/dx(j)) - dvel_small(1) = (1/(2._wp*dx(j)))*( & - q_cons_vf(momxb)%sf(j + 1 + q, k, l)/rho_sf_small(1) - & - q_cons_vf(momxb)%sf(j - 1 + q, k, l)/rho_sf_small(-1)) - dvel_small(2) = (1/(2._wp*dx(j)))*( & - q_cons_vf(momxb + 1)%sf(j + 1 + q, k, l)/rho_sf_small(1) - & - q_cons_vf(momxb + 1)%sf(j - 1 + q, k, l)/rho_sf_small(-1)) - dvel_small(3) = (1/(2._wp*dx(j)))*( & - q_cons_vf(momxb + 2)%sf(j + 1 + q, k, l)/rho_sf_small(1) - & - q_cons_vf(momxb + 2)%sf(j - 1 + q, k, l)/rho_sf_small(-1)) - - if (q == 0) dvel(:, 1) = dvel_small - if (q > vidxb) then - vflux_L_arr(1) = vflux_L_arr(1) + coeff_L(q)*(dvel_small(2)) - vflux_L_arr(2) = vflux_L_arr(2) + coeff_L(q)*(dvel_small(3)) - vflux_L_arr(3) = vflux_L_arr(3) + coeff_L(q)*(4._wp*dvel_small(1))/3._wp - end if - if (q < vidxe) then - vflux_R_arr(1) = vflux_R_arr(1) + coeff_R(q)*(dvel_small(2)) - vflux_R_arr(2) = vflux_R_arr(2) + coeff_R(q)*(dvel_small(3)) - vflux_R_arr(3) = vflux_R_arr(3) + coeff_R(q)*(4._wp*dvel_small(1))/3._wp - end if + $:GPU_ATOMIC(atomic='update') + rhs_vf(momxb + 2)%sf(j + 1, k, l) = rhs_vf(momxb + 2)%sf(j + 1, k, l) - & + 0.5_wp*mu_L*vflux_L_arr(2)*(1._wp/dx(j + 1)) + $:GPU_ATOMIC(atomic='update') + rhs_vf(E_idx)%sf(j + 1, k, l) = rhs_vf(E_idx)%sf(j + 1, k, l) - & + 0.5_wp*mu_L*vflux_L_arr(2)*vel_L(3)*(1._wp/dx(j + 1)) - !y-direction contributions - $:GPU_LOOP(parallelism='[seq]') - do i = -1, 1 - rho_L = 0._wp - $:GPU_LOOP(parallelism='[seq]') - do r = 1, num_fluids - rho_L = rho_L + q_cons_vf(r)%sf(j + q, k + i, l) - end do - rho_sf_small(i) = rho_L - end do + $:GPU_ATOMIC(atomic='update') + rhs_vf(momxb + 2)%sf(j, k, l) = rhs_vf(momxb + 2)%sf(j, k, l) + & + 0.5_wp*mu_L*vflux_L_arr(2)*(1._wp/dx(j)) + $:GPU_ATOMIC(atomic='update') + rhs_vf(E_idx)%sf(j, k, l) = rhs_vf(E_idx)%sf(j, k, l) + & + 0.5_wp*mu_L*vflux_L_arr(2)*vel_L(3)*(1._wp/dx(j)) - dvel_small(1) = (1/(2._wp*dy(k)))*( & - q_cons_vf(momxb)%sf(j + q, k + 1, l)/rho_sf_small(1) - & - q_cons_vf(momxb)%sf(j + q, k - 1, l)/rho_sf_small(-1)) - dvel_small(2) = (1/(2._wp*dy(k)))*( & - q_cons_vf(momxb + 1)%sf(j + q, k + 1, l)/rho_sf_small(1) - & - q_cons_vf(momxb + 1)%sf(j + q, k - 1, l)/rho_sf_small(-1)) - if (q == 0) dvel_small(3) = (1/(2._wp*dy(k)))*( & - q_cons_vf(momxb + 2)%sf(j + q, k + 1, l)/rho_sf_small(1) - & - q_cons_vf(momxb + 2)%sf(j + q, k - 1, l)/rho_sf_small(-1)) - if (q == 0) dvel(:, 2) = dvel_small - - if (q > vidxb) then - vflux_L_arr(1) = vflux_L_arr(1) + coeff_L(q)*(dvel_small(1)) - vflux_L_arr(3) = vflux_L_arr(3) + coeff_L(q)*(-2._wp*dvel_small(2))/3._wp - end if - if (q < vidxe) then - vflux_R_arr(1) = vflux_R_arr(1) + coeff_R(q)*(dvel_small(1)) - vflux_R_arr(3) = vflux_R_arr(3) + coeff_R(q)*(-2._wp*dvel_small(2))/3._wp - end if + $:GPU_ATOMIC(atomic='update') + rhs_vf(momxb + 2)%sf(j + 1, k, l) = rhs_vf(momxb + 2)%sf(j + 1, k, l) - & + 0.5_wp*mu_R*vflux_R_arr(2)*(1._wp/dx(j + 1)) + $:GPU_ATOMIC(atomic='update') + rhs_vf(E_idx)%sf(j + 1, k, l) = rhs_vf(E_idx)%sf(j + 1, k, l) - & + 0.5_wp*mu_R*vflux_R_arr(2)*vel_R(3)*(1._wp/dx(j + 1)) - !z-direction contributions - $:GPU_LOOP(parallelism='[seq]') - do i = -1, 1 - rho_L = 0._wp - $:GPU_LOOP(parallelism='[seq]') - do r = 1, num_fluids - rho_L = rho_L + q_cons_vf(r)%sf(j + q, k, l + i) - end do - rho_sf_small(i) = rho_L - end do + $:GPU_ATOMIC(atomic='update') + rhs_vf(momxb + 2)%sf(j, k, l) = rhs_vf(momxb + 2)%sf(j, k, l) + & + 0.5_wp*mu_R*vflux_R_arr(2)*(1._wp/dx(j)) + $:GPU_ATOMIC(atomic='update') + rhs_vf(E_idx)%sf(j, k, l) = rhs_vf(E_idx)%sf(j, k, l) + & + 0.5_wp*mu_R*vflux_R_arr(2)*vel_R(3)*(1._wp/dx(j)) - dvel_small(1) = (1/(2._wp*dz(l)))*( & - q_cons_vf(momxb)%sf(j + q, k, l + 1)/rho_sf_small(1) - & - q_cons_vf(momxb)%sf(j + q, k, l - 1)/rho_sf_small(-1)) - if (q == 0) dvel_small(2) = (1/(2._wp*dz(l)))*( & - q_cons_vf(momxb + 1)%sf(j + q, k, l + 1)/rho_sf_small(1) - & - q_cons_vf(momxb + 1)%sf(j + q, k, l - 1)/rho_sf_small(-1)) - dvel_small(3) = (1/(2._wp*dz(l)))*( & - q_cons_vf(momxb + 2)%sf(j + q, k, l + 1)/rho_sf_small(1) - & - q_cons_vf(momxb + 2)%sf(j + q, k, l - 1)/rho_sf_small(-1)) - if (q == 0) dvel(:, 3) = dvel_small - - if (q > vidxb) then - vflux_L_arr(2) = vflux_L_arr(2) + coeff_L(q)*(dvel_small(1)) - vflux_L_arr(3) = vflux_L_arr(3) + coeff_L(q)*(-2._wp*dvel_small(3))/3._wp - end if - if (q < vidxe) then - vflux_R_arr(2) = vflux_R_arr(2) + coeff_R(q)*(dvel_small(1)) - vflux_R_arr(3) = vflux_R_arr(3) + coeff_R(q)*(-2._wp*dvel_small(3))/3._wp - end if + $:GPU_ATOMIC(atomic='update') + rhs_vf(momxb)%sf(j + 1, k, l) = rhs_vf(momxb)%sf(j + 1, k, l) - & + 0.5_wp*mu_L*vflux_L_arr(3)*(1._wp/dx(j + 1)) + $:GPU_ATOMIC(atomic='update') + rhs_vf(E_idx)%sf(j + 1, k, l) = rhs_vf(E_idx)%sf(j + 1, k, l) - & + 0.5_wp*mu_L*vflux_L_arr(3)*vel_L(1)*(1._wp/dx(j + 1)) + + $:GPU_ATOMIC(atomic='update') + rhs_vf(momxb)%sf(j, k, l) = rhs_vf(momxb)%sf(j, k, l) + & + 0.5_wp*mu_L*vflux_L_arr(3)*(1._wp/dx(j)) + $:GPU_ATOMIC(atomic='update') + rhs_vf(E_idx)%sf(j, k, l) = rhs_vf(E_idx)%sf(j, k, l) + & + 0.5_wp*mu_L*vflux_L_arr(3)*vel_L(1)*(1._wp/dx(j)) - if (q == 0) then - jac_rhs(j, k, l) = alf_igr*(2._wp*(dvel(1, 2)*dvel(2, 1) & - + dvel(1, 3)*dvel(3, 1) & - + dvel(2, 3)*dvel(3, 2)) & - + dvel(1, 1)**2._wp + dvel(2, 2)**2._wp & - + dvel(3, 3)**2._wp & - + (dvel(1, 1) + dvel(2, 2) + dvel(3, 3))**2._wp) + $:GPU_ATOMIC(atomic='update') + rhs_vf(momxb)%sf(j + 1, k, l) = rhs_vf(momxb)%sf(j + 1, k, l) - & + 0.5_wp*mu_R*vflux_R_arr(3)*(1._wp/dx(j + 1)) + $:GPU_ATOMIC(atomic='update') + rhs_vf(E_idx)%sf(j + 1, k, l) = rhs_vf(E_idx)%sf(j + 1, k, l) - & + 0.5_wp*mu_R*vflux_R_arr(3)*vel_R(1)*(1._wp/dx(j + 1)) + + $:GPU_ATOMIC(atomic='update') + rhs_vf(momxb)%sf(j, k, l) = rhs_vf(momxb)%sf(j, k, l) + & + 0.5_wp*mu_R*vflux_R_arr(3)*(1._wp/dx(j)) + $:GPU_ATOMIC(atomic='update') + rhs_vf(E_idx)%sf(j, k, l) = rhs_vf(E_idx)%sf(j, k, l) + & + 0.5_wp*mu_R*vflux_R_arr(3)*vel_R(1)*(1._wp/dx(j)) end if - end do - alpha_rho_L = 0._wp; alpha_rho_R = 0._wp - alpha_L = 0._wp; alpha_R = 0._wp - vel_L = 0._wp; vel_R = 0._wp + E_L = 0._wp; E_R = 0._wp - $:GPU_LOOP(parallelism='[seq]') - do q = vidxb + 1, vidxe $:GPU_LOOP(parallelism='[seq]') - do i = 1, num_fluids - alpha_rho_L(i) = alpha_rho_L(i) + coeff_L(q)*q_cons_vf(i)%sf(j + q, k, l) + do q = vidxb + 1, vidxe + E_L = E_L + coeff_L(q)*q_cons_vf(E_idx)%sf(j + q, k, l) end do - if (num_fluids > 1) then - $:GPU_LOOP(parallelism='[seq]') - do i = 1, num_fluids - 1 - alpha_L(i) = alpha_L(i) + coeff_L(q)*q_cons_vf(E_idx + i)%sf(j + q, k, l) - end do - else - alpha_L(1) = 1._wp - end if - $:GPU_LOOP(parallelism='[seq]') - do i = 1, num_dims - vel_L(i) = vel_L(i) + coeff_L(q)*q_cons_vf(momxb + i - 1)%sf(j + q, k, l) + do q = vidxb, vidxe - 1 + E_R = E_R + coeff_R(q)*q_cons_vf(E_idx)%sf(j + q, k, l) end do - end do - $:GPU_LOOP(parallelism='[seq]') - do q = vidxb, vidxe - 1 + call s_get_derived_states(E_L, gamma_L, pi_inf_L, rho_L, vel_L, & + E_R, gamma_R, pi_inf_R, rho_R, vel_R, & + pres_L, pres_R, cfl) + $:GPU_LOOP(parallelism='[seq]') do i = 1, num_fluids - alpha_rho_R(i) = alpha_rho_R(i) + coeff_R(q)*q_cons_vf(i)%sf(j + q, k, l) + $:GPU_ATOMIC(atomic='update') + rhs_vf(i)%sf(j + 1, k, l) = rhs_vf(i)%sf(j + 1, k, l) + & + (0.5_wp*(alpha_rho_L(i)* & + vel_L(1))*(1._wp/dx(j + 1)) - & + 0.5_wp*cfl*(alpha_rho_L(i))*(1._wp/dx(j + 1))) + + $:GPU_ATOMIC(atomic='update') + rhs_vf(i)%sf(j, k, l) = rhs_vf(i)%sf(j, k, l) - & + (0.5_wp*(alpha_rho_L(i)* & + vel_L(1))*(1._wp/dx(j)) - & + 0.5_wp*cfl*(alpha_rho_L(i))*(1._wp/dx(j))) end do if (num_fluids > 1) then $:GPU_LOOP(parallelism='[seq]') do i = 1, num_fluids - 1 - alpha_R(i) = alpha_R(i) + coeff_R(q)*q_cons_vf(E_idx + i)%sf(j + q, k, l) + $:GPU_ATOMIC(atomic='update') + rhs_vf(advxb + i - 1)%sf(j + 1, k, l) = rhs_vf(advxb + i - 1)%sf(j + 1, k, l) + & + (0.5_wp*(alpha_L(i)* & + vel_L(1))*(1._wp/dx(j + 1)) - & + 0.5_wp*cfl*(alpha_L(i))*(1._wp/dx(j + 1))) + + $:GPU_ATOMIC(atomic='update') + rhs_vf(advxb + i - 1)%sf(j + 1, k, l) = rhs_vf(advxb + i - 1)%sf(j + 1, k, l) & + - (0.5_wp*q_cons_vf(advxb + i - 1)%sf(j + 1, k, l)*vel_L(1)*(1._wp/dx(j + 1))) + + $:GPU_ATOMIC(atomic='update') + rhs_vf(advxb + i - 1)%sf(j, k, l) = rhs_vf(advxb + i - 1)%sf(j, k, l) - & + (0.5_wp*(alpha_L(i)* & + vel_L(1))*(1._wp/dx(j)) - & + 0.5_wp*cfl*(alpha_L(i))*(1._wp/dx(j))) + + $:GPU_ATOMIC(atomic='update') + rhs_vf(advxb + i - 1)%sf(j, k, l) = rhs_vf(advxb + i - 1)%sf(j, k, l) & + + (0.5_wp*q_cons_vf(advxb + i - 1)%sf(j, k, l)*vel_L(1)*(1._wp/dx(j))) end do - else - alpha_R(1) = 1._wp end if - $:GPU_LOOP(parallelism='[seq]') - do i = 1, num_dims - vel_R(i) = vel_R(i) + coeff_R(q)*q_cons_vf(momxb + i - 1)%sf(j + q, k, l) - end do - end do - - if (num_fluids > 1) then - alpha_L(num_fluids) = 1._wp - sum(alpha_L(1:num_fluids - 1)) - alpha_R(num_fluids) = 1._wp - sum(alpha_R(1:num_fluids - 1)) - end if - - rho_L = sum(alpha_rho_L) - gamma_L = sum(alpha_L*gammas) - pi_inf_L = sum(alpha_L*pi_infs) - - rho_R = sum(alpha_rho_R) - gamma_R = sum(alpha_R*gammas) - pi_inf_R = sum(alpha_R*pi_infs) - - vel_L = vel_L/rho_L - vel_R = vel_R/rho_R - - if (viscous) then - mu_L = 0._wp - mu_R = 0._wp - $:GPU_LOOP(parallelism='[seq]') - do i = 1, num_fluids - mu_L = alpha_L(i)/Res(1, i) + mu_L - mu_R = alpha_R(i)/Res(1, i) + mu_R - end do - $:GPU_ATOMIC(atomic='update') - rhs_vf(momxb + 1)%sf(j + 1, k, l) = rhs_vf(momxb + 1)%sf(j + 1, k, l) - & - 0.5_wp*mu_L*vflux_L_arr(1)*(1._wp/dx(j + 1)) - $:GPU_ATOMIC(atomic='update') - rhs_vf(E_idx)%sf(j + 1, k, l) = rhs_vf(E_idx)%sf(j + 1, k, l) - & - 0.5_wp*mu_L*vflux_L_arr(1)*vel_L(2)*(1._wp/dx(j + 1)) + rhs_vf(momxb)%sf(j + 1, k, l) = rhs_vf(momxb)%sf(j + 1, k, l) + & + (0.5_wp*(rho_L*(vel_L(1))**2.0 + & + pres_L)*(1._wp/dx(j + 1)) - & + 0.5_wp*cfl*(rho_L*vel_L(1))*(1._wp/dx(j + 1))) $:GPU_ATOMIC(atomic='update') - rhs_vf(momxb + 1)%sf(j, k, l) = rhs_vf(momxb + 1)%sf(j, k, l) + & - 0.5_wp*mu_L*vflux_L_arr(1)*(1._wp/dx(j)) - $:GPU_ATOMIC(atomic='update') - rhs_vf(E_idx)%sf(j, k, l) = rhs_vf(E_idx)%sf(j, k, l) + & - 0.5_wp*mu_L*vflux_L_arr(1)*vel_L(2)*(1._wp/dx(j)) + rhs_vf(momxb + 1)%sf(j + 1, k, l) = rhs_vf(momxb + 1)%sf(j + 1, k, l) + & + (0.5_wp*rho_L*vel_L(1)*vel_L(2)*(1._wp/dx(j + 1)) - & + 0.5_wp*cfl*(rho_L*vel_L(2))*(1._wp/dx(j + 1))) $:GPU_ATOMIC(atomic='update') - rhs_vf(momxb + 1)%sf(j + 1, k, l) = rhs_vf(momxb + 1)%sf(j + 1, k, l) - & - 0.5_wp*mu_R*vflux_R_arr(1)*(1._wp/dx(j + 1)) - $:GPU_ATOMIC(atomic='update') - rhs_vf(E_idx)%sf(j + 1, k, l) = rhs_vf(E_idx)%sf(j + 1, k, l) - & - 0.5_wp*mu_R*vflux_R_arr(1)*vel_R(2)*(1._wp/dx(j + 1)) + rhs_vf(momxb + 2)%sf(j + 1, k, l) = rhs_vf(momxb + 2)%sf(j + 1, k, l) + & + (0.5_wp*rho_L*vel_L(1)*vel_L(3)*(1._wp/dx(j + 1)) - & + 0.5_wp*cfl*(rho_L*vel_L(3))*(1._wp/dx(j + 1))) $:GPU_ATOMIC(atomic='update') - rhs_vf(momxb + 1)%sf(j, k, l) = rhs_vf(momxb + 1)%sf(j, k, l) + & - 0.5_wp*mu_R*vflux_R_arr(1)*(1._wp/dx(j)) - $:GPU_ATOMIC(atomic='update') - rhs_vf(E_idx)%sf(j, k, l) = rhs_vf(E_idx)%sf(j, k, l) + & - 0.5_wp*mu_R*vflux_R_arr(1)*vel_R(2)*(1._wp/dx(j)) + rhs_vf(E_idx)%sf(j + 1, k, l) = rhs_vf(E_idx)%sf(j + 1, k, l) + & + (0.5_wp*(vel_L(1)*(E_L + & + pres_L))*(1._wp/dx(j + 1)) - & + 0.5_wp*cfl*(E_L)*(1._wp/dx(j + 1))) $:GPU_ATOMIC(atomic='update') - rhs_vf(momxb + 2)%sf(j + 1, k, l) = rhs_vf(momxb + 2)%sf(j + 1, k, l) - & - 0.5_wp*mu_L*vflux_L_arr(2)*(1._wp/dx(j + 1)) - $:GPU_ATOMIC(atomic='update') - rhs_vf(E_idx)%sf(j + 1, k, l) = rhs_vf(E_idx)%sf(j + 1, k, l) - & - 0.5_wp*mu_L*vflux_L_arr(2)*vel_L(3)*(1._wp/dx(j + 1)) + rhs_vf(momxb)%sf(j, k, l) = rhs_vf(momxb)%sf(j, k, l) - & + (0.5_wp*(rho_L*(vel_L(1))**2.0 + & + pres_L)*(1._wp/dx(j)) - & + 0.5_wp*cfl*(rho_L*vel_L(1))*(1._wp/dx(j))) $:GPU_ATOMIC(atomic='update') - rhs_vf(momxb + 2)%sf(j, k, l) = rhs_vf(momxb + 2)%sf(j, k, l) + & - 0.5_wp*mu_L*vflux_L_arr(2)*(1._wp/dx(j)) - $:GPU_ATOMIC(atomic='update') - rhs_vf(E_idx)%sf(j, k, l) = rhs_vf(E_idx)%sf(j, k, l) + & - 0.5_wp*mu_L*vflux_L_arr(2)*vel_L(3)*(1._wp/dx(j)) + rhs_vf(momxb + 1)%sf(j, k, l) = rhs_vf(momxb + 1)%sf(j, k, l) - & + (0.5_wp*rho_L*vel_L(1)*vel_L(2)*(1._wp/dx(j)) - & + 0.5_wp*cfl*(rho_L*vel_L(2))*(1._wp/dx(j))) $:GPU_ATOMIC(atomic='update') - rhs_vf(momxb + 2)%sf(j + 1, k, l) = rhs_vf(momxb + 2)%sf(j + 1, k, l) - & - 0.5_wp*mu_R*vflux_R_arr(2)*(1._wp/dx(j + 1)) - $:GPU_ATOMIC(atomic='update') - rhs_vf(E_idx)%sf(j + 1, k, l) = rhs_vf(E_idx)%sf(j + 1, k, l) - & - 0.5_wp*mu_R*vflux_R_arr(2)*vel_R(3)*(1._wp/dx(j + 1)) + rhs_vf(momxb + 2)%sf(j, k, l) = rhs_vf(momxb + 2)%sf(j, k, l) - & + (0.5_wp*rho_L*vel_L(1)*vel_L(3)*(1._wp/dx(j)) - & + 0.5_wp*cfl*(rho_L*vel_L(3))*(1._wp/dx(j))) $:GPU_ATOMIC(atomic='update') - rhs_vf(momxb + 2)%sf(j, k, l) = rhs_vf(momxb + 2)%sf(j, k, l) + & - 0.5_wp*mu_R*vflux_R_arr(2)*(1._wp/dx(j)) - $:GPU_ATOMIC(atomic='update') - rhs_vf(E_idx)%sf(j, k, l) = rhs_vf(E_idx)%sf(j, k, l) + & - 0.5_wp*mu_R*vflux_R_arr(2)*vel_R(3)*(1._wp/dx(j)) + rhs_vf(E_idx)%sf(j, k, l) = rhs_vf(E_idx)%sf(j, k, l) - & + (0.5_wp*(vel_L(1)*(E_L + & + pres_L))*(1._wp/dx(j)) - & + 0.5_wp*cfl*(E_L)*(1._wp/dx(j))) - $:GPU_ATOMIC(atomic='update') - rhs_vf(momxb)%sf(j + 1, k, l) = rhs_vf(momxb)%sf(j + 1, k, l) - & - 0.5_wp*mu_L*vflux_L_arr(3)*(1._wp/dx(j + 1)) - $:GPU_ATOMIC(atomic='update') - rhs_vf(E_idx)%sf(j + 1, k, l) = rhs_vf(E_idx)%sf(j + 1, k, l) - & - 0.5_wp*mu_L*vflux_L_arr(3)*vel_L(1)*(1._wp/dx(j + 1)) + $:GPU_LOOP(parallelism='[seq]') + do i = 1, num_fluids + $:GPU_ATOMIC(atomic='update') + rhs_vf(i)%sf(j + 1, k, l) = rhs_vf(i)%sf(j + 1, k, l) + & + (0.5_wp*(alpha_rho_R(i)* & + vel_R(1))*(1._wp/dx(j + 1)) + & + 0.5_wp*cfl*(alpha_rho_R(i))*(1._wp/dx(j + 1))) + + $:GPU_ATOMIC(atomic='update') + rhs_vf(i)%sf(j, k, l) = rhs_vf(i)%sf(j, k, l) - & + (0.5_wp*(alpha_rho_R(i)* & + vel_R(1))*(1._wp/dx(j)) + & + 0.5_wp*cfl*(alpha_rho_R(i))*(1._wp/dx(j))) + end do + + if (num_fluids > 1) then + $:GPU_LOOP(parallelism='[seq]') + do i = 1, num_fluids - 1 + $:GPU_ATOMIC(atomic='update') + rhs_vf(advxb + i - 1)%sf(j + 1, k, l) = rhs_vf(advxb + i - 1)%sf(j + 1, k, l) + & + (0.5_wp*(alpha_R(i)* & + vel_R(1))*(1._wp/dx(j + 1)) + & + 0.5_wp*cfl*(alpha_R(i))*(1._wp/dx(j + 1))) + + $:GPU_ATOMIC(atomic='update') + rhs_vf(advxb + i - 1)%sf(j + 1, k, l) = rhs_vf(advxb + i - 1)%sf(j + 1, k, l) & + - (0.5_wp*q_cons_vf(advxb + i - 1)%sf(j + 1, k, l)*vel_R(1)*(1._wp/dx(j + 1))) + + $:GPU_ATOMIC(atomic='update') + rhs_vf(advxb + i - 1)%sf(j, k, l) = rhs_vf(advxb + i - 1)%sf(j, k, l) - & + (0.5_wp*(alpha_R(i)* & + vel_R(1))*(1._wp/dx(j)) + & + 0.5_wp*cfl*(alpha_R(i))*(1._wp/dx(j))) + + $:GPU_ATOMIC(atomic='update') + rhs_vf(advxb + i - 1)%sf(j, k, l) = rhs_vf(advxb + i - 1)%sf(j, k, l) & + + (0.5_wp*q_cons_vf(advxb + i - 1)%sf(j, k, l)*vel_R(1)*(1._wp/dx(j))) + end do + end if $:GPU_ATOMIC(atomic='update') - rhs_vf(momxb)%sf(j, k, l) = rhs_vf(momxb)%sf(j, k, l) + & - 0.5_wp*mu_L*vflux_L_arr(3)*(1._wp/dx(j)) + rhs_vf(momxb)%sf(j + 1, k, l) = rhs_vf(momxb)%sf(j + 1, k, l) + & + (0.5_wp*(rho_R*(vel_R(1))**2.0 + & + pres_R)*(1._wp/dx(j + 1)) + & + 0.5_wp*cfl*(rho_R*vel_R(1))*(1._wp/dx(j + 1))) + $:GPU_ATOMIC(atomic='update') - rhs_vf(E_idx)%sf(j, k, l) = rhs_vf(E_idx)%sf(j, k, l) + & - 0.5_wp*mu_L*vflux_L_arr(3)*vel_L(1)*(1._wp/dx(j)) + rhs_vf(momxb + 1)%sf(j + 1, k, l) = rhs_vf(momxb + 1)%sf(j + 1, k, l) + & + (0.5_wp*rho_R*vel_R(1)*vel_R(2)*(1._wp/dx(j + 1)) + & + 0.5_wp*cfl*(rho_R*vel_R(2))*(1._wp/dx(j + 1))) $:GPU_ATOMIC(atomic='update') - rhs_vf(momxb)%sf(j + 1, k, l) = rhs_vf(momxb)%sf(j + 1, k, l) - & - 0.5_wp*mu_R*vflux_R_arr(3)*(1._wp/dx(j + 1)) + rhs_vf(momxb + 2)%sf(j + 1, k, l) = rhs_vf(momxb + 2)%sf(j + 1, k, l) + & + (0.5_wp*rho_R*vel_R(1)*vel_R(3)*(1._wp/dx(j + 1)) + & + 0.5_wp*cfl*(rho_R*vel_R(3))*(1._wp/dx(j + 1))) + $:GPU_ATOMIC(atomic='update') - rhs_vf(E_idx)%sf(j + 1, k, l) = rhs_vf(E_idx)%sf(j + 1, k, l) - & - 0.5_wp*mu_R*vflux_R_arr(3)*vel_R(1)*(1._wp/dx(j + 1)) + rhs_vf(E_idx)%sf(j + 1, k, l) = rhs_vf(E_idx)%sf(j + 1, k, l) + & + (0.5_wp*(vel_R(1)*(E_R + & + pres_R))*(1._wp/dx(j + 1)) + & + 0.5_wp*cfl*(E_R)*(1._wp/dx(j + 1))) $:GPU_ATOMIC(atomic='update') - rhs_vf(momxb)%sf(j, k, l) = rhs_vf(momxb)%sf(j, k, l) + & - 0.5_wp*mu_R*vflux_R_arr(3)*(1._wp/dx(j)) + rhs_vf(momxb)%sf(j, k, l) = rhs_vf(momxb)%sf(j, k, l) - & + (0.5_wp*(rho_R*(vel_R(1))**2.0 + & + pres_R)*(1._wp/dx(j)) + & + 0.5_wp*cfl*(rho_R*vel_R(1))*(1._wp/dx(j))) + $:GPU_ATOMIC(atomic='update') - rhs_vf(E_idx)%sf(j, k, l) = rhs_vf(E_idx)%sf(j, k, l) + & - 0.5_wp*mu_R*vflux_R_arr(3)*vel_R(1)*(1._wp/dx(j)) - end if + rhs_vf(momxb + 1)%sf(j, k, l) = rhs_vf(momxb + 1)%sf(j, k, l) - & + (0.5_wp*rho_R*vel_R(1)*vel_R(2)*(1._wp/dx(j)) + & + 0.5_wp*cfl*(rho_R*vel_R(2))*(1._wp/dx(j))) - E_L = 0._wp; E_R = 0._wp + $:GPU_ATOMIC(atomic='update') + rhs_vf(momxb + 2)%sf(j, k, l) = rhs_vf(momxb + 2)%sf(j, k, l) - & + (0.5_wp*rho_R*vel_R(1)*vel_R(3)*(1._wp/dx(j)) + & + 0.5_wp*cfl*(rho_R*vel_R(3))*(1._wp/dx(j))) - $:GPU_LOOP(parallelism='[seq]') - do q = vidxb + 1, vidxe - E_L = E_L + coeff_L(q)*q_cons_vf(E_idx)%sf(j + q, k, l) - end do + $:GPU_ATOMIC(atomic='update') + rhs_vf(E_idx)%sf(j, k, l) = rhs_vf(E_idx)%sf(j, k, l) - & + (0.5_wp*(vel_R(1)*(E_R + & + pres_R))*(1._wp/dx(j)) + & + 0.5_wp*cfl*(E_R)*(1._wp/dx(j))) - $:GPU_LOOP(parallelism='[seq]') - do q = vidxb, vidxe - 1 - E_R = E_R + coeff_R(q)*q_cons_vf(E_idx)%sf(j + q, k, l) end do + end do + end do + #:endcall GPU_PARALLEL_LOOP + end if + else if (idir == 2) then + if (p == 0) then + #:call GPU_PARALLEL_LOOP(collapse=3, private='[rho_L, rho_R, gamma_L, gamma_R, pi_inf_L, pi_inf_R, mu_L, mu_R, vel_L, vel_R, pres_L, pres_R, alpha_L, alpha_R, alpha_rho_L, alpha_rho_R, F_L, F_R, E_L, E_R, cfl, dvel_small, rho_sf_small, vflux_L_arr, vflux_R_arr]') + do l = 0, p + do k = -1, n + do j = 0, m + + if (viscous) then + vflux_L_arr = 0._wp + vflux_R_arr = 0._wp + + #:if MFC_CASE_OPTIMIZATION + #:if igr_order == 5 + !DIR$ unroll 6 + #:elif igr_order == 3 + !DIR$ unroll 4 + #:endif + #:endif + $:GPU_LOOP(parallelism='[seq]') + do q = vidxb, vidxe + dvel_small = 0._wp + !x-direction contributions + $:GPU_LOOP(parallelism='[seq]') + do i = -1, 1 + rho_L = 0._wp + $:GPU_LOOP(parallelism='[seq]') + do r = 1, num_fluids + rho_L = rho_L + q_cons_vf(r)%sf(j + i, k + q, l) + end do + rho_sf_small(i) = rho_L + end do - call s_get_derived_states(E_L, gamma_L, pi_inf_L, rho_L, vel_L, & - E_R, gamma_R, pi_inf_R, rho_R, vel_R, & - pres_L, pres_R, cfl) + dvel_small(1) = (1/(2._wp*dx(j)))*( & + q_cons_vf(momxb)%sf(j + 1, k + q, l)/rho_sf_small(1) - & + q_cons_vf(momxb)%sf(j - 1, k + q, l)/rho_sf_small(-1)) + dvel_small(2) = (1/(2._wp*dx(j)))*( & + q_cons_vf(momxb + 1)%sf(j + 1, k + q, l)/rho_sf_small(1) - & + q_cons_vf(momxb + 1)%sf(j - 1, k + q, l)/rho_sf_small(-1)) + + if (q > vidxb) then + vflux_L_arr(1) = vflux_L_arr(1) + coeff_L(q)*(dvel_small(2)) + vflux_L_arr(3) = vflux_L_arr(3) + coeff_L(q)*(-2._wp*dvel_small(1))/3._wp + end if + if (q < vidxe) then + vflux_R_arr(1) = vflux_R_arr(1) + coeff_R(q)*(dvel_small(2)) + vflux_R_arr(3) = vflux_R_arr(3) + coeff_R(q)*(-2._wp*dvel_small(1))/3._wp + end if + + !y-direction contributions + $:GPU_LOOP(parallelism='[seq]') + do i = -1, 1 + rho_L = 0._wp + $:GPU_LOOP(parallelism='[seq]') + do r = 1, num_fluids + rho_L = rho_L + q_cons_vf(r)%sf(j, k + i + q, l) + end do + rho_sf_small(i) = rho_L + end do - $:GPU_LOOP(parallelism='[seq]') - do i = 1, num_fluids - $:GPU_ATOMIC(atomic='update') - rhs_vf(i)%sf(j + 1, k, l) = rhs_vf(i)%sf(j + 1, k, l) + & - (0.5_wp*(alpha_rho_L(i)* & - vel_L(1))*(1._wp/dx(j + 1)) - & - 0.5_wp*cfl*(alpha_rho_L(i))*(1._wp/dx(j + 1))) + dvel_small(1) = (1/(2._wp*dy(k)))*( & + q_cons_vf(momxb)%sf(j, k + 1 + q, l)/rho_sf_small(1) - & + q_cons_vf(momxb)%sf(j, k - 1 + q, l)/rho_sf_small(-1)) + dvel_small(2) = (1/(2._wp*dy(k)))*( & + q_cons_vf(momxb + 1)%sf(j, k + 1 + q, l)/rho_sf_small(1) - & + q_cons_vf(momxb + 1)%sf(j, k - 1 + q, l)/rho_sf_small(-1)) + + if (q > vidxb) then + vflux_L_arr(1) = vflux_L_arr(1) + coeff_L(q)*(dvel_small(1)) + vflux_L_arr(3) = vflux_L_arr(3) + coeff_L(q)*(4._wp*dvel_small(2))/3._wp + end if + if (q < vidxe) then + vflux_R_arr(1) = vflux_R_arr(1) + coeff_R(q)*(dvel_small(1)) + vflux_R_arr(3) = vflux_R_arr(3) + coeff_R(q)*(4._wp*dvel_small(2))/3._wp + end if + end do + end if - $:GPU_ATOMIC(atomic='update') - rhs_vf(i)%sf(j, k, l) = rhs_vf(i)%sf(j, k, l) - & - (0.5_wp*(alpha_rho_L(i)* & - vel_L(1))*(1._wp/dx(j)) - & - 0.5_wp*cfl*(alpha_rho_L(i))*(1._wp/dx(j))) - end do + alpha_rho_L = 0._wp; alpha_rho_R = 0._wp + alpha_L = 0._wp; alpha_R = 0._wp + vel_L = 0._wp; vel_R = 0._wp - if (num_fluids > 1) then $:GPU_LOOP(parallelism='[seq]') - do i = 1, num_fluids - 1 - $:GPU_ATOMIC(atomic='update') - rhs_vf(advxb + i - 1)%sf(j + 1, k, l) = rhs_vf(advxb + i - 1)%sf(j + 1, k, l) + & - (0.5_wp*(alpha_L(i)* & - vel_L(1))*(1._wp/dx(j + 1)) - & - 0.5_wp*cfl*(alpha_L(i))*(1._wp/dx(j + 1))) - - $:GPU_ATOMIC(atomic='update') - rhs_vf(advxb + i - 1)%sf(j + 1, k, l) = rhs_vf(advxb + i - 1)%sf(j + 1, k, l) & - - (0.5_wp*q_cons_vf(advxb + i - 1)%sf(j + 1, k, l)*vel_L(1)*(1._wp/dx(j + 1))) + do q = vidxb + 1, vidxe + $:GPU_LOOP(parallelism='[seq]') + do i = 1, num_fluids + alpha_rho_L(i) = alpha_rho_L(i) + coeff_L(q)*q_cons_vf(i)%sf(j, k + q, l) + end do - $:GPU_ATOMIC(atomic='update') - rhs_vf(advxb + i - 1)%sf(j, k, l) = rhs_vf(advxb + i - 1)%sf(j, k, l) - & - (0.5_wp*(alpha_L(i)* & - vel_L(1))*(1._wp/dx(j)) - & - 0.5_wp*cfl*(alpha_L(i))*(1._wp/dx(j))) + if (num_fluids > 1) then + $:GPU_LOOP(parallelism='[seq]') + do i = 1, num_fluids - 1 + alpha_L(i) = alpha_L(i) + coeff_L(q)*q_cons_vf(E_idx + i)%sf(j, k + q, l) + end do + else + alpha_L(1) = 1._wp + end if - $:GPU_ATOMIC(atomic='update') - rhs_vf(advxb + i - 1)%sf(j, k, l) = rhs_vf(advxb + i - 1)%sf(j, k, l) & - + (0.5_wp*q_cons_vf(advxb + i - 1)%sf(j, k, l)*vel_L(1)*(1._wp/dx(j))) + $:GPU_LOOP(parallelism='[seq]') + do i = 1, num_dims + vel_L(i) = vel_L(i) + coeff_L(q)*q_cons_vf(momxb + i - 1)%sf(j, k + q, l) + end do end do - end if - $:GPU_ATOMIC(atomic='update') - rhs_vf(momxb)%sf(j + 1, k, l) = rhs_vf(momxb)%sf(j + 1, k, l) + & - (0.5_wp*(rho_L*(vel_L(1))**2.0 + & - pres_L)*(1._wp/dx(j + 1)) - & - 0.5_wp*cfl*(rho_L*vel_L(1))*(1._wp/dx(j + 1))) - - $:GPU_ATOMIC(atomic='update') - rhs_vf(momxb + 1)%sf(j + 1, k, l) = rhs_vf(momxb + 1)%sf(j + 1, k, l) + & - (0.5_wp*rho_L*vel_L(1)*vel_L(2)*(1._wp/dx(j + 1)) - & - 0.5_wp*cfl*(rho_L*vel_L(2))*(1._wp/dx(j + 1))) - - $:GPU_ATOMIC(atomic='update') - rhs_vf(momxb + 2)%sf(j + 1, k, l) = rhs_vf(momxb + 2)%sf(j + 1, k, l) + & - (0.5_wp*rho_L*vel_L(1)*vel_L(3)*(1._wp/dx(j + 1)) - & - 0.5_wp*cfl*(rho_L*vel_L(3))*(1._wp/dx(j + 1))) + $:GPU_LOOP(parallelism='[seq]') + do q = vidxb, vidxe - 1 + $:GPU_LOOP(parallelism='[seq]') + do i = 1, num_fluids + alpha_rho_R(i) = alpha_rho_R(i) + coeff_R(q)*q_cons_vf(i)%sf(j, k + q, l) + end do - $:GPU_ATOMIC(atomic='update') - rhs_vf(E_idx)%sf(j + 1, k, l) = rhs_vf(E_idx)%sf(j + 1, k, l) + & - (0.5_wp*(vel_L(1)*(E_L + & - pres_L))*(1._wp/dx(j + 1)) - & - 0.5_wp*cfl*(E_L)*(1._wp/dx(j + 1))) + if (num_fluids > 1) then + $:GPU_LOOP(parallelism='[seq]') + do i = 1, num_fluids - 1 + alpha_R(i) = alpha_R(i) + coeff_R(q)*q_cons_vf(E_idx + i)%sf(j, k + q, l) + end do + else + alpha_R(1) = 1._wp + end if - $:GPU_ATOMIC(atomic='update') - rhs_vf(momxb)%sf(j, k, l) = rhs_vf(momxb)%sf(j, k, l) - & - (0.5_wp*(rho_L*(vel_L(1))**2.0 + & - pres_L)*(1._wp/dx(j)) - & - 0.5_wp*cfl*(rho_L*vel_L(1))*(1._wp/dx(j))) + $:GPU_LOOP(parallelism='[seq]') + do i = 1, num_dims + vel_R(i) = vel_R(i) + coeff_R(q)*q_cons_vf(momxb + i - 1)%sf(j, k + q, l) + end do + end do - $:GPU_ATOMIC(atomic='update') - rhs_vf(momxb + 1)%sf(j, k, l) = rhs_vf(momxb + 1)%sf(j, k, l) - & - (0.5_wp*rho_L*vel_L(1)*vel_L(2)*(1._wp/dx(j)) - & - 0.5_wp*cfl*(rho_L*vel_L(2))*(1._wp/dx(j))) + if (num_fluids > 1) then + alpha_L(num_fluids) = 1._wp - sum(alpha_L(1:num_fluids - 1)) + alpha_R(num_fluids) = 1._wp - sum(alpha_R(1:num_fluids - 1)) + end if - $:GPU_ATOMIC(atomic='update') - rhs_vf(momxb + 2)%sf(j, k, l) = rhs_vf(momxb + 2)%sf(j, k, l) - & - (0.5_wp*rho_L*vel_L(1)*vel_L(3)*(1._wp/dx(j)) - & - 0.5_wp*cfl*(rho_L*vel_L(3))*(1._wp/dx(j))) + rho_L = sum(alpha_rho_L) + gamma_L = sum(alpha_L*gammas) + pi_inf_L = sum(alpha_L*pi_infs) - $:GPU_ATOMIC(atomic='update') - rhs_vf(E_idx)%sf(j, k, l) = rhs_vf(E_idx)%sf(j, k, l) - & - (0.5_wp*(vel_L(1)*(E_L + & - pres_L))*(1._wp/dx(j)) - & - 0.5_wp*cfl*(E_L)*(1._wp/dx(j))) + rho_R = sum(alpha_rho_R) + gamma_R = sum(alpha_R*gammas) + pi_inf_R = sum(alpha_R*pi_infs) - $:GPU_LOOP(parallelism='[seq]') - do i = 1, num_fluids - $:GPU_ATOMIC(atomic='update') - rhs_vf(i)%sf(j + 1, k, l) = rhs_vf(i)%sf(j + 1, k, l) + & - (0.5_wp*(alpha_rho_R(i)* & - vel_R(1))*(1._wp/dx(j + 1)) + & - 0.5_wp*cfl*(alpha_rho_R(i))*(1._wp/dx(j + 1))) + vel_L = vel_L/rho_L + vel_R = vel_R/rho_R - $:GPU_ATOMIC(atomic='update') - rhs_vf(i)%sf(j, k, l) = rhs_vf(i)%sf(j, k, l) - & - (0.5_wp*(alpha_rho_R(i)* & - vel_R(1))*(1._wp/dx(j)) + & - 0.5_wp*cfl*(alpha_rho_R(i))*(1._wp/dx(j))) - end do + if (viscous) then + mu_L = 0._wp + mu_R = 0._wp + $:GPU_LOOP(parallelism='[seq]') + do i = 1, num_fluids + mu_L = alpha_L(i)/Res_igr(1, i) + mu_L + mu_R = alpha_R(i)/Res_igr(1, i) + mu_R + end do - if (num_fluids > 1) then - $:GPU_LOOP(parallelism='[seq]') - do i = 1, num_fluids - 1 $:GPU_ATOMIC(atomic='update') - rhs_vf(advxb + i - 1)%sf(j + 1, k, l) = rhs_vf(advxb + i - 1)%sf(j + 1, k, l) + & - (0.5_wp*(alpha_R(i)* & - vel_R(1))*(1._wp/dx(j + 1)) + & - 0.5_wp*cfl*(alpha_R(i))*(1._wp/dx(j + 1))) - + rhs_vf(momxb)%sf(j, k + 1, l) = rhs_vf(momxb)%sf(j, k + 1, l) - & + 0.5_wp*mu_L*vflux_L_arr(1)*(1._wp/dy(k + 1)) $:GPU_ATOMIC(atomic='update') - rhs_vf(advxb + i - 1)%sf(j + 1, k, l) = rhs_vf(advxb + i - 1)%sf(j + 1, k, l) & - - (0.5_wp*q_cons_vf(advxb + i - 1)%sf(j + 1, k, l)*vel_R(1)*(1._wp/dx(j + 1))) + rhs_vf(E_idx)%sf(j, k + 1, l) = rhs_vf(E_idx)%sf(j, k + 1, l) - & + 0.5_wp*mu_L*vflux_L_arr(1)*vel_L(1)*(1._wp/dy(k + 1)) $:GPU_ATOMIC(atomic='update') - rhs_vf(advxb + i - 1)%sf(j, k, l) = rhs_vf(advxb + i - 1)%sf(j, k, l) - & - (0.5_wp*(alpha_R(i)* & - vel_R(1))*(1._wp/dx(j)) + & - 0.5_wp*cfl*(alpha_R(i))*(1._wp/dx(j))) - + rhs_vf(momxb)%sf(j, k, l) = rhs_vf(momxb)%sf(j, k, l) + & + 0.5_wp*mu_L*vflux_L_arr(1)*(1._wp/dy(k)) $:GPU_ATOMIC(atomic='update') - rhs_vf(advxb + i - 1)%sf(j, k, l) = rhs_vf(advxb + i - 1)%sf(j, k, l) & - + (0.5_wp*q_cons_vf(advxb + i - 1)%sf(j, k, l)*vel_R(1)*(1._wp/dx(j))) - end do - end if - - $:GPU_ATOMIC(atomic='update') - rhs_vf(momxb)%sf(j + 1, k, l) = rhs_vf(momxb)%sf(j + 1, k, l) + & - (0.5_wp*(rho_R*(vel_R(1))**2.0 + & - pres_R)*(1._wp/dx(j + 1)) + & - 0.5_wp*cfl*(rho_R*vel_R(1))*(1._wp/dx(j + 1))) - - $:GPU_ATOMIC(atomic='update') - rhs_vf(momxb + 1)%sf(j + 1, k, l) = rhs_vf(momxb + 1)%sf(j + 1, k, l) + & - (0.5_wp*rho_R*vel_R(1)*vel_R(2)*(1._wp/dx(j + 1)) + & - 0.5_wp*cfl*(rho_R*vel_R(2))*(1._wp/dx(j + 1))) - - $:GPU_ATOMIC(atomic='update') - rhs_vf(momxb + 2)%sf(j + 1, k, l) = rhs_vf(momxb + 2)%sf(j + 1, k, l) + & - (0.5_wp*rho_R*vel_R(1)*vel_R(3)*(1._wp/dx(j + 1)) + & - 0.5_wp*cfl*(rho_R*vel_R(3))*(1._wp/dx(j + 1))) - - $:GPU_ATOMIC(atomic='update') - rhs_vf(E_idx)%sf(j + 1, k, l) = rhs_vf(E_idx)%sf(j + 1, k, l) + & - (0.5_wp*(vel_R(1)*(E_R + & - pres_R))*(1._wp/dx(j + 1)) + & - 0.5_wp*cfl*(E_R)*(1._wp/dx(j + 1))) - - $:GPU_ATOMIC(atomic='update') - rhs_vf(momxb)%sf(j, k, l) = rhs_vf(momxb)%sf(j, k, l) - & - (0.5_wp*(rho_R*(vel_R(1))**2.0 + & - pres_R)*(1._wp/dx(j)) + & - 0.5_wp*cfl*(rho_R*vel_R(1))*(1._wp/dx(j))) + rhs_vf(E_idx)%sf(j, k, l) = rhs_vf(E_idx)%sf(j, k, l) + & + 0.5_wp*mu_L*vflux_L_arr(1)*vel_L(1)*(1._wp/dy(k)) - $:GPU_ATOMIC(atomic='update') - rhs_vf(momxb + 1)%sf(j, k, l) = rhs_vf(momxb + 1)%sf(j, k, l) - & - (0.5_wp*rho_R*vel_R(1)*vel_R(2)*(1._wp/dx(j)) + & - 0.5_wp*cfl*(rho_R*vel_R(2))*(1._wp/dx(j))) - - $:GPU_ATOMIC(atomic='update') - rhs_vf(momxb + 2)%sf(j, k, l) = rhs_vf(momxb + 2)%sf(j, k, l) - & - (0.5_wp*rho_R*vel_R(1)*vel_R(3)*(1._wp/dx(j)) + & - 0.5_wp*cfl*(rho_R*vel_R(3))*(1._wp/dx(j))) - - $:GPU_ATOMIC(atomic='update') - rhs_vf(E_idx)%sf(j, k, l) = rhs_vf(E_idx)%sf(j, k, l) - & - (0.5_wp*(vel_R(1)*(E_R + & - pres_R))*(1._wp/dx(j)) + & - 0.5_wp*cfl*(E_R)*(1._wp/dx(j))) + $:GPU_ATOMIC(atomic='update') + rhs_vf(momxb)%sf(j, k + 1, l) = rhs_vf(momxb)%sf(j, k + 1, l) - & + 0.5_wp*mu_R*vflux_R_arr(1)*(1._wp/dy(k + 1)) + $:GPU_ATOMIC(atomic='update') + rhs_vf(E_idx)%sf(j, k + 1, l) = rhs_vf(E_idx)%sf(j, k + 1, l) - & + 0.5_wp*mu_R*vflux_R_arr(1)*vel_R(1)*(1._wp/dy(k + 1)) - end do - end do - end do - end if - else if (idir == 2) then - if (p == 0) then - $:GPU_PARALLEL_LOOP(collapse=3, private='[rho_L, rho_R, gamma_L, & - & gamma_R, pi_inf_L, pi_inf_R, mu_L, mu_R, vel_L, vel_R, & - & pres_L, pres_R, alpha_L, alpha_R, alpha_rho_L, alpha_rho_R, & - & F_L, F_R, E_L, E_R, cfl, dvel_small, rho_sf_small, & - & vflux_L_arr, vflux_R_arr]') - do l = 0, p - do k = -1, n - do j = 0, m + $:GPU_ATOMIC(atomic='update') + rhs_vf(momxb)%sf(j, k, l) = rhs_vf(momxb)%sf(j, k, l) + & + 0.5_wp*mu_R*vflux_R_arr(1)*(1._wp/dy(k)) + $:GPU_ATOMIC(atomic='update') + rhs_vf(E_idx)%sf(j, k, l) = rhs_vf(E_idx)%sf(j, k, l) + & + 0.5_wp*mu_R*vflux_R_arr(1)*vel_R(1)*(1._wp/dy(k)) - if (viscous) then - vflux_L_arr = 0._wp - vflux_R_arr = 0._wp + $:GPU_ATOMIC(atomic='update') + rhs_vf(momxb + 1)%sf(j, k + 1, l) = rhs_vf(momxb + 1)%sf(j, k + 1, l) - & + 0.5_wp*mu_L*vflux_L_arr(3)*(1._wp/dy(k + 1)) + $:GPU_ATOMIC(atomic='update') + rhs_vf(E_idx)%sf(j, k + 1, l) = rhs_vf(E_idx)%sf(j, k + 1, l) - & + 0.5_wp*mu_L*vflux_L_arr(3)*vel_L(2)*(1._wp/dy(k + 1)) - #:if MFC_CASE_OPTIMIZATION - #:if igr_order == 5 - !DIR$ unroll 6 - #:elif igr_order == 3 - !DIR$ unroll 4 - #:endif - #:endif - $:GPU_LOOP(parallelism='[seq]') - do q = vidxb, vidxe - dvel_small = 0._wp - !x-direction contributions - $:GPU_LOOP(parallelism='[seq]') - do i = -1, 1 - rho_L = 0._wp - $:GPU_LOOP(parallelism='[seq]') - do r = 1, num_fluids - rho_L = rho_L + q_cons_vf(r)%sf(j + i, k + q, l) - end do - rho_sf_small(i) = rho_L - end do + $:GPU_ATOMIC(atomic='update') + rhs_vf(momxb + 1)%sf(j, k, l) = rhs_vf(momxb + 1)%sf(j, k, l) + & + 0.5_wp*mu_L*vflux_L_arr(3)*(1._wp/dy(k)) + $:GPU_ATOMIC(atomic='update') + rhs_vf(E_idx)%sf(j, k, l) = rhs_vf(E_idx)%sf(j, k, l) + & + 0.5_wp*mu_L*vflux_L_arr(3)*vel_L(2)*(1._wp/dy(k)) - dvel_small(1) = (1/(2._wp*dx(j)))*( & - q_cons_vf(momxb)%sf(j + 1, k + q, l)/rho_sf_small(1) - & - q_cons_vf(momxb)%sf(j - 1, k + q, l)/rho_sf_small(-1)) - dvel_small(2) = (1/(2._wp*dx(j)))*( & - q_cons_vf(momxb + 1)%sf(j + 1, k + q, l)/rho_sf_small(1) - & - q_cons_vf(momxb + 1)%sf(j - 1, k + q, l)/rho_sf_small(-1)) + $:GPU_ATOMIC(atomic='update') + rhs_vf(momxb + 1)%sf(j, k + 1, l) = rhs_vf(momxb + 1)%sf(j, k + 1, l) - & + 0.5_wp*mu_R*vflux_R_arr(3)*(1._wp/dy(k + 1)) + $:GPU_ATOMIC(atomic='update') + rhs_vf(E_idx)%sf(j, k + 1, l) = rhs_vf(E_idx)%sf(j, k + 1, l) - & + 0.5_wp*mu_R*vflux_R_arr(3)*vel_R(2)*(1._wp/dy(k + 1)) - if (q > vidxb) then - vflux_L_arr(1) = vflux_L_arr(1) + coeff_L(q)*(dvel_small(2)) - vflux_L_arr(3) = vflux_L_arr(3) + coeff_L(q)*(-2._wp*dvel_small(1))/3._wp - end if - if (q < vidxe) then - vflux_R_arr(1) = vflux_R_arr(1) + coeff_R(q)*(dvel_small(2)) - vflux_R_arr(3) = vflux_R_arr(3) + coeff_R(q)*(-2._wp*dvel_small(1))/3._wp - end if + $:GPU_ATOMIC(atomic='update') + rhs_vf(momxb + 1)%sf(j, k, l) = rhs_vf(momxb + 1)%sf(j, k, l) + & + 0.5_wp*mu_R*vflux_R_arr(3)*(1._wp/dy(k)) + $:GPU_ATOMIC(atomic='update') + rhs_vf(E_idx)%sf(j, k, l) = rhs_vf(E_idx)%sf(j, k, l) + & + 0.5_wp*mu_R*vflux_R_arr(3)*vel_R(2)*(1._wp/dy(k)) + end if - !y-direction contributions - $:GPU_LOOP(parallelism='[seq]') - do i = -1, 1 - rho_L = 0._wp - $:GPU_LOOP(parallelism='[seq]') - do r = 1, num_fluids - rho_L = rho_L + q_cons_vf(r)%sf(j, k + i + q, l) - end do - rho_sf_small(i) = rho_L - end do + E_L = 0._wp; E_R = 0._wp + F_L = 0._wp; F_R = 0._wp - dvel_small(1) = (1/(2._wp*dy(k)))*( & - q_cons_vf(momxb)%sf(j, k + 1 + q, l)/rho_sf_small(1) - & - q_cons_vf(momxb)%sf(j, k - 1 + q, l)/rho_sf_small(-1)) - dvel_small(2) = (1/(2._wp*dy(k)))*( & - q_cons_vf(momxb + 1)%sf(j, k + 1 + q, l)/rho_sf_small(1) - & - q_cons_vf(momxb + 1)%sf(j, k - 1 + q, l)/rho_sf_small(-1)) + $:GPU_LOOP(parallelism='[seq]') + do q = vidxb + 1, vidxe + E_L = E_L + coeff_L(q)*q_cons_vf(E_idx)%sf(j, k + q, l) + F_L = F_L + coeff_L(q)*jac(j, k + q, l) + end do - if (q > vidxb) then - vflux_L_arr(1) = vflux_L_arr(1) + coeff_L(q)*(dvel_small(1)) - vflux_L_arr(3) = vflux_L_arr(3) + coeff_L(q)*(4._wp*dvel_small(2))/3._wp - end if - if (q < vidxe) then - vflux_R_arr(1) = vflux_R_arr(1) + coeff_R(q)*(dvel_small(1)) - vflux_R_arr(3) = vflux_R_arr(3) + coeff_R(q)*(4._wp*dvel_small(2))/3._wp - end if + $:GPU_LOOP(parallelism='[seq]') + do q = vidxb, vidxe - 1 + E_R = E_R + coeff_R(q)*q_cons_vf(E_idx)%sf(j, k + q, l) + F_R = F_R + coeff_R(q)*jac(j, k + q, l) end do - end if - alpha_rho_L = 0._wp; alpha_rho_R = 0._wp - alpha_L = 0._wp; alpha_R = 0._wp - vel_L = 0._wp; vel_R = 0._wp + call s_get_derived_states(E_L, gamma_L, pi_inf_L, rho_L, vel_L, & + E_R, gamma_R, pi_inf_R, rho_R, vel_R, & + pres_L, pres_R, cfl) - $:GPU_LOOP(parallelism='[seq]') - do q = vidxb + 1, vidxe $:GPU_LOOP(parallelism='[seq]') do i = 1, num_fluids - alpha_rho_L(i) = alpha_rho_L(i) + coeff_L(q)*q_cons_vf(i)%sf(j, k + q, l) + $:GPU_ATOMIC(atomic='update') + rhs_vf(i)%sf(j, k + 1, l) = rhs_vf(i)%sf(j, k + 1, l) + & + (0.5_wp*(alpha_rho_L(i)* & + vel_L(2))*(1._wp/dy(k + 1)) - & + 0.5_wp*cfl*(alpha_rho_L(i))*(1._wp/dy(k + 1))) + + $:GPU_ATOMIC(atomic='update') + rhs_vf(i)%sf(j, k, l) = rhs_vf(i)%sf(j, k, l) - & + (0.5_wp*(alpha_rho_L(i)* & + vel_L(2))*(1._wp/dy(k)) - & + 0.5_wp*cfl*(alpha_rho_L(i))*(1._wp/dy(k))) end do if (num_fluids > 1) then $:GPU_LOOP(parallelism='[seq]') do i = 1, num_fluids - 1 - alpha_L(i) = alpha_L(i) + coeff_L(q)*q_cons_vf(E_idx + i)%sf(j, k + q, l) + $:GPU_ATOMIC(atomic='update') + rhs_vf(advxb + i - 1)%sf(j, k + 1, l) = rhs_vf(advxb + i - 1)%sf(j, k + 1, l) + & + (0.5_wp*(alpha_L(i)* & + vel_L(2))*(1._wp/dy(k + 1)) - & + 0.5_wp*cfl*(alpha_L(i))*(1._wp/dy(k + 1))) + + $:GPU_ATOMIC(atomic='update') + rhs_vf(advxb + i - 1)%sf(j, k + 1, l) = rhs_vf(advxb + i - 1)%sf(j, k + 1, l) & + - (0.5_wp*q_cons_vf(advxb + i - 1)%sf(j, k + 1, l)*vel_L(2)*(1._wp/dy(k + 1))) + + $:GPU_ATOMIC(atomic='update') + rhs_vf(advxb + i - 1)%sf(j, k, l) = rhs_vf(advxb + i - 1)%sf(j, k, l) - & + (0.5_wp*(alpha_L(i)* & + vel_L(2))*(1._wp/dy(k)) - & + 0.5_wp*cfl*(alpha_L(i))*(1._wp/dy(k))) + + $:GPU_ATOMIC(atomic='update') + rhs_vf(advxb + i - 1)%sf(j, k, l) = rhs_vf(advxb + i - 1)%sf(j, k, l) & + + (0.5_wp*q_cons_vf(advxb + i - 1)%sf(j, k, l)*vel_L(2)*(1._wp/dy(k))) end do - else - alpha_L(1) = 1._wp end if - $:GPU_LOOP(parallelism='[seq]') - do i = 1, num_dims - vel_L(i) = vel_L(i) + coeff_L(q)*q_cons_vf(momxb + i - 1)%sf(j, k + q, l) - end do - end do + $:GPU_ATOMIC(atomic='update') + rhs_vf(momxb + 1)%sf(j, k + 1, l) = rhs_vf(momxb + 1)%sf(j, k + 1, l) + & + (0.5_wp*(rho_L*(vel_L(2))**2.0 + & + pres_L + F_L)*(1._wp/dy(k + 1)) - & + 0.5_wp*cfl*(rho_L*vel_L(2))*(1._wp/dy(k + 1))) + + $:GPU_ATOMIC(atomic='update') + rhs_vf(momxb)%sf(j, k + 1, l) = rhs_vf(momxb)%sf(j, k + 1, l) + & + (0.5_wp*rho_L*vel_L(1)*vel_L(2)*(1._wp/dy(k + 1)) - & + 0.5_wp*cfl*(rho_L*vel_L(1))*(1._wp/dy(k + 1))) + + $:GPU_ATOMIC(atomic='update') + rhs_vf(E_idx)%sf(j, k + 1, l) = rhs_vf(E_idx)%sf(j, k + 1, l) + & + (0.5_wp*(vel_L(2)*(E_L + & + pres_L + F_L))*(1._wp/dy(k + 1)) - & + 0.5_wp*cfl*(E_L)*(1._wp/dy(k + 1))) + + $:GPU_ATOMIC(atomic='update') + rhs_vf(momxb + 1)%sf(j, k, l) = rhs_vf(momxb + 1)%sf(j, k, l) - & + (0.5_wp*(rho_L*(vel_L(2))**2.0 + & + pres_L + F_L)*(1._wp/dy(k)) - & + 0.5_wp*cfl*(rho_L*vel_L(2))*(1._wp/dy(k))) + + $:GPU_ATOMIC(atomic='update') + rhs_vf(momxb)%sf(j, k, l) = rhs_vf(momxb)%sf(j, k, l) - & + (0.5_wp*rho_L*vel_L(1)*vel_L(2)*(1._wp/dy(k)) - & + 0.5_wp*cfl*(rho_L*vel_L(1))*(1._wp/dy(k))) + + $:GPU_ATOMIC(atomic='update') + rhs_vf(E_idx)%sf(j, k, l) = rhs_vf(E_idx)%sf(j, k, l) - & + (0.5_wp*(vel_L(2)*(E_L + & + pres_L + F_L))*(1._wp/dy(k)) - & + 0.5_wp*cfl*(E_L)*(1._wp/dy(k))) - $:GPU_LOOP(parallelism='[seq]') - do q = vidxb, vidxe - 1 $:GPU_LOOP(parallelism='[seq]') do i = 1, num_fluids - alpha_rho_R(i) = alpha_rho_R(i) + coeff_R(q)*q_cons_vf(i)%sf(j, k + q, l) + $:GPU_ATOMIC(atomic='update') + rhs_vf(i)%sf(j, k + 1, l) = rhs_vf(i)%sf(j, k + 1, l) + & + (0.5_wp*(alpha_rho_R(i)* & + vel_R(2))*(1._wp/dy(k + 1)) + & + 0.5_wp*cfl*(alpha_rho_R(i))*(1._wp/dy(k + 1))) + $:GPU_ATOMIC(atomic='update') + rhs_vf(i)%sf(j, k, l) = rhs_vf(i)%sf(j, k, l) - & + (0.5_wp*(alpha_rho_R(i)* & + vel_R(2))*(1._wp/dy(k)) + & + 0.5_wp*cfl*(alpha_rho_R(i))*(1._wp/dy(k))) end do if (num_fluids > 1) then $:GPU_LOOP(parallelism='[seq]') do i = 1, num_fluids - 1 - alpha_R(i) = alpha_R(i) + coeff_R(q)*q_cons_vf(E_idx + i)%sf(j, k + q, l) + $:GPU_ATOMIC(atomic='update') + rhs_vf(advxb + i - 1)%sf(j, k + 1, l) = rhs_vf(advxb + i - 1)%sf(j, k + 1, l) + & + (0.5_wp*(alpha_R(i)* & + vel_R(2))*(1._wp/dy(k + 1)) + & + 0.5_wp*cfl*(alpha_R(i))*(1._wp/dy(k + 1))) + + $:GPU_ATOMIC(atomic='update') + rhs_vf(advxb + i - 1)%sf(j, k + 1, l) = rhs_vf(advxb + i - 1)%sf(j, k + 1, l) & + - (0.5_wp*q_cons_vf(advxb + i - 1)%sf(j, k + 1, l)*vel_R(2)*(1._wp/dy(k + 1))) + + $:GPU_ATOMIC(atomic='update') + rhs_vf(advxb + i - 1)%sf(j, k, l) = rhs_vf(advxb + i - 1)%sf(j, k, l) - & + (0.5_wp*(alpha_R(i)* & + vel_R(2))*(1._wp/dy(k)) + & + 0.5_wp*cfl*(alpha_R(i))*(1._wp/dy(k))) + + $:GPU_ATOMIC(atomic='update') + rhs_vf(advxb + i - 1)%sf(j, k, l) = rhs_vf(advxb + i - 1)%sf(j, k, l) & + + (0.5_wp*q_cons_vf(advxb + i - 1)%sf(j, k, l)*vel_R(2)*(1._wp/dy(k))) end do - else - alpha_R(1) = 1._wp end if - - $:GPU_LOOP(parallelism='[seq]') - do i = 1, num_dims - vel_R(i) = vel_R(i) + coeff_R(q)*q_cons_vf(momxb + i - 1)%sf(j, k + q, l) - end do + $:GPU_ATOMIC(atomic='update') + rhs_vf(momxb + 1)%sf(j, k + 1, l) = rhs_vf(momxb + 1)%sf(j, k + 1, l) + & + (0.5_wp*(rho_R*(vel_R(2))**2.0 + & + pres_R + F_R)*(1._wp/dy(k + 1)) + & + 0.5_wp*cfl*(rho_R*vel_R(2))*(1._wp/dy(k + 1))) + $:GPU_ATOMIC(atomic='update') + rhs_vf(momxb)%sf(j, k + 1, l) = rhs_vf(momxb)%sf(j, k + 1, l) + & + (0.5_wp*rho_R*vel_R(2)*vel_R(1)*(1._wp/dy(k + 1)) + & + 0.5_wp*cfl*(rho_R*vel_R(1))*(1._wp/dy(k + 1))) + $:GPU_ATOMIC(atomic='update') + rhs_vf(E_idx)%sf(j, k + 1, l) = rhs_vf(E_idx)%sf(j, k + 1, l) + & + (0.5_wp*(vel_R(2)*(E_R + & + pres_R + F_R))*(1._wp/dy(k + 1)) + & + 0.5_wp*cfl*(E_R)*(1._wp/dy(k + 1))) + $:GPU_ATOMIC(atomic='update') + rhs_vf(momxb + 1)%sf(j, k, l) = rhs_vf(momxb + 1)%sf(j, k, l) - & + (0.5_wp*(rho_R*(vel_R(2))**2.0 + & + pres_R + F_R)*(1._wp/dy(k)) + & + 0.5_wp*cfl*(rho_R*vel_R(2))*(1._wp/dy(k))) + $:GPU_ATOMIC(atomic='update') + rhs_vf(momxb)%sf(j, k, l) = rhs_vf(momxb)%sf(j, k, l) - & + (0.5_wp*rho_R*vel_R(2)*vel_R(1)*(1._wp/dy(k)) + & + 0.5_wp*cfl*(rho_R*vel_R(1))*(1._wp/dy(k))) + $:GPU_ATOMIC(atomic='update') + rhs_vf(E_idx)%sf(j, k, l) = rhs_vf(E_idx)%sf(j, k, l) - & + (0.5_wp*(vel_R(2)*(E_R + & + pres_R + F_R))*(1._wp/dy(k)) + & + 0.5_wp*cfl*(E_R)*(1._wp/dy(k))) end do + end do + end do + #:endcall GPU_PARALLEL_LOOP + else + #:call GPU_PARALLEL_LOOP(collapse=3, private='[rho_L, rho_R, gamma_L, gamma_R, pi_inf_L, pi_inf_R, mu_L, mu_R, vel_L, vel_R, pres_L, pres_R, alpha_L, alpha_R, alpha_rho_L, alpha_rho_R, F_L, F_R, E_L, E_R, cfl, dvel_small, rho_sf_small, vflux_L_arr, vflux_R_arr]') + do l = 0, p + do k = -1, n + do j = 0, m + + if (viscous) then + vflux_L_arr = 0._wp + vflux_R_arr = 0._wp + + #:if MFC_CASE_OPTIMIZATION + #:if igr_order == 5 + !DIR$ unroll 6 + #:elif igr_order == 3 + !DIR$ unroll 4 + #:endif + #:endif + $:GPU_LOOP(parallelism='[seq]') + do q = vidxb, vidxe + dvel_small = 0._wp + !x-direction contributions + $:GPU_LOOP(parallelism='[seq]') + do i = -1, 1 + rho_L = 0._wp + $:GPU_LOOP(parallelism='[seq]') + do r = 1, num_fluids + rho_L = rho_L + q_cons_vf(r)%sf(j + i, k + q, l) + end do + rho_sf_small(i) = rho_L + end do - if (num_fluids > 1) then - alpha_L(num_fluids) = 1._wp - sum(alpha_L(1:num_fluids - 1)) - alpha_R(num_fluids) = 1._wp - sum(alpha_R(1:num_fluids - 1)) - end if + dvel_small(1) = (1/(2._wp*dx(j)))*( & + q_cons_vf(momxb)%sf(j + 1, k + q, l)/rho_sf_small(1) - & + q_cons_vf(momxb)%sf(j - 1, k + q, l)/rho_sf_small(-1)) + dvel_small(2) = (1/(2._wp*dx(j)))*( & + q_cons_vf(momxb + 1)%sf(j + 1, k + q, l)/rho_sf_small(1) - & + q_cons_vf(momxb + 1)%sf(j - 1, k + q, l)/rho_sf_small(-1)) + + if (q > vidxb) then + vflux_L_arr(1) = vflux_L_arr(1) + coeff_L(q)*(dvel_small(2)) + vflux_L_arr(3) = vflux_L_arr(3) + coeff_L(q)*(-2._wp*dvel_small(1))/3._wp + end if + if (q < vidxe) then + vflux_R_arr(1) = vflux_R_arr(1) + coeff_R(q)*(dvel_small(2)) + vflux_R_arr(3) = vflux_R_arr(3) + coeff_R(q)*(-2._wp*dvel_small(1))/3._wp + end if + + !y-direction contributions + $:GPU_LOOP(parallelism='[seq]') + do i = -1, 1 + rho_L = 0._wp + $:GPU_LOOP(parallelism='[seq]') + do r = 1, num_fluids + rho_L = rho_L + q_cons_vf(r)%sf(j, k + i + q, l) + end do + rho_sf_small(i) = rho_L + end do - rho_L = sum(alpha_rho_L) - gamma_L = sum(alpha_L*gammas) - pi_inf_L = sum(alpha_L*pi_infs) + dvel_small(1) = (1/(2._wp*dy(k)))*( & + q_cons_vf(momxb)%sf(j, k + 1 + q, l)/rho_sf_small(1) - & + q_cons_vf(momxb)%sf(j, k - 1 + q, l)/rho_sf_small(-1)) + dvel_small(2) = (1/(2._wp*dy(k)))*( & + q_cons_vf(momxb + 1)%sf(j, k + 1 + q, l)/rho_sf_small(1) - & + q_cons_vf(momxb + 1)%sf(j, k - 1 + q, l)/rho_sf_small(-1)) + dvel_small(3) = (1/(2._wp*dy(k)))*( & + q_cons_vf(momxb + 2)%sf(j, k + 1 + q, l)/rho_sf_small(1) - & + q_cons_vf(momxb + 2)%sf(j, k - 1 + q, l)/rho_sf_small(-1)) + + if (q > vidxb) then + vflux_L_arr(1) = vflux_L_arr(1) + coeff_L(q)*(dvel_small(1)) + vflux_L_arr(2) = vflux_L_arr(2) + coeff_L(q)*(dvel_small(3)) + vflux_L_arr(3) = vflux_L_arr(3) + coeff_L(q)*(4._wp*dvel_small(2))/3._wp + end if + if (q < vidxe) then + vflux_R_arr(1) = vflux_R_arr(1) + coeff_R(q)*(dvel_small(1)) + vflux_R_arr(2) = vflux_R_arr(2) + coeff_R(q)*(dvel_small(3)) + vflux_R_arr(3) = vflux_R_arr(3) + coeff_R(q)*(4._wp*dvel_small(2))/3._wp + end if + + !z-direction contributions + $:GPU_LOOP(parallelism='[seq]') + do i = -1, 1 + rho_L = 0._wp + $:GPU_LOOP(parallelism='[seq]') + do r = 1, num_fluids + rho_L = rho_L + q_cons_vf(r)%sf(j, k + q, l + i) + end do + rho_sf_small(i) = rho_L + end do - rho_R = sum(alpha_rho_R) - gamma_R = sum(alpha_R*gammas) - pi_inf_R = sum(alpha_R*pi_infs) + dvel_small(2) = (1/(2._wp*dz(l)))*( & + q_cons_vf(momxb + 1)%sf(j, k + q, l + 1)/rho_sf_small(1) - & + q_cons_vf(momxb + 1)%sf(j, k + q, l - 1)/rho_sf_small(-1)) + dvel_small(3) = (1/(2._wp*dz(l)))*( & + q_cons_vf(momxb + 2)%sf(j, k + q, l + 1)/rho_sf_small(1) - & + q_cons_vf(momxb + 2)%sf(j, k + q, l - 1)/rho_sf_small(-1)) + if (q > vidxb) then + vflux_L_arr(2) = vflux_L_arr(2) + coeff_L(q)*(dvel_small(2)) + vflux_L_arr(3) = vflux_L_arr(3) + coeff_L(q)*(-2._wp*dvel_small(3))/3._wp + end if + if (q < vidxe) then + vflux_R_arr(2) = vflux_R_arr(2) + coeff_R(q)*(dvel_small(2)) + vflux_R_arr(3) = vflux_R_arr(3) + coeff_R(q)*(-2._wp*dvel_small(3))/3._wp + end if + end do + end if - vel_L = vel_L/rho_L - vel_R = vel_R/rho_R + alpha_rho_L = 0._wp; alpha_rho_R = 0._wp + alpha_L = 0._wp; alpha_R = 0._wp + vel_L = 0._wp; vel_R = 0._wp - if (viscous) then - mu_L = 0._wp - mu_R = 0._wp $:GPU_LOOP(parallelism='[seq]') - do i = 1, num_fluids - mu_L = alpha_L(i)/Res(1, i) + mu_L - mu_R = alpha_R(i)/Res(1, i) + mu_R + do q = vidxb + 1, vidxe + $:GPU_LOOP(parallelism='[seq]') + do i = 1, num_fluids + alpha_rho_L(i) = alpha_rho_L(i) + coeff_L(q)*q_cons_vf(i)%sf(j, k + q, l) + end do + + if (num_fluids > 1) then + $:GPU_LOOP(parallelism='[seq]') + do i = 1, num_fluids - 1 + alpha_L(i) = alpha_L(i) + coeff_L(q)*q_cons_vf(E_idx + i)%sf(j, k + q, l) + end do + else + alpha_L(1) = 1._wp + end if + + $:GPU_LOOP(parallelism='[seq]') + do i = 1, num_dims + vel_L(i) = vel_L(i) + coeff_L(q)*q_cons_vf(momxb + i - 1)%sf(j, k + q, l) + end do end do - $:GPU_ATOMIC(atomic='update') - rhs_vf(momxb)%sf(j, k + 1, l) = rhs_vf(momxb)%sf(j, k + 1, l) - & - 0.5_wp*mu_L*vflux_L_arr(1)*(1._wp/dy(k + 1)) - $:GPU_ATOMIC(atomic='update') - rhs_vf(E_idx)%sf(j, k + 1, l) = rhs_vf(E_idx)%sf(j, k + 1, l) - & - 0.5_wp*mu_L*vflux_L_arr(1)*vel_L(1)*(1._wp/dy(k + 1)) + $:GPU_LOOP(parallelism='[seq]') + do q = vidxb, vidxe - 1 + $:GPU_LOOP(parallelism='[seq]') + do i = 1, num_fluids + alpha_rho_R(i) = alpha_rho_R(i) + coeff_R(q)*q_cons_vf(i)%sf(j, k + q, l) + end do - $:GPU_ATOMIC(atomic='update') - rhs_vf(momxb)%sf(j, k, l) = rhs_vf(momxb)%sf(j, k, l) + & - 0.5_wp*mu_L*vflux_L_arr(1)*(1._wp/dy(k)) - $:GPU_ATOMIC(atomic='update') - rhs_vf(E_idx)%sf(j, k, l) = rhs_vf(E_idx)%sf(j, k, l) + & - 0.5_wp*mu_L*vflux_L_arr(1)*vel_L(1)*(1._wp/dy(k)) + if (num_fluids > 1) then + $:GPU_LOOP(parallelism='[seq]') + do i = 1, num_fluids - 1 + alpha_R(i) = alpha_R(i) + coeff_R(q)*q_cons_vf(E_idx + i)%sf(j, k + q, l) + end do + else + alpha_R(1) = 1._wp + end if - $:GPU_ATOMIC(atomic='update') - rhs_vf(momxb)%sf(j, k + 1, l) = rhs_vf(momxb)%sf(j, k + 1, l) - & - 0.5_wp*mu_R*vflux_R_arr(1)*(1._wp/dy(k + 1)) - $:GPU_ATOMIC(atomic='update') - rhs_vf(E_idx)%sf(j, k + 1, l) = rhs_vf(E_idx)%sf(j, k + 1, l) - & - 0.5_wp*mu_R*vflux_R_arr(1)*vel_R(1)*(1._wp/dy(k + 1)) + $:GPU_LOOP(parallelism='[seq]') + do i = 1, num_dims + vel_R(i) = vel_R(i) + coeff_R(q)*q_cons_vf(momxb + i - 1)%sf(j, k + q, l) + end do + end do - $:GPU_ATOMIC(atomic='update') - rhs_vf(momxb)%sf(j, k, l) = rhs_vf(momxb)%sf(j, k, l) + & - 0.5_wp*mu_R*vflux_R_arr(1)*(1._wp/dy(k)) - $:GPU_ATOMIC(atomic='update') - rhs_vf(E_idx)%sf(j, k, l) = rhs_vf(E_idx)%sf(j, k, l) + & - 0.5_wp*mu_R*vflux_R_arr(1)*vel_R(1)*(1._wp/dy(k)) + if (num_fluids > 1) then + alpha_L(num_fluids) = 1._wp - sum(alpha_L(1:num_fluids - 1)) + alpha_R(num_fluids) = 1._wp - sum(alpha_R(1:num_fluids - 1)) + end if - $:GPU_ATOMIC(atomic='update') - rhs_vf(momxb + 1)%sf(j, k + 1, l) = rhs_vf(momxb + 1)%sf(j, k + 1, l) - & - 0.5_wp*mu_L*vflux_L_arr(3)*(1._wp/dy(k + 1)) - $:GPU_ATOMIC(atomic='update') - rhs_vf(E_idx)%sf(j, k + 1, l) = rhs_vf(E_idx)%sf(j, k + 1, l) - & - 0.5_wp*mu_L*vflux_L_arr(3)*vel_L(2)*(1._wp/dy(k + 1)) + rho_L = sum(alpha_rho_L) + gamma_L = sum(alpha_L*gammas) + pi_inf_L = sum(alpha_L*pi_infs) - $:GPU_ATOMIC(atomic='update') - rhs_vf(momxb + 1)%sf(j, k, l) = rhs_vf(momxb + 1)%sf(j, k, l) + & - 0.5_wp*mu_L*vflux_L_arr(3)*(1._wp/dy(k)) - $:GPU_ATOMIC(atomic='update') - rhs_vf(E_idx)%sf(j, k, l) = rhs_vf(E_idx)%sf(j, k, l) + & - 0.5_wp*mu_L*vflux_L_arr(3)*vel_L(2)*(1._wp/dy(k)) + rho_R = sum(alpha_rho_R) + gamma_R = sum(alpha_R*gammas) + pi_inf_R = sum(alpha_R*pi_infs) - $:GPU_ATOMIC(atomic='update') - rhs_vf(momxb + 1)%sf(j, k + 1, l) = rhs_vf(momxb + 1)%sf(j, k + 1, l) - & - 0.5_wp*mu_R*vflux_R_arr(3)*(1._wp/dy(k + 1)) - $:GPU_ATOMIC(atomic='update') - rhs_vf(E_idx)%sf(j, k + 1, l) = rhs_vf(E_idx)%sf(j, k + 1, l) - & - 0.5_wp*mu_R*vflux_R_arr(3)*vel_R(2)*(1._wp/dy(k + 1)) + vel_L = vel_L/rho_L + vel_R = vel_R/rho_R - $:GPU_ATOMIC(atomic='update') - rhs_vf(momxb + 1)%sf(j, k, l) = rhs_vf(momxb + 1)%sf(j, k, l) + & - 0.5_wp*mu_R*vflux_R_arr(3)*(1._wp/dy(k)) - $:GPU_ATOMIC(atomic='update') - rhs_vf(E_idx)%sf(j, k, l) = rhs_vf(E_idx)%sf(j, k, l) + & - 0.5_wp*mu_R*vflux_R_arr(3)*vel_R(2)*(1._wp/dy(k)) - end if + if (viscous) then + mu_L = 0._wp + mu_R = 0._wp + $:GPU_LOOP(parallelism='[seq]') + do i = 1, num_fluids + mu_L = alpha_L(i)/Res_igr(1, i) + mu_L + mu_R = alpha_R(i)/Res_igr(1, i) + mu_R + end do - E_L = 0._wp; E_R = 0._wp - F_L = 0._wp; F_R = 0._wp + $:GPU_ATOMIC(atomic='update') + rhs_vf(momxb)%sf(j, k + 1, l) = rhs_vf(momxb)%sf(j, k + 1, l) - & + 0.5_wp*mu_L*vflux_L_arr(1)*(1._wp/dy(k + 1)) + $:GPU_ATOMIC(atomic='update') + rhs_vf(E_idx)%sf(j, k + 1, l) = rhs_vf(E_idx)%sf(j, k + 1, l) - & + 0.5_wp*mu_L*vflux_L_arr(1)*vel_L(1)*(1._wp/dy(k + 1)) - $:GPU_LOOP(parallelism='[seq]') - do q = vidxb + 1, vidxe - E_L = E_L + coeff_L(q)*q_cons_vf(E_idx)%sf(j, k + q, l) - F_L = F_L + coeff_L(q)*jac(j, k + q, l) - end do + $:GPU_ATOMIC(atomic='update') + rhs_vf(momxb)%sf(j, k, l) = rhs_vf(momxb)%sf(j, k, l) + & + 0.5_wp*mu_L*vflux_L_arr(1)*(1._wp/dy(k)) + $:GPU_ATOMIC(atomic='update') + rhs_vf(E_idx)%sf(j, k, l) = rhs_vf(E_idx)%sf(j, k, l) + & + 0.5_wp*mu_L*vflux_L_arr(1)*vel_L(1)*(1._wp/dy(k)) - $:GPU_LOOP(parallelism='[seq]') - do q = vidxb, vidxe - 1 - E_R = E_R + coeff_R(q)*q_cons_vf(E_idx)%sf(j, k + q, l) - F_R = F_R + coeff_R(q)*jac(j, k + q, l) - end do + $:GPU_ATOMIC(atomic='update') + rhs_vf(momxb)%sf(j, k + 1, l) = rhs_vf(momxb)%sf(j, k + 1, l) - & + 0.5_wp*mu_R*vflux_R_arr(1)*(1._wp/dy(k + 1)) + $:GPU_ATOMIC(atomic='update') + rhs_vf(E_idx)%sf(j, k + 1, l) = rhs_vf(E_idx)%sf(j, k + 1, l) - & + 0.5_wp*mu_R*vflux_R_arr(1)*vel_R(1)*(1._wp/dy(k + 1)) - call s_get_derived_states(E_L, gamma_L, pi_inf_L, rho_L, vel_L, & - E_R, gamma_R, pi_inf_R, rho_R, vel_R, & - pres_L, pres_R, cfl) + $:GPU_ATOMIC(atomic='update') + rhs_vf(momxb)%sf(j, k, l) = rhs_vf(momxb)%sf(j, k, l) + & + 0.5_wp*mu_R*vflux_R_arr(1)*(1._wp/dy(k)) + $:GPU_ATOMIC(atomic='update') + rhs_vf(E_idx)%sf(j, k, l) = rhs_vf(E_idx)%sf(j, k, l) + & + 0.5_wp*mu_R*vflux_R_arr(1)*vel_R(1)*(1._wp/dy(k)) - $:GPU_LOOP(parallelism='[seq]') - do i = 1, num_fluids - $:GPU_ATOMIC(atomic='update') - rhs_vf(i)%sf(j, k + 1, l) = rhs_vf(i)%sf(j, k + 1, l) + & - (0.5_wp*(alpha_rho_L(i)* & - vel_L(2))*(1._wp/dy(k + 1)) - & - 0.5_wp*cfl*(alpha_rho_L(i))*(1._wp/dy(k + 1))) + $:GPU_ATOMIC(atomic='update') + rhs_vf(momxb + 2)%sf(j, k + 1, l) = rhs_vf(momxb + 2)%sf(j, k + 1, l) - & + 0.5_wp*mu_L*vflux_L_arr(2)*(1._wp/dy(k + 1)) + $:GPU_ATOMIC(atomic='update') + rhs_vf(E_idx)%sf(j, k + 1, l) = rhs_vf(E_idx)%sf(j, k + 1, l) - & + 0.5_wp*mu_L*vflux_L_arr(2)*vel_L(3)*(1._wp/dy(k + 1)) - $:GPU_ATOMIC(atomic='update') - rhs_vf(i)%sf(j, k, l) = rhs_vf(i)%sf(j, k, l) - & - (0.5_wp*(alpha_rho_L(i)* & - vel_L(2))*(1._wp/dy(k)) - & - 0.5_wp*cfl*(alpha_rho_L(i))*(1._wp/dy(k))) - end do + $:GPU_ATOMIC(atomic='update') + rhs_vf(momxb + 2)%sf(j, k, l) = rhs_vf(momxb + 2)%sf(j, k, l) + & + 0.5_wp*mu_L*vflux_L_arr(2)*(1._wp/dy(k)) + $:GPU_ATOMIC(atomic='update') + rhs_vf(E_idx)%sf(j, k, l) = rhs_vf(E_idx)%sf(j, k, l) + & + 0.5_wp*mu_L*vflux_L_arr(2)*vel_L(3)*(1._wp/dy(k)) - if (num_fluids > 1) then - $:GPU_LOOP(parallelism='[seq]') - do i = 1, num_fluids - 1 $:GPU_ATOMIC(atomic='update') - rhs_vf(advxb + i - 1)%sf(j, k + 1, l) = rhs_vf(advxb + i - 1)%sf(j, k + 1, l) + & - (0.5_wp*(alpha_L(i)* & - vel_L(2))*(1._wp/dy(k + 1)) - & - 0.5_wp*cfl*(alpha_L(i))*(1._wp/dy(k + 1))) + rhs_vf(momxb + 2)%sf(j, k + 1, l) = rhs_vf(momxb + 2)%sf(j, k + 1, l) - & + 0.5_wp*mu_R*vflux_R_arr(2)*(1._wp/dy(k + 1)) + $:GPU_ATOMIC(atomic='update') + rhs_vf(E_idx)%sf(j, k + 1, l) = rhs_vf(E_idx)%sf(j, k + 1, l) - & + 0.5_wp*mu_R*vflux_R_arr(2)*vel_R(3)*(1._wp/dy(k + 1)) $:GPU_ATOMIC(atomic='update') - rhs_vf(advxb + i - 1)%sf(j, k + 1, l) = rhs_vf(advxb + i - 1)%sf(j, k + 1, l) & - - (0.5_wp*q_cons_vf(advxb + i - 1)%sf(j, k + 1, l)*vel_L(2)*(1._wp/dy(k + 1))) + rhs_vf(momxb + 2)%sf(j, k, l) = rhs_vf(momxb + 2)%sf(j, k, l) + & + 0.5_wp*mu_R*vflux_R_arr(2)*(1._wp/dy(k)) + $:GPU_ATOMIC(atomic='update') + rhs_vf(E_idx)%sf(j, k, l) = rhs_vf(E_idx)%sf(j, k, l) + & + 0.5_wp*mu_R*vflux_R_arr(2)*vel_R(3)*(1._wp/dy(k)) $:GPU_ATOMIC(atomic='update') - rhs_vf(advxb + i - 1)%sf(j, k, l) = rhs_vf(advxb + i - 1)%sf(j, k, l) - & - (0.5_wp*(alpha_L(i)* & - vel_L(2))*(1._wp/dy(k)) - & - 0.5_wp*cfl*(alpha_L(i))*(1._wp/dy(k))) + rhs_vf(momxb + 1)%sf(j, k + 1, l) = rhs_vf(momxb + 1)%sf(j, k + 1, l) - & + 0.5_wp*mu_L*vflux_L_arr(3)*(1._wp/dy(k + 1)) + $:GPU_ATOMIC(atomic='update') + rhs_vf(E_idx)%sf(j, k + 1, l) = rhs_vf(E_idx)%sf(j, k + 1, l) - & + 0.5_wp*mu_L*vflux_L_arr(3)*vel_L(2)*(1._wp/dy(k + 1)) $:GPU_ATOMIC(atomic='update') - rhs_vf(advxb + i - 1)%sf(j, k, l) = rhs_vf(advxb + i - 1)%sf(j, k, l) & - + (0.5_wp*q_cons_vf(advxb + i - 1)%sf(j, k, l)*vel_L(2)*(1._wp/dy(k))) + rhs_vf(momxb + 1)%sf(j, k, l) = rhs_vf(momxb + 1)%sf(j, k, l) + & + 0.5_wp*mu_L*vflux_L_arr(3)*(1._wp/dy(k)) + $:GPU_ATOMIC(atomic='update') + rhs_vf(E_idx)%sf(j, k, l) = rhs_vf(E_idx)%sf(j, k, l) + & + 0.5_wp*mu_L*vflux_L_arr(3)*vel_L(2)*(1._wp/dy(k)) + + $:GPU_ATOMIC(atomic='update') + rhs_vf(momxb + 1)%sf(j, k + 1, l) = rhs_vf(momxb + 1)%sf(j, k + 1, l) - & + 0.5_wp*mu_R*vflux_R_arr(3)*(1._wp/dy(k + 1)) + $:GPU_ATOMIC(atomic='update') + rhs_vf(E_idx)%sf(j, k + 1, l) = rhs_vf(E_idx)%sf(j, k + 1, l) - & + 0.5_wp*mu_R*vflux_R_arr(3)*vel_R(2)*(1._wp/dy(k + 1)) + + $:GPU_ATOMIC(atomic='update') + rhs_vf(momxb + 1)%sf(j, k, l) = rhs_vf(momxb + 1)%sf(j, k, l) + & + 0.5_wp*mu_R*vflux_R_arr(3)*(1._wp/dy(k)) + $:GPU_ATOMIC(atomic='update') + rhs_vf(E_idx)%sf(j, k, l) = rhs_vf(E_idx)%sf(j, k, l) + & + 0.5_wp*mu_R*vflux_R_arr(3)*vel_R(2)*(1._wp/dy(k)) + end if + + E_L = 0._wp; E_R = 0._wp + F_L = 0._wp; F_R = 0._wp + + $:GPU_LOOP(parallelism='[seq]') + do q = vidxb + 1, vidxe + E_L = E_L + coeff_L(q)*q_cons_vf(E_idx)%sf(j, k + q, l) + F_L = F_L + coeff_L(q)*jac(j, k + q, l) end do - end if - $:GPU_ATOMIC(atomic='update') - rhs_vf(momxb + 1)%sf(j, k + 1, l) = rhs_vf(momxb + 1)%sf(j, k + 1, l) + & - (0.5_wp*(rho_L*(vel_L(2))**2.0 + & - pres_L + F_L)*(1._wp/dy(k + 1)) - & - 0.5_wp*cfl*(rho_L*vel_L(2))*(1._wp/dy(k + 1))) + $:GPU_LOOP(parallelism='[seq]') + do q = vidxb, vidxe - 1 + E_R = E_R + coeff_R(q)*q_cons_vf(E_idx)%sf(j, k + q, l) + F_R = F_R + coeff_R(q)*jac(j, k + q, l) + end do - $:GPU_ATOMIC(atomic='update') - rhs_vf(momxb)%sf(j, k + 1, l) = rhs_vf(momxb)%sf(j, k + 1, l) + & - (0.5_wp*rho_L*vel_L(1)*vel_L(2)*(1._wp/dy(k + 1)) - & - 0.5_wp*cfl*(rho_L*vel_L(1))*(1._wp/dy(k + 1))) + call s_get_derived_states(E_L, gamma_L, pi_inf_L, rho_L, vel_L, & + E_R, gamma_R, pi_inf_R, rho_R, vel_R, & + pres_L, pres_R, cfl) - $:GPU_ATOMIC(atomic='update') - rhs_vf(E_idx)%sf(j, k + 1, l) = rhs_vf(E_idx)%sf(j, k + 1, l) + & - (0.5_wp*(vel_L(2)*(E_L + & - pres_L + F_L))*(1._wp/dy(k + 1)) - & - 0.5_wp*cfl*(E_L)*(1._wp/dy(k + 1))) + $:GPU_LOOP(parallelism='[seq]') + do i = 1, num_fluids + $:GPU_ATOMIC(atomic='update') + rhs_vf(i)%sf(j, k + 1, l) = rhs_vf(i)%sf(j, k + 1, l) + & + (0.5_wp*(alpha_rho_L(i)* & + vel_L(2))*(1._wp/dy(k + 1)) - & + 0.5_wp*cfl*(alpha_rho_L(i))*(1._wp/dy(k + 1))) - $:GPU_ATOMIC(atomic='update') - rhs_vf(momxb + 1)%sf(j, k, l) = rhs_vf(momxb + 1)%sf(j, k, l) - & - (0.5_wp*(rho_L*(vel_L(2))**2.0 + & - pres_L + F_L)*(1._wp/dy(k)) - & - 0.5_wp*cfl*(rho_L*vel_L(2))*(1._wp/dy(k))) + $:GPU_ATOMIC(atomic='update') + rhs_vf(i)%sf(j, k, l) = rhs_vf(i)%sf(j, k, l) - & + (0.5_wp*(alpha_rho_L(i)* & + vel_L(2))*(1._wp/dy(k)) - & + 0.5_wp*cfl*(alpha_rho_L(i))*(1._wp/dy(k))) + end do - $:GPU_ATOMIC(atomic='update') - rhs_vf(momxb)%sf(j, k, l) = rhs_vf(momxb)%sf(j, k, l) - & - (0.5_wp*rho_L*vel_L(1)*vel_L(2)*(1._wp/dy(k)) - & - 0.5_wp*cfl*(rho_L*vel_L(1))*(1._wp/dy(k))) + if (num_fluids > 1) then + $:GPU_LOOP(parallelism='[seq]') + do i = 1, num_fluids - 1 + $:GPU_ATOMIC(atomic='update') + rhs_vf(advxb + i - 1)%sf(j, k + 1, l) = rhs_vf(advxb + i - 1)%sf(j, k + 1, l) + & + (0.5_wp*(alpha_L(i)* & + vel_L(2))*(1._wp/dy(k + 1)) - & + 0.5_wp*cfl*(alpha_L(i))*(1._wp/dy(k + 1))) + + $:GPU_ATOMIC(atomic='update') + rhs_vf(advxb + i - 1)%sf(j, k + 1, l) = rhs_vf(advxb + i - 1)%sf(j, k + 1, l) & + - (0.5_wp*q_cons_vf(advxb + i - 1)%sf(j, k + 1, l)*vel_L(2)*(1._wp/dy(k + 1))) + + $:GPU_ATOMIC(atomic='update') + rhs_vf(advxb + i - 1)%sf(j, k, l) = rhs_vf(advxb + i - 1)%sf(j, k, l) - & + (0.5_wp*(alpha_L(i)* & + vel_L(2))*(1._wp/dy(k)) - & + 0.5_wp*cfl*(alpha_L(i))*(1._wp/dy(k))) - $:GPU_ATOMIC(atomic='update') - rhs_vf(E_idx)%sf(j, k, l) = rhs_vf(E_idx)%sf(j, k, l) - & - (0.5_wp*(vel_L(2)*(E_L + & - pres_L + F_L))*(1._wp/dy(k)) - & - 0.5_wp*cfl*(E_L)*(1._wp/dy(k))) + $:GPU_ATOMIC(atomic='update') + rhs_vf(advxb + i - 1)%sf(j, k, l) = rhs_vf(advxb + i - 1)%sf(j, k, l) & + + (0.5_wp*q_cons_vf(advxb + i - 1)%sf(j, k, l)*vel_L(2)*(1._wp/dy(k))) + end do + end if - $:GPU_LOOP(parallelism='[seq]') - do i = 1, num_fluids $:GPU_ATOMIC(atomic='update') - rhs_vf(i)%sf(j, k + 1, l) = rhs_vf(i)%sf(j, k + 1, l) + & - (0.5_wp*(alpha_rho_R(i)* & - vel_R(2))*(1._wp/dy(k + 1)) + & - 0.5_wp*cfl*(alpha_rho_R(i))*(1._wp/dy(k + 1))) + rhs_vf(momxb + 1)%sf(j, k + 1, l) = rhs_vf(momxb + 1)%sf(j, k + 1, l) + & + (0.5_wp*(rho_L*(vel_L(2))**2.0 + & + pres_L + F_L)*(1._wp/dy(k + 1)) - & + 0.5_wp*cfl*(rho_L*vel_L(2))*(1._wp/dy(k + 1))) + $:GPU_ATOMIC(atomic='update') - rhs_vf(i)%sf(j, k, l) = rhs_vf(i)%sf(j, k, l) - & - (0.5_wp*(alpha_rho_R(i)* & - vel_R(2))*(1._wp/dy(k)) + & - 0.5_wp*cfl*(alpha_rho_R(i))*(1._wp/dy(k))) - end do + rhs_vf(momxb)%sf(j, k + 1, l) = rhs_vf(momxb)%sf(j, k + 1, l) + & + (0.5_wp*rho_L*vel_L(1)*vel_L(2)*(1._wp/dy(k + 1)) - & + 0.5_wp*cfl*(rho_L*vel_L(1))*(1._wp/dy(k + 1))) + + $:GPU_ATOMIC(atomic='update') + rhs_vf(momxb + 2)%sf(j, k + 1, l) = rhs_vf(momxb + 2)%sf(j, k + 1, l) + & + (0.5_wp*rho_L*vel_L(3)*vel_L(2)*(1._wp/dy(k + 1)) - & + 0.5_wp*cfl*(rho_L*vel_L(3))*(1._wp/dy(k + 1))) + + $:GPU_ATOMIC(atomic='update') + rhs_vf(E_idx)%sf(j, k + 1, l) = rhs_vf(E_idx)%sf(j, k + 1, l) + & + (0.5_wp*(vel_L(2)*(E_L + & + pres_L + F_L))*(1._wp/dy(k + 1)) - & + 0.5_wp*cfl*(E_L)*(1._wp/dy(k + 1))) + + $:GPU_ATOMIC(atomic='update') + rhs_vf(momxb + 1)%sf(j, k, l) = rhs_vf(momxb + 1)%sf(j, k, l) - & + (0.5_wp*(rho_L*(vel_L(2))**2.0 + & + pres_L + F_L)*(1._wp/dy(k)) - & + 0.5_wp*cfl*(rho_L*vel_L(2))*(1._wp/dy(k))) + + $:GPU_ATOMIC(atomic='update') + rhs_vf(momxb)%sf(j, k, l) = rhs_vf(momxb)%sf(j, k, l) - & + (0.5_wp*rho_L*vel_L(1)*vel_L(2)*(1._wp/dy(k)) - & + 0.5_wp*cfl*(rho_L*vel_L(1))*(1._wp/dy(k))) + + $:GPU_ATOMIC(atomic='update') + rhs_vf(momxb + 2)%sf(j, k, l) = rhs_vf(momxb + 2)%sf(j, k, l) - & + (0.5_wp*rho_L*vel_L(3)*vel_L(2)*(1._wp/dy(k)) - & + 0.5_wp*cfl*(rho_L*vel_L(3))*(1._wp/dy(k))) + + $:GPU_ATOMIC(atomic='update') + rhs_vf(E_idx)%sf(j, k, l) = rhs_vf(E_idx)%sf(j, k, l) - & + (0.5_wp*(vel_L(2)*(E_L + & + pres_L + F_L))*(1._wp/dy(k)) - & + 0.5_wp*cfl*(E_L)*(1._wp/dy(k))) - if (num_fluids > 1) then $:GPU_LOOP(parallelism='[seq]') - do i = 1, num_fluids - 1 + do i = 1, num_fluids $:GPU_ATOMIC(atomic='update') - rhs_vf(advxb + i - 1)%sf(j, k + 1, l) = rhs_vf(advxb + i - 1)%sf(j, k + 1, l) + & - (0.5_wp*(alpha_R(i)* & - vel_R(2))*(1._wp/dy(k + 1)) + & - 0.5_wp*cfl*(alpha_R(i))*(1._wp/dy(k + 1))) + rhs_vf(i)%sf(j, k + 1, l) = rhs_vf(i)%sf(j, k + 1, l) + & + (0.5_wp*(alpha_rho_R(i)* & + vel_R(2))*(1._wp/dy(k + 1)) + & + 0.5_wp*cfl*(alpha_rho_R(i))*(1._wp/dy(k + 1))) $:GPU_ATOMIC(atomic='update') - rhs_vf(advxb + i - 1)%sf(j, k + 1, l) = rhs_vf(advxb + i - 1)%sf(j, k + 1, l) & - - (0.5_wp*q_cons_vf(advxb + i - 1)%sf(j, k + 1, l)*vel_R(2)*(1._wp/dy(k + 1))) + rhs_vf(i)%sf(j, k, l) = rhs_vf(i)%sf(j, k, l) - & + (0.5_wp*(alpha_rho_R(i)* & + vel_R(2))*(1._wp/dy(k)) + & + 0.5_wp*cfl*(alpha_rho_R(i))*(1._wp/dy(k))) + end do + + if (num_fluids > 1) then + $:GPU_LOOP(parallelism='[seq]') + do i = 1, num_fluids - 1 + $:GPU_ATOMIC(atomic='update') + rhs_vf(advxb + i - 1)%sf(j, k + 1, l) = rhs_vf(advxb + i - 1)%sf(j, k + 1, l) + & + (0.5_wp*(alpha_R(i)* & + vel_R(2))*(1._wp/dy(k + 1)) + & + 0.5_wp*cfl*(alpha_R(i))*(1._wp/dy(k + 1))) + + $:GPU_ATOMIC(atomic='update') + rhs_vf(advxb + i - 1)%sf(j, k + 1, l) = rhs_vf(advxb + i - 1)%sf(j, k + 1, l) & + - (0.5_wp*q_cons_vf(advxb + i - 1)%sf(j, k + 1, l)*vel_R(2)*(1._wp/dy(k + 1))) + + $:GPU_ATOMIC(atomic='update') + rhs_vf(advxb + i - 1)%sf(j, k, l) = rhs_vf(advxb + i - 1)%sf(j, k, l) - & + (0.5_wp*(alpha_R(i)* & + vel_R(2))*(1._wp/dy(k)) + & + 0.5_wp*cfl*(alpha_R(i))*(1._wp/dy(k))) + + $:GPU_ATOMIC(atomic='update') + rhs_vf(advxb + i - 1)%sf(j, k, l) = rhs_vf(advxb + i - 1)%sf(j, k, l) & + + (0.5_wp*q_cons_vf(advxb + i - 1)%sf(j, k, l)*vel_R(2)*(1._wp/dy(k))) + end do + end if + + $:GPU_ATOMIC(atomic='update') + rhs_vf(momxb + 1)%sf(j, k + 1, l) = rhs_vf(momxb + 1)%sf(j, k + 1, l) + & + (0.5_wp*(rho_R*(vel_R(2))**2.0 + & + pres_R + F_R)*(1._wp/dy(k + 1)) + & + 0.5_wp*cfl*(rho_R*vel_R(2))*(1._wp/dy(k + 1))) + + $:GPU_ATOMIC(atomic='update') + rhs_vf(momxb)%sf(j, k + 1, l) = rhs_vf(momxb)%sf(j, k + 1, l) + & + (0.5_wp*rho_R*vel_R(2)*vel_R(1)*(1._wp/dy(k + 1)) + & + 0.5_wp*cfl*(rho_R*vel_R(1))*(1._wp/dy(k + 1))) + + $:GPU_ATOMIC(atomic='update') + rhs_vf(momxb + 2)%sf(j, k + 1, l) = rhs_vf(momxb + 2)%sf(j, k + 1, l) + & + (0.5_wp*rho_R*vel_R(2)*vel_R(3)*(1._wp/dy(k + 1)) + & + 0.5_wp*cfl*(rho_R*vel_R(3))*(1._wp/dy(k + 1))) - $:GPU_ATOMIC(atomic='update') - rhs_vf(advxb + i - 1)%sf(j, k, l) = rhs_vf(advxb + i - 1)%sf(j, k, l) - & - (0.5_wp*(alpha_R(i)* & - vel_R(2))*(1._wp/dy(k)) + & - 0.5_wp*cfl*(alpha_R(i))*(1._wp/dy(k))) + $:GPU_ATOMIC(atomic='update') + rhs_vf(E_idx)%sf(j, k + 1, l) = rhs_vf(E_idx)%sf(j, k + 1, l) + & + (0.5_wp*(vel_R(2)*(E_R + & + pres_R + F_R))*(1._wp/dy(k + 1)) + & + 0.5_wp*cfl*(E_R)*(1._wp/dy(k + 1))) - $:GPU_ATOMIC(atomic='update') - rhs_vf(advxb + i - 1)%sf(j, k, l) = rhs_vf(advxb + i - 1)%sf(j, k, l) & - + (0.5_wp*q_cons_vf(advxb + i - 1)%sf(j, k, l)*vel_R(2)*(1._wp/dy(k))) - end do - end if - $:GPU_ATOMIC(atomic='update') - rhs_vf(momxb + 1)%sf(j, k + 1, l) = rhs_vf(momxb + 1)%sf(j, k + 1, l) + & + $:GPU_ATOMIC(atomic='update') + rhs_vf(momxb + 1)%sf(j, k, l) = rhs_vf(momxb + 1)%sf(j, k, l) - & (0.5_wp*(rho_R*(vel_R(2))**2.0 + & - pres_R + F_R)*(1._wp/dy(k + 1)) + & - 0.5_wp*cfl*(rho_R*vel_R(2))*(1._wp/dy(k + 1))) - $:GPU_ATOMIC(atomic='update') - rhs_vf(momxb)%sf(j, k + 1, l) = rhs_vf(momxb)%sf(j, k + 1, l) + & - (0.5_wp*rho_R*vel_R(2)*vel_R(1)*(1._wp/dy(k + 1)) + & - 0.5_wp*cfl*(rho_R*vel_R(1))*(1._wp/dy(k + 1))) - $:GPU_ATOMIC(atomic='update') - rhs_vf(E_idx)%sf(j, k + 1, l) = rhs_vf(E_idx)%sf(j, k + 1, l) + & + pres_R + F_R)*(1._wp/dy(k)) + & + 0.5_wp*cfl*(rho_R*vel_R(2))*(1._wp/dy(k))) + + $:GPU_ATOMIC(atomic='update') + rhs_vf(momxb)%sf(j, k, l) = rhs_vf(momxb)%sf(j, k, l) - & + (0.5_wp*rho_R*vel_R(2)*vel_R(1)*(1._wp/dy(k)) + & + 0.5_wp*cfl*(rho_R*vel_R(1))*(1._wp/dy(k))) + + $:GPU_ATOMIC(atomic='update') + rhs_vf(momxb + 2)%sf(j, k, l) = rhs_vf(momxb + 2)%sf(j, k, l) - & + (0.5_wp*rho_R*vel_R(2)*vel_R(3)*(1._wp/dy(k)) + & + 0.5_wp*cfl*(rho_R*vel_R(3))*(1._wp/dy(k))) + + $:GPU_ATOMIC(atomic='update') + rhs_vf(E_idx)%sf(j, k, l) = rhs_vf(E_idx)%sf(j, k, l) - & (0.5_wp*(vel_R(2)*(E_R + & - pres_R + F_R))*(1._wp/dy(k + 1)) + & - 0.5_wp*cfl*(E_R)*(1._wp/dy(k + 1))) - $:GPU_ATOMIC(atomic='update') - rhs_vf(momxb + 1)%sf(j, k, l) = rhs_vf(momxb + 1)%sf(j, k, l) - & - (0.5_wp*(rho_R*(vel_R(2))**2.0 + & - pres_R + F_R)*(1._wp/dy(k)) + & - 0.5_wp*cfl*(rho_R*vel_R(2))*(1._wp/dy(k))) - $:GPU_ATOMIC(atomic='update') - rhs_vf(momxb)%sf(j, k, l) = rhs_vf(momxb)%sf(j, k, l) - & - (0.5_wp*rho_R*vel_R(2)*vel_R(1)*(1._wp/dy(k)) + & - 0.5_wp*cfl*(rho_R*vel_R(1))*(1._wp/dy(k))) - $:GPU_ATOMIC(atomic='update') - rhs_vf(E_idx)%sf(j, k, l) = rhs_vf(E_idx)%sf(j, k, l) - & - (0.5_wp*(vel_R(2)*(E_R + & - pres_R + F_R))*(1._wp/dy(k)) + & - 0.5_wp*cfl*(E_R)*(1._wp/dy(k))) + pres_R + F_R))*(1._wp/dy(k)) + & + 0.5_wp*cfl*(E_R)*(1._wp/dy(k))) + + end do end do end do - end do - else - $:GPU_PARALLEL_LOOP(collapse=3, private='[rho_L, rho_R, gamma_L, & - & gamma_R, pi_inf_L, pi_inf_R, mu_L, mu_R, vel_L, vel_R, & - & pres_L, pres_R, alpha_L, alpha_R, alpha_rho_L, alpha_rho_R, & - & F_L, F_R, E_L, E_R, cfl, dvel_small, rho_sf_small, & - & vflux_L_arr, vflux_R_arr]') - do l = 0, p - do k = -1, n + #:endcall GPU_PARALLEL_LOOP + end if + elseif (idir == 3) then + #:call GPU_PARALLEL_LOOP(collapse=3, private='[rho_L, rho_R, gamma_L, gamma_R, pi_inf_L, pi_inf_R, mu_L, mu_R, vel_L, vel_R, pres_L, pres_R, alpha_L, alpha_R, alpha_rho_L, alpha_rho_R, F_L, F_R, E_L, E_R, cfl, dvel_small, rho_sf_small, vflux_L_arr, vflux_R_arr]') + do l = -1, p + do k = 0, n do j = 0, m if (viscous) then @@ -1670,24 +2123,24 @@ contains rho_L = 0._wp $:GPU_LOOP(parallelism='[seq]') do r = 1, num_fluids - rho_L = rho_L + q_cons_vf(r)%sf(j + i, k + q, l) + rho_L = rho_L + q_cons_vf(r)%sf(j + i, k, l + q) end do rho_sf_small(i) = rho_L end do dvel_small(1) = (1/(2._wp*dx(j)))*( & - q_cons_vf(momxb)%sf(j + 1, k + q, l)/rho_sf_small(1) - & - q_cons_vf(momxb)%sf(j - 1, k + q, l)/rho_sf_small(-1)) - dvel_small(2) = (1/(2._wp*dx(j)))*( & - q_cons_vf(momxb + 1)%sf(j + 1, k + q, l)/rho_sf_small(1) - & - q_cons_vf(momxb + 1)%sf(j - 1, k + q, l)/rho_sf_small(-1)) + q_cons_vf(momxb)%sf(j + 1, k, l + q)/rho_sf_small(1) - & + q_cons_vf(momxb)%sf(j - 1, k, l + q)/rho_sf_small(-1)) + dvel_small(3) = (1/(2._wp*dx(j)))*( & + q_cons_vf(momxb + 2)%sf(j + 1, k, l + q)/rho_sf_small(1) - & + q_cons_vf(momxb + 2)%sf(j - 1, k, l + q)/rho_sf_small(-1)) if (q > vidxb) then - vflux_L_arr(1) = vflux_L_arr(1) + coeff_L(q)*(dvel_small(2)) + vflux_L_arr(1) = vflux_L_arr(1) + coeff_L(q)*(dvel_small(3)) vflux_L_arr(3) = vflux_L_arr(3) + coeff_L(q)*(-2._wp*dvel_small(1))/3._wp end if if (q < vidxe) then - vflux_R_arr(1) = vflux_R_arr(1) + coeff_R(q)*(dvel_small(2)) + vflux_R_arr(1) = vflux_R_arr(1) + coeff_R(q)*(dvel_small(3)) vflux_R_arr(3) = vflux_R_arr(3) + coeff_R(q)*(-2._wp*dvel_small(1))/3._wp end if @@ -1697,30 +2150,25 @@ contains rho_L = 0._wp $:GPU_LOOP(parallelism='[seq]') do r = 1, num_fluids - rho_L = rho_L + q_cons_vf(r)%sf(j, k + i + q, l) + rho_L = rho_L + q_cons_vf(r)%sf(j, k + i, l + q) end do rho_sf_small(i) = rho_L end do - dvel_small(1) = (1/(2._wp*dy(k)))*( & - q_cons_vf(momxb)%sf(j, k + 1 + q, l)/rho_sf_small(1) - & - q_cons_vf(momxb)%sf(j, k - 1 + q, l)/rho_sf_small(-1)) dvel_small(2) = (1/(2._wp*dy(k)))*( & - q_cons_vf(momxb + 1)%sf(j, k + 1 + q, l)/rho_sf_small(1) - & - q_cons_vf(momxb + 1)%sf(j, k - 1 + q, l)/rho_sf_small(-1)) + q_cons_vf(momxb + 1)%sf(j, k + 1, l + q)/rho_sf_small(1) - & + q_cons_vf(momxb + 1)%sf(j, k - 1, l + q)/rho_sf_small(-1)) dvel_small(3) = (1/(2._wp*dy(k)))*( & - q_cons_vf(momxb + 2)%sf(j, k + 1 + q, l)/rho_sf_small(1) - & - q_cons_vf(momxb + 2)%sf(j, k - 1 + q, l)/rho_sf_small(-1)) + q_cons_vf(momxb + 2)%sf(j, k + 1, l + q)/rho_sf_small(1) - & + q_cons_vf(momxb + 2)%sf(j, k - 1, l + q)/rho_sf_small(-1)) if (q > vidxb) then - vflux_L_arr(1) = vflux_L_arr(1) + coeff_L(q)*(dvel_small(1)) vflux_L_arr(2) = vflux_L_arr(2) + coeff_L(q)*(dvel_small(3)) - vflux_L_arr(3) = vflux_L_arr(3) + coeff_L(q)*(4._wp*dvel_small(2))/3._wp + vflux_L_arr(3) = vflux_L_arr(3) + coeff_L(q)*(-2._wp*dvel_small(2))/3._wp end if if (q < vidxe) then - vflux_R_arr(1) = vflux_R_arr(1) + coeff_R(q)*(dvel_small(1)) vflux_R_arr(2) = vflux_R_arr(2) + coeff_R(q)*(dvel_small(3)) - vflux_R_arr(3) = vflux_R_arr(3) + coeff_R(q)*(4._wp*dvel_small(2))/3._wp + vflux_R_arr(3) = vflux_R_arr(3) + coeff_R(q)*(-2._wp*dvel_small(2))/3._wp end if !z-direction contributions @@ -1729,24 +2177,28 @@ contains rho_L = 0._wp $:GPU_LOOP(parallelism='[seq]') do r = 1, num_fluids - rho_L = rho_L + q_cons_vf(r)%sf(j, k + q, l + i) + rho_L = rho_L + q_cons_vf(r)%sf(j, k, l + i + q) end do rho_sf_small(i) = rho_L end do - + dvel_small(1) = (1/(2._wp*dz(l)))*( & + q_cons_vf(momxb)%sf(j, k, l + 1 + q)/rho_sf_small(1) - & + q_cons_vf(momxb)%sf(j, k, l - 1 + q)/rho_sf_small(-1)) dvel_small(2) = (1/(2._wp*dz(l)))*( & - q_cons_vf(momxb + 1)%sf(j, k + q, l + 1)/rho_sf_small(1) - & - q_cons_vf(momxb + 1)%sf(j, k + q, l - 1)/rho_sf_small(-1)) + q_cons_vf(momxb + 1)%sf(j, k, l + 1 + q)/rho_sf_small(1) - & + q_cons_vf(momxb + 1)%sf(j, k, l - 1 + q)/rho_sf_small(-1)) dvel_small(3) = (1/(2._wp*dz(l)))*( & - q_cons_vf(momxb + 2)%sf(j, k + q, l + 1)/rho_sf_small(1) - & - q_cons_vf(momxb + 2)%sf(j, k + q, l - 1)/rho_sf_small(-1)) + q_cons_vf(momxb + 2)%sf(j, k, l + 1 + q)/rho_sf_small(1) - & + q_cons_vf(momxb + 2)%sf(j, k, l - 1 + q)/rho_sf_small(-1)) if (q > vidxb) then + vflux_L_arr(1) = vflux_L_arr(1) + coeff_L(q)*(dvel_small(1)) vflux_L_arr(2) = vflux_L_arr(2) + coeff_L(q)*(dvel_small(2)) - vflux_L_arr(3) = vflux_L_arr(3) + coeff_L(q)*(-2._wp*dvel_small(3))/3._wp + vflux_L_arr(3) = vflux_L_arr(3) + coeff_L(q)*(4._wp*dvel_small(3))/3._wp end if if (q < vidxe) then + vflux_R_arr(1) = vflux_R_arr(1) + coeff_R(q)*(dvel_small(1)) vflux_R_arr(2) = vflux_R_arr(2) + coeff_R(q)*(dvel_small(2)) - vflux_R_arr(3) = vflux_R_arr(3) + coeff_R(q)*(-2._wp*dvel_small(3))/3._wp + vflux_R_arr(3) = vflux_R_arr(3) + coeff_R(q)*(4._wp*dvel_small(3))/3._wp end if end do end if @@ -1759,13 +2211,13 @@ contains do q = vidxb + 1, vidxe $:GPU_LOOP(parallelism='[seq]') do i = 1, num_fluids - alpha_rho_L(i) = alpha_rho_L(i) + coeff_L(q)*q_cons_vf(i)%sf(j, k + q, l) + alpha_rho_L(i) = alpha_rho_L(i) + coeff_L(q)*q_cons_vf(i)%sf(j, k, l + q) end do if (num_fluids > 1) then $:GPU_LOOP(parallelism='[seq]') do i = 1, num_fluids - 1 - alpha_L(i) = alpha_L(i) + coeff_L(q)*q_cons_vf(E_idx + i)%sf(j, k + q, l) + alpha_L(i) = alpha_L(i) + coeff_L(q)*q_cons_vf(E_idx + i)%sf(j, k, l + q) end do else alpha_L(1) = 1._wp @@ -1773,7 +2225,7 @@ contains $:GPU_LOOP(parallelism='[seq]') do i = 1, num_dims - vel_L(i) = vel_L(i) + coeff_L(q)*q_cons_vf(momxb + i - 1)%sf(j, k + q, l) + vel_L(i) = vel_L(i) + coeff_L(q)*q_cons_vf(momxb + i - 1)%sf(j, k, l + q) end do end do @@ -1781,13 +2233,13 @@ contains do q = vidxb, vidxe - 1 $:GPU_LOOP(parallelism='[seq]') do i = 1, num_fluids - alpha_rho_R(i) = alpha_rho_R(i) + coeff_R(q)*q_cons_vf(i)%sf(j, k + q, l) + alpha_rho_R(i) = alpha_rho_R(i) + coeff_R(q)*q_cons_vf(i)%sf(j, k, l + q) end do if (num_fluids > 1) then $:GPU_LOOP(parallelism='[seq]') do i = 1, num_fluids - 1 - alpha_R(i) = alpha_R(i) + coeff_R(q)*q_cons_vf(E_idx + i)%sf(j, k + q, l) + alpha_R(i) = alpha_R(i) + coeff_R(q)*q_cons_vf(E_idx + i)%sf(j, k, l + q) end do else alpha_R(1) = 1._wp @@ -1795,7 +2247,7 @@ contains $:GPU_LOOP(parallelism='[seq]') do i = 1, num_dims - vel_R(i) = vel_R(i) + coeff_R(q)*q_cons_vf(momxb + i - 1)%sf(j, k + q, l) + vel_R(i) = vel_R(i) + coeff_R(q)*q_cons_vf(momxb + i - 1)%sf(j, k, l + q) end do end do @@ -1820,93 +2272,93 @@ contains mu_R = 0._wp $:GPU_LOOP(parallelism='[seq]') do i = 1, num_fluids - mu_L = alpha_L(i)/Res(1, i) + mu_L - mu_R = alpha_R(i)/Res(1, i) + mu_R + mu_L = alpha_L(i)/Res_igr(1, i) + mu_L + mu_R = alpha_R(i)/Res_igr(1, i) + mu_R end do $:GPU_ATOMIC(atomic='update') - rhs_vf(momxb)%sf(j, k + 1, l) = rhs_vf(momxb)%sf(j, k + 1, l) - & - 0.5_wp*mu_L*vflux_L_arr(1)*(1._wp/dy(k + 1)) + rhs_vf(momxb)%sf(j, k, l + 1) = rhs_vf(momxb)%sf(j, k, l + 1) - & + 0.5_wp*mu_L*vflux_L_arr(1)*(1._wp/dz(l + 1)) $:GPU_ATOMIC(atomic='update') - rhs_vf(E_idx)%sf(j, k + 1, l) = rhs_vf(E_idx)%sf(j, k + 1, l) - & - 0.5_wp*mu_L*vflux_L_arr(1)*vel_L(1)*(1._wp/dy(k + 1)) + rhs_vf(E_idx)%sf(j, k, l + 1) = rhs_vf(E_idx)%sf(j, k, l + 1) - & + 0.5_wp*mu_L*vflux_L_arr(1)*vel_L(1)*(1._wp/dz(l + 1)) $:GPU_ATOMIC(atomic='update') rhs_vf(momxb)%sf(j, k, l) = rhs_vf(momxb)%sf(j, k, l) + & - 0.5_wp*mu_L*vflux_L_arr(1)*(1._wp/dy(k)) + 0.5_wp*mu_L*vflux_L_arr(1)*(1._wp/dz(l)) $:GPU_ATOMIC(atomic='update') rhs_vf(E_idx)%sf(j, k, l) = rhs_vf(E_idx)%sf(j, k, l) + & - 0.5_wp*mu_L*vflux_L_arr(1)*vel_L(1)*(1._wp/dy(k)) + 0.5_wp*mu_L*vflux_L_arr(1)*vel_L(1)*(1._wp/dz(l)) $:GPU_ATOMIC(atomic='update') - rhs_vf(momxb)%sf(j, k + 1, l) = rhs_vf(momxb)%sf(j, k + 1, l) - & - 0.5_wp*mu_R*vflux_R_arr(1)*(1._wp/dy(k + 1)) + rhs_vf(momxb)%sf(j, k, l + 1) = rhs_vf(momxb)%sf(j, k, l + 1) - & + 0.5_wp*mu_R*vflux_R_arr(1)*(1._wp/dz(l + 1)) $:GPU_ATOMIC(atomic='update') - rhs_vf(E_idx)%sf(j, k + 1, l) = rhs_vf(E_idx)%sf(j, k + 1, l) - & - 0.5_wp*mu_R*vflux_R_arr(1)*vel_R(1)*(1._wp/dy(k + 1)) + rhs_vf(E_idx)%sf(j, k, l + 1) = rhs_vf(E_idx)%sf(j, k, l + 1) - & + 0.5_wp*mu_R*vflux_R_arr(1)*vel_R(1)*(1._wp/dz(l + 1)) $:GPU_ATOMIC(atomic='update') rhs_vf(momxb)%sf(j, k, l) = rhs_vf(momxb)%sf(j, k, l) + & - 0.5_wp*mu_R*vflux_R_arr(1)*(1._wp/dy(k)) + 0.5_wp*mu_R*vflux_R_arr(1)*(1._wp/dz(l)) $:GPU_ATOMIC(atomic='update') rhs_vf(E_idx)%sf(j, k, l) = rhs_vf(E_idx)%sf(j, k, l) + & - 0.5_wp*mu_R*vflux_R_arr(1)*vel_R(1)*(1._wp/dy(k)) + 0.5_wp*mu_R*vflux_R_arr(1)*vel_R(1)*(1._wp/dz(l)) $:GPU_ATOMIC(atomic='update') - rhs_vf(momxb + 2)%sf(j, k + 1, l) = rhs_vf(momxb + 2)%sf(j, k + 1, l) - & - 0.5_wp*mu_L*vflux_L_arr(2)*(1._wp/dy(k + 1)) + rhs_vf(momxb + 1)%sf(j, k, l + 1) = rhs_vf(momxb + 1)%sf(j, k, l + 1) - & + 0.5_wp*mu_L*vflux_L_arr(2)*(1._wp/dz(l + 1)) $:GPU_ATOMIC(atomic='update') - rhs_vf(E_idx)%sf(j, k + 1, l) = rhs_vf(E_idx)%sf(j, k + 1, l) - & - 0.5_wp*mu_L*vflux_L_arr(2)*vel_L(3)*(1._wp/dy(k + 1)) + rhs_vf(E_idx)%sf(j, k, l + 1) = rhs_vf(E_idx)%sf(j, k, l + 1) - & + 0.5_wp*mu_L*vflux_L_arr(2)*vel_L(2)*(1._wp/dz(l + 1)) $:GPU_ATOMIC(atomic='update') - rhs_vf(momxb + 2)%sf(j, k, l) = rhs_vf(momxb + 2)%sf(j, k, l) + & - 0.5_wp*mu_L*vflux_L_arr(2)*(1._wp/dy(k)) + rhs_vf(momxb + 1)%sf(j, k, l) = rhs_vf(momxb + 1)%sf(j, k, l) + & + 0.5_wp*mu_L*vflux_L_arr(2)*(1._wp/dz(l)) $:GPU_ATOMIC(atomic='update') rhs_vf(E_idx)%sf(j, k, l) = rhs_vf(E_idx)%sf(j, k, l) + & - 0.5_wp*mu_L*vflux_L_arr(2)*vel_L(3)*(1._wp/dy(k)) + 0.5_wp*mu_L*vflux_L_arr(2)*vel_L(2)*(1._wp/dz(l)) $:GPU_ATOMIC(atomic='update') - rhs_vf(momxb + 2)%sf(j, k + 1, l) = rhs_vf(momxb + 2)%sf(j, k + 1, l) - & - 0.5_wp*mu_R*vflux_R_arr(2)*(1._wp/dy(k + 1)) + rhs_vf(momxb + 1)%sf(j, k, l + 1) = rhs_vf(momxb + 1)%sf(j, k, l + 1) - & + 0.5_wp*mu_R*vflux_R_arr(2)*(1._wp/dz(l + 1)) $:GPU_ATOMIC(atomic='update') - rhs_vf(E_idx)%sf(j, k + 1, l) = rhs_vf(E_idx)%sf(j, k + 1, l) - & - 0.5_wp*mu_R*vflux_R_arr(2)*vel_R(3)*(1._wp/dy(k + 1)) + rhs_vf(E_idx)%sf(j, k, l + 1) = rhs_vf(E_idx)%sf(j, k, l + 1) - & + 0.5_wp*mu_R*vflux_R_arr(2)*vel_R(2)*(1._wp/dz(l + 1)) $:GPU_ATOMIC(atomic='update') - rhs_vf(momxb + 2)%sf(j, k, l) = rhs_vf(momxb + 2)%sf(j, k, l) + & - 0.5_wp*mu_R*vflux_R_arr(2)*(1._wp/dy(k)) + rhs_vf(momxb + 1)%sf(j, k, l) = rhs_vf(momxb + 1)%sf(j, k, l) + & + 0.5_wp*mu_R*vflux_R_arr(2)*(1._wp/dz(l)) $:GPU_ATOMIC(atomic='update') rhs_vf(E_idx)%sf(j, k, l) = rhs_vf(E_idx)%sf(j, k, l) + & - 0.5_wp*mu_R*vflux_R_arr(2)*vel_R(3)*(1._wp/dy(k)) + 0.5_wp*mu_R*vflux_R_arr(2)*vel_R(2)*(1._wp/dz(l)) $:GPU_ATOMIC(atomic='update') - rhs_vf(momxb + 1)%sf(j, k + 1, l) = rhs_vf(momxb + 1)%sf(j, k + 1, l) - & - 0.5_wp*mu_L*vflux_L_arr(3)*(1._wp/dy(k + 1)) + rhs_vf(momxb + 2)%sf(j, k, l + 1) = rhs_vf(momxb + 2)%sf(j, k, l + 1) - & + 0.5_wp*mu_L*vflux_L_arr(3)*(1._wp/dz(l + 1)) $:GPU_ATOMIC(atomic='update') - rhs_vf(E_idx)%sf(j, k + 1, l) = rhs_vf(E_idx)%sf(j, k + 1, l) - & - 0.5_wp*mu_L*vflux_L_arr(3)*vel_L(2)*(1._wp/dy(k + 1)) + rhs_vf(E_idx)%sf(j, k, l + 1) = rhs_vf(E_idx)%sf(j, k, l + 1) - & + 0.5_wp*mu_L*vflux_L_arr(3)*vel_L(3)*(1._wp/dz(l + 1)) $:GPU_ATOMIC(atomic='update') - rhs_vf(momxb + 1)%sf(j, k, l) = rhs_vf(momxb + 1)%sf(j, k, l) + & - 0.5_wp*mu_L*vflux_L_arr(3)*(1._wp/dy(k)) + rhs_vf(momxb + 2)%sf(j, k, l) = rhs_vf(momxb + 2)%sf(j, k, l) + & + 0.5_wp*mu_L*vflux_L_arr(3)*(1._wp/dz(l)) $:GPU_ATOMIC(atomic='update') rhs_vf(E_idx)%sf(j, k, l) = rhs_vf(E_idx)%sf(j, k, l) + & - 0.5_wp*mu_L*vflux_L_arr(3)*vel_L(2)*(1._wp/dy(k)) + 0.5_wp*mu_L*vflux_L_arr(3)*vel_L(3)*(1._wp/dz(l)) $:GPU_ATOMIC(atomic='update') - rhs_vf(momxb + 1)%sf(j, k + 1, l) = rhs_vf(momxb + 1)%sf(j, k + 1, l) - & - 0.5_wp*mu_R*vflux_R_arr(3)*(1._wp/dy(k + 1)) + rhs_vf(momxb + 2)%sf(j, k, l + 1) = rhs_vf(momxb + 2)%sf(j, k, l + 1) - & + 0.5_wp*mu_R*vflux_R_arr(3)*(1._wp/dz(l + 1)) $:GPU_ATOMIC(atomic='update') - rhs_vf(E_idx)%sf(j, k + 1, l) = rhs_vf(E_idx)%sf(j, k + 1, l) - & - 0.5_wp*mu_R*vflux_R_arr(3)*vel_R(2)*(1._wp/dy(k + 1)) + rhs_vf(E_idx)%sf(j, k, l + 1) = rhs_vf(E_idx)%sf(j, k, l + 1) - & + 0.5_wp*mu_R*vflux_R_arr(3)*vel_R(3)*(1._wp/dz(l + 1)) $:GPU_ATOMIC(atomic='update') - rhs_vf(momxb + 1)%sf(j, k, l) = rhs_vf(momxb + 1)%sf(j, k, l) + & - 0.5_wp*mu_R*vflux_R_arr(3)*(1._wp/dy(k)) + rhs_vf(momxb + 2)%sf(j, k, l) = rhs_vf(momxb + 2)%sf(j, k, l) + & + 0.5_wp*mu_R*vflux_R_arr(3)*(1._wp/dz(l)) $:GPU_ATOMIC(atomic='update') rhs_vf(E_idx)%sf(j, k, l) = rhs_vf(E_idx)%sf(j, k, l) + & - 0.5_wp*mu_R*vflux_R_arr(3)*vel_R(2)*(1._wp/dy(k)) + 0.5_wp*mu_R*vflux_R_arr(3)*vel_R(3)*(1._wp/dz(l)) end if E_L = 0._wp; E_R = 0._wp @@ -1914,14 +2366,14 @@ contains $:GPU_LOOP(parallelism='[seq]') do q = vidxb + 1, vidxe - E_L = E_L + coeff_L(q)*q_cons_vf(E_idx)%sf(j, k + q, l) - F_L = F_L + coeff_L(q)*jac(j, k + q, l) + E_L = E_L + coeff_L(q)*q_cons_vf(E_idx)%sf(j, k, l + q) + F_L = F_L + coeff_L(q)*jac(j, k, l + q) end do $:GPU_LOOP(parallelism='[seq]') do q = vidxb, vidxe - 1 - E_R = E_R + coeff_R(q)*q_cons_vf(E_idx)%sf(j, k + q, l) - F_R = F_R + coeff_R(q)*jac(j, k + q, l) + E_R = E_R + coeff_R(q)*q_cons_vf(E_idx)%sf(j, k, l + q) + F_R = F_R + coeff_R(q)*jac(j, k, l + q) end do call s_get_derived_states(E_L, gamma_L, pi_inf_L, rho_L, vel_L, & @@ -1931,640 +2383,182 @@ contains $:GPU_LOOP(parallelism='[seq]') do i = 1, num_fluids $:GPU_ATOMIC(atomic='update') - rhs_vf(i)%sf(j, k + 1, l) = rhs_vf(i)%sf(j, k + 1, l) + & + rhs_vf(i)%sf(j, k, l + 1) = rhs_vf(i)%sf(j, k, l + 1) + & (0.5_wp*(alpha_rho_L(i)* & - vel_L(2))*(1._wp/dy(k + 1)) - & - 0.5_wp*cfl*(alpha_rho_L(i))*(1._wp/dy(k + 1))) + vel_L(3))*(1._wp/dz(l + 1)) - & + 0.5_wp*cfl*(alpha_rho_L(i))*(1._wp/dz(l + 1))) $:GPU_ATOMIC(atomic='update') rhs_vf(i)%sf(j, k, l) = rhs_vf(i)%sf(j, k, l) - & (0.5_wp*(alpha_rho_L(i)* & - vel_L(2))*(1._wp/dy(k)) - & - 0.5_wp*cfl*(alpha_rho_L(i))*(1._wp/dy(k))) + vel_L(3))*(1._wp/dz(l)) - & + 0.5_wp*cfl*(alpha_rho_L(i))*(1._wp/dz(l))) end do if (num_fluids > 1) then $:GPU_LOOP(parallelism='[seq]') do i = 1, num_fluids - 1 $:GPU_ATOMIC(atomic='update') - rhs_vf(advxb + i - 1)%sf(j, k + 1, l) = rhs_vf(advxb + i - 1)%sf(j, k + 1, l) + & + rhs_vf(advxb + i - 1)%sf(j, k, l + 1) = rhs_vf(advxb + i - 1)%sf(j, k, l + 1) + & (0.5_wp*(alpha_L(i)* & - vel_L(2))*(1._wp/dy(k + 1)) - & - 0.5_wp*cfl*(alpha_L(i))*(1._wp/dy(k + 1))) + vel_L(3))*(1._wp/dz(l + 1)) - & + 0.5_wp*cfl*(alpha_L(i))*(1._wp/dz(l + 1))) $:GPU_ATOMIC(atomic='update') - rhs_vf(advxb + i - 1)%sf(j, k + 1, l) = rhs_vf(advxb + i - 1)%sf(j, k + 1, l) & - - (0.5_wp*q_cons_vf(advxb + i - 1)%sf(j, k + 1, l)*vel_L(2)*(1._wp/dy(k + 1))) + rhs_vf(advxb + i - 1)%sf(j, k, l + 1) = rhs_vf(advxb + i - 1)%sf(j, k, l + 1) & + - (0.5_wp*q_cons_vf(advxb + i - 1)%sf(j, k, l + 1)*vel_L(3)*(1._wp/dz(l + 1))) $:GPU_ATOMIC(atomic='update') rhs_vf(advxb + i - 1)%sf(j, k, l) = rhs_vf(advxb + i - 1)%sf(j, k, l) - & (0.5_wp*(alpha_L(i)* & - vel_L(2))*(1._wp/dy(k)) - & - 0.5_wp*cfl*(alpha_L(i))*(1._wp/dy(k))) + vel_L(3))*(1._wp/dz(l)) - & + 0.5_wp*cfl*(alpha_L(i))*(1._wp/dz(l))) $:GPU_ATOMIC(atomic='update') rhs_vf(advxb + i - 1)%sf(j, k, l) = rhs_vf(advxb + i - 1)%sf(j, k, l) & - + (0.5_wp*q_cons_vf(advxb + i - 1)%sf(j, k, l)*vel_L(2)*(1._wp/dy(k))) + + (0.5_wp*q_cons_vf(advxb + i - 1)%sf(j, k, l)*vel_L(3)*(1._wp/dz(l))) end do end if $:GPU_ATOMIC(atomic='update') - rhs_vf(momxb + 1)%sf(j, k + 1, l) = rhs_vf(momxb + 1)%sf(j, k + 1, l) + & - (0.5_wp*(rho_L*(vel_L(2))**2.0 + & - pres_L + F_L)*(1._wp/dy(k + 1)) - & - 0.5_wp*cfl*(rho_L*vel_L(2))*(1._wp/dy(k + 1))) + rhs_vf(momxb + 2)%sf(j, k, l + 1) = rhs_vf(momxb + 2)%sf(j, k, l + 1) + & + (0.5_wp*(rho_L*(vel_L(3))**2.0 + & + pres_L + F_L)*(1._wp/dz(l + 1)) - & + 0.5_wp*cfl*(rho_L*vel_L(3))*(1._wp/dz(l + 1))) $:GPU_ATOMIC(atomic='update') - rhs_vf(momxb)%sf(j, k + 1, l) = rhs_vf(momxb)%sf(j, k + 1, l) + & - (0.5_wp*rho_L*vel_L(1)*vel_L(2)*(1._wp/dy(k + 1)) - & - 0.5_wp*cfl*(rho_L*vel_L(1))*(1._wp/dy(k + 1))) + rhs_vf(momxb)%sf(j, k, l + 1) = rhs_vf(momxb)%sf(j, k, l + 1) + & + (0.5_wp*rho_L*vel_L(1)*vel_L(3)*(1._wp/dz(l + 1)) - & + 0.5_wp*cfl*(rho_L*vel_L(1))*(1._wp/dz(l + 1))) $:GPU_ATOMIC(atomic='update') - rhs_vf(momxb + 2)%sf(j, k + 1, l) = rhs_vf(momxb + 2)%sf(j, k + 1, l) + & - (0.5_wp*rho_L*vel_L(3)*vel_L(2)*(1._wp/dy(k + 1)) - & - 0.5_wp*cfl*(rho_L*vel_L(3))*(1._wp/dy(k + 1))) + rhs_vf(momxb + 1)%sf(j, k, l + 1) = rhs_vf(momxb + 1)%sf(j, k, l + 1) + & + (0.5_wp*rho_L*vel_L(2)*vel_L(3)*(1._wp/dz(l + 1)) - & + 0.5_wp*cfl*(rho_L*vel_L(2))*(1._wp/dz(l + 1))) $:GPU_ATOMIC(atomic='update') - rhs_vf(E_idx)%sf(j, k + 1, l) = rhs_vf(E_idx)%sf(j, k + 1, l) + & - (0.5_wp*(vel_L(2)*(E_L + & - pres_L + F_L))*(1._wp/dy(k + 1)) - & - 0.5_wp*cfl*(E_L)*(1._wp/dy(k + 1))) + rhs_vf(E_idx)%sf(j, k, l + 1) = rhs_vf(E_idx)%sf(j, k, l + 1) + & + (0.5_wp*(vel_L(3)*(E_L + & + pres_L + F_L))*(1._wp/dz(l + 1)) - & + 0.5_wp*cfl*(E_L)*(1._wp/dz(l + 1))) $:GPU_ATOMIC(atomic='update') - rhs_vf(momxb + 1)%sf(j, k, l) = rhs_vf(momxb + 1)%sf(j, k, l) - & - (0.5_wp*(rho_L*(vel_L(2))**2.0 + & - pres_L + F_L)*(1._wp/dy(k)) - & - 0.5_wp*cfl*(rho_L*vel_L(2))*(1._wp/dy(k))) + rhs_vf(momxb + 2)%sf(j, k, l) = rhs_vf(momxb + 2)%sf(j, k, l) - & + (0.5_wp*(rho_L*(vel_L(3))**2.0 + & + pres_L + F_L)*(1._wp/dz(l)) - & + 0.5_wp*cfl*(rho_L*vel_L(3))*(1._wp/dz(l))) $:GPU_ATOMIC(atomic='update') rhs_vf(momxb)%sf(j, k, l) = rhs_vf(momxb)%sf(j, k, l) - & - (0.5_wp*rho_L*vel_L(1)*vel_L(2)*(1._wp/dy(k)) - & - 0.5_wp*cfl*(rho_L*vel_L(1))*(1._wp/dy(k))) + (0.5_wp*rho_L*vel_L(1)*vel_L(3)*(1._wp/dz(l)) - & + 0.5_wp*cfl*(rho_L*vel_L(1))*(1._wp/dz(l))) $:GPU_ATOMIC(atomic='update') - rhs_vf(momxb + 2)%sf(j, k, l) = rhs_vf(momxb + 2)%sf(j, k, l) - & - (0.5_wp*rho_L*vel_L(3)*vel_L(2)*(1._wp/dy(k)) - & - 0.5_wp*cfl*(rho_L*vel_L(3))*(1._wp/dy(k))) + rhs_vf(momxb + 1)%sf(j, k, l) = rhs_vf(momxb + 1)%sf(j, k, l) - & + (0.5_wp*rho_L*vel_L(2)*vel_L(3)*(1._wp/dz(l)) - & + 0.5_wp*cfl*(rho_L*vel_L(2))*(1._wp/dz(l))) $:GPU_ATOMIC(atomic='update') rhs_vf(E_idx)%sf(j, k, l) = rhs_vf(E_idx)%sf(j, k, l) - & - (0.5_wp*(vel_L(2)*(E_L + & - pres_L + F_L))*(1._wp/dy(k)) - & - 0.5_wp*cfl*(E_L)*(1._wp/dy(k))) + (0.5_wp*(vel_L(3)*(E_L + & + pres_L + F_L))*(1._wp/dz(l)) - & + 0.5_wp*cfl*(E_L)*(1._wp/dz(l))) $:GPU_LOOP(parallelism='[seq]') do i = 1, num_fluids $:GPU_ATOMIC(atomic='update') - rhs_vf(i)%sf(j, k + 1, l) = rhs_vf(i)%sf(j, k + 1, l) + & + rhs_vf(i)%sf(j, k, l + 1) = rhs_vf(i)%sf(j, k, l + 1) + & (0.5_wp*(alpha_rho_R(i)* & - vel_R(2))*(1._wp/dy(k + 1)) + & - 0.5_wp*cfl*(alpha_rho_R(i))*(1._wp/dy(k + 1))) + vel_R(3))*(1._wp/dz(l + 1)) + & + 0.5_wp*cfl*(alpha_rho_R(i))*(1._wp/dz(l + 1))) $:GPU_ATOMIC(atomic='update') rhs_vf(i)%sf(j, k, l) = rhs_vf(i)%sf(j, k, l) - & (0.5_wp*(alpha_rho_R(i)* & - vel_R(2))*(1._wp/dy(k)) + & - 0.5_wp*cfl*(alpha_rho_R(i))*(1._wp/dy(k))) + vel_R(3))*(1._wp/dz(l)) + & + 0.5_wp*cfl*(alpha_rho_R(i))*(1._wp/dz(l))) end do if (num_fluids > 1) then $:GPU_LOOP(parallelism='[seq]') do i = 1, num_fluids - 1 $:GPU_ATOMIC(atomic='update') - rhs_vf(advxb + i - 1)%sf(j, k + 1, l) = rhs_vf(advxb + i - 1)%sf(j, k + 1, l) + & + rhs_vf(advxb + i - 1)%sf(j, k, l + 1) = rhs_vf(advxb + i - 1)%sf(j, k, l + 1) + & (0.5_wp*(alpha_R(i)* & - vel_R(2))*(1._wp/dy(k + 1)) + & - 0.5_wp*cfl*(alpha_R(i))*(1._wp/dy(k + 1))) + vel_R(3))*(1._wp/dz(l + 1)) + & + 0.5_wp*cfl*(alpha_R(i))*(1._wp/dz(l + 1))) $:GPU_ATOMIC(atomic='update') - rhs_vf(advxb + i - 1)%sf(j, k + 1, l) = rhs_vf(advxb + i - 1)%sf(j, k + 1, l) & - - (0.5_wp*q_cons_vf(advxb + i - 1)%sf(j, k + 1, l)*vel_R(2)*(1._wp/dy(k + 1))) + rhs_vf(advxb + i - 1)%sf(j, k, l + 1) = rhs_vf(advxb + i - 1)%sf(j, k, l + 1) & + - (0.5_wp*q_cons_vf(advxb + i - 1)%sf(j, k, l + 1)*vel_R(3)*(1._wp/dz(l + 1))) $:GPU_ATOMIC(atomic='update') rhs_vf(advxb + i - 1)%sf(j, k, l) = rhs_vf(advxb + i - 1)%sf(j, k, l) - & (0.5_wp*(alpha_R(i)* & - vel_R(2))*(1._wp/dy(k)) + & - 0.5_wp*cfl*(alpha_R(i))*(1._wp/dy(k))) + vel_R(3))*(1._wp/dz(l)) + & + 0.5_wp*cfl*(alpha_R(i))*(1._wp/dz(l))) $:GPU_ATOMIC(atomic='update') rhs_vf(advxb + i - 1)%sf(j, k, l) = rhs_vf(advxb + i - 1)%sf(j, k, l) & - + (0.5_wp*q_cons_vf(advxb + i - 1)%sf(j, k, l)*vel_R(2)*(1._wp/dy(k))) + + (0.5_wp*q_cons_vf(advxb + i - 1)%sf(j, k, l)*vel_R(3)*(1._wp/dz(l))) end do end if $:GPU_ATOMIC(atomic='update') - rhs_vf(momxb + 1)%sf(j, k + 1, l) = rhs_vf(momxb + 1)%sf(j, k + 1, l) + & - (0.5_wp*(rho_R*(vel_R(2))**2.0 + & - pres_R + F_R)*(1._wp/dy(k + 1)) + & - 0.5_wp*cfl*(rho_R*vel_R(2))*(1._wp/dy(k + 1))) + rhs_vf(momxb + 2)%sf(j, k, l + 1) = rhs_vf(momxb + 2)%sf(j, k, l + 1) + & + (0.5_wp*(rho_R*(vel_R(3))**2.0 + & + pres_R + F_R)*(1._wp/dz(l + 1)) + & + 0.5_wp*cfl*(rho_R*vel_R(3))*(1._wp/dz(l + 1))) $:GPU_ATOMIC(atomic='update') - rhs_vf(momxb)%sf(j, k + 1, l) = rhs_vf(momxb)%sf(j, k + 1, l) + & - (0.5_wp*rho_R*vel_R(2)*vel_R(1)*(1._wp/dy(k + 1)) + & - 0.5_wp*cfl*(rho_R*vel_R(1))*(1._wp/dy(k + 1))) + rhs_vf(momxb)%sf(j, k, l + 1) = rhs_vf(momxb)%sf(j, k, l + 1) + & + (0.5_wp*rho_R*vel_R(1)*vel_R(3)*(1._wp/dz(l + 1)) + & + 0.5_wp*cfl*(rho_R*vel_R(1))*(1._wp/dz(l + 1))) $:GPU_ATOMIC(atomic='update') - rhs_vf(momxb + 2)%sf(j, k + 1, l) = rhs_vf(momxb + 2)%sf(j, k + 1, l) + & - (0.5_wp*rho_R*vel_R(2)*vel_R(3)*(1._wp/dy(k + 1)) + & - 0.5_wp*cfl*(rho_R*vel_R(3))*(1._wp/dy(k + 1))) + rhs_vf(momxb + 1)%sf(j, k, l + 1) = rhs_vf(momxb + 1)%sf(j, k, l + 1) + & + (0.5_wp*rho_R*vel_R(2)*vel_R(3)*(1._wp/dz(l + 1)) + & + 0.5_wp*cfl*(rho_R*vel_R(2))*(1._wp/dz(l + 1))) $:GPU_ATOMIC(atomic='update') - rhs_vf(E_idx)%sf(j, k + 1, l) = rhs_vf(E_idx)%sf(j, k + 1, l) + & - (0.5_wp*(vel_R(2)*(E_R + & - pres_R + F_R))*(1._wp/dy(k + 1)) + & - 0.5_wp*cfl*(E_R)*(1._wp/dy(k + 1))) + rhs_vf(E_idx)%sf(j, k, l + 1) = rhs_vf(E_idx)%sf(j, k, l + 1) + & + (0.5_wp*(vel_R(3)*(E_R + & + pres_R + F_R))*(1._wp/dz(l + 1)) + & + 0.5_wp*cfl*(E_R)*(1._wp/dz(l + 1))) $:GPU_ATOMIC(atomic='update') - rhs_vf(momxb + 1)%sf(j, k, l) = rhs_vf(momxb + 1)%sf(j, k, l) - & - (0.5_wp*(rho_R*(vel_R(2))**2.0 + & - pres_R + F_R)*(1._wp/dy(k)) + & - 0.5_wp*cfl*(rho_R*vel_R(2))*(1._wp/dy(k))) + rhs_vf(momxb + 2)%sf(j, k, l) = rhs_vf(momxb + 2)%sf(j, k, l) - & + (0.5_wp*(rho_R*(vel_R(3))**2.0 + & + pres_R + F_R)*(1._wp/dz(l)) + & + 0.5_wp*cfl*(rho_R*vel_R(3))*(1._wp/dz(l))) $:GPU_ATOMIC(atomic='update') rhs_vf(momxb)%sf(j, k, l) = rhs_vf(momxb)%sf(j, k, l) - & - (0.5_wp*rho_R*vel_R(2)*vel_R(1)*(1._wp/dy(k)) + & - 0.5_wp*cfl*(rho_R*vel_R(1))*(1._wp/dy(k))) + (0.5_wp*rho_R*vel_R(1)*vel_R(3)*(1._wp/dz(l)) + & + 0.5_wp*cfl*(rho_R*vel_R(1))*(1._wp/dz(l))) $:GPU_ATOMIC(atomic='update') - rhs_vf(momxb + 2)%sf(j, k, l) = rhs_vf(momxb + 2)%sf(j, k, l) - & - (0.5_wp*rho_R*vel_R(2)*vel_R(3)*(1._wp/dy(k)) + & - 0.5_wp*cfl*(rho_R*vel_R(3))*(1._wp/dy(k))) + rhs_vf(momxb + 1)%sf(j, k, l) = rhs_vf(momxb + 1)%sf(j, k, l) - & + (0.5_wp*rho_R*vel_R(2)*vel_R(3)*(1._wp/dz(l)) + & + 0.5_wp*cfl*(rho_R*vel_R(2))*(1._wp/dz(l))) $:GPU_ATOMIC(atomic='update') rhs_vf(E_idx)%sf(j, k, l) = rhs_vf(E_idx)%sf(j, k, l) - & - (0.5_wp*(vel_R(2)*(E_R + & - pres_R + F_R))*(1._wp/dy(k)) + & - 0.5_wp*cfl*(E_R)*(1._wp/dy(k))) - - end do - end do - end do - end if - elseif (idir == 3) then - $:GPU_PARALLEL_LOOP(collapse=3, private='[rho_L, rho_R, gamma_L, & - & gamma_R, pi_inf_L, pi_inf_R, mu_L, mu_R, vel_L, vel_R, & - & pres_L, pres_R, alpha_L, alpha_R, alpha_rho_L, alpha_rho_R, & - & F_L, F_R, E_L, E_R, cfl, dvel_small, rho_sf_small, & - & vflux_L_arr, vflux_R_arr]') - do l = -1, p - do k = 0, n - do j = 0, m - - if (viscous) then - vflux_L_arr = 0._wp - vflux_R_arr = 0._wp - - #:if MFC_CASE_OPTIMIZATION - #:if igr_order == 5 - !DIR$ unroll 6 - #:elif igr_order == 3 - !DIR$ unroll 4 - #:endif - #:endif - $:GPU_LOOP(parallelism='[seq]') - do q = vidxb, vidxe - dvel_small = 0._wp - !x-direction contributions - $:GPU_LOOP(parallelism='[seq]') - do i = -1, 1 - rho_L = 0._wp - $:GPU_LOOP(parallelism='[seq]') - do r = 1, num_fluids - rho_L = rho_L + q_cons_vf(r)%sf(j + i, k, l + q) - end do - rho_sf_small(i) = rho_L - end do - - dvel_small(1) = (1/(2._wp*dx(j)))*( & - q_cons_vf(momxb)%sf(j + 1, k, l + q)/rho_sf_small(1) - & - q_cons_vf(momxb)%sf(j - 1, k, l + q)/rho_sf_small(-1)) - dvel_small(3) = (1/(2._wp*dx(j)))*( & - q_cons_vf(momxb + 2)%sf(j + 1, k, l + q)/rho_sf_small(1) - & - q_cons_vf(momxb + 2)%sf(j - 1, k, l + q)/rho_sf_small(-1)) - - if (q > vidxb) then - vflux_L_arr(1) = vflux_L_arr(1) + coeff_L(q)*(dvel_small(3)) - vflux_L_arr(3) = vflux_L_arr(3) + coeff_L(q)*(-2._wp*dvel_small(1))/3._wp - end if - if (q < vidxe) then - vflux_R_arr(1) = vflux_R_arr(1) + coeff_R(q)*(dvel_small(3)) - vflux_R_arr(3) = vflux_R_arr(3) + coeff_R(q)*(-2._wp*dvel_small(1))/3._wp - end if - - !y-direction contributions - $:GPU_LOOP(parallelism='[seq]') - do i = -1, 1 - rho_L = 0._wp - $:GPU_LOOP(parallelism='[seq]') - do r = 1, num_fluids - rho_L = rho_L + q_cons_vf(r)%sf(j, k + i, l + q) - end do - rho_sf_small(i) = rho_L - end do - - dvel_small(2) = (1/(2._wp*dy(k)))*( & - q_cons_vf(momxb + 1)%sf(j, k + 1, l + q)/rho_sf_small(1) - & - q_cons_vf(momxb + 1)%sf(j, k - 1, l + q)/rho_sf_small(-1)) - dvel_small(3) = (1/(2._wp*dy(k)))*( & - q_cons_vf(momxb + 2)%sf(j, k + 1, l + q)/rho_sf_small(1) - & - q_cons_vf(momxb + 2)%sf(j, k - 1, l + q)/rho_sf_small(-1)) - - if (q > vidxb) then - vflux_L_arr(2) = vflux_L_arr(2) + coeff_L(q)*(dvel_small(3)) - vflux_L_arr(3) = vflux_L_arr(3) + coeff_L(q)*(-2._wp*dvel_small(2))/3._wp - end if - if (q < vidxe) then - vflux_R_arr(2) = vflux_R_arr(2) + coeff_R(q)*(dvel_small(3)) - vflux_R_arr(3) = vflux_R_arr(3) + coeff_R(q)*(-2._wp*dvel_small(2))/3._wp - end if - - !z-direction contributions - $:GPU_LOOP(parallelism='[seq]') - do i = -1, 1 - rho_L = 0._wp - $:GPU_LOOP(parallelism='[seq]') - do r = 1, num_fluids - rho_L = rho_L + q_cons_vf(r)%sf(j, k, l + i + q) - end do - rho_sf_small(i) = rho_L - end do - dvel_small(1) = (1/(2._wp*dz(l)))*( & - q_cons_vf(momxb)%sf(j, k, l + 1 + q)/rho_sf_small(1) - & - q_cons_vf(momxb)%sf(j, k, l - 1 + q)/rho_sf_small(-1)) - dvel_small(2) = (1/(2._wp*dz(l)))*( & - q_cons_vf(momxb + 1)%sf(j, k, l + 1 + q)/rho_sf_small(1) - & - q_cons_vf(momxb + 1)%sf(j, k, l - 1 + q)/rho_sf_small(-1)) - dvel_small(3) = (1/(2._wp*dz(l)))*( & - q_cons_vf(momxb + 2)%sf(j, k, l + 1 + q)/rho_sf_small(1) - & - q_cons_vf(momxb + 2)%sf(j, k, l - 1 + q)/rho_sf_small(-1)) - if (q > vidxb) then - vflux_L_arr(1) = vflux_L_arr(1) + coeff_L(q)*(dvel_small(1)) - vflux_L_arr(2) = vflux_L_arr(2) + coeff_L(q)*(dvel_small(2)) - vflux_L_arr(3) = vflux_L_arr(3) + coeff_L(q)*(4._wp*dvel_small(3))/3._wp - end if - if (q < vidxe) then - vflux_R_arr(1) = vflux_R_arr(1) + coeff_R(q)*(dvel_small(1)) - vflux_R_arr(2) = vflux_R_arr(2) + coeff_R(q)*(dvel_small(2)) - vflux_R_arr(3) = vflux_R_arr(3) + coeff_R(q)*(4._wp*dvel_small(3))/3._wp - end if - end do - end if - - alpha_rho_L = 0._wp; alpha_rho_R = 0._wp - alpha_L = 0._wp; alpha_R = 0._wp - vel_L = 0._wp; vel_R = 0._wp - - $:GPU_LOOP(parallelism='[seq]') - do q = vidxb + 1, vidxe - $:GPU_LOOP(parallelism='[seq]') - do i = 1, num_fluids - alpha_rho_L(i) = alpha_rho_L(i) + coeff_L(q)*q_cons_vf(i)%sf(j, k, l + q) - end do - - if (num_fluids > 1) then - $:GPU_LOOP(parallelism='[seq]') - do i = 1, num_fluids - 1 - alpha_L(i) = alpha_L(i) + coeff_L(q)*q_cons_vf(E_idx + i)%sf(j, k, l + q) - end do - else - alpha_L(1) = 1._wp - end if - - $:GPU_LOOP(parallelism='[seq]') - do i = 1, num_dims - vel_L(i) = vel_L(i) + coeff_L(q)*q_cons_vf(momxb + i - 1)%sf(j, k, l + q) - end do - end do - - $:GPU_LOOP(parallelism='[seq]') - do q = vidxb, vidxe - 1 - $:GPU_LOOP(parallelism='[seq]') - do i = 1, num_fluids - alpha_rho_R(i) = alpha_rho_R(i) + coeff_R(q)*q_cons_vf(i)%sf(j, k, l + q) - end do - - if (num_fluids > 1) then - $:GPU_LOOP(parallelism='[seq]') - do i = 1, num_fluids - 1 - alpha_R(i) = alpha_R(i) + coeff_R(q)*q_cons_vf(E_idx + i)%sf(j, k, l + q) - end do - else - alpha_R(1) = 1._wp - end if - - $:GPU_LOOP(parallelism='[seq]') - do i = 1, num_dims - vel_R(i) = vel_R(i) + coeff_R(q)*q_cons_vf(momxb + i - 1)%sf(j, k, l + q) - end do - end do - - if (num_fluids > 1) then - alpha_L(num_fluids) = 1._wp - sum(alpha_L(1:num_fluids - 1)) - alpha_R(num_fluids) = 1._wp - sum(alpha_R(1:num_fluids - 1)) - end if - - rho_L = sum(alpha_rho_L) - gamma_L = sum(alpha_L*gammas) - pi_inf_L = sum(alpha_L*pi_infs) - - rho_R = sum(alpha_rho_R) - gamma_R = sum(alpha_R*gammas) - pi_inf_R = sum(alpha_R*pi_infs) - - vel_L = vel_L/rho_L - vel_R = vel_R/rho_R - - if (viscous) then - mu_L = 0._wp - mu_R = 0._wp - $:GPU_LOOP(parallelism='[seq]') - do i = 1, num_fluids - mu_L = alpha_L(i)/Res(1, i) + mu_L - mu_R = alpha_R(i)/Res(1, i) + mu_R - end do - - $:GPU_ATOMIC(atomic='update') - rhs_vf(momxb)%sf(j, k, l + 1) = rhs_vf(momxb)%sf(j, k, l + 1) - & - 0.5_wp*mu_L*vflux_L_arr(1)*(1._wp/dz(l + 1)) - $:GPU_ATOMIC(atomic='update') - rhs_vf(E_idx)%sf(j, k, l + 1) = rhs_vf(E_idx)%sf(j, k, l + 1) - & - 0.5_wp*mu_L*vflux_L_arr(1)*vel_L(1)*(1._wp/dz(l + 1)) - - $:GPU_ATOMIC(atomic='update') - rhs_vf(momxb)%sf(j, k, l) = rhs_vf(momxb)%sf(j, k, l) + & - 0.5_wp*mu_L*vflux_L_arr(1)*(1._wp/dz(l)) - $:GPU_ATOMIC(atomic='update') - rhs_vf(E_idx)%sf(j, k, l) = rhs_vf(E_idx)%sf(j, k, l) + & - 0.5_wp*mu_L*vflux_L_arr(1)*vel_L(1)*(1._wp/dz(l)) - - $:GPU_ATOMIC(atomic='update') - rhs_vf(momxb)%sf(j, k, l + 1) = rhs_vf(momxb)%sf(j, k, l + 1) - & - 0.5_wp*mu_R*vflux_R_arr(1)*(1._wp/dz(l + 1)) - $:GPU_ATOMIC(atomic='update') - rhs_vf(E_idx)%sf(j, k, l + 1) = rhs_vf(E_idx)%sf(j, k, l + 1) - & - 0.5_wp*mu_R*vflux_R_arr(1)*vel_R(1)*(1._wp/dz(l + 1)) - - $:GPU_ATOMIC(atomic='update') - rhs_vf(momxb)%sf(j, k, l) = rhs_vf(momxb)%sf(j, k, l) + & - 0.5_wp*mu_R*vflux_R_arr(1)*(1._wp/dz(l)) - $:GPU_ATOMIC(atomic='update') - rhs_vf(E_idx)%sf(j, k, l) = rhs_vf(E_idx)%sf(j, k, l) + & - 0.5_wp*mu_R*vflux_R_arr(1)*vel_R(1)*(1._wp/dz(l)) - - $:GPU_ATOMIC(atomic='update') - rhs_vf(momxb + 1)%sf(j, k, l + 1) = rhs_vf(momxb + 1)%sf(j, k, l + 1) - & - 0.5_wp*mu_L*vflux_L_arr(2)*(1._wp/dz(l + 1)) - $:GPU_ATOMIC(atomic='update') - rhs_vf(E_idx)%sf(j, k, l + 1) = rhs_vf(E_idx)%sf(j, k, l + 1) - & - 0.5_wp*mu_L*vflux_L_arr(2)*vel_L(2)*(1._wp/dz(l + 1)) - - $:GPU_ATOMIC(atomic='update') - rhs_vf(momxb + 1)%sf(j, k, l) = rhs_vf(momxb + 1)%sf(j, k, l) + & - 0.5_wp*mu_L*vflux_L_arr(2)*(1._wp/dz(l)) - $:GPU_ATOMIC(atomic='update') - rhs_vf(E_idx)%sf(j, k, l) = rhs_vf(E_idx)%sf(j, k, l) + & - 0.5_wp*mu_L*vflux_L_arr(2)*vel_L(2)*(1._wp/dz(l)) - - $:GPU_ATOMIC(atomic='update') - rhs_vf(momxb + 1)%sf(j, k, l + 1) = rhs_vf(momxb + 1)%sf(j, k, l + 1) - & - 0.5_wp*mu_R*vflux_R_arr(2)*(1._wp/dz(l + 1)) - $:GPU_ATOMIC(atomic='update') - rhs_vf(E_idx)%sf(j, k, l + 1) = rhs_vf(E_idx)%sf(j, k, l + 1) - & - 0.5_wp*mu_R*vflux_R_arr(2)*vel_R(2)*(1._wp/dz(l + 1)) - - $:GPU_ATOMIC(atomic='update') - rhs_vf(momxb + 1)%sf(j, k, l) = rhs_vf(momxb + 1)%sf(j, k, l) + & - 0.5_wp*mu_R*vflux_R_arr(2)*(1._wp/dz(l)) - $:GPU_ATOMIC(atomic='update') - rhs_vf(E_idx)%sf(j, k, l) = rhs_vf(E_idx)%sf(j, k, l) + & - 0.5_wp*mu_R*vflux_R_arr(2)*vel_R(2)*(1._wp/dz(l)) - - $:GPU_ATOMIC(atomic='update') - rhs_vf(momxb + 2)%sf(j, k, l + 1) = rhs_vf(momxb + 2)%sf(j, k, l + 1) - & - 0.5_wp*mu_L*vflux_L_arr(3)*(1._wp/dz(l + 1)) - $:GPU_ATOMIC(atomic='update') - rhs_vf(E_idx)%sf(j, k, l + 1) = rhs_vf(E_idx)%sf(j, k, l + 1) - & - 0.5_wp*mu_L*vflux_L_arr(3)*vel_L(3)*(1._wp/dz(l + 1)) - - $:GPU_ATOMIC(atomic='update') - rhs_vf(momxb + 2)%sf(j, k, l) = rhs_vf(momxb + 2)%sf(j, k, l) + & - 0.5_wp*mu_L*vflux_L_arr(3)*(1._wp/dz(l)) - $:GPU_ATOMIC(atomic='update') - rhs_vf(E_idx)%sf(j, k, l) = rhs_vf(E_idx)%sf(j, k, l) + & - 0.5_wp*mu_L*vflux_L_arr(3)*vel_L(3)*(1._wp/dz(l)) - - $:GPU_ATOMIC(atomic='update') - rhs_vf(momxb + 2)%sf(j, k, l + 1) = rhs_vf(momxb + 2)%sf(j, k, l + 1) - & - 0.5_wp*mu_R*vflux_R_arr(3)*(1._wp/dz(l + 1)) - $:GPU_ATOMIC(atomic='update') - rhs_vf(E_idx)%sf(j, k, l + 1) = rhs_vf(E_idx)%sf(j, k, l + 1) - & - 0.5_wp*mu_R*vflux_R_arr(3)*vel_R(3)*(1._wp/dz(l + 1)) - - $:GPU_ATOMIC(atomic='update') - rhs_vf(momxb + 2)%sf(j, k, l) = rhs_vf(momxb + 2)%sf(j, k, l) + & - 0.5_wp*mu_R*vflux_R_arr(3)*(1._wp/dz(l)) - $:GPU_ATOMIC(atomic='update') - rhs_vf(E_idx)%sf(j, k, l) = rhs_vf(E_idx)%sf(j, k, l) + & - 0.5_wp*mu_R*vflux_R_arr(3)*vel_R(3)*(1._wp/dz(l)) - end if - - E_L = 0._wp; E_R = 0._wp - F_L = 0._wp; F_R = 0._wp - - $:GPU_LOOP(parallelism='[seq]') - do q = vidxb + 1, vidxe - E_L = E_L + coeff_L(q)*q_cons_vf(E_idx)%sf(j, k, l + q) - F_L = F_L + coeff_L(q)*jac(j, k, l + q) - end do - - $:GPU_LOOP(parallelism='[seq]') - do q = vidxb, vidxe - 1 - E_R = E_R + coeff_R(q)*q_cons_vf(E_idx)%sf(j, k, l + q) - F_R = F_R + coeff_R(q)*jac(j, k, l + q) - end do - - call s_get_derived_states(E_L, gamma_L, pi_inf_L, rho_L, vel_L, & - E_R, gamma_R, pi_inf_R, rho_R, vel_R, & - pres_L, pres_R, cfl) - - $:GPU_LOOP(parallelism='[seq]') - do i = 1, num_fluids - $:GPU_ATOMIC(atomic='update') - rhs_vf(i)%sf(j, k, l + 1) = rhs_vf(i)%sf(j, k, l + 1) + & - (0.5_wp*(alpha_rho_L(i)* & - vel_L(3))*(1._wp/dz(l + 1)) - & - 0.5_wp*cfl*(alpha_rho_L(i))*(1._wp/dz(l + 1))) - - $:GPU_ATOMIC(atomic='update') - rhs_vf(i)%sf(j, k, l) = rhs_vf(i)%sf(j, k, l) - & - (0.5_wp*(alpha_rho_L(i)* & - vel_L(3))*(1._wp/dz(l)) - & - 0.5_wp*cfl*(alpha_rho_L(i))*(1._wp/dz(l))) - end do - - if (num_fluids > 1) then - $:GPU_LOOP(parallelism='[seq]') - do i = 1, num_fluids - 1 - $:GPU_ATOMIC(atomic='update') - rhs_vf(advxb + i - 1)%sf(j, k, l + 1) = rhs_vf(advxb + i - 1)%sf(j, k, l + 1) + & - (0.5_wp*(alpha_L(i)* & - vel_L(3))*(1._wp/dz(l + 1)) - & - 0.5_wp*cfl*(alpha_L(i))*(1._wp/dz(l + 1))) - - $:GPU_ATOMIC(atomic='update') - rhs_vf(advxb + i - 1)%sf(j, k, l + 1) = rhs_vf(advxb + i - 1)%sf(j, k, l + 1) & - - (0.5_wp*q_cons_vf(advxb + i - 1)%sf(j, k, l + 1)*vel_L(3)*(1._wp/dz(l + 1))) - - $:GPU_ATOMIC(atomic='update') - rhs_vf(advxb + i - 1)%sf(j, k, l) = rhs_vf(advxb + i - 1)%sf(j, k, l) - & - (0.5_wp*(alpha_L(i)* & - vel_L(3))*(1._wp/dz(l)) - & - 0.5_wp*cfl*(alpha_L(i))*(1._wp/dz(l))) - - $:GPU_ATOMIC(atomic='update') - rhs_vf(advxb + i - 1)%sf(j, k, l) = rhs_vf(advxb + i - 1)%sf(j, k, l) & - + (0.5_wp*q_cons_vf(advxb + i - 1)%sf(j, k, l)*vel_L(3)*(1._wp/dz(l))) - end do - end if - - $:GPU_ATOMIC(atomic='update') - rhs_vf(momxb + 2)%sf(j, k, l + 1) = rhs_vf(momxb + 2)%sf(j, k, l + 1) + & - (0.5_wp*(rho_L*(vel_L(3))**2.0 + & - pres_L + F_L)*(1._wp/dz(l + 1)) - & - 0.5_wp*cfl*(rho_L*vel_L(3))*(1._wp/dz(l + 1))) - - $:GPU_ATOMIC(atomic='update') - rhs_vf(momxb)%sf(j, k, l + 1) = rhs_vf(momxb)%sf(j, k, l + 1) + & - (0.5_wp*rho_L*vel_L(1)*vel_L(3)*(1._wp/dz(l + 1)) - & - 0.5_wp*cfl*(rho_L*vel_L(1))*(1._wp/dz(l + 1))) - - $:GPU_ATOMIC(atomic='update') - rhs_vf(momxb + 1)%sf(j, k, l + 1) = rhs_vf(momxb + 1)%sf(j, k, l + 1) + & - (0.5_wp*rho_L*vel_L(2)*vel_L(3)*(1._wp/dz(l + 1)) - & - 0.5_wp*cfl*(rho_L*vel_L(2))*(1._wp/dz(l + 1))) - - $:GPU_ATOMIC(atomic='update') - rhs_vf(E_idx)%sf(j, k, l + 1) = rhs_vf(E_idx)%sf(j, k, l + 1) + & - (0.5_wp*(vel_L(3)*(E_L + & - pres_L + F_L))*(1._wp/dz(l + 1)) - & - 0.5_wp*cfl*(E_L)*(1._wp/dz(l + 1))) - - $:GPU_ATOMIC(atomic='update') - rhs_vf(momxb + 2)%sf(j, k, l) = rhs_vf(momxb + 2)%sf(j, k, l) - & - (0.5_wp*(rho_L*(vel_L(3))**2.0 + & - pres_L + F_L)*(1._wp/dz(l)) - & - 0.5_wp*cfl*(rho_L*vel_L(3))*(1._wp/dz(l))) - - $:GPU_ATOMIC(atomic='update') - rhs_vf(momxb)%sf(j, k, l) = rhs_vf(momxb)%sf(j, k, l) - & - (0.5_wp*rho_L*vel_L(1)*vel_L(3)*(1._wp/dz(l)) - & - 0.5_wp*cfl*(rho_L*vel_L(1))*(1._wp/dz(l))) - - $:GPU_ATOMIC(atomic='update') - rhs_vf(momxb + 1)%sf(j, k, l) = rhs_vf(momxb + 1)%sf(j, k, l) - & - (0.5_wp*rho_L*vel_L(2)*vel_L(3)*(1._wp/dz(l)) - & - 0.5_wp*cfl*(rho_L*vel_L(2))*(1._wp/dz(l))) - - $:GPU_ATOMIC(atomic='update') - rhs_vf(E_idx)%sf(j, k, l) = rhs_vf(E_idx)%sf(j, k, l) - & - (0.5_wp*(vel_L(3)*(E_L + & - pres_L + F_L))*(1._wp/dz(l)) - & - 0.5_wp*cfl*(E_L)*(1._wp/dz(l))) - - $:GPU_LOOP(parallelism='[seq]') - do i = 1, num_fluids - $:GPU_ATOMIC(atomic='update') - rhs_vf(i)%sf(j, k, l + 1) = rhs_vf(i)%sf(j, k, l + 1) + & - (0.5_wp*(alpha_rho_R(i)* & - vel_R(3))*(1._wp/dz(l + 1)) + & - 0.5_wp*cfl*(alpha_rho_R(i))*(1._wp/dz(l + 1))) - - $:GPU_ATOMIC(atomic='update') - rhs_vf(i)%sf(j, k, l) = rhs_vf(i)%sf(j, k, l) - & - (0.5_wp*(alpha_rho_R(i)* & - vel_R(3))*(1._wp/dz(l)) + & - 0.5_wp*cfl*(alpha_rho_R(i))*(1._wp/dz(l))) - end do - - if (num_fluids > 1) then - $:GPU_LOOP(parallelism='[seq]') - do i = 1, num_fluids - 1 - $:GPU_ATOMIC(atomic='update') - rhs_vf(advxb + i - 1)%sf(j, k, l + 1) = rhs_vf(advxb + i - 1)%sf(j, k, l + 1) + & - (0.5_wp*(alpha_R(i)* & - vel_R(3))*(1._wp/dz(l + 1)) + & - 0.5_wp*cfl*(alpha_R(i))*(1._wp/dz(l + 1))) - - $:GPU_ATOMIC(atomic='update') - rhs_vf(advxb + i - 1)%sf(j, k, l + 1) = rhs_vf(advxb + i - 1)%sf(j, k, l + 1) & - - (0.5_wp*q_cons_vf(advxb + i - 1)%sf(j, k, l + 1)*vel_R(3)*(1._wp/dz(l + 1))) - - $:GPU_ATOMIC(atomic='update') - rhs_vf(advxb + i - 1)%sf(j, k, l) = rhs_vf(advxb + i - 1)%sf(j, k, l) - & - (0.5_wp*(alpha_R(i)* & - vel_R(3))*(1._wp/dz(l)) + & - 0.5_wp*cfl*(alpha_R(i))*(1._wp/dz(l))) - - $:GPU_ATOMIC(atomic='update') - rhs_vf(advxb + i - 1)%sf(j, k, l) = rhs_vf(advxb + i - 1)%sf(j, k, l) & - + (0.5_wp*q_cons_vf(advxb + i - 1)%sf(j, k, l)*vel_R(3)*(1._wp/dz(l))) - end do - end if - - $:GPU_ATOMIC(atomic='update') - rhs_vf(momxb + 2)%sf(j, k, l + 1) = rhs_vf(momxb + 2)%sf(j, k, l + 1) + & - (0.5_wp*(rho_R*(vel_R(3))**2.0 + & - pres_R + F_R)*(1._wp/dz(l + 1)) + & - 0.5_wp*cfl*(rho_R*vel_R(3))*(1._wp/dz(l + 1))) - - $:GPU_ATOMIC(atomic='update') - rhs_vf(momxb)%sf(j, k, l + 1) = rhs_vf(momxb)%sf(j, k, l + 1) + & - (0.5_wp*rho_R*vel_R(1)*vel_R(3)*(1._wp/dz(l + 1)) + & - 0.5_wp*cfl*(rho_R*vel_R(1))*(1._wp/dz(l + 1))) - - $:GPU_ATOMIC(atomic='update') - rhs_vf(momxb + 1)%sf(j, k, l + 1) = rhs_vf(momxb + 1)%sf(j, k, l + 1) + & - (0.5_wp*rho_R*vel_R(2)*vel_R(3)*(1._wp/dz(l + 1)) + & - 0.5_wp*cfl*(rho_R*vel_R(2))*(1._wp/dz(l + 1))) - - $:GPU_ATOMIC(atomic='update') - rhs_vf(E_idx)%sf(j, k, l + 1) = rhs_vf(E_idx)%sf(j, k, l + 1) + & (0.5_wp*(vel_R(3)*(E_R + & - pres_R + F_R))*(1._wp/dz(l + 1)) + & - 0.5_wp*cfl*(E_R)*(1._wp/dz(l + 1))) - - $:GPU_ATOMIC(atomic='update') - rhs_vf(momxb + 2)%sf(j, k, l) = rhs_vf(momxb + 2)%sf(j, k, l) - & - (0.5_wp*(rho_R*(vel_R(3))**2.0 + & - pres_R + F_R)*(1._wp/dz(l)) + & - 0.5_wp*cfl*(rho_R*vel_R(3))*(1._wp/dz(l))) - - $:GPU_ATOMIC(atomic='update') - rhs_vf(momxb)%sf(j, k, l) = rhs_vf(momxb)%sf(j, k, l) - & - (0.5_wp*rho_R*vel_R(1)*vel_R(3)*(1._wp/dz(l)) + & - 0.5_wp*cfl*(rho_R*vel_R(1))*(1._wp/dz(l))) - - $:GPU_ATOMIC(atomic='update') - rhs_vf(momxb + 1)%sf(j, k, l) = rhs_vf(momxb + 1)%sf(j, k, l) - & - (0.5_wp*rho_R*vel_R(2)*vel_R(3)*(1._wp/dz(l)) + & - 0.5_wp*cfl*(rho_R*vel_R(2))*(1._wp/dz(l))) - - $:GPU_ATOMIC(atomic='update') - rhs_vf(E_idx)%sf(j, k, l) = rhs_vf(E_idx)%sf(j, k, l) - & - (0.5_wp*(vel_R(3)*(E_R + & - pres_R + F_R))*(1._wp/dz(l)) + & - 0.5_wp*cfl*(E_R)*(1._wp/dz(l))) + pres_R + F_R))*(1._wp/dz(l)) + & + 0.5_wp*cfl*(E_R)*(1._wp/dz(l))) + end do end do end do - end do + #:endcall GPU_PARALLEL_LOOP end if end subroutine s_igr_riemann_solver - pure subroutine s_get_derived_states(E_L, gamma_L, pi_inf_L, rho_L, vel_L, & - E_R, gamma_R, pi_inf_R, rho_R, vel_R, & - pres_L, pres_R, cfl) + subroutine s_get_derived_states(E_L, gamma_L, pi_inf_L, rho_L, vel_L, & + E_R, gamma_R, pi_inf_R, rho_R, vel_R, & + pres_L, pres_R, cfl) $:GPU_ROUTINE(parallelism='[seq]') real(wp), intent(in) :: E_L, gamma_L, pi_inf_L, rho_L @@ -2617,46 +2611,49 @@ contains integer, intent(in) :: idir if (idir == 1) then - $:GPU_PARALLEL_LOOP(collapse=4) - do i = 1, sys_size - do l = 0, p - do k = 0, n - do j = 0, m - rhs_vf(i)%sf(j, k, l) = 1._wp/dx(j)* & - (flux_vf(i)%sf(j - 1, k, l) & - - flux_vf(i)%sf(j, k, l)) + #:call GPU_PARALLEL_LOOP(collapse=4) + do i = 1, sys_size + do l = 0, p + do k = 0, n + do j = 0, m + rhs_vf(i)%sf(j, k, l) = 1._wp/dx(j)* & + (flux_vf(i)%sf(j - 1, k, l) & + - flux_vf(i)%sf(j, k, l)) + end do end do end do end do - end do + #:endcall GPU_PARALLEL_LOOP elseif (idir == 2) then - $:GPU_PARALLEL_LOOP(collapse=4) - do i = 1, sys_size - do l = 0, p - do k = 0, n - do j = 0, m - rhs_vf(i)%sf(j, k, l) = & - rhs_vf(i)%sf(j, k, l) + 1._wp/dy(k)* & - (flux_vf(i)%sf(j, k - 1, l) & - - flux_vf(i)%sf(j, k, l)) + #:call GPU_PARALLEL_LOOP(collapse=4) + do i = 1, sys_size + do l = 0, p + do k = 0, n + do j = 0, m + rhs_vf(i)%sf(j, k, l) = & + rhs_vf(i)%sf(j, k, l) + 1._wp/dy(k)* & + (flux_vf(i)%sf(j, k - 1, l) & + - flux_vf(i)%sf(j, k, l)) + end do end do end do end do - end do + #:endcall GPU_PARALLEL_LOOP elseif (idir == 3) then - $:GPU_PARALLEL_LOOP(collapse=4) - do i = 1, sys_size - do l = 0, p - do k = 0, n - do j = 0, m - rhs_vf(i)%sf(j, k, l) = & - rhs_vf(i)%sf(j, k, l) + 1._wp/dz(l)* & - (flux_vf(i)%sf(j, k, l - 1) & - - flux_vf(i)%sf(j, k, l)) + #:call GPU_PARALLEL_LOOP(collapse=4) + do i = 1, sys_size + do l = 0, p + do k = 0, n + do j = 0, m + rhs_vf(i)%sf(j, k, l) = & + rhs_vf(i)%sf(j, k, l) + 1._wp/dz(l)* & + (flux_vf(i)%sf(j, k, l - 1) & + - flux_vf(i)%sf(j, k, l)) + end do end do end do end do - end do + #:endcall GPU_PARALLEL_LOOP end if end subroutine s_igr_flux_add @@ -2664,7 +2661,7 @@ contains subroutine s_finalize_igr_module() if (viscous) then - @:DEALLOCATE(Res) + @:DEALLOCATE(Res_igr) end if #ifndef __NVCOMPILER_GPU_UNIFIED_MEM diff --git a/src/simulation/m_mhd.fpp b/src/simulation/m_mhd.fpp index 8112b3af7e..12ba72809c 100644 --- a/src/simulation/m_mhd.fpp +++ b/src/simulation/m_mhd.fpp @@ -21,10 +21,10 @@ module m_mhd s_finalize_mhd_powell_module, & s_compute_mhd_powell_rhs - real(wp), allocatable, dimension(:, :, :) :: du_dx, du_dy, du_dz - real(wp), allocatable, dimension(:, :, :) :: dv_dx, dv_dy, dv_dz - real(wp), allocatable, dimension(:, :, :) :: dw_dx, dw_dy, dw_dz - $:GPU_DECLARE(create='[du_dx,du_dy,du_dz,dv_dx,dv_dy,dv_dz,dw_dx,dw_dy,dw_dz]') + real(wp), allocatable, dimension(:, :, :) :: du_dx_mhd, du_dy_mhd, du_dz_mhd + real(wp), allocatable, dimension(:, :, :) :: dv_dx_mhd, dv_dy_mhd, dv_dz_mhd + real(wp), allocatable, dimension(:, :, :) :: dw_dx_mhd, dw_dy_mhd, dw_dz_mhd + $:GPU_DECLARE(create='[du_dx_mhd,du_dy_mhd,du_dz_mhd,dv_dx_mhd,dv_dy_mhd,dv_dz_mhd,dw_dx_mhd,dw_dy_mhd,dw_dz_mhd]') real(wp), allocatable, dimension(:, :) :: fd_coeff_x_h real(wp), allocatable, dimension(:, :) :: fd_coeff_y_h @@ -38,10 +38,10 @@ contains ! Additional safety check beyond m_checker if (n == 0) call s_mpi_abort('Fatal Error: Powell correction is not applicable for 1D') - @:ALLOCATE(du_dx(0:m,0:n,0:p), dv_dx(0:m,0:n,0:p), dw_dx(0:m,0:n,0:p)) - @:ALLOCATE(du_dy(0:m,0:n,0:p), dv_dy(0:m,0:n,0:p), dw_dy(0:m,0:n,0:p)) + @:ALLOCATE(du_dx_mhd(0:m,0:n,0:p), dv_dx_mhd(0:m,0:n,0:p), dw_dx_mhd(0:m,0:n,0:p)) + @:ALLOCATE(du_dy_mhd(0:m,0:n,0:p), dv_dy_mhd(0:m,0:n,0:p), dw_dy_mhd(0:m,0:n,0:p)) if (p > 0) then - @:ALLOCATE(dw_dx(0:m,0:n,0:p), dw_dy(0:m,0:n,0:p), dw_dz(0:m,0:n,0:p)) + @:ALLOCATE(dw_dx_mhd(0:m,0:n,0:p), dw_dy_mhd(0:m,0:n,0:p), dw_dz_mhd(0:m,0:n,0:p)) end if @:ALLOCATE(fd_coeff_x_h(-fd_number:fd_number, 0:m)) @@ -67,7 +67,7 @@ contains !! S = - (divB) [ 0, Bx, By, Bz, vdotB, vx, vy, vz ]^T !! @param q_prim_vf Primitive variables !! @param rhs_vf rhs variables - pure subroutine s_compute_mhd_powell_rhs(q_prim_vf, rhs_vf) + subroutine s_compute_mhd_powell_rhs(q_prim_vf, rhs_vf) type(scalar_field), dimension(sys_size), intent(in) :: q_prim_vf type(scalar_field), dimension(sys_size), intent(inout) :: rhs_vf @@ -76,70 +76,71 @@ contains real(wp), dimension(3) :: v, B real(wp) :: divB, vdotB - $:GPU_PARALLEL_LOOP(collapse=3, private='[v, B]') - do q = 0, p - do l = 0, n - do k = 0, m + #:call GPU_PARALLEL_LOOP(collapse=3, private='[v, B]') + do q = 0, p + do l = 0, n + do k = 0, m - divB = 0._wp - $:GPU_LOOP(parallelism='[seq]') - do r = -fd_number, fd_number - divB = divB + q_prim_vf(B_idx%beg)%sf(k + r, l, q)*fd_coeff_x_h(r, k) - end do - $:GPU_LOOP(parallelism='[seq]') - do r = -fd_number, fd_number - divB = divB + q_prim_vf(B_idx%beg + 1)%sf(k, l + r, q)*fd_coeff_y_h(r, l) - end do - if (p > 0) then + divB = 0._wp $:GPU_LOOP(parallelism='[seq]') do r = -fd_number, fd_number - divB = divB + q_prim_vf(B_idx%beg + 2)%sf(k, l, q + r)*fd_coeff_z_h(r, q) + divB = divB + q_prim_vf(B_idx%beg)%sf(k + r, l, q)*fd_coeff_x_h(r, k) end do - end if - - v(1) = q_prim_vf(momxb)%sf(k, l, q) - v(2) = q_prim_vf(momxb + 1)%sf(k, l, q) - v(3) = q_prim_vf(momxb + 2)%sf(k, l, q) - - B(1) = q_prim_vf(B_idx%beg)%sf(k, l, q) - B(2) = q_prim_vf(B_idx%beg + 1)%sf(k, l, q) - B(3) = q_prim_vf(B_idx%beg + 2)%sf(k, l, q) - - vdotB = sum(v*B) - - ! 1: rho -> unchanged - ! 2: vx -> - (divB) * Bx - ! 3: vy -> - (divB) * By - ! 4: vz -> - (divB) * Bz - ! 5: E -> - (divB) * (vdotB) - ! 6: Bx -> - (divB) * vx - ! 7: By -> - (divB) * vy - ! 8: Bz -> - (divB) * vz - - rhs_vf(momxb)%sf(k, l, q) = rhs_vf(momxb)%sf(k, l, q) - divB*B(1) - rhs_vf(momxb + 1)%sf(k, l, q) = rhs_vf(momxb + 1)%sf(k, l, q) - divB*B(2) - rhs_vf(momxb + 2)%sf(k, l, q) = rhs_vf(momxb + 2)%sf(k, l, q) - divB*B(3) - - rhs_vf(E_idx)%sf(k, l, q) = rhs_vf(E_idx)%sf(k, l, q) - divB*vdotB - - rhs_vf(B_idx%beg)%sf(k, l, q) = rhs_vf(B_idx%beg)%sf(k, l, q) - divB*v(1) - rhs_vf(B_idx%beg + 1)%sf(k, l, q) = rhs_vf(B_idx%beg + 1)%sf(k, l, q) - divB*v(2) - rhs_vf(B_idx%beg + 2)%sf(k, l, q) = rhs_vf(B_idx%beg + 2)%sf(k, l, q) - divB*v(3) + $:GPU_LOOP(parallelism='[seq]') + do r = -fd_number, fd_number + divB = divB + q_prim_vf(B_idx%beg + 1)%sf(k, l + r, q)*fd_coeff_y_h(r, l) + end do + if (p > 0) then + $:GPU_LOOP(parallelism='[seq]') + do r = -fd_number, fd_number + divB = divB + q_prim_vf(B_idx%beg + 2)%sf(k, l, q + r)*fd_coeff_z_h(r, q) + end do + end if + + v(1) = q_prim_vf(momxb)%sf(k, l, q) + v(2) = q_prim_vf(momxb + 1)%sf(k, l, q) + v(3) = q_prim_vf(momxb + 2)%sf(k, l, q) + + B(1) = q_prim_vf(B_idx%beg)%sf(k, l, q) + B(2) = q_prim_vf(B_idx%beg + 1)%sf(k, l, q) + B(3) = q_prim_vf(B_idx%beg + 2)%sf(k, l, q) + + vdotB = sum(v*B) + + ! 1: rho -> unchanged + ! 2: vx -> - (divB) * Bx + ! 3: vy -> - (divB) * By + ! 4: vz -> - (divB) * Bz + ! 5: E -> - (divB) * (vdotB) + ! 6: Bx -> - (divB) * vx + ! 7: By -> - (divB) * vy + ! 8: Bz -> - (divB) * vz + + rhs_vf(momxb)%sf(k, l, q) = rhs_vf(momxb)%sf(k, l, q) - divB*B(1) + rhs_vf(momxb + 1)%sf(k, l, q) = rhs_vf(momxb + 1)%sf(k, l, q) - divB*B(2) + rhs_vf(momxb + 2)%sf(k, l, q) = rhs_vf(momxb + 2)%sf(k, l, q) - divB*B(3) + + rhs_vf(E_idx)%sf(k, l, q) = rhs_vf(E_idx)%sf(k, l, q) - divB*vdotB + + rhs_vf(B_idx%beg)%sf(k, l, q) = rhs_vf(B_idx%beg)%sf(k, l, q) - divB*v(1) + rhs_vf(B_idx%beg + 1)%sf(k, l, q) = rhs_vf(B_idx%beg + 1)%sf(k, l, q) - divB*v(2) + rhs_vf(B_idx%beg + 2)%sf(k, l, q) = rhs_vf(B_idx%beg + 2)%sf(k, l, q) - divB*v(3) + end do end do end do - end do + #:endcall GPU_PARALLEL_LOOP end subroutine s_compute_mhd_powell_rhs impure subroutine s_finalize_mhd_powell_module - @:DEALLOCATE(du_dx, dv_dx, dw_dx) + @:DEALLOCATE(du_dx_mhd, dv_dx_mhd, dw_dx_mhd) @:DEALLOCATE(fd_coeff_x_h) - @:DEALLOCATE(du_dy, dv_dy, dw_dy) + @:DEALLOCATE(du_dy_mhd, dv_dy_mhd, dw_dy_mhd) @:DEALLOCATE(fd_coeff_y_h) if (p > 0) then - @:DEALLOCATE(dw_dx, dw_dy, dw_dz) + @:DEALLOCATE(dw_dx_mhd, dw_dy_mhd, dw_dz_mhd) @:DEALLOCATE(fd_coeff_z_h) end if diff --git a/src/simulation/m_mpi_proxy.fpp b/src/simulation/m_mpi_proxy.fpp index 617afb7c98..875cd6fe53 100644 --- a/src/simulation/m_mpi_proxy.fpp +++ b/src/simulation/m_mpi_proxy.fpp @@ -310,37 +310,40 @@ contains #:for mpi_dir in [1, 2, 3] if (mpi_dir == ${mpi_dir}$) then #:if mpi_dir == 1 - $:GPU_PARALLEL_LOOP(collapse=3,private='[r]') - do l = 0, p - do k = 0, n - do j = 0, buff_size - 1 - r = (j + buff_size*(k + (n + 1)*l)) - ib_buff_send(r) = ib_markers%sf(j + pack_offset, k, l) + #:call GPU_PARALLEL_LOOP(collapse=3,private='[r]') + do l = 0, p + do k = 0, n + do j = 0, buff_size - 1 + r = (j + buff_size*(k + (n + 1)*l)) + ib_buff_send(r) = ib_markers%sf(j + pack_offset, k, l) + end do end do end do - end do + #:endcall GPU_PARALLEL_LOOP #:elif mpi_dir == 2 - $:GPU_PARALLEL_LOOP(collapse=3,private='[r]') - do l = 0, p - do k = 0, buff_size - 1 - do j = -buff_size, m + buff_size - r = ((j + buff_size) + (m + 2*buff_size + 1)* & - (k + buff_size*l)) - ib_buff_send(r) = ib_markers%sf(j, k + pack_offset, l) + #:call GPU_PARALLEL_LOOP(collapse=3,private='[r]') + do l = 0, p + do k = 0, buff_size - 1 + do j = -buff_size, m + buff_size + r = ((j + buff_size) + (m + 2*buff_size + 1)* & + (k + buff_size*l)) + ib_buff_send(r) = ib_markers%sf(j, k + pack_offset, l) + end do end do end do - end do + #:endcall GPU_PARALLEL_LOOP #:else - $:GPU_PARALLEL_LOOP(collapse=3,private='[r]') - do l = 0, buff_size - 1 - do k = -buff_size, n + buff_size - do j = -buff_size, m + buff_size - r = ((j + buff_size) + (m + 2*buff_size + 1)* & - ((k + buff_size) + (n + 2*buff_size + 1)*l)) - ib_buff_send(r) = ib_markers%sf(j, k, l + pack_offset) + #:call GPU_PARALLEL_LOOP(collapse=3,private='[r]') + do l = 0, buff_size - 1 + do k = -buff_size, n + buff_size + do j = -buff_size, m + buff_size + r = ((j + buff_size) + (m + 2*buff_size + 1)* & + ((k + buff_size) + (n + 2*buff_size + 1)*l)) + ib_buff_send(r) = ib_markers%sf(j, k, l + pack_offset) + end do end do end do - end do + #:endcall GPU_PARALLEL_LOOP #:endif end if #:endfor @@ -349,7 +352,7 @@ contains #:for rdma_mpi in [False, True] if (rdma_mpi .eqv. ${'.true.' if rdma_mpi else '.false.'}$) then #:if rdma_mpi - #:call GPU_HOST_DATA(use_device='[ib_buff_send, ib_buff_recv]') + #:call GPU_HOST_DATA(use_device_addr='[ib_buff_send, ib_buff_recv]') call nvtxStartRange("IB-MARKER-SENDRECV-RDMA") call MPI_SENDRECV( & @@ -384,39 +387,42 @@ contains #:for mpi_dir in [1, 2, 3] if (mpi_dir == ${mpi_dir}$) then #:if mpi_dir == 1 - $:GPU_PARALLEL_LOOP(collapse=3,private='[r]') - do l = 0, p - do k = 0, n - do j = -buff_size, -1 - r = (j + buff_size*((k + 1) + (n + 1)*l)) - ib_markers%sf(j + unpack_offset, k, l) = ib_buff_recv(r) + #:call GPU_PARALLEL_LOOP(collapse=3,private='[r]') + do l = 0, p + do k = 0, n + do j = -buff_size, -1 + r = (j + buff_size*((k + 1) + (n + 1)*l)) + ib_markers%sf(j + unpack_offset, k, l) = ib_buff_recv(r) + end do end do end do - end do + #:endcall GPU_PARALLEL_LOOP #:elif mpi_dir == 2 - $:GPU_PARALLEL_LOOP(collapse=3,private='[r]') - do l = 0, p - do k = -buff_size, -1 - do j = -buff_size, m + buff_size - r = ((j + buff_size) + (m + 2*buff_size + 1)* & - ((k + buff_size) + buff_size*l)) - ib_markers%sf(j, k + unpack_offset, l) = ib_buff_recv(r) + #:call GPU_PARALLEL_LOOP(collapse=3,private='[r]') + do l = 0, p + do k = -buff_size, -1 + do j = -buff_size, m + buff_size + r = ((j + buff_size) + (m + 2*buff_size + 1)* & + ((k + buff_size) + buff_size*l)) + ib_markers%sf(j, k + unpack_offset, l) = ib_buff_recv(r) + end do end do end do - end do + #:endcall GPU_PARALLEL_LOOP #:else ! Unpacking buffer from bc_z%beg - $:GPU_PARALLEL_LOOP(collapse=3,private='[r]') - do l = -buff_size, -1 - do k = -buff_size, n + buff_size - do j = -buff_size, m + buff_size - r = ((j + buff_size) + (m + 2*buff_size + 1)* & - ((k + buff_size) + (n + 2*buff_size + 1)* & - (l + buff_size))) - ib_markers%sf(j, k, l + unpack_offset) = ib_buff_recv(r) + #:call GPU_PARALLEL_LOOP(collapse=3,private='[r]') + do l = -buff_size, -1 + do k = -buff_size, n + buff_size + do j = -buff_size, m + buff_size + r = ((j + buff_size) + (m + 2*buff_size + 1)* & + ((k + buff_size) + (n + 2*buff_size + 1)* & + (l + buff_size))) + ib_markers%sf(j, k, l + unpack_offset) = ib_buff_recv(r) + end do end do end do - end do + #:endcall GPU_PARALLEL_LOOP #:endif end if #:endfor diff --git a/src/simulation/m_muscl.fpp b/src/simulation/m_muscl.fpp index 11136673ec..335ef7e03f 100644 --- a/src/simulation/m_muscl.fpp +++ b/src/simulation/m_muscl.fpp @@ -118,97 +118,101 @@ contains if (muscl_order == 1) then if (muscl_dir == 1) then - $:GPU_PARALLEL_LOOP(collapse=4) - do i = 1, ubound(v_vf, 1) - do l = is3_muscl%beg, is3_muscl%end - do k = is2_muscl%beg, is2_muscl%end - do j = is1_muscl%beg, is1_muscl%end - vL_rs_vf_x(j, k, l, i) = v_vf(i)%sf(j, k, l) - vR_rs_vf_x(j, k, l, i) = v_vf(i)%sf(j, k, l) + #:call GPU_PARALLEL_LOOP(collapse=4) + do i = 1, ubound(v_vf, 1) + do l = is3_muscl%beg, is3_muscl%end + do k = is2_muscl%beg, is2_muscl%end + do j = is1_muscl%beg, is1_muscl%end + vL_rs_vf_x(j, k, l, i) = v_vf(i)%sf(j, k, l) + vR_rs_vf_x(j, k, l, i) = v_vf(i)%sf(j, k, l) + end do end do end do end do - end do + #:endcall else if (muscl_dir == 2) then - $:GPU_PARALLEL_LOOP(collapse=4) - do i = 1, ubound(v_vf, 1) - do l = is3_muscl%beg, is3_muscl%end - do k = is2_muscl%beg, is2_muscl%end - do j = is1_muscl%beg, is1_muscl%end - vL_rs_vf_y(j, k, l, i) = v_vf(i)%sf(k, j, l) - vR_rs_vf_y(j, k, l, i) = v_vf(i)%sf(k, j, l) + #:call GPU_PARALLEL_LOOP(collapse=4) + do i = 1, ubound(v_vf, 1) + do l = is3_muscl%beg, is3_muscl%end + do k = is2_muscl%beg, is2_muscl%end + do j = is1_muscl%beg, is1_muscl%end + vL_rs_vf_y(j, k, l, i) = v_vf(i)%sf(k, j, l) + vR_rs_vf_y(j, k, l, i) = v_vf(i)%sf(k, j, l) + end do end do end do end do - end do + #:endcall else if (muscl_dir == 3) then - $:GPU_PARALLEL_LOOP(collapse=4) - do i = 1, ubound(v_vf, 1) - do l = is3_muscl%beg, is3_muscl%end - do k = is2_muscl%beg, is2_muscl%end - do j = is1_muscl%beg, is1_muscl%end - vL_rs_vf_z(j, k, l, i) = v_vf(i)%sf(l, k, j) - vR_rs_vf_z(j, k, l, i) = v_vf(i)%sf(l, k, j) + #:call GPU_PARALLEL_LOOP(collapse=4) + do i = 1, ubound(v_vf, 1) + do l = is3_muscl%beg, is3_muscl%end + do k = is2_muscl%beg, is2_muscl%end + do j = is1_muscl%beg, is1_muscl%end + vL_rs_vf_z(j, k, l, i) = v_vf(i)%sf(l, k, j) + vR_rs_vf_z(j, k, l, i) = v_vf(i)%sf(l, k, j) + end do end do end do end do - end do + #:endcall end if else if (muscl_order == 2) then ! MUSCL Reconstruction #:for MUSCL_DIR, XYZ in [(1, 'x'), (2, 'y'), (3, 'z')] if (muscl_dir == ${MUSCL_DIR}$) then - $:GPU_PARALLEL_LOOP(collapse=4,private='[slopeL,slopeR,slope]') - do l = is3_muscl%beg, is3_muscl%end - do k = is2_muscl%beg, is2_muscl%end - do j = is1_muscl%beg, is1_muscl%end - do i = 1, v_size - - slopeL = v_rs_ws_${XYZ}$ (j + 1, k, l, i) - & - v_rs_ws_${XYZ}$ (j, k, l, i) - slopeR = v_rs_ws_${XYZ}$ (j, k, l, i) - & - v_rs_ws_${XYZ}$ (j - 1, k, l, i) - slope = 0._wp - - if (muscl_lim == 1) then ! minmod - if (slopeL*slopeR > 1e-9_wp) then - slope = min(abs(slopeL), abs(slopeR)) - end if - if (slopeL < 0._wp) slope = -slope - elseif (muscl_lim == 2) then ! MC - if (slopeL*slopeR > 1e-9_wp) then - slope = min(2._wp*abs(slopeL), 2._wp*abs(slopeR)) - slope = min(slope, 5e-1_wp*(abs(slopeL) + abs(slopeR))) - end if - if (slopeL < 0._wp) slope = -slope - elseif (muscl_lim == 3) then ! Van Albada - if (abs(slopeL) > 1e-6_wp .and. abs(slopeR) > 1e-6_wp .and. & - abs(slopeL + slopeR) > 1e-6_wp .and. slopeL*slopeR > 1e-6_wp) then - slope = ((slopeL + slopeR)*slopeL*slopeR)/(slopeL**2._wp + slopeR**2._wp) - end if - elseif (muscl_lim == 4) then ! Van Leer - if (abs(slopeL + slopeR) > 1.e-6_wp .and. slopeL*slopeR > 1.e-6_wp) then - slope = 2._wp*slopeL*slopeR/(slopeL + slopeR) - end if - elseif (muscl_lim == 5) then ! SUPERBEE - if (slopeL*slopeR > 1e-6_wp) then - slope = -1._wp*min(-min(2._wp*abs(slopeL), abs(slopeR)), -min(abs(slopeL), 2._wp*abs(slopeR))) + #:call GPU_PARALLEL_LOOP(collapse=4,private='[slopeL,slopeR,slope]') + do l = is3_muscl%beg, is3_muscl%end + do k = is2_muscl%beg, is2_muscl%end + do j = is1_muscl%beg, is1_muscl%end + do i = 1, v_size + + slopeL = v_rs_ws_${XYZ}$ (j + 1, k, l, i) - & + v_rs_ws_${XYZ}$ (j, k, l, i) + slopeR = v_rs_ws_${XYZ}$ (j, k, l, i) - & + v_rs_ws_${XYZ}$ (j - 1, k, l, i) + slope = 0._wp + + if (muscl_lim == 1) then ! minmod + if (slopeL*slopeR > 1e-9_wp) then + slope = min(abs(slopeL), abs(slopeR)) + end if + if (slopeL < 0._wp) slope = -slope + elseif (muscl_lim == 2) then ! MC + if (slopeL*slopeR > 1e-9_wp) then + slope = min(2._wp*abs(slopeL), 2._wp*abs(slopeR)) + slope = min(slope, 5e-1_wp*(abs(slopeL) + abs(slopeR))) + end if + if (slopeL < 0._wp) slope = -slope + elseif (muscl_lim == 3) then ! Van Albada + if (abs(slopeL) > 1e-6_wp .and. abs(slopeR) > 1e-6_wp .and. & + abs(slopeL + slopeR) > 1e-6_wp .and. slopeL*slopeR > 1e-6_wp) then + slope = ((slopeL + slopeR)*slopeL*slopeR)/(slopeL**2._wp + slopeR**2._wp) + end if + elseif (muscl_lim == 4) then ! Van Leer + if (abs(slopeL + slopeR) > 1.e-6_wp .and. slopeL*slopeR > 1.e-6_wp) then + slope = 2._wp*slopeL*slopeR/(slopeL + slopeR) + end if + elseif (muscl_lim == 5) then ! SUPERBEE + if (slopeL*slopeR > 1e-6_wp) then + slope = -1._wp*min(-min(2._wp*abs(slopeL), abs(slopeR)), -min(abs(slopeL), 2._wp*abs(slopeR))) + end if end if - end if - ! reconstruct from left side - vL_rs_vf_${XYZ}$ (j, k, l, i) = & - v_rs_ws_${XYZ}$ (j, k, l, i) - (5.e-1_wp*slope) + ! reconstruct from left side + vL_rs_vf_${XYZ}$ (j, k, l, i) = & + v_rs_ws_${XYZ}$ (j, k, l, i) - (5.e-1_wp*slope) - ! reconstruct from the right side - vR_rs_vf_${XYZ}$ (j, k, l, i) = & - v_rs_ws_${XYZ}$ (j, k, l, i) + (5.e-1_wp*slope) + ! reconstruct from the right side + vR_rs_vf_${XYZ}$ (j, k, l, i) = & + v_rs_ws_${XYZ}$ (j, k, l, i) + (5.e-1_wp*slope) + end do end do end do end do - end do + #:endcall end if #:endfor end if @@ -239,59 +243,60 @@ contains #:for MUSCL_DIR, XYZ in [(1, 'x'), (2, 'y'), (3, 'z')] if (muscl_dir == ${MUSCL_DIR}$) then - $:GPU_PARALLEL_LOOP(collapse=3,private='[aCL,aC,aCR,aTHINC,moncon,sign,qmin,qmax]') - do l = is3_muscl%beg, is3_muscl%end - do k = is2_muscl%beg, is2_muscl%end - do j = is1_muscl%beg, is1_muscl%end + #:call GPU_PARALLEL_LOOP(collapse=3,private='[aCL,aC,aCR,aTHINC,moncon,sign,qmin,qmax]') + do l = is3_muscl%beg, is3_muscl%end + do k = is2_muscl%beg, is2_muscl%end + do j = is1_muscl%beg, is1_muscl%end - aCL = v_rs_ws_${XYZ}$ (j - 1, k, l, advxb) - aC = v_rs_ws_${XYZ}$ (j, k, l, advxb) - aCR = v_rs_ws_${XYZ}$ (j + 1, k, l, advxb) + aCL = v_rs_ws_${XYZ}$ (j - 1, k, l, advxb) + aC = v_rs_ws_${XYZ}$ (j, k, l, advxb) + aCR = v_rs_ws_${XYZ}$ (j + 1, k, l, advxb) - moncon = (aCR - aC)*(aC - aCL) + moncon = (aCR - aC)*(aC - aCL) - if (aC >= ic_eps .and. aC <= 1._wp - ic_eps .and. moncon > moncon_cutoff) then ! Interface cell + if (aC >= ic_eps .and. aC <= 1._wp - ic_eps .and. moncon > moncon_cutoff) then ! Interface cell - if (aCR - aCL > 0._wp) then - sign = 1._wp - else - sign = -1._wp - end if + if (aCR - aCL > 0._wp) then + sign = 1._wp + else + sign = -1._wp + end if - qmin = min(aCR, aCL) - qmax = max(aCR, aCL) - qmin - - C = (aC - qmin + sgm_eps)/(qmax + sgm_eps) - B = exp(sign*ic_beta*(2._wp*C - 1._wp)) - A = (B/cosh(ic_beta) - 1._wp)/tanh(ic_beta) - - ! Left reconstruction - aTHINC = qmin + 5e-1_wp*qmax*(1._wp + sign*A) - if (aTHINC < ic_eps) aTHINC = ic_eps - if (aTHINC > 1 - ic_eps) aTHINC = 1 - ic_eps - vL_rs_vf_${XYZ}$ (j, k, l, contxb) = vL_rs_vf_${XYZ}$ (j, k, l, contxb)/ & - vL_rs_vf_${XYZ}$ (j, k, l, advxb)*aTHINC - vL_rs_vf_${XYZ}$ (j, k, l, contxe) = vL_rs_vf_${XYZ}$ (j, k, l, contxe)/ & - (1._wp - vL_rs_vf_${XYZ}$ (j, k, l, advxb))*(1._wp - aTHINC) - vL_rs_vf_${XYZ}$ (j, k, l, advxb) = aTHINC - vL_rs_vf_${XYZ}$ (j, k, l, advxe) = 1 - aTHINC - - ! Right reconstruction - aTHINC = qmin + 5e-1_wp*qmax*(1._wp + sign*(tanh(ic_beta) + A)/(1._wp + A*tanh(ic_beta))) - if (aTHINC < ic_eps) aTHINC = ic_eps - if (aTHINC > 1 - ic_eps) aTHINC = 1 - ic_eps - vR_rs_vf_${XYZ}$ (j, k, l, contxb) = vL_rs_vf_${XYZ}$ (j, k, l, contxb)/ & - vL_rs_vf_${XYZ}$ (j, k, l, advxb)*aTHINC - vR_rs_vf_${XYZ}$ (j, k, l, contxe) = vL_rs_vf_${XYZ}$ (j, k, l, contxe)/ & - (1._wp - vL_rs_vf_${XYZ}$ (j, k, l, advxb))*(1._wp - aTHINC) - vR_rs_vf_${XYZ}$ (j, k, l, advxb) = aTHINC - vR_rs_vf_${XYZ}$ (j, k, l, advxe) = 1 - aTHINC - - end if + qmin = min(aCR, aCL) + qmax = max(aCR, aCL) - qmin + + C = (aC - qmin + sgm_eps)/(qmax + sgm_eps) + B = exp(sign*ic_beta*(2._wp*C - 1._wp)) + A = (B/cosh(ic_beta) - 1._wp)/tanh(ic_beta) + + ! Left reconstruction + aTHINC = qmin + 5e-1_wp*qmax*(1._wp + sign*A) + if (aTHINC < ic_eps) aTHINC = ic_eps + if (aTHINC > 1 - ic_eps) aTHINC = 1 - ic_eps + vL_rs_vf_${XYZ}$ (j, k, l, contxb) = vL_rs_vf_${XYZ}$ (j, k, l, contxb)/ & + vL_rs_vf_${XYZ}$ (j, k, l, advxb)*aTHINC + vL_rs_vf_${XYZ}$ (j, k, l, contxe) = vL_rs_vf_${XYZ}$ (j, k, l, contxe)/ & + (1._wp - vL_rs_vf_${XYZ}$ (j, k, l, advxb))*(1._wp - aTHINC) + vL_rs_vf_${XYZ}$ (j, k, l, advxb) = aTHINC + vL_rs_vf_${XYZ}$ (j, k, l, advxe) = 1 - aTHINC + + ! Right reconstruction + aTHINC = qmin + 5e-1_wp*qmax*(1._wp + sign*(tanh(ic_beta) + A)/(1._wp + A*tanh(ic_beta))) + if (aTHINC < ic_eps) aTHINC = ic_eps + if (aTHINC > 1 - ic_eps) aTHINC = 1 - ic_eps + vR_rs_vf_${XYZ}$ (j, k, l, contxb) = vL_rs_vf_${XYZ}$ (j, k, l, contxb)/ & + vL_rs_vf_${XYZ}$ (j, k, l, advxb)*aTHINC + vR_rs_vf_${XYZ}$ (j, k, l, contxe) = vL_rs_vf_${XYZ}$ (j, k, l, contxe)/ & + (1._wp - vL_rs_vf_${XYZ}$ (j, k, l, advxb))*(1._wp - aTHINC) + vR_rs_vf_${XYZ}$ (j, k, l, advxb) = aTHINC + vR_rs_vf_${XYZ}$ (j, k, l, advxe) = 1 - aTHINC + end if + + end do end do end do - end do + #:endcall end if #:endfor @@ -313,47 +318,50 @@ contains $:GPU_UPDATE(device='[v_size]') if (muscl_dir == 1) then - $:GPU_PARALLEL_LOOP(collapse=4) - do j = 1, v_size - do q = is3_muscl%beg, is3_muscl%end - do l = is2_muscl%beg, is2_muscl%end - do k = is1_muscl%beg - muscl_polyn, is1_muscl%end + muscl_polyn - v_rs_ws_x(k, l, q, j) = v_vf(j)%sf(k, l, q) + #:call GPU_PARALLEL_LOOP(collapse=4) + do j = 1, v_size + do q = is3_muscl%beg, is3_muscl%end + do l = is2_muscl%beg, is2_muscl%end + do k = is1_muscl%beg - muscl_polyn, is1_muscl%end + muscl_polyn + v_rs_ws_x(k, l, q, j) = v_vf(j)%sf(k, l, q) + end do end do end do end do - end do + #:endcall end if ! Reshaping/Projecting onto Characteristic Fields in y-direction if (n == 0) return if (muscl_dir == 2) then - $:GPU_PARALLEL_LOOP(collapse=4) - do j = 1, v_size - do q = is3_muscl%beg, is3_muscl%end - do l = is2_muscl%beg, is2_muscl%end - do k = is1_muscl%beg - muscl_polyn, is1_muscl%end + muscl_polyn - v_rs_ws_y(k, l, q, j) = v_vf(j)%sf(l, k, q) + #:call GPU_PARALLEL_LOOP(collapse=4) + do j = 1, v_size + do q = is3_muscl%beg, is3_muscl%end + do l = is2_muscl%beg, is2_muscl%end + do k = is1_muscl%beg - muscl_polyn, is1_muscl%end + muscl_polyn + v_rs_ws_y(k, l, q, j) = v_vf(j)%sf(l, k, q) + end do end do end do end do - end do + #:endcall end if ! Reshaping/Projecting onto Characteristic Fields in z-direction if (p == 0) return if (muscl_dir == 3) then - $:GPU_PARALLEL_LOOP(collapse=4) - do j = 1, v_size - do q = is3_muscl%beg, is3_muscl%end - do l = is2_muscl%beg, is2_muscl%end - do k = is1_muscl%beg - muscl_polyn, is1_muscl%end + muscl_polyn - v_rs_ws_z(k, l, q, j) = v_vf(j)%sf(q, l, k) + #:call GPU_PARALLEL_LOOP(collapse=4) + do j = 1, v_size + do q = is3_muscl%beg, is3_muscl%end + do l = is2_muscl%beg, is2_muscl%end + do k = is1_muscl%beg - muscl_polyn, is1_muscl%end + muscl_polyn + v_rs_ws_z(k, l, q, j) = v_vf(j)%sf(q, l, k) + end do end do end do end do - end do + #:endcall end if end subroutine s_initialize_muscl diff --git a/src/simulation/m_pressure_relaxation.fpp b/src/simulation/m_pressure_relaxation.fpp index 5affd3342f..407c01ff10 100644 --- a/src/simulation/m_pressure_relaxation.fpp +++ b/src/simulation/m_pressure_relaxation.fpp @@ -23,8 +23,8 @@ module m_pressure_relaxation real(wp), allocatable, dimension(:) :: gamma_min, pres_inf $:GPU_DECLARE(create='[gamma_min, pres_inf]') - real(wp), allocatable, dimension(:, :) :: Res - $:GPU_DECLARE(create='[Res]') + real(wp), allocatable, dimension(:, :) :: Res_pr + $:GPU_DECLARE(create='[Res_pr]') contains @@ -42,13 +42,13 @@ contains $:GPU_UPDATE(device='[gamma_min, pres_inf]') if (viscous) then - @:ALLOCATE(Res(1:2, 1:Re_size_max)) + @:ALLOCATE(Res_pr(1:2, 1:Re_size_max)) do i = 1, 2 do j = 1, Re_size(i) - Res(i, j) = fluid_pp(Re_idx(i, j))%Re(i) + Res_pr(i, j) = fluid_pp(Re_idx(i, j))%Re(i) end do end do - $:GPU_UPDATE(device='[Res, Re_idx, Re_size]') + $:GPU_UPDATE(device='[Res_pr, Re_idx, Re_size]') end if end subroutine s_initialize_pressure_relaxation_module @@ -58,31 +58,32 @@ contains @:DEALLOCATE(gamma_min, pres_inf) if (viscous) then - @:DEALLOCATE(Res) + @:DEALLOCATE(Res_pr) end if end subroutine s_finalize_pressure_relaxation_module !> The main pressure relaxation procedure !! @param q_cons_vf Cell-average conservative variables - pure subroutine s_pressure_relaxation_procedure(q_cons_vf) + subroutine s_pressure_relaxation_procedure(q_cons_vf) type(scalar_field), dimension(sys_size), intent(inout) :: q_cons_vf integer :: j, k, l - $:GPU_PARALLEL_LOOP(collapse=3) - do l = 0, p - do k = 0, n - do j = 0, m - call s_relax_cell_pressure(q_cons_vf, j, k, l) + #:call GPU_PARALLEL_LOOP(collapse=3) + do l = 0, p + do k = 0, n + do j = 0, m + call s_relax_cell_pressure(q_cons_vf, j, k, l) + end do end do end do - end do + #:endcall GPU_PARALLEL_LOOP end subroutine s_pressure_relaxation_procedure !> Process pressure relaxation for a single cell - pure subroutine s_relax_cell_pressure(q_cons_vf, j, k, l) + subroutine s_relax_cell_pressure(q_cons_vf, j, k, l) $:GPU_ROUTINE(parallelism='[seq]') type(scalar_field), dimension(sys_size), intent(inout) :: q_cons_vf @@ -102,7 +103,7 @@ contains end subroutine s_relax_cell_pressure !> Check if pressure relaxation is needed for this cell - pure logical function s_needs_pressure_relaxation(q_cons_vf, j, k, l) + logical function s_needs_pressure_relaxation(q_cons_vf, j, k, l) $:GPU_ROUTINE(parallelism='[seq]') type(scalar_field), dimension(sys_size), intent(in) :: q_cons_vf @@ -120,7 +121,7 @@ contains end function s_needs_pressure_relaxation !> Correct volume fractions to physical bounds - pure subroutine s_correct_volume_fractions(q_cons_vf, j, k, l) + subroutine s_correct_volume_fractions(q_cons_vf, j, k, l) $:GPU_ROUTINE(parallelism='[seq]') type(scalar_field), dimension(sys_size), intent(inout) :: q_cons_vf @@ -150,7 +151,7 @@ contains end subroutine s_correct_volume_fractions !> Main pressure equilibration using Newton-Raphson - pure subroutine s_equilibrate_pressure(q_cons_vf, j, k, l) + subroutine s_equilibrate_pressure(q_cons_vf, j, k, l) $:GPU_ROUTINE(parallelism='[seq]') type(scalar_field), dimension(sys_size), intent(inout) :: q_cons_vf @@ -219,7 +220,7 @@ contains end subroutine s_equilibrate_pressure !> Correct internal energies using equilibrated pressure - pure subroutine s_correct_internal_energies(q_cons_vf, j, k, l) + subroutine s_correct_internal_energies(q_cons_vf, j, k, l) $:GPU_ROUTINE(parallelism='[seq]') type(scalar_field), dimension(sys_size), intent(inout) :: q_cons_vf @@ -287,7 +288,7 @@ contains if (Re_size(i) > 0) Re(i) = 0._wp $:GPU_LOOP(parallelism='[seq]') do q = 1, Re_size(i) - Re(i) = alpha(Re_idx(i, q))/Res(i, q) + Re(i) + Re(i) = alpha(Re_idx(i, q))/Res_pr(i, q) + Re(i) end do Re(i) = 1._wp/max(Re(i), sgm_eps) end do diff --git a/src/simulation/m_qbmm.fpp b/src/simulation/m_qbmm.fpp index 62e170c42d..85dd51b166 100644 --- a/src/simulation/m_qbmm.fpp +++ b/src/simulation/m_qbmm.fpp @@ -37,9 +37,9 @@ module m_qbmm type(int_bounds_info) :: is1_qbmm, is2_qbmm, is3_qbmm $:GPU_DECLARE(create='[is1_qbmm,is2_qbmm,is3_qbmm]') - integer, allocatable, dimension(:) :: bubrs + integer, allocatable, dimension(:) :: bubrs_qbmm integer, allocatable, dimension(:, :) :: bubmoms - $:GPU_DECLARE(create='[bubrs,bubmoms]') + $:GPU_DECLARE(create='[bubrs_qbmm,bubmoms]') contains @@ -394,13 +394,13 @@ contains $:GPU_UPDATE(device='[momrhs]') - @:ALLOCATE(bubrs(1:nb)) + @:ALLOCATE(bubrs_qbmm(1:nb)) @:ALLOCATE(bubmoms(1:nb, 1:nmom)) do i = 1, nb - bubrs(i) = bub_idx%rs(i) + bubrs_qbmm(i) = bub_idx%rs(i) end do - $:GPU_UPDATE(device='[bubrs]') + $:GPU_UPDATE(device='[bubrs_qbmm]') do j = 1, nmom do i = 1, nb @@ -411,7 +411,7 @@ contains end subroutine s_initialize_qbmm_module - pure subroutine s_compute_qbmm_rhs(idir, q_cons_vf, q_prim_vf, rhs_vf, flux_n_vf, pb, rhs_pb) + subroutine s_compute_qbmm_rhs(idir, q_cons_vf, q_prim_vf, rhs_vf, flux_n_vf, pb, rhs_pb) integer, intent(in) :: idir type(scalar_field), dimension(sys_size), intent(in) :: q_cons_vf, q_prim_vf @@ -433,136 +433,138 @@ contains end select if (.not. polytropic) then - $:GPU_PARALLEL_LOOP(collapse=5,private='[nb_q,nR,nR2,R,R2,nb_dot,nR_dot,nR2_dot,var,AX]') - do i = 1, nb - do q = 1, nnode - do l = 0, p - do k = 0, n - do j = 0, m - nb_q = q_cons_vf(bubxb + (i - 1)*nmom)%sf(j, k, l) - nR = q_cons_vf(bubxb + 1 + (i - 1)*nmom)%sf(j, k, l) - nR2 = q_cons_vf(bubxb + 3 + (i - 1)*nmom)%sf(j, k, l) - R = q_prim_vf(bubxb + 1 + (i - 1)*nmom)%sf(j, k, l) - R2 = q_prim_vf(bubxb + 3 + (i - 1)*nmom)%sf(j, k, l) - var = max(R2 - R**2._wp, verysmall) - if (q <= 2) then - AX = R - sqrt(var) - else - AX = R + sqrt(var) - end if - select case (idir) - case (1) - nb_dot = flux_n_vf(bubxb + (i - 1)*nmom)%sf(j - 1, k, l) - flux_n_vf(bubxb + (i - 1)*nmom)%sf(j, k, l) - nR_dot = flux_n_vf(bubxb + 1 + (i - 1)*nmom)%sf(j - 1, k, l) - flux_n_vf(bubxb + 1 + (i - 1)*nmom)%sf(j, k, l) - nR2_dot = flux_n_vf(bubxb + 3 + (i - 1)*nmom)%sf(j - 1, k, l) - flux_n_vf(bubxb + 3 + (i - 1)*nmom)%sf(j, k, l) - rhs_pb(j, k, l, q, i) = rhs_pb(j, k, l, q, i) - 3._wp*gam/(dx(j)*AX*nb_q**2)* & - (nR_dot*nb_q - nR*nb_dot)*(pb(j, k, l, q, i)) - case (2) - nb_dot = flux_n_vf(bubxb + (i - 1)*nmom)%sf(j, k - 1, l) - flux_n_vf(bubxb + (i - 1)*nmom)%sf(j, k, l) - nR_dot = flux_n_vf(bubxb + 1 + (i - 1)*nmom)%sf(j, k - 1, l) - flux_n_vf(bubxb + 1 + (i - 1)*nmom)%sf(j, k, l) - nR2_dot = flux_n_vf(bubxb + 3 + (i - 1)*nmom)%sf(j, k - 1, l) - flux_n_vf(bubxb + 3 + (i - 1)*nmom)%sf(j, k, l) - rhs_pb(j, k, l, q, i) = rhs_pb(j, k, l, q, i) - 3._wp*gam/(dy(k)*AX*nb_q**2)* & - (nR_dot*nb_q - nR*nb_dot)*(pb(j, k, l, q, i)) - case (3) - if (is_axisym) then - nb_dot = q_prim_vf(contxe + idir)%sf(j, k, l)*(flux_n_vf(bubxb + (i - 1)*nmom)%sf(j, k, l - 1) - flux_n_vf(bubxb + (i - 1)*nmom)%sf(j, k, l)) - nR_dot = q_prim_vf(contxe + idir)%sf(j, k, l)*(flux_n_vf(bubxb + 1 + (i - 1)*nmom)%sf(j, k, l - 1) - flux_n_vf(bubxb + 1 + (i - 1)*nmom)%sf(j, k, l)) - nR2_dot = q_prim_vf(contxe + idir)%sf(j, k, l)*(flux_n_vf(bubxb + 3 + (i - 1)*nmom)%sf(j, k, l - 1) - flux_n_vf(bubxb + 3 + (i - 1)*nmom)%sf(j, k, l)) - rhs_pb(j, k, l, q, i) = rhs_pb(j, k, l, q, i) - 3._wp*gam/(dz(l)*y_cc(k)*AX*nb_q**2)* & - (nR_dot*nb_q - nR*nb_dot)*(pb(j, k, l, q, i)) + #:call GPU_PARALLEL_LOOP(collapse=5,private='[nb_q,nR,nR2,R,R2,nb_dot,nR_dot,nR2_dot,var,AX]') + do i = 1, nb + do q = 1, nnode + do l = 0, p + do k = 0, n + do j = 0, m + nb_q = q_cons_vf(bubxb + (i - 1)*nmom)%sf(j, k, l) + nR = q_cons_vf(bubxb + 1 + (i - 1)*nmom)%sf(j, k, l) + nR2 = q_cons_vf(bubxb + 3 + (i - 1)*nmom)%sf(j, k, l) + R = q_prim_vf(bubxb + 1 + (i - 1)*nmom)%sf(j, k, l) + R2 = q_prim_vf(bubxb + 3 + (i - 1)*nmom)%sf(j, k, l) + var = max(R2 - R**2._wp, verysmall) + if (q <= 2) then + AX = R - sqrt(var) else - nb_dot = flux_n_vf(bubxb + (i - 1)*nmom)%sf(j, k, l - 1) - flux_n_vf(bubxb + (i - 1)*nmom)%sf(j, k, l) - nR_dot = flux_n_vf(bubxb + 1 + (i - 1)*nmom)%sf(j, k, l - 1) - flux_n_vf(bubxb + 1 + (i - 1)*nmom)%sf(j, k, l) - nR2_dot = flux_n_vf(bubxb + 3 + (i - 1)*nmom)%sf(j, k, l - 1) - flux_n_vf(bubxb + 3 + (i - 1)*nmom)%sf(j, k, l) - rhs_pb(j, k, l, q, i) = rhs_pb(j, k, l, q, i) - 3._wp*gam/(dz(l)*AX*nb_q**2)* & - (nR_dot*nb_q - nR*nb_dot)*(pb(j, k, l, q, i)) + AX = R + sqrt(var) end if - end select - if (q <= 2) then select case (idir) case (1) - rhs_pb(j, k, l, q, i) = rhs_pb(j, k, l, q, i) + 3._wp*gam/(dx(j)*AX*nb_q**2*sqrt(var)*2._wp)* & - (nR2_dot*nb_q - nR2*nb_dot)*(pb(j, k, l, q, i)) - rhs_pb(j, k, l, q, i) = rhs_pb(j, k, l, q, i) + 3._wp*gam/(dx(j)*AX*nb_q**2*sqrt(var)*2._wp)* & - (-2._wp*(nR/nb_q)*(nR_dot*nb_q - nR*nb_dot))*(pb(j, k, l, q, i)) + nb_dot = flux_n_vf(bubxb + (i - 1)*nmom)%sf(j - 1, k, l) - flux_n_vf(bubxb + (i - 1)*nmom)%sf(j, k, l) + nR_dot = flux_n_vf(bubxb + 1 + (i - 1)*nmom)%sf(j - 1, k, l) - flux_n_vf(bubxb + 1 + (i - 1)*nmom)%sf(j, k, l) + nR2_dot = flux_n_vf(bubxb + 3 + (i - 1)*nmom)%sf(j - 1, k, l) - flux_n_vf(bubxb + 3 + (i - 1)*nmom)%sf(j, k, l) + rhs_pb(j, k, l, q, i) = rhs_pb(j, k, l, q, i) - 3._wp*gam/(dx(j)*AX*nb_q**2)* & + (nR_dot*nb_q - nR*nb_dot)*(pb(j, k, l, q, i)) case (2) - rhs_pb(j, k, l, q, i) = rhs_pb(j, k, l, q, i) + 3._wp*gam/(dy(k)*AX*nb_q**2*sqrt(var)*2._wp)* & - (nR2_dot*nb_q - nR2*nb_dot)*(pb(j, k, l, q, i)) - rhs_pb(j, k, l, q, i) = rhs_pb(j, k, l, q, i) + 3._wp*gam/(dy(k)*AX*nb_q**2*sqrt(var)*2._wp)* & - (-2._wp*(nR/nb_q)*(nR_dot*nb_q - nR*nb_dot))*(pb(j, k, l, q, i)) + nb_dot = flux_n_vf(bubxb + (i - 1)*nmom)%sf(j, k - 1, l) - flux_n_vf(bubxb + (i - 1)*nmom)%sf(j, k, l) + nR_dot = flux_n_vf(bubxb + 1 + (i - 1)*nmom)%sf(j, k - 1, l) - flux_n_vf(bubxb + 1 + (i - 1)*nmom)%sf(j, k, l) + nR2_dot = flux_n_vf(bubxb + 3 + (i - 1)*nmom)%sf(j, k - 1, l) - flux_n_vf(bubxb + 3 + (i - 1)*nmom)%sf(j, k, l) + rhs_pb(j, k, l, q, i) = rhs_pb(j, k, l, q, i) - 3._wp*gam/(dy(k)*AX*nb_q**2)* & + (nR_dot*nb_q - nR*nb_dot)*(pb(j, k, l, q, i)) case (3) if (is_axisym) then - rhs_pb(j, k, l, q, i) = rhs_pb(j, k, l, q, i) + 3._wp*gam/(dz(l)*y_cc(k)*AX*nb_q**2*sqrt(var)*2._wp)* & + nb_dot = q_prim_vf(contxe + idir)%sf(j, k, l)*(flux_n_vf(bubxb + (i - 1)*nmom)%sf(j, k, l - 1) - flux_n_vf(bubxb + (i - 1)*nmom)%sf(j, k, l)) + nR_dot = q_prim_vf(contxe + idir)%sf(j, k, l)*(flux_n_vf(bubxb + 1 + (i - 1)*nmom)%sf(j, k, l - 1) - flux_n_vf(bubxb + 1 + (i - 1)*nmom)%sf(j, k, l)) + nR2_dot = q_prim_vf(contxe + idir)%sf(j, k, l)*(flux_n_vf(bubxb + 3 + (i - 1)*nmom)%sf(j, k, l - 1) - flux_n_vf(bubxb + 3 + (i - 1)*nmom)%sf(j, k, l)) + rhs_pb(j, k, l, q, i) = rhs_pb(j, k, l, q, i) - 3._wp*gam/(dz(l)*y_cc(k)*AX*nb_q**2)* & + (nR_dot*nb_q - nR*nb_dot)*(pb(j, k, l, q, i)) + else + nb_dot = flux_n_vf(bubxb + (i - 1)*nmom)%sf(j, k, l - 1) - flux_n_vf(bubxb + (i - 1)*nmom)%sf(j, k, l) + nR_dot = flux_n_vf(bubxb + 1 + (i - 1)*nmom)%sf(j, k, l - 1) - flux_n_vf(bubxb + 1 + (i - 1)*nmom)%sf(j, k, l) + nR2_dot = flux_n_vf(bubxb + 3 + (i - 1)*nmom)%sf(j, k, l - 1) - flux_n_vf(bubxb + 3 + (i - 1)*nmom)%sf(j, k, l) + rhs_pb(j, k, l, q, i) = rhs_pb(j, k, l, q, i) - 3._wp*gam/(dz(l)*AX*nb_q**2)* & + (nR_dot*nb_q - nR*nb_dot)*(pb(j, k, l, q, i)) + end if + end select + if (q <= 2) then + select case (idir) + case (1) + rhs_pb(j, k, l, q, i) = rhs_pb(j, k, l, q, i) + 3._wp*gam/(dx(j)*AX*nb_q**2*sqrt(var)*2._wp)* & (nR2_dot*nb_q - nR2*nb_dot)*(pb(j, k, l, q, i)) - rhs_pb(j, k, l, q, i) = rhs_pb(j, k, l, q, i) + 3._wp*gam/(dz(l)*y_cc(k)*AX*nb_q**2*sqrt(var)*2._wp)* & + rhs_pb(j, k, l, q, i) = rhs_pb(j, k, l, q, i) + 3._wp*gam/(dx(j)*AX*nb_q**2*sqrt(var)*2._wp)* & (-2._wp*(nR/nb_q)*(nR_dot*nb_q - nR*nb_dot))*(pb(j, k, l, q, i)) - else - rhs_pb(j, k, l, q, i) = rhs_pb(j, k, l, q, i) + 3._wp*gam/(dz(l)*AX*nb_q**2*sqrt(var)*2._wp)* & + case (2) + rhs_pb(j, k, l, q, i) = rhs_pb(j, k, l, q, i) + 3._wp*gam/(dy(k)*AX*nb_q**2*sqrt(var)*2._wp)* & (nR2_dot*nb_q - nR2*nb_dot)*(pb(j, k, l, q, i)) - rhs_pb(j, k, l, q, i) = rhs_pb(j, k, l, q, i) + 3._wp*gam/(dz(l)*AX*nb_q**2*sqrt(var)*2._wp)* & + rhs_pb(j, k, l, q, i) = rhs_pb(j, k, l, q, i) + 3._wp*gam/(dy(k)*AX*nb_q**2*sqrt(var)*2._wp)* & (-2._wp*(nR/nb_q)*(nR_dot*nb_q - nR*nb_dot))*(pb(j, k, l, q, i)) - end if - end select - else - select case (idir) - case (1) - rhs_pb(j, k, l, q, i) = rhs_pb(j, k, l, q, i) - 3._wp*gam/(dx(j)*AX*nb_q**2*sqrt(var)*2._wp)* & - (nR2_dot*nb_q - nR2*nb_dot)*(pb(j, k, l, q, i)) - rhs_pb(j, k, l, q, i) = rhs_pb(j, k, l, q, i) - 3._wp*gam/(dx(j)*AX*nb_q**2*sqrt(var)*2._wp)* & - (-2._wp*(nR/nb_q)*(nR_dot*nb_q - nR*nb_dot))*(pb(j, k, l, q, i)) - case (2) - rhs_pb(j, k, l, q, i) = rhs_pb(j, k, l, q, i) - 3._wp*gam/(dy(k)*AX*nb_q**2*sqrt(var)*2._wp)* & - (nR2_dot*nb_q - nR2*nb_dot)*(pb(j, k, l, q, i)) - rhs_pb(j, k, l, q, i) = rhs_pb(j, k, l, q, i) - 3._wp*gam/(dy(k)*AX*nb_q**2*sqrt(var)*2._wp)* & - (-2._wp*(nR/nb_q)*(nR_dot*nb_q - nR*nb_dot))*(pb(j, k, l, q, i)) - case (3) - if (is_axisym) then - rhs_pb(j, k, l, q, i) = rhs_pb(j, k, l, q, i) - 3._wp*gam/(dz(l)*y_cc(k)*AX*nb_q**2*sqrt(var)*2._wp)* & + case (3) + if (is_axisym) then + rhs_pb(j, k, l, q, i) = rhs_pb(j, k, l, q, i) + 3._wp*gam/(dz(l)*y_cc(k)*AX*nb_q**2*sqrt(var)*2._wp)* & + (nR2_dot*nb_q - nR2*nb_dot)*(pb(j, k, l, q, i)) + rhs_pb(j, k, l, q, i) = rhs_pb(j, k, l, q, i) + 3._wp*gam/(dz(l)*y_cc(k)*AX*nb_q**2*sqrt(var)*2._wp)* & + (-2._wp*(nR/nb_q)*(nR_dot*nb_q - nR*nb_dot))*(pb(j, k, l, q, i)) + else + rhs_pb(j, k, l, q, i) = rhs_pb(j, k, l, q, i) + 3._wp*gam/(dz(l)*AX*nb_q**2*sqrt(var)*2._wp)* & + (nR2_dot*nb_q - nR2*nb_dot)*(pb(j, k, l, q, i)) + rhs_pb(j, k, l, q, i) = rhs_pb(j, k, l, q, i) + 3._wp*gam/(dz(l)*AX*nb_q**2*sqrt(var)*2._wp)* & + (-2._wp*(nR/nb_q)*(nR_dot*nb_q - nR*nb_dot))*(pb(j, k, l, q, i)) + end if + end select + else + select case (idir) + case (1) + rhs_pb(j, k, l, q, i) = rhs_pb(j, k, l, q, i) - 3._wp*gam/(dx(j)*AX*nb_q**2*sqrt(var)*2._wp)* & (nR2_dot*nb_q - nR2*nb_dot)*(pb(j, k, l, q, i)) - rhs_pb(j, k, l, q, i) = rhs_pb(j, k, l, q, i) - 3._wp*gam/(dz(l)*y_cc(k)*AX*nb_q**2*sqrt(var)*2._wp)* & + rhs_pb(j, k, l, q, i) = rhs_pb(j, k, l, q, i) - 3._wp*gam/(dx(j)*AX*nb_q**2*sqrt(var)*2._wp)* & (-2._wp*(nR/nb_q)*(nR_dot*nb_q - nR*nb_dot))*(pb(j, k, l, q, i)) - else - rhs_pb(j, k, l, q, i) = rhs_pb(j, k, l, q, i) - 3._wp*gam/(dz(l)*AX*nb_q**2*sqrt(var)*2._wp)* & + case (2) + rhs_pb(j, k, l, q, i) = rhs_pb(j, k, l, q, i) - 3._wp*gam/(dy(k)*AX*nb_q**2*sqrt(var)*2._wp)* & (nR2_dot*nb_q - nR2*nb_dot)*(pb(j, k, l, q, i)) - rhs_pb(j, k, l, q, i) = rhs_pb(j, k, l, q, i) - 3._wp*gam/(dz(l)*AX*nb_q**2*sqrt(var)*2._wp)* & + rhs_pb(j, k, l, q, i) = rhs_pb(j, k, l, q, i) - 3._wp*gam/(dy(k)*AX*nb_q**2*sqrt(var)*2._wp)* & (-2._wp*(nR/nb_q)*(nR_dot*nb_q - nR*nb_dot))*(pb(j, k, l, q, i)) - end if - end select - end if + case (3) + if (is_axisym) then + rhs_pb(j, k, l, q, i) = rhs_pb(j, k, l, q, i) - 3._wp*gam/(dz(l)*y_cc(k)*AX*nb_q**2*sqrt(var)*2._wp)* & + (nR2_dot*nb_q - nR2*nb_dot)*(pb(j, k, l, q, i)) + rhs_pb(j, k, l, q, i) = rhs_pb(j, k, l, q, i) - 3._wp*gam/(dz(l)*y_cc(k)*AX*nb_q**2*sqrt(var)*2._wp)* & + (-2._wp*(nR/nb_q)*(nR_dot*nb_q - nR*nb_dot))*(pb(j, k, l, q, i)) + else + rhs_pb(j, k, l, q, i) = rhs_pb(j, k, l, q, i) - 3._wp*gam/(dz(l)*AX*nb_q**2*sqrt(var)*2._wp)* & + (nR2_dot*nb_q - nR2*nb_dot)*(pb(j, k, l, q, i)) + rhs_pb(j, k, l, q, i) = rhs_pb(j, k, l, q, i) - 3._wp*gam/(dz(l)*AX*nb_q**2*sqrt(var)*2._wp)* & + (-2._wp*(nR/nb_q)*(nR_dot*nb_q - nR*nb_dot))*(pb(j, k, l, q, i)) + end if + end select + end if + end do end do end do end do end do - end do + #:endcall GPU_PARALLEL_LOOP end if ! The following block is not repeated and is left as is if (idir == 1) then - $:GPU_PARALLEL_LOOP(collapse=3) - do l = 0, p - do q = 0, n - do i = 0, m - rhs_vf(alf_idx)%sf(i, q, l) = rhs_vf(alf_idx)%sf(i, q, l) + mom_sp(2)%sf(i, q, l) - j = bubxb - $:GPU_LOOP(parallelism='[seq]') - do k = 1, nb - rhs_vf(j)%sf(i, q, l) = rhs_vf(j)%sf(i, q, l) + mom_3d(0, 0, k)%sf(i, q, l) - rhs_vf(j + 1)%sf(i, q, l) = rhs_vf(j + 1)%sf(i, q, l) + mom_3d(1, 0, k)%sf(i, q, l) - rhs_vf(j + 2)%sf(i, q, l) = rhs_vf(j + 2)%sf(i, q, l) + mom_3d(0, 1, k)%sf(i, q, l) - rhs_vf(j + 3)%sf(i, q, l) = rhs_vf(j + 3)%sf(i, q, l) + mom_3d(2, 0, k)%sf(i, q, l) - rhs_vf(j + 4)%sf(i, q, l) = rhs_vf(j + 4)%sf(i, q, l) + mom_3d(1, 1, k)%sf(i, q, l) - rhs_vf(j + 5)%sf(i, q, l) = rhs_vf(j + 5)%sf(i, q, l) + mom_3d(0, 2, k)%sf(i, q, l) - j = j + 6 + #:call GPU_PARALLEL_LOOP(collapse=3) + do l = 0, p + do q = 0, n + do i = 0, m + rhs_vf(alf_idx)%sf(i, q, l) = rhs_vf(alf_idx)%sf(i, q, l) + mom_sp(2)%sf(i, q, l) + j = bubxb + $:GPU_LOOP(parallelism='[seq]') + do k = 1, nb + rhs_vf(j)%sf(i, q, l) = rhs_vf(j)%sf(i, q, l) + mom_3d(0, 0, k)%sf(i, q, l) + rhs_vf(j + 1)%sf(i, q, l) = rhs_vf(j + 1)%sf(i, q, l) + mom_3d(1, 0, k)%sf(i, q, l) + rhs_vf(j + 2)%sf(i, q, l) = rhs_vf(j + 2)%sf(i, q, l) + mom_3d(0, 1, k)%sf(i, q, l) + rhs_vf(j + 3)%sf(i, q, l) = rhs_vf(j + 3)%sf(i, q, l) + mom_3d(2, 0, k)%sf(i, q, l) + rhs_vf(j + 4)%sf(i, q, l) = rhs_vf(j + 4)%sf(i, q, l) + mom_3d(1, 1, k)%sf(i, q, l) + rhs_vf(j + 5)%sf(i, q, l) = rhs_vf(j + 5)%sf(i, q, l) + mom_3d(0, 2, k)%sf(i, q, l) + j = j + 6 + end do end do end do end do - end do + #:endcall GPU_PARALLEL_LOOP end if end subroutine s_compute_qbmm_rhs !Coefficient array for non-polytropic model (pb and mv values are accounted in wght_pb and wght_mv) - pure subroutine s_coeff_nonpoly(pres, rho, c, coeffs) + subroutine s_coeff_nonpoly(pres, rho, c, coeffs) $:GPU_ROUTINE(function_name='s_coeff_nonpoly',parallelism='[seq]', & & cray_inline=True) @@ -633,7 +635,7 @@ contains end subroutine s_coeff_nonpoly !Coefficient array for polytropic model (pb for each R0 bin accounted for in wght_pb) - pure subroutine s_coeff(pres, rho, c, coeffs) + subroutine s_coeff(pres, rho, c, coeffs) $:GPU_ROUTINE(function_name='s_coeff',parallelism='[seq]', & & cray_inline=True) @@ -712,146 +714,144 @@ contains is1_qbmm = ix; is2_qbmm = iy; is3_qbmm = iz $:GPU_UPDATE(device='[is1_qbmm,is2_qbmm,is3_qbmm]') - $:GPU_PARALLEL_LOOP(collapse=3, private='[moms, msum, wght, abscX, & - & abscY, wght_pb, wght_mv, wght_ht, coeff, ht, r, q, & - & n_tait, B_tait, pres, rho, nbub, c, alf, momsum, & - & drdt, drdt2, chi_vw, x_vw, rho_mw, k_mw, T_bar, grad_T]') - do id3 = is3_qbmm%beg, is3_qbmm%end - do id2 = is2_qbmm%beg, is2_qbmm%end - do id1 = is1_qbmm%beg, is1_qbmm%end - - alf = q_prim_vf(alf_idx)%sf(id1, id2, id3) - pres = q_prim_vf(E_idx)%sf(id1, id2, id3) - rho = q_prim_vf(contxb)%sf(id1, id2, id3) - - if (bubble_model == 2) then - n_tait = 1._wp/gammas(1) + 1._wp - B_tait = pi_infs(1)*(n_tait - 1)/n_tait - c = n_tait*(pres + B_tait)*(1._wp - alf)/(rho) - c = merge(sqrt(c), sgm_eps, c > 0._wp) - end if + #:call GPU_PARALLEL_LOOP(collapse=3, private='[moms, msum, wght, abscX, abscY, wght_pb, wght_mv, wght_ht, coeff, ht, r, q, n_tait, B_tait, pres, rho, nbub, c, alf, momsum, drdt, drdt2, chi_vw, x_vw, rho_mw, k_mw, T_bar, grad_T]') + do id3 = is3_qbmm%beg, is3_qbmm%end + do id2 = is2_qbmm%beg, is2_qbmm%end + do id1 = is1_qbmm%beg, is1_qbmm%end - call s_coeff_selector(pres, rho, c, coeff, polytropic) + alf = q_prim_vf(alf_idx)%sf(id1, id2, id3) + pres = q_prim_vf(E_idx)%sf(id1, id2, id3) + rho = q_prim_vf(contxb)%sf(id1, id2, id3) - if (alf > small_alf) then - nbub = q_cons_vf(bubxb)%sf(id1, id2, id3) - $:GPU_LOOP(parallelism='[seq]') - do q = 1, nb - ! Gather moments for this bubble bin - $:GPU_LOOP(parallelism='[seq]') - do r = 2, nmom - moms(r) = q_prim_vf(bubmoms(q, r))%sf(id1, id2, id3) - end do - moms(1) = 1._wp - call s_chyqmom(moms, wght(:, q), abscX(:, q), abscY(:, q)) + if (bubble_model == 2) then + n_tait = 1._wp/gammas(1) + 1._wp + B_tait = pi_infs(1)*(n_tait - 1)/n_tait + c = n_tait*(pres + B_tait)*(1._wp - alf)/(rho) + c = merge(sqrt(c), sgm_eps, c > 0._wp) + end if - if (polytropic) then - $:GPU_LOOP(parallelism='[seq]') - do j = 1, nnode - wght_pb(j, q) = wght(j, q)*(pb0(q) - pv) - end do - else - $:GPU_LOOP(parallelism='[seq]') - do j = 1, nnode - chi_vw = 1._wp/(1._wp + R_v/R_n*(pb(id1, id2, id3, j, q)/pv - 1._wp)) - x_vw = M_n*chi_vw/(M_v + (M_n - M_v)*chi_vw) - k_mw = x_vw*k_v(q)/(x_vw + (1._wp - x_vw)*phi_vn) + (1._wp - x_vw)*k_n(q)/(x_vw*phi_nv + 1._wp - x_vw) - rho_mw = pv/(chi_vw*R_v*Tw) - rhs_mv(id1, id2, id3, j, q) = -Re_trans_c(q)*((mv(id1, id2, id3, j, q)/(mv(id1, id2, id3, j, q) + mass_n0(q))) - chi_vw) - rhs_mv(id1, id2, id3, j, q) = rho_mw*rhs_mv(id1, id2, id3, j, q)/Pe_c/(1._wp - chi_vw)/abscX(j, q) - T_bar = Tw*(pb(id1, id2, id3, j, q)/pb0(q))*(abscX(j, q)/R0(q))**3*(mass_n0(q) + mass_v0(q))/(mass_n0(q) + mv(id1, id2, id3, j, q)) - grad_T = -Re_trans_T(q)*(T_bar - Tw) - ht(j, q) = pb0(q)*k_mw*grad_T/Pe_T(q)/abscX(j, q) - wght_pb(j, q) = wght(j, q)*(pb(id1, id2, id3, j, q)) - wght_mv(j, q) = wght(j, q)*(rhs_mv(id1, id2, id3, j, q)) - wght_ht(j, q) = wght(j, q)*ht(j, q) - end do - end if + call s_coeff_selector(pres, rho, c, coeff, polytropic) - ! Compute change in moments due to bubble dynamics - r = 1 + if (alf > small_alf) then + nbub = q_cons_vf(bubxb)%sf(id1, id2, id3) $:GPU_LOOP(parallelism='[seq]') - do i2 = 0, 2 + do q = 1, nb + ! Gather moments for this bubble bin $:GPU_LOOP(parallelism='[seq]') - do i1 = 0, 2 - if ((i1 + i2) <= 2) then - momsum = 0._wp - $:GPU_LOOP(parallelism='[seq]') - do j = 1, nterms - select case (bubble_model) - case (3) - if (j == 3) then - momsum = momsum + coeff(j, i1, i2)*(R0(q)**momrhs(3, i1, i2, j, q))*f_quad2D(abscX(:, q), abscY(:, q), wght_pb(:, q), momrhs(:, i1, i2, j, q)) - else - momsum = momsum + coeff(j, i1, i2)*(R0(q)**momrhs(3, i1, i2, j, q))*f_quad2D(abscX(:, q), abscY(:, q), wght(:, q), momrhs(:, i1, i2, j, q)) - end if - case (2) - if ((j >= 7 .and. j <= 9) .or. (j >= 22 .and. j <= 23) .or. (j >= 10 .and. j <= 11) .or. (j == 26)) then - momsum = momsum + coeff(j, i1, i2)*(R0(q)**momrhs(3, i1, i2, j, q))*f_quad2D(abscX(:, q), abscY(:, q), wght_pb(:, q), momrhs(:, i1, i2, j, q)) - else if ((j >= 27 .and. j <= 29) .and. (.not. polytropic)) then - momsum = momsum + coeff(j, i1, i2)*(R0(q)**momrhs(3, i1, i2, j, q))*f_quad2D(abscX(:, q), abscY(:, q), wght_mv(:, q), momrhs(:, i1, i2, j, q)) - else if ((j >= 30 .and. j <= 32) .and. (.not. polytropic)) then - momsum = momsum + coeff(j, i1, i2)*(R0(q)**momrhs(3, i1, i2, j, q))*f_quad2D(abscX(:, q), abscY(:, q), wght_ht(:, q), momrhs(:, i1, i2, j, q)) - else - momsum = momsum + coeff(j, i1, i2)*(R0(q)**momrhs(3, i1, i2, j, q))*f_quad2D(abscX(:, q), abscY(:, q), wght(:, q), momrhs(:, i1, i2, j, q)) - end if - end select - end do - moms3d(i1, i2, q)%sf(id1, id2, id3) = nbub*momsum - msum(r) = momsum - r = r + 1 - end if + do r = 2, nmom + moms(r) = q_prim_vf(bubmoms(q, r))%sf(id1, id2, id3) end do - end do + moms(1) = 1._wp + call s_chyqmom(moms, wght(:, q), abscX(:, q), abscY(:, q)) + + if (polytropic) then + $:GPU_LOOP(parallelism='[seq]') + do j = 1, nnode + wght_pb(j, q) = wght(j, q)*(pb0(q) - pv) + end do + else + $:GPU_LOOP(parallelism='[seq]') + do j = 1, nnode + chi_vw = 1._wp/(1._wp + R_v/R_n*(pb(id1, id2, id3, j, q)/pv - 1._wp)) + x_vw = M_n*chi_vw/(M_v + (M_n - M_v)*chi_vw) + k_mw = x_vw*k_v(q)/(x_vw + (1._wp - x_vw)*phi_vn) + (1._wp - x_vw)*k_n(q)/(x_vw*phi_nv + 1._wp - x_vw) + rho_mw = pv/(chi_vw*R_v*Tw) + rhs_mv(id1, id2, id3, j, q) = -Re_trans_c(q)*((mv(id1, id2, id3, j, q)/(mv(id1, id2, id3, j, q) + mass_n0(q))) - chi_vw) + rhs_mv(id1, id2, id3, j, q) = rho_mw*rhs_mv(id1, id2, id3, j, q)/Pe_c/(1._wp - chi_vw)/abscX(j, q) + T_bar = Tw*(pb(id1, id2, id3, j, q)/pb0(q))*(abscX(j, q)/R0(q))**3*(mass_n0(q) + mass_v0(q))/(mass_n0(q) + mv(id1, id2, id3, j, q)) + grad_T = -Re_trans_T(q)*(T_bar - Tw) + ht(j, q) = pb0(q)*k_mw*grad_T/Pe_T(q)/abscX(j, q) + wght_pb(j, q) = wght(j, q)*(pb(id1, id2, id3, j, q)) + wght_mv(j, q) = wght(j, q)*(rhs_mv(id1, id2, id3, j, q)) + wght_ht(j, q) = wght(j, q)*ht(j, q) + end do + end if - ! Compute change in pb and mv for non-polytropic model - if (.not. polytropic) then + ! Compute change in moments due to bubble dynamics + r = 1 $:GPU_LOOP(parallelism='[seq]') - do j = 1, nnode - drdt = msum(2) - drdt2 = merge(-1._wp, 1._wp, j == 1 .or. j == 2)/(2._wp*sqrt(merge(moms(4) - moms(2)**2._wp, verysmall, moms(4) - moms(2)**2._wp > 0._wp))) - drdt2 = drdt2*(msum(3) - 2._wp*moms(2)*msum(2)) - drdt = drdt + drdt2 - rhs_pb(id1, id2, id3, j, q) = (-3._wp*gam*drdt/abscX(j, q))*(pb(id1, id2, id3, j, q)) - rhs_pb(id1, id2, id3, j, q) = rhs_pb(id1, id2, id3, j, q) + (3._wp*gam/abscX(j, q))*rhs_mv(id1, id2, id3, j, q)*R_v*Tw - rhs_pb(id1, id2, id3, j, q) = rhs_pb(id1, id2, id3, j, q) + (3._wp*gam/abscX(j, q))*ht(j, q) - rhs_mv(id1, id2, id3, j, q) = rhs_mv(id1, id2, id3, j, q)*(4._wp*pi*abscX(j, q)**2._wp) + do i2 = 0, 2 + $:GPU_LOOP(parallelism='[seq]') + do i1 = 0, 2 + if ((i1 + i2) <= 2) then + momsum = 0._wp + $:GPU_LOOP(parallelism='[seq]') + do j = 1, nterms + select case (bubble_model) + case (3) + if (j == 3) then + momsum = momsum + coeff(j, i1, i2)*(R0(q)**momrhs(3, i1, i2, j, q))*f_quad2D(abscX(:, q), abscY(:, q), wght_pb(:, q), momrhs(:, i1, i2, j, q)) + else + momsum = momsum + coeff(j, i1, i2)*(R0(q)**momrhs(3, i1, i2, j, q))*f_quad2D(abscX(:, q), abscY(:, q), wght(:, q), momrhs(:, i1, i2, j, q)) + end if + case (2) + if ((j >= 7 .and. j <= 9) .or. (j >= 22 .and. j <= 23) .or. (j >= 10 .and. j <= 11) .or. (j == 26)) then + momsum = momsum + coeff(j, i1, i2)*(R0(q)**momrhs(3, i1, i2, j, q))*f_quad2D(abscX(:, q), abscY(:, q), wght_pb(:, q), momrhs(:, i1, i2, j, q)) + else if ((j >= 27 .and. j <= 29) .and. (.not. polytropic)) then + momsum = momsum + coeff(j, i1, i2)*(R0(q)**momrhs(3, i1, i2, j, q))*f_quad2D(abscX(:, q), abscY(:, q), wght_mv(:, q), momrhs(:, i1, i2, j, q)) + else if ((j >= 30 .and. j <= 32) .and. (.not. polytropic)) then + momsum = momsum + coeff(j, i1, i2)*(R0(q)**momrhs(3, i1, i2, j, q))*f_quad2D(abscX(:, q), abscY(:, q), wght_ht(:, q), momrhs(:, i1, i2, j, q)) + else + momsum = momsum + coeff(j, i1, i2)*(R0(q)**momrhs(3, i1, i2, j, q))*f_quad2D(abscX(:, q), abscY(:, q), wght(:, q), momrhs(:, i1, i2, j, q)) + end if + end select + end do + moms3d(i1, i2, q)%sf(id1, id2, id3) = nbub*momsum + msum(r) = momsum + r = r + 1 + end if + end do end do - end if - end do - ! Compute special high-order moments - momsp(1)%sf(id1, id2, id3) = f_quad(abscX, abscY, wght, 3._wp, 0._wp, 0._wp) - momsp(2)%sf(id1, id2, id3) = 4._wp*pi*nbub*f_quad(abscX, abscY, wght, 2._wp, 1._wp, 0._wp) - momsp(3)%sf(id1, id2, id3) = f_quad(abscX, abscY, wght, 3._wp, 2._wp, 0._wp) - if (abs(gam - 1._wp) <= 1.e-4_wp) then - momsp(4)%sf(id1, id2, id3) = 1._wp - else - if (polytropic) then - momsp(4)%sf(id1, id2, id3) = f_quad(abscX, abscY, wght_pb, 3._wp*(1._wp - gam), 0._wp, 3._wp*gam) + pv*f_quad(abscX, abscY, wght, 3._wp, 0._wp, 0._wp) - 4._wp*Re_inv*f_quad(abscX, abscY, wght, 2._wp, 1._wp, 0._wp) - (2._wp/Web)*f_quad(abscX, abscY, wght, 2._wp, 0._wp, 0._wp) + ! Compute change in pb and mv for non-polytropic model + if (.not. polytropic) then + $:GPU_LOOP(parallelism='[seq]') + do j = 1, nnode + drdt = msum(2) + drdt2 = merge(-1._wp, 1._wp, j == 1 .or. j == 2)/(2._wp*sqrt(merge(moms(4) - moms(2)**2._wp, verysmall, moms(4) - moms(2)**2._wp > 0._wp))) + drdt2 = drdt2*(msum(3) - 2._wp*moms(2)*msum(2)) + drdt = drdt + drdt2 + rhs_pb(id1, id2, id3, j, q) = (-3._wp*gam*drdt/abscX(j, q))*(pb(id1, id2, id3, j, q)) + rhs_pb(id1, id2, id3, j, q) = rhs_pb(id1, id2, id3, j, q) + (3._wp*gam/abscX(j, q))*rhs_mv(id1, id2, id3, j, q)*R_v*Tw + rhs_pb(id1, id2, id3, j, q) = rhs_pb(id1, id2, id3, j, q) + (3._wp*gam/abscX(j, q))*ht(j, q) + rhs_mv(id1, id2, id3, j, q) = rhs_mv(id1, id2, id3, j, q)*(4._wp*pi*abscX(j, q)**2._wp) + end do + end if + end do + + ! Compute special high-order moments + momsp(1)%sf(id1, id2, id3) = f_quad(abscX, abscY, wght, 3._wp, 0._wp, 0._wp) + momsp(2)%sf(id1, id2, id3) = 4._wp*pi*nbub*f_quad(abscX, abscY, wght, 2._wp, 1._wp, 0._wp) + momsp(3)%sf(id1, id2, id3) = f_quad(abscX, abscY, wght, 3._wp, 2._wp, 0._wp) + if (abs(gam - 1._wp) <= 1.e-4_wp) then + momsp(4)%sf(id1, id2, id3) = 1._wp else - momsp(4)%sf(id1, id2, id3) = f_quad(abscX, abscY, wght_pb, 3._wp, 0._wp, 0._wp) - 4._wp*Re_inv*f_quad(abscX, abscY, wght, 2._wp, 1._wp, 0._wp) - (2._wp/Web)*f_quad(abscX, abscY, wght, 2._wp, 0._wp, 0._wp) + if (polytropic) then + momsp(4)%sf(id1, id2, id3) = f_quad(abscX, abscY, wght_pb, 3._wp*(1._wp - gam), 0._wp, 3._wp*gam) + pv*f_quad(abscX, abscY, wght, 3._wp, 0._wp, 0._wp) - 4._wp*Re_inv*f_quad(abscX, abscY, wght, 2._wp, 1._wp, 0._wp) - (2._wp/Web)*f_quad(abscX, abscY, wght, 2._wp, 0._wp, 0._wp) + else + momsp(4)%sf(id1, id2, id3) = f_quad(abscX, abscY, wght_pb, 3._wp, 0._wp, 0._wp) - 4._wp*Re_inv*f_quad(abscX, abscY, wght, 2._wp, 1._wp, 0._wp) - (2._wp/Web)*f_quad(abscX, abscY, wght, 2._wp, 0._wp, 0._wp) + end if end if - end if - else - $:GPU_LOOP(parallelism='[seq]') - do q = 1, nb + else $:GPU_LOOP(parallelism='[seq]') - do i1 = 0, 2 + do q = 1, nb $:GPU_LOOP(parallelism='[seq]') - do i2 = 0, 2 - moms3d(i1, i2, q)%sf(id1, id2, id3) = 0._wp + do i1 = 0, 2 + $:GPU_LOOP(parallelism='[seq]') + do i2 = 0, 2 + moms3d(i1, i2, q)%sf(id1, id2, id3) = 0._wp + end do end do end do - end do - momsp(1)%sf(id1, id2, id3) = 0._wp - momsp(2)%sf(id1, id2, id3) = 0._wp - momsp(3)%sf(id1, id2, id3) = 0._wp - momsp(4)%sf(id1, id2, id3) = 0._wp - end if + momsp(1)%sf(id1, id2, id3) = 0._wp + momsp(2)%sf(id1, id2, id3) = 0._wp + momsp(3)%sf(id1, id2, id3) = 0._wp + momsp(4)%sf(id1, id2, id3) = 0._wp + end if + end do end do end do - end do + #:endcall GPU_PARALLEL_LOOP contains ! Helper to select the correct coefficient routine @@ -868,7 +868,7 @@ contains end if end subroutine s_coeff_selector - pure subroutine s_chyqmom(momin, wght, abscX, abscY) + subroutine s_chyqmom(momin, wght, abscX, abscY) $:GPU_ROUTINE(function_name='s_chyqmom',parallelism='[seq]', & & cray_inline=True) @@ -926,7 +926,7 @@ contains end subroutine s_chyqmom - pure subroutine s_hyqmom(frho, fup, fmom) + subroutine s_hyqmom(frho, fup, fmom) $:GPU_ROUTINE(function_name='s_hyqmom',parallelism='[seq]', & & cray_inline=True) @@ -946,7 +946,7 @@ contains end subroutine s_hyqmom - pure function f_quad(abscX, abscY, wght_in, q, r, s) + function f_quad(abscX, abscY, wght_in, q, r, s) $:GPU_ROUTINE(parallelism='[seq]') real(wp), dimension(nnode, nb), intent(in) :: abscX, abscY, wght_in real(wp), intent(in) :: q, r, s @@ -962,7 +962,7 @@ contains end function f_quad - pure function f_quad2D(abscX, abscY, wght_in, pow) + function f_quad2D(abscX, abscY, wght_in, pow) $:GPU_ROUTINE(parallelism='[seq]') real(wp), dimension(nnode), intent(in) :: abscX, abscY, wght_in real(wp), dimension(3), intent(in) :: pow diff --git a/src/simulation/m_rhs.fpp b/src/simulation/m_rhs.fpp index cdcb418075..7e2cc240e8 100644 --- a/src/simulation/m_rhs.fpp +++ b/src/simulation/m_rhs.fpp @@ -564,18 +564,19 @@ contains end do end if ! end allocation of viscous variables - $:GPU_PARALLEL_LOOP(collapse=4) - do id = 1, num_dims - do i = 1, sys_size - do l = idwbuff(3)%beg, idwbuff(3)%end - do k = idwbuff(2)%beg, idwbuff(2)%end - do j = idwbuff(1)%beg, idwbuff(1)%end - flux_gsrc_n(id)%vf(i)%sf(j, k, l) = 0._wp + #:call GPU_PARALLEL_LOOP(collapse=4) + do id = 1, num_dims + do i = 1, sys_size + do l = idwbuff(3)%beg, idwbuff(3)%end + do k = idwbuff(2)%beg, idwbuff(2)%end + do j = idwbuff(1)%beg, idwbuff(1)%end + flux_gsrc_n(id)%vf(i)%sf(j, k, l) = 0._wp + end do end do end do end do end do - end do + #:endcall GPU_PARALLEL_LOOP end if ! end allocation for .not. igr @@ -646,37 +647,39 @@ contains if (.not. igr) then ! Association/Population of Working Variables - $:GPU_PARALLEL_LOOP(collapse=4) - do i = 1, sys_size - do l = idwbuff(3)%beg, idwbuff(3)%end - do k = idwbuff(2)%beg, idwbuff(2)%end - do j = idwbuff(1)%beg, idwbuff(1)%end - q_cons_qp%vf(i)%sf(j, k, l) = q_cons_vf(i)%sf(j, k, l) + #:call GPU_PARALLEL_LOOP(collapse=4) + do i = 1, sys_size + do l = idwbuff(3)%beg, idwbuff(3)%end + do k = idwbuff(2)%beg, idwbuff(2)%end + do j = idwbuff(1)%beg, idwbuff(1)%end + q_cons_qp%vf(i)%sf(j, k, l) = q_cons_vf(i)%sf(j, k, l) + end do end do end do end do - end do + #:endcall GPU_PARALLEL_LOOP ! Converting Conservative to Primitive Variables if (mpp_lim .and. bubbles_euler) then - $:GPU_PARALLEL_LOOP(collapse=3) - do l = idwbuff(3)%beg, idwbuff(3)%end - do k = idwbuff(2)%beg, idwbuff(2)%end - do j = idwbuff(1)%beg, idwbuff(1)%end - alf_sum%sf(j, k, l) = 0._wp - $:GPU_LOOP(parallelism='[seq]') - do i = advxb, advxe - 1 - alf_sum%sf(j, k, l) = alf_sum%sf(j, k, l) + q_cons_qp%vf(i)%sf(j, k, l) - end do - $:GPU_LOOP(parallelism='[seq]') - do i = advxb, advxe - 1 - q_cons_qp%vf(i)%sf(j, k, l) = q_cons_qp%vf(i)%sf(j, k, l)*(1._wp - q_cons_qp%vf(alf_idx)%sf(j, k, l)) & - /alf_sum%sf(j, k, l) + #:call GPU_PARALLEL_LOOP(collapse=3) + do l = idwbuff(3)%beg, idwbuff(3)%end + do k = idwbuff(2)%beg, idwbuff(2)%end + do j = idwbuff(1)%beg, idwbuff(1)%end + alf_sum%sf(j, k, l) = 0._wp + $:GPU_LOOP(parallelism='[seq]') + do i = advxb, advxe - 1 + alf_sum%sf(j, k, l) = alf_sum%sf(j, k, l) + q_cons_qp%vf(i)%sf(j, k, l) + end do + $:GPU_LOOP(parallelism='[seq]') + do i = advxb, advxe - 1 + q_cons_qp%vf(i)%sf(j, k, l) = q_cons_qp%vf(i)%sf(j, k, l)*(1._wp - q_cons_qp%vf(alf_idx)%sf(j, k, l)) & + /alf_sum%sf(j, k, l) + end do end do end do end do - end do + #:endcall GPU_PARALLEL_LOOP end if end if @@ -736,16 +739,17 @@ contains if (igr) then if (id == 1) then - $:GPU_PARALLEL_LOOP(collapse=4) - do l = -1, p + 1 - do k = -1, n + 1 - do j = -1, m + 1 - do i = 1, sys_size - rhs_vf(i)%sf(j, k, l) = 0._wp + #:call GPU_PARALLEL_LOOP(collapse=4) + do l = -1, p + 1 + do k = -1, n + 1 + do j = -1, m + 1 + do i = 1, sys_size + rhs_vf(i)%sf(j, k, l) = 0._wp + end do end do end do end do - end do + #:endcall GPU_PARALLEL_LOOP end if call nvtxStartRange("IGR_RIEMANN") @@ -877,6 +881,9 @@ contains irx%beg = 0; iry%beg = 0; irz%beg = -1 end if irx%end = m; iry%end = n; irz%end = p + ! $:GPU_UPDATE(host='[qL_rsx_vf,qR_rsx_vf]') + ! print *, "L", qL_rsx_vf(100:300, 0, 0, 1) + ! print *, "R", qR_rsx_vf(100:300, 0, 0, 1) !Computing Riemann Solver Flux and Source Flux call nvtxStartRange("RHS-RIEMANN-SOLVER") @@ -897,6 +904,9 @@ contains id, irx, iry, irz) call nvtxEndRange + !$:GPU_UPDATE(host='[flux_n(1)%vf(1)%sf]') + !print *, "FLUX", flux_n(1)%vf(1)%sf(100:300, 0, 0) + ! Additional physics and source terms ! RHS addition for advection source call nvtxStartRange("RHS-ADVECTION-SRC") @@ -966,18 +976,19 @@ contains ! END: Dimensional Splitting Loop if (ib) then - $:GPU_PARALLEL_LOOP(collapse=3) - do l = 0, p - do k = 0, n - do j = 0, m - if (ib_markers%sf(j, k, l) /= 0) then - do i = 1, sys_size - rhs_vf(i)%sf(j, k, l) = 0._wp - end do - end if + #:call GPU_PARALLEL_LOOP(collapse=3) + do l = 0, p + do k = 0, n + do j = 0, m + if (ib_markers%sf(j, k, l) /= 0) then + do i = 1, sys_size + rhs_vf(i)%sf(j, k, l) = 0._wp + end do + end if + end do end do end do - end do + #:endcall GPU_PARALLEL_LOOP end if ! Additional Physics and Source Temrs @@ -1032,16 +1043,17 @@ contains if (run_time_info .or. probe_wrt .or. ib .or. bubbles_lagrange) then if (.not. igr) then - $:GPU_PARALLEL_LOOP(collapse=4) - do i = 1, sys_size - do l = idwbuff(3)%beg, idwbuff(3)%end - do k = idwbuff(2)%beg, idwbuff(2)%end - do j = idwbuff(1)%beg, idwbuff(1)%end - q_prim_vf(i)%sf(j, k, l) = q_prim_qp%vf(i)%sf(j, k, l) + #:call GPU_PARALLEL_LOOP(collapse=4) + do i = 1, sys_size + do l = idwbuff(3)%beg, idwbuff(3)%end + do k = idwbuff(2)%beg, idwbuff(2)%end + do j = idwbuff(1)%beg, idwbuff(1)%end + q_prim_vf(i)%sf(j, k, l) = q_prim_qp%vf(i)%sf(j, k, l) + end do end do end do end do - end do + #:endcall GPU_PARALLEL_LOOP end if end if @@ -1073,29 +1085,30 @@ contains real(wp) :: advected_qty_val, pressure_val, velocity_val if (alt_soundspeed) then - $:GPU_PARALLEL_LOOP(collapse=3) - do q_loop = 0, p - do l_loop = 0, n - do k_loop = 0, m - blkmod1(k_loop, l_loop, q_loop) = ((gammas(1) + 1._wp)*q_prim_vf%vf(E_idx)%sf(k_loop, l_loop, q_loop) + & - pi_infs(1))/gammas(1) - blkmod2(k_loop, l_loop, q_loop) = ((gammas(2) + 1._wp)*q_prim_vf%vf(E_idx)%sf(k_loop, l_loop, q_loop) + & - pi_infs(2))/gammas(2) - alpha1(k_loop, l_loop, q_loop) = q_cons_vf%vf(advxb)%sf(k_loop, l_loop, q_loop) - - if (bubbles_euler) then - alpha2(k_loop, l_loop, q_loop) = q_cons_vf%vf(alf_idx - 1)%sf(k_loop, l_loop, q_loop) - else - alpha2(k_loop, l_loop, q_loop) = q_cons_vf%vf(advxe)%sf(k_loop, l_loop, q_loop) - end if + #:call GPU_PARALLEL_LOOP(collapse=3) + do q_loop = 0, p + do l_loop = 0, n + do k_loop = 0, m + blkmod1(k_loop, l_loop, q_loop) = ((gammas(1) + 1._wp)*q_prim_vf%vf(E_idx)%sf(k_loop, l_loop, q_loop) + & + pi_infs(1))/gammas(1) + blkmod2(k_loop, l_loop, q_loop) = ((gammas(2) + 1._wp)*q_prim_vf%vf(E_idx)%sf(k_loop, l_loop, q_loop) + & + pi_infs(2))/gammas(2) + alpha1(k_loop, l_loop, q_loop) = q_cons_vf%vf(advxb)%sf(k_loop, l_loop, q_loop) + + if (bubbles_euler) then + alpha2(k_loop, l_loop, q_loop) = q_cons_vf%vf(alf_idx - 1)%sf(k_loop, l_loop, q_loop) + else + alpha2(k_loop, l_loop, q_loop) = q_cons_vf%vf(advxe)%sf(k_loop, l_loop, q_loop) + end if - Kterm(k_loop, l_loop, q_loop) = alpha1(k_loop, l_loop, q_loop)*alpha2(k_loop, l_loop, q_loop)* & - (blkmod2(k_loop, l_loop, q_loop) - blkmod1(k_loop, l_loop, q_loop))/ & - (alpha1(k_loop, l_loop, q_loop)*blkmod2(k_loop, l_loop, q_loop) + & - alpha2(k_loop, l_loop, q_loop)*blkmod1(k_loop, l_loop, q_loop)) + Kterm(k_loop, l_loop, q_loop) = alpha1(k_loop, l_loop, q_loop)*alpha2(k_loop, l_loop, q_loop)* & + (blkmod2(k_loop, l_loop, q_loop) - blkmod1(k_loop, l_loop, q_loop))/ & + (alpha1(k_loop, l_loop, q_loop)*blkmod2(k_loop, l_loop, q_loop) + & + alpha2(k_loop, l_loop, q_loop)*blkmod1(k_loop, l_loop, q_loop)) + end do end do end do - end do + #:endcall GPU_PARALLEL_LOOP end if select case (idir) @@ -1107,39 +1120,41 @@ contains call s_cbc(q_prim_vf%vf, flux_n(idir)%vf, flux_src_n_vf%vf, idir, 1, irx, iry, irz) end if - $:GPU_PARALLEL_LOOP(collapse=4,private='[inv_ds,flux_face1,flux_face2]') - do j = 1, sys_size - do q_loop = 0, p - do l_loop = 0, n - do k_loop = 0, m - inv_ds = 1._wp/dx(k_loop) - flux_face1 = flux_n(1)%vf(j)%sf(k_loop - 1, l_loop, q_loop) - flux_face2 = flux_n(1)%vf(j)%sf(k_loop, l_loop, q_loop) - rhs_vf(j)%sf(k_loop, l_loop, q_loop) = inv_ds*(flux_face1 - flux_face2) + #:call GPU_PARALLEL_LOOP(collapse=4,private='[inv_ds,flux_face1,flux_face2]') + do j = 1, sys_size + do q_loop = 0, p + do l_loop = 0, n + do k_loop = 0, m + inv_ds = 1._wp/dx(k_loop) + flux_face1 = flux_n(1)%vf(j)%sf(k_loop - 1, l_loop, q_loop) + flux_face2 = flux_n(1)%vf(j)%sf(k_loop, l_loop, q_loop) + rhs_vf(j)%sf(k_loop, l_loop, q_loop) = inv_ds*(flux_face1 - flux_face2) + end do end do end do end do - end do - + #:endcall GPU_PARALLEL_LOOP + ! $:GPU_UPDATE(host='[rhs_vf(1)%sf]') + ! print *, "RHS", rhs_vf(1)%sf(100:300, 0, 0) if (model_eqns == 3) then - $:GPU_PARALLEL_LOOP(collapse=4,private='[inv_ds,advected_qty_val, & - & pressure_val,flux_face1,flux_face2]') - do q_loop = 0, p - do l_loop = 0, n - do k_loop = 0, m - do i_fluid_loop = 1, num_fluids - inv_ds = 1._wp/dx(k_loop) - advected_qty_val = q_cons_vf%vf(i_fluid_loop + advxb - 1)%sf(k_loop, l_loop, q_loop) - pressure_val = q_prim_vf%vf(E_idx)%sf(k_loop, l_loop, q_loop) - flux_face1 = flux_src_n_vf%vf(advxb)%sf(k_loop, l_loop, q_loop) - flux_face2 = flux_src_n_vf%vf(advxb)%sf(k_loop - 1, l_loop, q_loop) - rhs_vf(i_fluid_loop + intxb - 1)%sf(k_loop, l_loop, q_loop) = & - rhs_vf(i_fluid_loop + intxb - 1)%sf(k_loop, l_loop, q_loop) - & - inv_ds*advected_qty_val*pressure_val*(flux_face1 - flux_face2) + #:call GPU_PARALLEL_LOOP(collapse=4,private='[inv_ds,advected_qty_val, pressure_val,flux_face1,flux_face2]') + do q_loop = 0, p + do l_loop = 0, n + do k_loop = 0, m + do i_fluid_loop = 1, num_fluids + inv_ds = 1._wp/dx(k_loop) + advected_qty_val = q_cons_vf%vf(i_fluid_loop + advxb - 1)%sf(k_loop, l_loop, q_loop) + pressure_val = q_prim_vf%vf(E_idx)%sf(k_loop, l_loop, q_loop) + flux_face1 = flux_src_n_vf%vf(advxb)%sf(k_loop, l_loop, q_loop) + flux_face2 = flux_src_n_vf%vf(advxb)%sf(k_loop - 1, l_loop, q_loop) + rhs_vf(i_fluid_loop + intxb - 1)%sf(k_loop, l_loop, q_loop) = & + rhs_vf(i_fluid_loop + intxb - 1)%sf(k_loop, l_loop, q_loop) - & + inv_ds*advected_qty_val*pressure_val*(flux_face1 - flux_face2) + end do end do end do end do - end do + #:endcall GPU_PARALLEL_LOOP end if call s_add_directional_advection_source_terms(idir, rhs_vf, q_cons_vf, q_prim_vf, flux_src_n_vf, Kterm) @@ -1152,60 +1167,62 @@ contains call s_cbc(q_prim_vf%vf, flux_n(idir)%vf, flux_src_n_vf%vf, idir, 1, irx, iry, irz) end if - $:GPU_PARALLEL_LOOP(collapse=4,private='[inv_ds,flux_face1,flux_face2]') - do j = 1, sys_size - do l = 0, p - do k = 0, n - do q = 0, m - inv_ds = 1._wp/dy(k) - flux_face1 = flux_n(2)%vf(j)%sf(q, k - 1, l) - flux_face2 = flux_n(2)%vf(j)%sf(q, k, l) - rhs_vf(j)%sf(q, k, l) = rhs_vf(j)%sf(q, k, l) + inv_ds*(flux_face1 - flux_face2) + #:call GPU_PARALLEL_LOOP(collapse=4,private='[inv_ds,flux_face1,flux_face2]') + do j = 1, sys_size + do l = 0, p + do k = 0, n + do q = 0, m + inv_ds = 1._wp/dy(k) + flux_face1 = flux_n(2)%vf(j)%sf(q, k - 1, l) + flux_face2 = flux_n(2)%vf(j)%sf(q, k, l) + rhs_vf(j)%sf(q, k, l) = rhs_vf(j)%sf(q, k, l) + inv_ds*(flux_face1 - flux_face2) + end do end do end do end do - end do + #:endcall GPU_PARALLEL_LOOP if (model_eqns == 3) then - $:GPU_PARALLEL_LOOP(collapse=4,private='[inv_ds,advected_qty_val, & - & pressure_val,flux_face1,flux_face2]') - do l = 0, p - do k = 0, n - do q = 0, m - do i_fluid_loop = 1, num_fluids - inv_ds = 1._wp/dy(k) - advected_qty_val = q_cons_vf%vf(i_fluid_loop + advxb - 1)%sf(q, k, l) - pressure_val = q_prim_vf%vf(E_idx)%sf(q, k, l) - flux_face1 = flux_src_n_vf%vf(advxb)%sf(q, k, l) - flux_face2 = flux_src_n_vf%vf(advxb)%sf(q, k - 1, l) - rhs_vf(i_fluid_loop + intxb - 1)%sf(q, k, l) = & - rhs_vf(i_fluid_loop + intxb - 1)%sf(q, k, l) - & - inv_ds*advected_qty_val*pressure_val*(flux_face1 - flux_face2) - if (cyl_coord) then + #:call GPU_PARALLEL_LOOP(collapse=4,private='[inv_ds,advected_qty_val, pressure_val,flux_face1,flux_face2]') + do l = 0, p + do k = 0, n + do q = 0, m + do i_fluid_loop = 1, num_fluids + inv_ds = 1._wp/dy(k) + advected_qty_val = q_cons_vf%vf(i_fluid_loop + advxb - 1)%sf(q, k, l) + pressure_val = q_prim_vf%vf(E_idx)%sf(q, k, l) + flux_face1 = flux_src_n_vf%vf(advxb)%sf(q, k, l) + flux_face2 = flux_src_n_vf%vf(advxb)%sf(q, k - 1, l) rhs_vf(i_fluid_loop + intxb - 1)%sf(q, k, l) = & rhs_vf(i_fluid_loop + intxb - 1)%sf(q, k, l) - & - 5.e-1_wp/y_cc(k)*advected_qty_val*pressure_val*(flux_face1 + flux_face2) - end if + inv_ds*advected_qty_val*pressure_val*(flux_face1 - flux_face2) + if (cyl_coord) then + rhs_vf(i_fluid_loop + intxb - 1)%sf(q, k, l) = & + rhs_vf(i_fluid_loop + intxb - 1)%sf(q, k, l) - & + 5.e-1_wp/y_cc(k)*advected_qty_val*pressure_val*(flux_face1 + flux_face2) + end if + end do end do end do end do - end do + #:endcall GPU_PARALLEL_LOOP end if if (cyl_coord) then - $:GPU_PARALLEL_LOOP(collapse=4,private='[flux_face1,flux_face2]') - do j = 1, sys_size - do l = 0, p - do k = 0, n - do q = 0, m - flux_face1 = flux_gsrc_n(2)%vf(j)%sf(q, k - 1, l) - flux_face2 = flux_gsrc_n(2)%vf(j)%sf(q, k, l) - rhs_vf(j)%sf(q, k, l) = rhs_vf(j)%sf(q, k, l) - & - 5.e-1_wp/y_cc(k)*(flux_face1 + flux_face2) + #:call GPU_PARALLEL_LOOP(collapse=4,private='[flux_face1,flux_face2]') + do j = 1, sys_size + do l = 0, p + do k = 0, n + do q = 0, m + flux_face1 = flux_gsrc_n(2)%vf(j)%sf(q, k - 1, l) + flux_face2 = flux_gsrc_n(2)%vf(j)%sf(q, k, l) + rhs_vf(j)%sf(q, k, l) = rhs_vf(j)%sf(q, k, l) - & + 5.e-1_wp/y_cc(k)*(flux_face1 + flux_face2) + end do end do end do end do - end do + #:endcall GPU_PARALLEL_LOOP end if call s_add_directional_advection_source_terms(idir, rhs_vf, q_cons_vf, q_prim_vf, flux_src_n_vf, Kterm) @@ -1219,70 +1236,72 @@ contains end if if (grid_geometry == 3) then ! Cylindrical Coordinates - $:GPU_PARALLEL_LOOP(collapse=4,private='[inv_ds,velocity_val, & - & flux_face1,flux_face2]') - do j = 1, sys_size - do k = 0, p - do q = 0, n - do l = 0, m - inv_ds = 1._wp/(dz(k)*y_cc(q)) - velocity_val = q_prim_vf%vf(contxe + idir)%sf(l, q, k) - flux_face1 = flux_n(3)%vf(j)%sf(l, q, k - 1) - flux_face2 = flux_n(3)%vf(j)%sf(l, q, k) - rhs_vf(j)%sf(l, q, k) = rhs_vf(j)%sf(l, q, k) + & - inv_ds*velocity_val*(flux_face1 - flux_face2) + #:call GPU_PARALLEL_LOOP(collapse=4,private='[inv_ds,velocity_val,flux_face1,flux_face2]') + do j = 1, sys_size + do k = 0, p + do q = 0, n + do l = 0, m + inv_ds = 1._wp/(dz(k)*y_cc(q)) + velocity_val = q_prim_vf%vf(contxe + idir)%sf(l, q, k) + flux_face1 = flux_n(3)%vf(j)%sf(l, q, k - 1) + flux_face2 = flux_n(3)%vf(j)%sf(l, q, k) + rhs_vf(j)%sf(l, q, k) = rhs_vf(j)%sf(l, q, k) + & + inv_ds*velocity_val*(flux_face1 - flux_face2) + end do end do end do end do - end do - $:GPU_PARALLEL_LOOP(collapse=4,private='[flux_face1,flux_face2]') - do j = 1, sys_size - do k = 0, p - do q = 0, n - do l = 0, m - flux_face1 = flux_gsrc_n(3)%vf(j)%sf(l, q, k - 1) - flux_face2 = flux_gsrc_n(3)%vf(j)%sf(l, q, k) - rhs_vf(j)%sf(l, q, k) = rhs_vf(j)%sf(l, q, k) - & - 5.e-1_wp/y_cc(q)*(flux_face1 + flux_face2) + #:endcall GPU_PARALLEL_LOOP + #:call GPU_PARALLEL_LOOP(collapse=4,private='[flux_face1,flux_face2]') + do j = 1, sys_size + do k = 0, p + do q = 0, n + do l = 0, m + flux_face1 = flux_gsrc_n(3)%vf(j)%sf(l, q, k - 1) + flux_face2 = flux_gsrc_n(3)%vf(j)%sf(l, q, k) + rhs_vf(j)%sf(l, q, k) = rhs_vf(j)%sf(l, q, k) - & + 5.e-1_wp/y_cc(q)*(flux_face1 + flux_face2) + end do end do end do end do - end do + #:endcall GPU_PARALLEL_LOOP else ! Cartesian Coordinates - $:GPU_PARALLEL_LOOP(collapse=4,private='[inv_ds,flux_face1,flux_face2]') - do j = 1, sys_size - do k = 0, p - do q = 0, n - do l = 0, m - inv_ds = 1._wp/dz(k) - flux_face1 = flux_n(3)%vf(j)%sf(l, q, k - 1) - flux_face2 = flux_n(3)%vf(j)%sf(l, q, k) - rhs_vf(j)%sf(l, q, k) = rhs_vf(j)%sf(l, q, k) + inv_ds*(flux_face1 - flux_face2) + #:call GPU_PARALLEL_LOOP(collapse=4,private='[inv_ds,flux_face1,flux_face2]') + do j = 1, sys_size + do k = 0, p + do q = 0, n + do l = 0, m + inv_ds = 1._wp/dz(k) + flux_face1 = flux_n(3)%vf(j)%sf(l, q, k - 1) + flux_face2 = flux_n(3)%vf(j)%sf(l, q, k) + rhs_vf(j)%sf(l, q, k) = rhs_vf(j)%sf(l, q, k) + inv_ds*(flux_face1 - flux_face2) + end do end do end do end do - end do + #:endcall GPU_PARALLEL_LOOP end if if (model_eqns == 3) then - $:GPU_PARALLEL_LOOP(collapse=4,private='[inv_ds,advected_qty_val, & - & pressure_val,flux_face1,flux_face2]') - do k = 0, p - do q = 0, n - do l = 0, m - do i_fluid_loop = 1, num_fluids - inv_ds = 1._wp/dz(k) - advected_qty_val = q_cons_vf%vf(i_fluid_loop + advxb - 1)%sf(l, q, k) - pressure_val = q_prim_vf%vf(E_idx)%sf(l, q, k) - flux_face1 = flux_src_n_vf%vf(advxb)%sf(l, q, k) - flux_face2 = flux_src_n_vf%vf(advxb)%sf(l, q, k - 1) - rhs_vf(i_fluid_loop + intxb - 1)%sf(l, q, k) = & - rhs_vf(i_fluid_loop + intxb - 1)%sf(l, q, k) - & - inv_ds*advected_qty_val*pressure_val*(flux_face1 - flux_face2) + #:call GPU_PARALLEL_LOOP(collapse=4,private='[inv_ds,advected_qty_val, pressure_val,flux_face1,flux_face2]') + do k = 0, p + do q = 0, n + do l = 0, m + do i_fluid_loop = 1, num_fluids + inv_ds = 1._wp/dz(k) + advected_qty_val = q_cons_vf%vf(i_fluid_loop + advxb - 1)%sf(l, q, k) + pressure_val = q_prim_vf%vf(E_idx)%sf(l, q, k) + flux_face1 = flux_src_n_vf%vf(advxb)%sf(l, q, k) + flux_face2 = flux_src_n_vf%vf(advxb)%sf(l, q, k - 1) + rhs_vf(i_fluid_loop + intxb - 1)%sf(l, q, k) = & + rhs_vf(i_fluid_loop + intxb - 1)%sf(l, q, k) - & + inv_ds*advected_qty_val*pressure_val*(flux_face1 - flux_face2) + end do end do end do end do - end do + #:endcall GPU_PARALLEL_LOOP end if call s_add_directional_advection_source_terms(idir, rhs_vf, q_cons_vf, q_prim_vf, flux_src_n_vf, Kterm) @@ -1310,143 +1329,136 @@ contains case (1) ! x-direction use_standard_riemann = (riemann_solver == 1 .or. riemann_solver == 4) if (use_standard_riemann) then - $:GPU_PARALLEL_LOOP(collapse=4,private='[local_inv_ds, & - & local_term_coeff,local_flux1,local_flux2]') - do j_adv = advxb, advxe - do q_idx = 0, p ! z_extent - do l_idx = 0, n ! y_extent - do k_idx = 0, m ! x_extent - local_inv_ds = 1._wp/dx(k_idx) - local_term_coeff = q_prim_vf_arg%vf(contxe + current_idir)%sf(k_idx, l_idx, q_idx) - local_flux1 = flux_src_n_vf_arg%vf(j_adv)%sf(k_idx - 1, l_idx, q_idx) - local_flux2 = flux_src_n_vf_arg%vf(j_adv)%sf(k_idx, l_idx, q_idx) - rhs_vf_arg(j_adv)%sf(k_idx, l_idx, q_idx) = rhs_vf_arg(j_adv)%sf(k_idx, l_idx, q_idx) + & - local_inv_ds*local_term_coeff*(local_flux1 - local_flux2) + #:call GPU_PARALLEL_LOOP(collapse=4,private='[local_inv_ds, local_term_coeff,local_flux1,local_flux2]') + do j_adv = advxb, advxe + do q_idx = 0, p ! z_extent + do l_idx = 0, n ! y_extent + do k_idx = 0, m ! x_extent + local_inv_ds = 1._wp/dx(k_idx) + local_term_coeff = q_prim_vf_arg%vf(contxe + current_idir)%sf(k_idx, l_idx, q_idx) + local_flux1 = flux_src_n_vf_arg%vf(j_adv)%sf(k_idx - 1, l_idx, q_idx) + local_flux2 = flux_src_n_vf_arg%vf(j_adv)%sf(k_idx, l_idx, q_idx) + rhs_vf_arg(j_adv)%sf(k_idx, l_idx, q_idx) = rhs_vf_arg(j_adv)%sf(k_idx, l_idx, q_idx) + & + local_inv_ds*local_term_coeff*(local_flux1 - local_flux2) + end do end do end do end do - end do + #:endcall GPU_PARALLEL_LOOP else ! Other Riemann solvers if (alt_soundspeed) then if (bubbles_euler .neqv. .true.) then - $:GPU_PARALLEL_LOOP(collapse=3, private='[local_inv_ds, & - & local_q_cons_val, local_k_term_val, & - & local_term_coeff, local_flux1, local_flux2]') - do q_idx = 0, p; do l_idx = 0, n; do k_idx = 0, m - local_inv_ds = 1._wp/dx(k_idx) - local_q_cons_val = q_cons_vf_arg%vf(advxe)%sf(k_idx, l_idx, q_idx) - local_k_term_val = Kterm_arg(k_idx, l_idx, q_idx) ! Access is safe due to outer alt_soundspeed check - local_term_coeff = local_q_cons_val - local_k_term_val - local_flux1 = flux_src_n_vf_arg%vf(advxe)%sf(k_idx, l_idx, q_idx) - local_flux2 = flux_src_n_vf_arg%vf(advxe)%sf(k_idx - 1, l_idx, q_idx) - rhs_vf_arg(advxe)%sf(k_idx, l_idx, q_idx) = rhs_vf_arg(advxe)%sf(k_idx, l_idx, q_idx) + & - local_inv_ds*local_term_coeff*(local_flux1 - local_flux2) - end do; end do; end do - - $:GPU_PARALLEL_LOOP(collapse=3, private='[local_inv_ds,& - & local_q_cons_val, local_k_term_val, & - & local_term_coeff, local_flux1, & - & local_flux2]') - do q_idx = 0, p; do l_idx = 0, n; do k_idx = 0, m - local_inv_ds = 1._wp/dx(k_idx) - local_q_cons_val = q_cons_vf_arg%vf(advxb)%sf(k_idx, l_idx, q_idx) - local_k_term_val = Kterm_arg(k_idx, l_idx, q_idx) ! Access is safe - local_term_coeff = local_q_cons_val + local_k_term_val - local_flux1 = flux_src_n_vf_arg%vf(advxb)%sf(k_idx, l_idx, q_idx) - local_flux2 = flux_src_n_vf_arg%vf(advxb)%sf(k_idx - 1, l_idx, q_idx) - rhs_vf_arg(advxb)%sf(k_idx, l_idx, q_idx) = rhs_vf_arg(advxb)%sf(k_idx, l_idx, q_idx) + & - local_inv_ds*local_term_coeff*(local_flux1 - local_flux2) - end do; end do; end do + #:call GPU_PARALLEL_LOOP(collapse=3, private='[local_inv_ds, local_q_cons_val, local_k_term_val, local_term_coeff, local_flux1, local_flux2]') + do q_idx = 0, p; do l_idx = 0, n; do k_idx = 0, m + local_inv_ds = 1._wp/dx(k_idx) + local_q_cons_val = q_cons_vf_arg%vf(advxe)%sf(k_idx, l_idx, q_idx) + local_k_term_val = Kterm_arg(k_idx, l_idx, q_idx) ! Access is safe due to outer alt_soundspeed check + local_term_coeff = local_q_cons_val - local_k_term_val + local_flux1 = flux_src_n_vf_arg%vf(advxe)%sf(k_idx, l_idx, q_idx) + local_flux2 = flux_src_n_vf_arg%vf(advxe)%sf(k_idx - 1, l_idx, q_idx) + rhs_vf_arg(advxe)%sf(k_idx, l_idx, q_idx) = rhs_vf_arg(advxe)%sf(k_idx, l_idx, q_idx) + & + local_inv_ds*local_term_coeff*(local_flux1 - local_flux2) + end do; end do; end do + #:endcall GPU_PARALLEL_LOOP + + #:call GPU_PARALLEL_LOOP(collapse=3, private='[local_inv_ds,local_q_cons_val, local_k_term_val,local_term_coeff, local_flux1, local_flux2]') + do q_idx = 0, p; do l_idx = 0, n; do k_idx = 0, m + local_inv_ds = 1._wp/dx(k_idx) + local_q_cons_val = q_cons_vf_arg%vf(advxb)%sf(k_idx, l_idx, q_idx) + local_k_term_val = Kterm_arg(k_idx, l_idx, q_idx) ! Access is safe + local_term_coeff = local_q_cons_val + local_k_term_val + local_flux1 = flux_src_n_vf_arg%vf(advxb)%sf(k_idx, l_idx, q_idx) + local_flux2 = flux_src_n_vf_arg%vf(advxb)%sf(k_idx - 1, l_idx, q_idx) + rhs_vf_arg(advxb)%sf(k_idx, l_idx, q_idx) = rhs_vf_arg(advxb)%sf(k_idx, l_idx, q_idx) + & + local_inv_ds*local_term_coeff*(local_flux1 - local_flux2) + end do; end do; end do + #:endcall GPU_PARALLEL_LOOP end if else ! NOT alt_soundspeed - $:GPU_PARALLEL_LOOP(collapse=4,private='[local_inv_ds, & - & local_term_coeff,local_flux1,local_flux2]') - do j_adv = advxb, advxe - do q_idx = 0, p; do l_idx = 0, n; do k_idx = 0, m - local_inv_ds = 1._wp/dx(k_idx) - local_term_coeff = q_cons_vf_arg%vf(j_adv)%sf(k_idx, l_idx, q_idx) - local_flux1 = flux_src_n_vf_arg%vf(j_adv)%sf(k_idx, l_idx, q_idx) - local_flux2 = flux_src_n_vf_arg%vf(j_adv)%sf(k_idx - 1, l_idx, q_idx) - rhs_vf_arg(j_adv)%sf(k_idx, l_idx, q_idx) = rhs_vf_arg(j_adv)%sf(k_idx, l_idx, q_idx) + & - local_inv_ds*local_term_coeff*(local_flux1 - local_flux2) - end do; end do; end do - end do + #:call GPU_PARALLEL_LOOP(collapse=4,private='[local_inv_ds, local_term_coeff,local_flux1,local_flux2]') + do j_adv = advxb, advxe + do q_idx = 0, p; do l_idx = 0, n; do k_idx = 0, m + local_inv_ds = 1._wp/dx(k_idx) + local_term_coeff = q_cons_vf_arg%vf(j_adv)%sf(k_idx, l_idx, q_idx) + local_flux1 = flux_src_n_vf_arg%vf(j_adv)%sf(k_idx, l_idx, q_idx) + local_flux2 = flux_src_n_vf_arg%vf(j_adv)%sf(k_idx - 1, l_idx, q_idx) + rhs_vf_arg(j_adv)%sf(k_idx, l_idx, q_idx) = rhs_vf_arg(j_adv)%sf(k_idx, l_idx, q_idx) + & + local_inv_ds*local_term_coeff*(local_flux1 - local_flux2) + end do; end do; end do + end do + #:endcall GPU_PARALLEL_LOOP end if end if case (2) ! y-direction: loops q_idx (x), k_idx (y), l_idx (z); sf(q_idx, k_idx, l_idx); dy(k_idx); Kterm(q_idx,k_idx,l_idx) use_standard_riemann = (riemann_solver == 1 .or. riemann_solver == 4) if (use_standard_riemann) then - $:GPU_PARALLEL_LOOP(collapse=4,private='[local_inv_ds, & - & local_term_coeff,local_flux1,local_flux2]') - do j_adv = advxb, advxe - do l_idx = 0, p ! z_extent - do k_idx = 0, n ! y_extent - do q_idx = 0, m ! x_extent - local_inv_ds = 1._wp/dy(k_idx) - local_term_coeff = q_prim_vf_arg%vf(contxe + current_idir)%sf(q_idx, k_idx, l_idx) - local_flux1 = flux_src_n_vf_arg%vf(j_adv)%sf(q_idx, k_idx - 1, l_idx) - local_flux2 = flux_src_n_vf_arg%vf(j_adv)%sf(q_idx, k_idx, l_idx) - rhs_vf_arg(j_adv)%sf(q_idx, k_idx, l_idx) = rhs_vf_arg(j_adv)%sf(q_idx, k_idx, l_idx) + & - local_inv_ds*local_term_coeff*(local_flux1 - local_flux2) + #:call GPU_PARALLEL_LOOP(collapse=4,private='[local_inv_ds, local_term_coeff,local_flux1,local_flux2]') + do j_adv = advxb, advxe + do l_idx = 0, p ! z_extent + do k_idx = 0, n ! y_extent + do q_idx = 0, m ! x_extent + local_inv_ds = 1._wp/dy(k_idx) + local_term_coeff = q_prim_vf_arg%vf(contxe + current_idir)%sf(q_idx, k_idx, l_idx) + local_flux1 = flux_src_n_vf_arg%vf(j_adv)%sf(q_idx, k_idx - 1, l_idx) + local_flux2 = flux_src_n_vf_arg%vf(j_adv)%sf(q_idx, k_idx, l_idx) + rhs_vf_arg(j_adv)%sf(q_idx, k_idx, l_idx) = rhs_vf_arg(j_adv)%sf(q_idx, k_idx, l_idx) + & + local_inv_ds*local_term_coeff*(local_flux1 - local_flux2) + end do end do end do end do - end do + #:endcall GPU_PARALLEL_LOOP else ! Other Riemann solvers if (alt_soundspeed) then if (bubbles_euler .neqv. .true.) then - $:GPU_PARALLEL_LOOP(collapse=3, private='[local_inv_ds, & - & local_q_cons_val, local_k_term_val, & - & local_term_coeff, local_flux1, & - & local_flux2]') - do l_idx = 0, p; do k_idx = 0, n; do q_idx = 0, m - local_inv_ds = 1._wp/dy(k_idx) - local_q_cons_val = q_cons_vf_arg%vf(advxe)%sf(q_idx, k_idx, l_idx) - local_k_term_val = Kterm_arg(q_idx, k_idx, l_idx) ! Access is safe - local_term_coeff = local_q_cons_val - local_k_term_val - local_flux1 = flux_src_n_vf_arg%vf(advxe)%sf(q_idx, k_idx, l_idx) - local_flux2 = flux_src_n_vf_arg%vf(advxe)%sf(q_idx, k_idx - 1, l_idx) - rhs_vf_arg(advxe)%sf(q_idx, k_idx, l_idx) = rhs_vf_arg(advxe)%sf(q_idx, k_idx, l_idx) + & - local_inv_ds*local_term_coeff*(local_flux1 - local_flux2) - if (cyl_coord) then - rhs_vf_arg(advxe)%sf(q_idx, k_idx, l_idx) = rhs_vf_arg(advxe)%sf(q_idx, k_idx, l_idx) - & - (local_k_term_val/(2._wp*y_cc(k_idx)))*(local_flux1 + local_flux2) - end if - end do; end do; end do - - $:GPU_PARALLEL_LOOP(collapse=3, private='[local_inv_ds, & - & local_q_cons_val, local_k_term_val, & - & local_term_coeff, local_flux1, & - & local_flux2]') - do l_idx = 0, p; do k_idx = 0, n; do q_idx = 0, m - local_inv_ds = 1._wp/dy(k_idx) - local_q_cons_val = q_cons_vf_arg%vf(advxb)%sf(q_idx, k_idx, l_idx) - local_k_term_val = Kterm_arg(q_idx, k_idx, l_idx) ! Access is safe - local_term_coeff = local_q_cons_val + local_k_term_val - local_flux1 = flux_src_n_vf_arg%vf(advxb)%sf(q_idx, k_idx, l_idx) - local_flux2 = flux_src_n_vf_arg%vf(advxb)%sf(q_idx, k_idx - 1, l_idx) - rhs_vf_arg(advxb)%sf(q_idx, k_idx, l_idx) = rhs_vf_arg(advxb)%sf(q_idx, k_idx, l_idx) + & - local_inv_ds*local_term_coeff*(local_flux1 - local_flux2) - if (cyl_coord) then + #:call GPU_PARALLEL_LOOP(collapse=3, private='[local_inv_ds, local_q_cons_val, local_k_term_val, local_term_coeff, local_flux1, local_flux2]') + do l_idx = 0, p; do k_idx = 0, n; do q_idx = 0, m + local_inv_ds = 1._wp/dy(k_idx) + local_q_cons_val = q_cons_vf_arg%vf(advxe)%sf(q_idx, k_idx, l_idx) + local_k_term_val = Kterm_arg(q_idx, k_idx, l_idx) ! Access is safe + local_term_coeff = local_q_cons_val - local_k_term_val + local_flux1 = flux_src_n_vf_arg%vf(advxe)%sf(q_idx, k_idx, l_idx) + local_flux2 = flux_src_n_vf_arg%vf(advxe)%sf(q_idx, k_idx - 1, l_idx) + rhs_vf_arg(advxe)%sf(q_idx, k_idx, l_idx) = rhs_vf_arg(advxe)%sf(q_idx, k_idx, l_idx) + & + local_inv_ds*local_term_coeff*(local_flux1 - local_flux2) + if (cyl_coord) then + rhs_vf_arg(advxe)%sf(q_idx, k_idx, l_idx) = rhs_vf_arg(advxe)%sf(q_idx, k_idx, l_idx) - & + (local_k_term_val/(2._wp*y_cc(k_idx)))*(local_flux1 + local_flux2) + end if + end do; end do; end do + #:endcall GPU_PARALLEL_LOOP + + #:call GPU_PARALLEL_LOOP(collapse=3, private='[local_inv_ds, local_q_cons_val, local_k_term_val,local_term_coeff, local_flux1, local_flux2]') + do l_idx = 0, p; do k_idx = 0, n; do q_idx = 0, m + local_inv_ds = 1._wp/dy(k_idx) + local_q_cons_val = q_cons_vf_arg%vf(advxb)%sf(q_idx, k_idx, l_idx) + local_k_term_val = Kterm_arg(q_idx, k_idx, l_idx) ! Access is safe + local_term_coeff = local_q_cons_val + local_k_term_val + local_flux1 = flux_src_n_vf_arg%vf(advxb)%sf(q_idx, k_idx, l_idx) + local_flux2 = flux_src_n_vf_arg%vf(advxb)%sf(q_idx, k_idx - 1, l_idx) rhs_vf_arg(advxb)%sf(q_idx, k_idx, l_idx) = rhs_vf_arg(advxb)%sf(q_idx, k_idx, l_idx) + & - (local_k_term_val/(2._wp*y_cc(k_idx)))*(local_flux1 + local_flux2) - end if - end do; end do; end do + local_inv_ds*local_term_coeff*(local_flux1 - local_flux2) + if (cyl_coord) then + rhs_vf_arg(advxb)%sf(q_idx, k_idx, l_idx) = rhs_vf_arg(advxb)%sf(q_idx, k_idx, l_idx) + & + (local_k_term_val/(2._wp*y_cc(k_idx)))*(local_flux1 + local_flux2) + end if + end do; end do; end do + #:endcall GPU_PARALLEL_LOOP end if else ! NOT alt_soundspeed - $:GPU_PARALLEL_LOOP(collapse=4,private='[local_inv_ds, & - & local_term_coeff,local_flux1,local_flux2]') - do j_adv = advxb, advxe - do l_idx = 0, p; do k_idx = 0, n; do q_idx = 0, m - local_inv_ds = 1._wp/dy(k_idx) - local_term_coeff = q_cons_vf_arg%vf(j_adv)%sf(q_idx, k_idx, l_idx) - local_flux1 = flux_src_n_vf_arg%vf(j_adv)%sf(q_idx, k_idx, l_idx) - local_flux2 = flux_src_n_vf_arg%vf(j_adv)%sf(q_idx, k_idx - 1, l_idx) - rhs_vf_arg(j_adv)%sf(q_idx, k_idx, l_idx) = rhs_vf_arg(j_adv)%sf(q_idx, k_idx, l_idx) + & - local_inv_ds*local_term_coeff*(local_flux1 - local_flux2) - end do; end do; end do - end do + #:call GPU_PARALLEL_LOOP(collapse=4,private='[local_inv_ds, local_term_coeff,local_flux1,local_flux2]') + do j_adv = advxb, advxe + do l_idx = 0, p; do k_idx = 0, n; do q_idx = 0, m + local_inv_ds = 1._wp/dy(k_idx) + local_term_coeff = q_cons_vf_arg%vf(j_adv)%sf(q_idx, k_idx, l_idx) + local_flux1 = flux_src_n_vf_arg%vf(j_adv)%sf(q_idx, k_idx, l_idx) + local_flux2 = flux_src_n_vf_arg%vf(j_adv)%sf(q_idx, k_idx - 1, l_idx) + rhs_vf_arg(j_adv)%sf(q_idx, k_idx, l_idx) = rhs_vf_arg(j_adv)%sf(q_idx, k_idx, l_idx) + & + local_inv_ds*local_term_coeff*(local_flux1 - local_flux2) + end do; end do; end do + end do + #:endcall GPU_PARALLEL_LOOP end if end if @@ -1458,68 +1470,64 @@ contains end if if (use_standard_riemann) then - $:GPU_PARALLEL_LOOP(collapse=4,private='[local_inv_ds, & - & local_term_coeff,local_flux1,local_flux2]') - do j_adv = advxb, advxe - do k_idx = 0, p ! z_extent - do q_idx = 0, n ! y_extent - do l_idx = 0, m ! x_extent - local_inv_ds = 1._wp/dz(k_idx) - local_term_coeff = q_prim_vf_arg%vf(contxe + current_idir)%sf(l_idx, q_idx, k_idx) - local_flux1 = flux_src_n_vf_arg%vf(j_adv)%sf(l_idx, q_idx, k_idx - 1) - local_flux2 = flux_src_n_vf_arg%vf(j_adv)%sf(l_idx, q_idx, k_idx) - rhs_vf_arg(j_adv)%sf(l_idx, q_idx, k_idx) = rhs_vf_arg(j_adv)%sf(l_idx, q_idx, k_idx) + & - local_inv_ds*local_term_coeff*(local_flux1 - local_flux2) + #:call GPU_PARALLEL_LOOP(collapse=4,private='[local_inv_ds, local_term_coeff,local_flux1,local_flux2]') + do j_adv = advxb, advxe + do k_idx = 0, p ! z_extent + do q_idx = 0, n ! y_extent + do l_idx = 0, m ! x_extent + local_inv_ds = 1._wp/dz(k_idx) + local_term_coeff = q_prim_vf_arg%vf(contxe + current_idir)%sf(l_idx, q_idx, k_idx) + local_flux1 = flux_src_n_vf_arg%vf(j_adv)%sf(l_idx, q_idx, k_idx - 1) + local_flux2 = flux_src_n_vf_arg%vf(j_adv)%sf(l_idx, q_idx, k_idx) + rhs_vf_arg(j_adv)%sf(l_idx, q_idx, k_idx) = rhs_vf_arg(j_adv)%sf(l_idx, q_idx, k_idx) + & + local_inv_ds*local_term_coeff*(local_flux1 - local_flux2) + end do end do end do end do - end do + #:endcall GPU_PARALLEL_LOOP else ! Other Riemann solvers if (alt_soundspeed) then if (bubbles_euler .neqv. .true.) then - $:GPU_PARALLEL_LOOP(collapse=3, private='[local_inv_ds, & - & local_q_cons_val, local_k_term_val, & - & local_term_coeff, local_flux1, & - & local_flux2]') - do k_idx = 0, p; do q_idx = 0, n; do l_idx = 0, m - local_inv_ds = 1._wp/dz(k_idx) - local_q_cons_val = q_cons_vf_arg%vf(advxe)%sf(l_idx, q_idx, k_idx) - local_k_term_val = Kterm_arg(l_idx, q_idx, k_idx) ! Access is safe - local_term_coeff = local_q_cons_val - local_k_term_val - local_flux1 = flux_src_n_vf_arg%vf(advxe)%sf(l_idx, q_idx, k_idx) - local_flux2 = flux_src_n_vf_arg%vf(advxe)%sf(l_idx, q_idx, k_idx - 1) - rhs_vf_arg(advxe)%sf(l_idx, q_idx, k_idx) = rhs_vf_arg(advxe)%sf(l_idx, q_idx, k_idx) + & - local_inv_ds*local_term_coeff*(local_flux1 - local_flux2) - end do; end do; end do - - $:GPU_PARALLEL_LOOP(collapse=3, private='[local_inv_ds, & - & local_q_cons_val, local_k_term_val, & - & local_term_coeff, local_flux1, & - & local_flux2]') - do k_idx = 0, p; do q_idx = 0, n; do l_idx = 0, m - local_inv_ds = 1._wp/dz(k_idx) - local_q_cons_val = q_cons_vf_arg%vf(advxb)%sf(l_idx, q_idx, k_idx) - local_k_term_val = Kterm_arg(l_idx, q_idx, k_idx) ! Access is safe - local_term_coeff = local_q_cons_val + local_k_term_val - local_flux1 = flux_src_n_vf_arg%vf(advxb)%sf(l_idx, q_idx, k_idx) - local_flux2 = flux_src_n_vf_arg%vf(advxb)%sf(l_idx, q_idx, k_idx - 1) - rhs_vf_arg(advxb)%sf(l_idx, q_idx, k_idx) = rhs_vf_arg(advxb)%sf(l_idx, q_idx, k_idx) + & - local_inv_ds*local_term_coeff*(local_flux1 - local_flux2) - end do; end do; end do + #:call GPU_PARALLEL_LOOP(collapse=3, private='[local_inv_ds,local_q_cons_val, local_k_term_val, local_term_coeff, local_flux1, local_flux2]') + do k_idx = 0, p; do q_idx = 0, n; do l_idx = 0, m + local_inv_ds = 1._wp/dz(k_idx) + local_q_cons_val = q_cons_vf_arg%vf(advxe)%sf(l_idx, q_idx, k_idx) + local_k_term_val = Kterm_arg(l_idx, q_idx, k_idx) ! Access is safe + local_term_coeff = local_q_cons_val - local_k_term_val + local_flux1 = flux_src_n_vf_arg%vf(advxe)%sf(l_idx, q_idx, k_idx) + local_flux2 = flux_src_n_vf_arg%vf(advxe)%sf(l_idx, q_idx, k_idx - 1) + rhs_vf_arg(advxe)%sf(l_idx, q_idx, k_idx) = rhs_vf_arg(advxe)%sf(l_idx, q_idx, k_idx) + & + local_inv_ds*local_term_coeff*(local_flux1 - local_flux2) + end do; end do; end do + #:endcall GPU_PARALLEL_LOOP + + #:call GPU_PARALLEL_LOOP(collapse=3, private='[local_inv_ds, local_q_cons_val, local_k_term_val, local_term_coeff, local_flux1, local_flux2]') + do k_idx = 0, p; do q_idx = 0, n; do l_idx = 0, m + local_inv_ds = 1._wp/dz(k_idx) + local_q_cons_val = q_cons_vf_arg%vf(advxb)%sf(l_idx, q_idx, k_idx) + local_k_term_val = Kterm_arg(l_idx, q_idx, k_idx) ! Access is safe + local_term_coeff = local_q_cons_val + local_k_term_val + local_flux1 = flux_src_n_vf_arg%vf(advxb)%sf(l_idx, q_idx, k_idx) + local_flux2 = flux_src_n_vf_arg%vf(advxb)%sf(l_idx, q_idx, k_idx - 1) + rhs_vf_arg(advxb)%sf(l_idx, q_idx, k_idx) = rhs_vf_arg(advxb)%sf(l_idx, q_idx, k_idx) + & + local_inv_ds*local_term_coeff*(local_flux1 - local_flux2) + end do; end do; end do + #:endcall GPU_PARALLEL_LOOP end if else ! NOT alt_soundspeed - $:GPU_PARALLEL_LOOP(collapse=4,private='[local_inv_ds, & - & local_term_coeff,local_flux1,local_flux2]') - do j_adv = advxb, advxe - do k_idx = 0, p; do q_idx = 0, n; do l_idx = 0, m - local_inv_ds = 1._wp/dz(k_idx) - local_term_coeff = q_cons_vf_arg%vf(j_adv)%sf(l_idx, q_idx, k_idx) - local_flux1 = flux_src_n_vf_arg%vf(j_adv)%sf(l_idx, q_idx, k_idx) - local_flux2 = flux_src_n_vf_arg%vf(j_adv)%sf(l_idx, q_idx, k_idx - 1) - rhs_vf_arg(j_adv)%sf(l_idx, q_idx, k_idx) = rhs_vf_arg(j_adv)%sf(l_idx, q_idx, k_idx) + & - local_inv_ds*local_term_coeff*(local_flux1 - local_flux2) - end do; end do; end do - end do + #:call GPU_PARALLEL_LOOP(collapse=4,private='[local_inv_ds, local_term_coeff,local_flux1,local_flux2]') + do j_adv = advxb, advxe + do k_idx = 0, p; do q_idx = 0, n; do l_idx = 0, m + local_inv_ds = 1._wp/dz(k_idx) + local_term_coeff = q_cons_vf_arg%vf(j_adv)%sf(l_idx, q_idx, k_idx) + local_flux1 = flux_src_n_vf_arg%vf(j_adv)%sf(l_idx, q_idx, k_idx) + local_flux2 = flux_src_n_vf_arg%vf(j_adv)%sf(l_idx, q_idx, k_idx - 1) + rhs_vf_arg(j_adv)%sf(l_idx, q_idx, k_idx) = rhs_vf_arg(j_adv)%sf(l_idx, q_idx, k_idx) + & + local_inv_ds*local_term_coeff*(local_flux1 - local_flux2) + end do; end do; end do + end do + #:endcall GPU_PARALLEL_LOOP end if end if end select @@ -1541,71 +1549,74 @@ contains if (idir == 1) then ! x-direction if (surface_tension) then - $:GPU_PARALLEL_LOOP(collapse=3) - do l = 0, p - do k = 0, n - do j = 0, m - rhs_vf(c_idx)%sf(j, k, l) = & - rhs_vf(c_idx)%sf(j, k, l) + 1._wp/dx(j)* & - q_prim_vf(c_idx)%sf(j, k, l)* & - (flux_src_n_in(advxb)%sf(j, k, l) - & - flux_src_n_in(advxb)%sf(j - 1, k, l)) + #:call GPU_PARALLEL_LOOP(collapse=3) + do l = 0, p + do k = 0, n + do j = 0, m + rhs_vf(c_idx)%sf(j, k, l) = & + rhs_vf(c_idx)%sf(j, k, l) + 1._wp/dx(j)* & + q_prim_vf(c_idx)%sf(j, k, l)* & + (flux_src_n_in(advxb)%sf(j, k, l) - & + flux_src_n_in(advxb)%sf(j - 1, k, l)) + end do end do end do - end do + #:endcall GPU_PARALLEL_LOOP end if if ((surface_tension .or. viscous) .or. chem_params%diffusion) then - $:GPU_PARALLEL_LOOP(collapse=3) - do l = 0, p - do k = 0, n - do j = 0, m - if (surface_tension .or. viscous) then - $:GPU_LOOP(parallelism='[seq]') - do i = momxb, E_idx - rhs_vf(i)%sf(j, k, l) = & - rhs_vf(i)%sf(j, k, l) + 1._wp/dx(j)* & - (flux_src_n_in(i)%sf(j - 1, k, l) & - - flux_src_n_in(i)%sf(j, k, l)) - end do - end if + #:call GPU_PARALLEL_LOOP(collapse=3) + do l = 0, p + do k = 0, n + do j = 0, m + if (surface_tension .or. viscous) then + $:GPU_LOOP(parallelism='[seq]') + do i = momxb, E_idx + rhs_vf(i)%sf(j, k, l) = & + rhs_vf(i)%sf(j, k, l) + 1._wp/dx(j)* & + (flux_src_n_in(i)%sf(j - 1, k, l) & + - flux_src_n_in(i)%sf(j, k, l)) + end do + end if - if (chem_params%diffusion) then - $:GPU_LOOP(parallelism='[seq]') - do i = chemxb, chemxe - rhs_vf(i)%sf(j, k, l) = & - rhs_vf(i)%sf(j, k, l) + 1._wp/dx(j)* & - (flux_src_n_in(i)%sf(j - 1, k, l) & - - flux_src_n_in(i)%sf(j, k, l)) - end do + if (chem_params%diffusion) then + $:GPU_LOOP(parallelism='[seq]') + do i = chemxb, chemxe + rhs_vf(i)%sf(j, k, l) = & + rhs_vf(i)%sf(j, k, l) + 1._wp/dx(j)* & + (flux_src_n_in(i)%sf(j - 1, k, l) & + - flux_src_n_in(i)%sf(j, k, l)) + end do - if (.not. viscous) then - rhs_vf(E_idx)%sf(j, k, l) = & - rhs_vf(E_idx)%sf(j, k, l) + 1._wp/dx(j)* & - (flux_src_n_in(E_idx)%sf(j - 1, k, l) & - - flux_src_n_in(E_idx)%sf(j, k, l)) + if (.not. viscous) then + rhs_vf(E_idx)%sf(j, k, l) = & + rhs_vf(E_idx)%sf(j, k, l) + 1._wp/dx(j)* & + (flux_src_n_in(E_idx)%sf(j - 1, k, l) & + - flux_src_n_in(E_idx)%sf(j, k, l)) + end if end if - end if + end do end do end do - end do + #:endcall GPU_PARALLEL_LOOP end if elseif (idir == 2) then ! y-direction if (surface_tension) then - $:GPU_PARALLEL_LOOP(collapse=3) - do l = 0, p - do k = 0, n - do j = 0, m - rhs_vf(c_idx)%sf(j, k, l) = & - rhs_vf(c_idx)%sf(j, k, l) + 1._wp/dy(k)* & - q_prim_vf(c_idx)%sf(j, k, l)* & - (flux_src_n_in(advxb)%sf(j, k, l) - & - flux_src_n_in(advxb)%sf(j, k - 1, l)) + #:call GPU_PARALLEL_LOOP(collapse=3) + do l = 0, p + do k = 0, n + do j = 0, m + rhs_vf(c_idx)%sf(j, k, l) = & + rhs_vf(c_idx)%sf(j, k, l) + 1._wp/dy(k)* & + q_prim_vf(c_idx)%sf(j, k, l)* & + (flux_src_n_in(advxb)%sf(j, k, l) - & + flux_src_n_in(advxb)%sf(j, k - 1, l)) + end do end do end do - end do + #:endcall GPU_PARALLEL_LOOP end if if (cyl_coord .and. ((bc_y%beg == -2) .or. (bc_y%beg == -14))) then @@ -1626,71 +1637,74 @@ contains idwbuff(1), idwbuff(2), idwbuff(3)) end if - $:GPU_PARALLEL_LOOP(collapse=2) - do l = 0, p - do j = 0, m - $:GPU_LOOP(parallelism='[seq]') - do i = momxb, E_idx - rhs_vf(i)%sf(j, 0, l) = & - rhs_vf(i)%sf(j, 0, l) + 1._wp/(y_cc(1) - y_cc(-1))* & - (tau_Re_vf(i)%sf(j, -1, l) & - - tau_Re_vf(i)%sf(j, 1, l)) + #:call GPU_PARALLEL_LOOP(collapse=2) + do l = 0, p + do j = 0, m + $:GPU_LOOP(parallelism='[seq]') + do i = momxb, E_idx + rhs_vf(i)%sf(j, 0, l) = & + rhs_vf(i)%sf(j, 0, l) + 1._wp/(y_cc(1) - y_cc(-1))* & + (tau_Re_vf(i)%sf(j, -1, l) & + - tau_Re_vf(i)%sf(j, 1, l)) + end do end do end do - end do + #:endcall GPU_PARALLEL_LOOP end if - $:GPU_PARALLEL_LOOP(collapse=3) - do l = 0, p - do k = 1, n - do j = 0, m - $:GPU_LOOP(parallelism='[seq]') - do i = momxb, E_idx - rhs_vf(i)%sf(j, k, l) = & - rhs_vf(i)%sf(j, k, l) + 1._wp/dy(k)* & - (flux_src_n_in(i)%sf(j, k - 1, l) & - - flux_src_n_in(i)%sf(j, k, l)) + #:call GPU_PARALLEL_LOOP(collapse=3) + do l = 0, p + do k = 1, n + do j = 0, m + $:GPU_LOOP(parallelism='[seq]') + do i = momxb, E_idx + rhs_vf(i)%sf(j, k, l) = & + rhs_vf(i)%sf(j, k, l) + 1._wp/dy(k)* & + (flux_src_n_in(i)%sf(j, k - 1, l) & + - flux_src_n_in(i)%sf(j, k, l)) + end do end do end do end do - end do + #:endcall GPU_PARALLEL_LOOP else if ((surface_tension .or. viscous) .or. chem_params%diffusion) then - $:GPU_PARALLEL_LOOP(collapse=3) - do l = 0, p - do k = 0, n - do j = 0, m - if (surface_tension .or. viscous) then - $:GPU_LOOP(parallelism='[seq]') - do i = momxb, E_idx - rhs_vf(i)%sf(j, k, l) = & - rhs_vf(i)%sf(j, k, l) + 1._wp/dy(k)* & - (flux_src_n_in(i)%sf(j, k - 1, l) & - - flux_src_n_in(i)%sf(j, k, l)) - end do - end if + #:call GPU_PARALLEL_LOOP(collapse=3) + do l = 0, p + do k = 0, n + do j = 0, m + if (surface_tension .or. viscous) then + $:GPU_LOOP(parallelism='[seq]') + do i = momxb, E_idx + rhs_vf(i)%sf(j, k, l) = & + rhs_vf(i)%sf(j, k, l) + 1._wp/dy(k)* & + (flux_src_n_in(i)%sf(j, k - 1, l) & + - flux_src_n_in(i)%sf(j, k, l)) + end do + end if - if (chem_params%diffusion) then - $:GPU_LOOP(parallelism='[seq]') - do i = chemxb, chemxe - rhs_vf(i)%sf(j, k, l) = & - rhs_vf(i)%sf(j, k, l) + 1._wp/dy(k)* & - (flux_src_n_in(i)%sf(j, k - 1, l) & - - flux_src_n_in(i)%sf(j, k, l)) - end do - if (.not. viscous) then - rhs_vf(E_idx)%sf(j, k, l) = & - rhs_vf(E_idx)%sf(j, k, l) + 1._wp/dy(k)* & - (flux_src_n_in(E_idx)%sf(j, k - 1, l) & - - flux_src_n_in(E_idx)%sf(j, k, l)) + if (chem_params%diffusion) then + $:GPU_LOOP(parallelism='[seq]') + do i = chemxb, chemxe + rhs_vf(i)%sf(j, k, l) = & + rhs_vf(i)%sf(j, k, l) + 1._wp/dy(k)* & + (flux_src_n_in(i)%sf(j, k - 1, l) & + - flux_src_n_in(i)%sf(j, k, l)) + end do + if (.not. viscous) then + rhs_vf(E_idx)%sf(j, k, l) = & + rhs_vf(E_idx)%sf(j, k, l) + 1._wp/dy(k)* & + (flux_src_n_in(E_idx)%sf(j, k - 1, l) & + - flux_src_n_in(E_idx)%sf(j, k, l)) + end if end if - end if + end do end do end do - end do + #:endcall GPU_PARALLEL_LOOP end if end if @@ -1699,123 +1713,108 @@ contains if (cyl_coord) then if ((bc_y%beg == -2) .or. (bc_y%beg == -14)) then - $:GPU_PARALLEL_LOOP(collapse=3) - do l = 0, p - do k = 1, n - do j = 0, m - $:GPU_LOOP(parallelism='[seq]') - do i = momxb, E_idx - rhs_vf(i)%sf(j, k, l) = & - rhs_vf(i)%sf(j, k, l) - 5.e-1_wp/y_cc(k)* & - (flux_src_n_in(i)%sf(j, k - 1, l) & - + flux_src_n_in(i)%sf(j, k, l)) + #:call GPU_PARALLEL_LOOP(collapse=3) + do l = 0, p + do k = 1, n + do j = 0, m + $:GPU_LOOP(parallelism='[seq]') + do i = momxb, E_idx + rhs_vf(i)%sf(j, k, l) = & + rhs_vf(i)%sf(j, k, l) - 5.e-1_wp/y_cc(k)* & + (flux_src_n_in(i)%sf(j, k - 1, l) & + + flux_src_n_in(i)%sf(j, k, l)) + end do end do end do end do - end do + #:endcall GPU_PARALLEL_LOOP if (viscous) then - $:GPU_PARALLEL_LOOP(collapse=2) - do l = 0, p - do j = 0, m - $:GPU_LOOP(parallelism='[seq]') - do i = momxb, E_idx - rhs_vf(i)%sf(j, 0, l) = & - rhs_vf(i)%sf(j, 0, l) - 1._wp/y_cc(0)* & - tau_Re_vf(i)%sf(j, 0, l) + #:call GPU_PARALLEL_LOOP(collapse=2) + do l = 0, p + do j = 0, m + $:GPU_LOOP(parallelism='[seq]') + do i = momxb, E_idx + rhs_vf(i)%sf(j, 0, l) = & + rhs_vf(i)%sf(j, 0, l) - 1._wp/y_cc(0)* & + tau_Re_vf(i)%sf(j, 0, l) + end do end do end do - end do + #:endcall GPU_PARALLEL_LOOP end if else - $:GPU_PARALLEL_LOOP(collapse=3) - do l = 0, p - do k = 0, n - do j = 0, m - $:GPU_LOOP(parallelism='[seq]') - do i = momxb, E_idx - rhs_vf(i)%sf(j, k, l) = & - rhs_vf(i)%sf(j, k, l) - 5.e-1_wp/y_cc(k)* & - (flux_src_n_in(i)%sf(j, k - 1, l) & - + flux_src_n_in(i)%sf(j, k, l)) + #:call GPU_PARALLEL_LOOP(collapse=3) + do l = 0, p + do k = 0, n + do j = 0, m + $:GPU_LOOP(parallelism='[seq]') + do i = momxb, E_idx + rhs_vf(i)%sf(j, k, l) = & + rhs_vf(i)%sf(j, k, l) - 5.e-1_wp/y_cc(k)* & + (flux_src_n_in(i)%sf(j, k - 1, l) & + + flux_src_n_in(i)%sf(j, k, l)) + end do end do end do end do - end do - + #:endcall GPU_PARALLEL_LOOP end if end if elseif (idir == 3) then ! z-direction if (surface_tension) then - $:GPU_PARALLEL_LOOP(collapse=3) - do l = 0, p - do k = 0, n - do j = 0, m - rhs_vf(c_idx)%sf(j, k, l) = & - rhs_vf(c_idx)%sf(j, k, l) + 1._wp/dz(l)* & - q_prim_vf(c_idx)%sf(j, k, l)* & - (flux_src_n_in(advxb)%sf(j, k, l) - & - flux_src_n_in(advxb)%sf(j, k, l - 1)) + #:call GPU_PARALLEL_LOOP(collapse=3) + do l = 0, p + do k = 0, n + do j = 0, m + rhs_vf(c_idx)%sf(j, k, l) = & + rhs_vf(c_idx)%sf(j, k, l) + 1._wp/dz(l)* & + q_prim_vf(c_idx)%sf(j, k, l)* & + (flux_src_n_in(advxb)%sf(j, k, l) - & + flux_src_n_in(advxb)%sf(j, k, l - 1)) + end do end do end do - end do + #:endcall GPU_PARALLEL_LOOP end if - if ((surface_tension .or. viscous) .or. chem_params%diffusion) then - $:GPU_PARALLEL_LOOP(collapse=3) + #:call GPU_PARALLEL_LOOP(collapse=3) do l = 0, p do k = 0, n do j = 0, m - if (surface_tension .or. viscous) then - $:GPU_LOOP(parallelism='[seq]') - do i = momxb, E_idx - rhs_vf(i)%sf(j, k, l) = & - rhs_vf(i)%sf(j, k, l) + 1._wp/dz(l)* & - (flux_src_n_in(i)%sf(j, k, l - 1) & - - flux_src_n_in(i)%sf(j, k, l)) - end do - end if - - if (chem_params%diffusion) then - $:GPU_LOOP(parallelism='[seq]') - do i = chemxb, chemxe - rhs_vf(i)%sf(j, k, l) = & - rhs_vf(i)%sf(j, k, l) + 1._wp/dz(l)* & - (flux_src_n_in(i)%sf(j, k, l - 1) & - - flux_src_n_in(i)%sf(j, k, l)) - end do - if (.not. viscous) then - rhs_vf(E_idx)%sf(j, k, l) = & - rhs_vf(E_idx)%sf(j, k, l) + 1._wp/dz(l)* & - (flux_src_n_in(E_idx)%sf(j, k, l - 1) & - - flux_src_n_in(E_idx)%sf(j, k, l)) - end if - end if + $:GPU_LOOP(parallelism='[seq]') + do i = momxb, E_idx + rhs_vf(i)%sf(j, k, l) = & + rhs_vf(i)%sf(j, k, l) + 1._wp/dz(l)* & + (flux_src_n_in(i)%sf(j, k, l - 1) & + - flux_src_n_in(i)%sf(j, k, l)) + end do end do end do end do - end if + #:endcall GPU_PARALLEL_LOOP if (grid_geometry == 3) then - $:GPU_PARALLEL_LOOP(collapse=3) - do l = 0, p - do k = 0, n - do j = 0, m - rhs_vf(momxb + 1)%sf(j, k, l) = & - rhs_vf(momxb + 1)%sf(j, k, l) + 5.e-1_wp* & - (flux_src_n_in(momxe)%sf(j, k, l - 1) & - + flux_src_n_in(momxe)%sf(j, k, l)) - - rhs_vf(momxe)%sf(j, k, l) = & - rhs_vf(momxe)%sf(j, k, l) - 5.e-1_wp* & - (flux_src_n_in(momxb + 1)%sf(j, k, l - 1) & - + flux_src_n_in(momxb + 1)%sf(j, k, l)) + #:call GPU_PARALLEL_LOOP(collapse=3) + do l = 0, p + do k = 0, n + do j = 0, m + rhs_vf(momxb + 1)%sf(j, k, l) = & + rhs_vf(momxb + 1)%sf(j, k, l) + 5.e-1_wp* & + (flux_src_n_in(momxe)%sf(j, k, l - 1) & + + flux_src_n_in(momxe)%sf(j, k, l)) + + rhs_vf(momxe)%sf(j, k, l) = & + rhs_vf(momxe)%sf(j, k, l) - 5.e-1_wp* & + (flux_src_n_in(momxb + 1)%sf(j, k, l - 1) & + + flux_src_n_in(momxb + 1)%sf(j, k, l)) + end do end do end do - end do + #:endcall GPU_PARALLEL_LOOP end if end if @@ -1927,41 +1926,44 @@ contains $:GPU_UPDATE(device='[is1,is2,is3,iv]') if (recon_dir == 1) then - $:GPU_PARALLEL_LOOP(collapse=4) - do i = iv%beg, iv%end - do l = is3%beg, is3%end - do k = is2%beg, is2%end - do j = is1%beg, is1%end - vL_x(j, k, l, i) = v_vf(i)%sf(j, k, l) - vR_x(j, k, l, i) = v_vf(i)%sf(j, k, l) + #:call GPU_PARALLEL_LOOP(collapse=4) + do i = iv%beg, iv%end + do l = is3%beg, is3%end + do k = is2%beg, is2%end + do j = is1%beg, is1%end + vL_x(j, k, l, i) = v_vf(i)%sf(j, k, l) + vR_x(j, k, l, i) = v_vf(i)%sf(j, k, l) + end do end do end do end do - end do + #:endcall GPU_PARALLEL_LOOP else if (recon_dir == 2) then - $:GPU_PARALLEL_LOOP(collapse=4) - do i = iv%beg, iv%end - do l = is3%beg, is3%end - do k = is2%beg, is2%end - do j = is1%beg, is1%end - vL_y(j, k, l, i) = v_vf(i)%sf(k, j, l) - vR_y(j, k, l, i) = v_vf(i)%sf(k, j, l) + #:call GPU_PARALLEL_LOOP(collapse=4) + do i = iv%beg, iv%end + do l = is3%beg, is3%end + do k = is2%beg, is2%end + do j = is1%beg, is1%end + vL_y(j, k, l, i) = v_vf(i)%sf(k, j, l) + vR_y(j, k, l, i) = v_vf(i)%sf(k, j, l) + end do end do end do end do - end do + #:endcall GPU_PARALLEL_LOOP else if (recon_dir == 3) then - $:GPU_PARALLEL_LOOP(collapse=4) - do i = iv%beg, iv%end - do l = is3%beg, is3%end - do k = is2%beg, is2%end - do j = is1%beg, is1%end - vL_z(j, k, l, i) = v_vf(i)%sf(l, k, j) - vR_z(j, k, l, i) = v_vf(i)%sf(l, k, j) + #:call GPU_PARALLEL_LOOP(collapse=4) + do i = iv%beg, iv%end + do l = is3%beg, is3%end + do k = is2%beg, is2%end + do j = is1%beg, is1%end + vL_z(j, k, l, i) = v_vf(i)%sf(l, k, j) + vR_z(j, k, l, i) = v_vf(i)%sf(l, k, j) + end do end do end do end do - end do + #:endcall GPU_PARALLEL_LOOP end if end if #:endfor diff --git a/src/simulation/m_riemann_solvers.fpp b/src/simulation/m_riemann_solvers.fpp index b4db230d75..6cbc16e978 100644 --- a/src/simulation/m_riemann_solvers.fpp +++ b/src/simulation/m_riemann_solvers.fpp @@ -107,11 +107,11 @@ module m_riemann_solvers $:GPU_DECLARE(create='[is1,is2,is3,isx,isy,isz]') - real(wp), allocatable, dimension(:) :: Gs - $:GPU_DECLARE(create='[Gs]') + real(wp), allocatable, dimension(:) :: Gs_rs + $:GPU_DECLARE(create='[Gs_rs]') - real(wp), allocatable, dimension(:, :) :: Res - $:GPU_DECLARE(create='[Res]') + real(wp), allocatable, dimension(:, :) :: Res_gs + $:GPU_DECLARE(create='[Res_gs]') contains @@ -203,17 +203,17 @@ contains !! For more information please refer to: !! 1) s_compute_cartesian_viscous_source_flux !! 2) s_compute_cylindrical_viscous_source_flux - pure subroutine s_compute_viscous_source_flux(velL_vf, & - dvelL_dx_vf, & - dvelL_dy_vf, & - dvelL_dz_vf, & - velR_vf, & - dvelR_dx_vf, & - dvelR_dy_vf, & - dvelR_dz_vf, & - flux_src_vf, & - norm_dir, & - ix, iy, iz) + subroutine s_compute_viscous_source_flux(velL_vf, & + dvelL_dx_vf, & + dvelL_dy_vf, & + dvelL_dz_vf, & + velR_vf, & + dvelR_dx_vf, & + dvelR_dy_vf, & + dvelR_dz_vf, & + flux_src_vf, & + norm_dir, & + ix, iy, iz) type(scalar_field), & dimension(num_vels), & @@ -357,654 +357,649 @@ contains #:for NORM_DIR, XYZ in [(1, 'x'), (2, 'y'), (3, 'z')] if (norm_dir == ${NORM_DIR}$) then - $:GPU_PARALLEL_LOOP(collapse=3, private='[alpha_rho_L, alpha_rho_R, & - & vel_L, vel_R, alpha_L, alpha_R, tau_e_L, tau_e_R, & - & G_L, G_R, Re_L, Re_R, rho_avg, h_avg, gamma_avg, & - & s_L, s_R, s_S, Ys_L, Ys_R, xi_field_L, xi_field_R, & - & Cp_iL, Cp_iR, Xs_L, Xs_R, Gamma_iL, Gamma_iR, & - & Yi_avg, Phi_avg, h_iL, h_iR, h_avg_2, c_fast, & - & pres_mag, B, Ga, vdotB, B2, b4, cm, pcorr, & - & zcoef, vel_L_tmp, vel_R_tmp]') - do l = is3%beg, is3%end - do k = is2%beg, is2%end - do j = is1%beg, is1%end - $:GPU_LOOP(parallelism='[seq]') - do i = 1, contxe - alpha_rho_L(i) = qL_prim_rs${XYZ}$_vf(j, k, l, i) - alpha_rho_R(i) = qR_prim_rs${XYZ}$_vf(j + 1, k, l, i) - end do + #:call GPU_PARALLEL_LOOP(collapse=3, private='[alpha_rho_L, alpha_rho_R, vel_L, vel_R, alpha_L, alpha_R, tau_e_L, tau_e_R,G_L, G_R, Re_L, Re_R, rho_avg, h_avg, gamma_avg, s_L, s_R, s_S, Ys_L, Ys_R, xi_field_L, xi_field_R, Cp_iL, Cp_iR, Xs_L, Xs_R, Gamma_iL, Gamma_iR, Yi_avg, Phi_avg, h_iL, h_iR, h_avg_2, c_fast, pres_mag, B, Ga, vdotB, B2, b4, cm, pcorr, zcoef, vel_L_tmp, vel_R_tmp]') + do l = is3%beg, is3%end + do k = is2%beg, is2%end + do j = is1%beg, is1%end + $:GPU_LOOP(parallelism='[seq]') + do i = 1, contxe + alpha_rho_L(i) = qL_prim_rs${XYZ}$_vf(j, k, l, i) + alpha_rho_R(i) = qR_prim_rs${XYZ}$_vf(j + 1, k, l, i) + end do - vel_L_rms = 0._wp; vel_R_rms = 0._wp + vel_L_rms = 0._wp; vel_R_rms = 0._wp - $:GPU_LOOP(parallelism='[seq]') - do i = 1, num_vels - vel_L(i) = qL_prim_rs${XYZ}$_vf(j, k, l, contxe + i) - vel_R(i) = qR_prim_rs${XYZ}$_vf(j + 1, k, l, contxe + i) - vel_L_rms = vel_L_rms + vel_L(i)**2._wp - vel_R_rms = vel_R_rms + vel_R(i)**2._wp - end do + $:GPU_LOOP(parallelism='[seq]') + do i = 1, num_vels + vel_L(i) = qL_prim_rs${XYZ}$_vf(j, k, l, contxe + i) + vel_R(i) = qR_prim_rs${XYZ}$_vf(j + 1, k, l, contxe + i) + vel_L_rms = vel_L_rms + vel_L(i)**2._wp + vel_R_rms = vel_R_rms + vel_R(i)**2._wp + end do - $:GPU_LOOP(parallelism='[seq]') - do i = 1, num_fluids - alpha_L(i) = qL_prim_rs${XYZ}$_vf(j, k, l, E_idx + i) - alpha_R(i) = qR_prim_rs${XYZ}$_vf(j + 1, k, l, E_idx + i) - end do + $:GPU_LOOP(parallelism='[seq]') + do i = 1, num_fluids + alpha_L(i) = qL_prim_rs${XYZ}$_vf(j, k, l, E_idx + i) + alpha_R(i) = qR_prim_rs${XYZ}$_vf(j + 1, k, l, E_idx + i) + end do + + pres_L = qL_prim_rs${XYZ}$_vf(j, k, l, E_idx) + pres_R = qR_prim_rs${XYZ}$_vf(j + 1, k, l, E_idx) - pres_L = qL_prim_rs${XYZ}$_vf(j, k, l, E_idx) - pres_R = qR_prim_rs${XYZ}$_vf(j + 1, k, l, E_idx) - - if (mhd) then - if (n == 0) then ! 1D: constant Bx; By, Bz as variables - B%L(1) = Bx0 - B%R(1) = Bx0 - B%L(2) = qL_prim_rs${XYZ}$_vf(j, k, l, B_idx%beg) - B%R(2) = qR_prim_rs${XYZ}$_vf(j + 1, k, l, B_idx%beg) - B%L(3) = qL_prim_rs${XYZ}$_vf(j, k, l, B_idx%beg + 1) - B%R(3) = qR_prim_rs${XYZ}$_vf(j + 1, k, l, B_idx%beg + 1) - else ! 2D/3D: Bx, By, Bz as variables - B%L(1) = qL_prim_rs${XYZ}$_vf(j, k, l, B_idx%beg) - B%R(1) = qR_prim_rs${XYZ}$_vf(j + 1, k, l, B_idx%beg) - B%L(2) = qL_prim_rs${XYZ}$_vf(j, k, l, B_idx%beg + 1) - B%R(2) = qR_prim_rs${XYZ}$_vf(j + 1, k, l, B_idx%beg + 1) - B%L(3) = qL_prim_rs${XYZ}$_vf(j, k, l, B_idx%beg + 2) - B%R(3) = qR_prim_rs${XYZ}$_vf(j + 1, k, l, B_idx%beg + 2) + if (mhd) then + if (n == 0) then ! 1D: constant Bx; By, Bz as variables + B%L(1) = Bx0 + B%R(1) = Bx0 + B%L(2) = qL_prim_rs${XYZ}$_vf(j, k, l, B_idx%beg) + B%R(2) = qR_prim_rs${XYZ}$_vf(j + 1, k, l, B_idx%beg) + B%L(3) = qL_prim_rs${XYZ}$_vf(j, k, l, B_idx%beg + 1) + B%R(3) = qR_prim_rs${XYZ}$_vf(j + 1, k, l, B_idx%beg + 1) + else ! 2D/3D: Bx, By, Bz as variables + B%L(1) = qL_prim_rs${XYZ}$_vf(j, k, l, B_idx%beg) + B%R(1) = qR_prim_rs${XYZ}$_vf(j + 1, k, l, B_idx%beg) + B%L(2) = qL_prim_rs${XYZ}$_vf(j, k, l, B_idx%beg + 1) + B%R(2) = qR_prim_rs${XYZ}$_vf(j + 1, k, l, B_idx%beg + 1) + B%L(3) = qL_prim_rs${XYZ}$_vf(j, k, l, B_idx%beg + 2) + B%R(3) = qR_prim_rs${XYZ}$_vf(j + 1, k, l, B_idx%beg + 2) + end if end if - end if - rho_L = 0._wp - gamma_L = 0._wp - pi_inf_L = 0._wp - qv_L = 0._wp + rho_L = 0._wp + gamma_L = 0._wp + pi_inf_L = 0._wp + qv_L = 0._wp + + rho_R = 0._wp + gamma_R = 0._wp + pi_inf_R = 0._wp + qv_R = 0._wp - rho_R = 0._wp - gamma_R = 0._wp - pi_inf_R = 0._wp - qv_R = 0._wp + alpha_L_sum = 0._wp + alpha_R_sum = 0._wp - alpha_L_sum = 0._wp - alpha_R_sum = 0._wp + pres_mag%L = 0._wp + pres_mag%R = 0._wp + + if (mpp_lim) then + $:GPU_LOOP(parallelism='[seq]') + do i = 1, num_fluids + alpha_rho_L(i) = max(0._wp, alpha_rho_L(i)) + alpha_L(i) = min(max(0._wp, alpha_L(i)), 1._wp) + alpha_L_sum = alpha_L_sum + alpha_L(i) + alpha_rho_R(i) = max(0._wp, alpha_rho_R(i)) + alpha_R(i) = min(max(0._wp, alpha_R(i)), 1._wp) + alpha_R_sum = alpha_R_sum + alpha_R(i) + end do - pres_mag%L = 0._wp - pres_mag%R = 0._wp + alpha_L = alpha_L/max(alpha_L_sum, sgm_eps) + alpha_R = alpha_R/max(alpha_R_sum, sgm_eps) + end if - if (mpp_lim) then $:GPU_LOOP(parallelism='[seq]') do i = 1, num_fluids - alpha_rho_L(i) = max(0._wp, alpha_rho_L(i)) - alpha_L(i) = min(max(0._wp, alpha_L(i)), 1._wp) - alpha_L_sum = alpha_L_sum + alpha_L(i) - alpha_rho_R(i) = max(0._wp, alpha_rho_R(i)) - alpha_R(i) = min(max(0._wp, alpha_R(i)), 1._wp) - alpha_R_sum = alpha_R_sum + alpha_R(i) + rho_L = rho_L + alpha_rho_L(i) + gamma_L = gamma_L + alpha_L(i)*gammas(i) + pi_inf_L = pi_inf_L + alpha_L(i)*pi_infs(i) + qv_L = qv_L + alpha_rho_L(i)*qvs(i) + + rho_R = rho_R + alpha_rho_R(i) + gamma_R = gamma_R + alpha_R(i)*gammas(i) + pi_inf_R = pi_inf_R + alpha_R(i)*pi_infs(i) + qv_R = qv_R + alpha_rho_R(i)*qvs(i) end do - alpha_L = alpha_L/max(alpha_L_sum, sgm_eps) - alpha_R = alpha_R/max(alpha_R_sum, sgm_eps) - end if + if (viscous) then + $:GPU_LOOP(parallelism='[seq]') + do i = 1, 2 + Re_L(i) = dflt_real + Re_R(i) = dflt_real - $:GPU_LOOP(parallelism='[seq]') - do i = 1, num_fluids - rho_L = rho_L + alpha_rho_L(i) - gamma_L = gamma_L + alpha_L(i)*gammas(i) - pi_inf_L = pi_inf_L + alpha_L(i)*pi_infs(i) - qv_L = qv_L + alpha_rho_L(i)*qvs(i) - - rho_R = rho_R + alpha_rho_R(i) - gamma_R = gamma_R + alpha_R(i)*gammas(i) - pi_inf_R = pi_inf_R + alpha_R(i)*pi_infs(i) - qv_R = qv_R + alpha_rho_R(i)*qvs(i) - end do + if (Re_size(i) > 0) Re_L(i) = 0._wp + if (Re_size(i) > 0) Re_R(i) = 0._wp - if (viscous) then - $:GPU_LOOP(parallelism='[seq]') - do i = 1, 2 - Re_L(i) = dflt_real - Re_R(i) = dflt_real + $:GPU_LOOP(parallelism='[seq]') + do q = 1, Re_size(i) + Re_L(i) = alpha_L(Re_idx(i, q))/Res_gs(i, q) & + + Re_L(i) + Re_R(i) = alpha_R(Re_idx(i, q))/Res_gs(i, q) & + + Re_R(i) + end do - if (Re_size(i) > 0) Re_L(i) = 0._wp - if (Re_size(i) > 0) Re_R(i) = 0._wp + Re_L(i) = 1._wp/max(Re_L(i), sgm_eps) + Re_R(i) = 1._wp/max(Re_R(i), sgm_eps) + end do + end if + if (chemistry) then $:GPU_LOOP(parallelism='[seq]') - do q = 1, Re_size(i) - Re_L(i) = alpha_L(Re_idx(i, q))/Res(i, q) & - + Re_L(i) - Re_R(i) = alpha_R(Re_idx(i, q))/Res(i, q) & - + Re_R(i) + do i = chemxb, chemxe + Ys_L(i - chemxb + 1) = qL_prim_rs${XYZ}$_vf(j, k, l, i) + Ys_R(i - chemxb + 1) = qR_prim_rs${XYZ}$_vf(j + 1, k, l, i) end do - Re_L(i) = 1._wp/max(Re_L(i), sgm_eps) - Re_R(i) = 1._wp/max(Re_R(i), sgm_eps) - end do - end if + call get_mixture_molecular_weight(Ys_L, MW_L) + call get_mixture_molecular_weight(Ys_R, MW_R) - if (chemistry) then - $:GPU_LOOP(parallelism='[seq]') - do i = chemxb, chemxe - Ys_L(i - chemxb + 1) = qL_prim_rs${XYZ}$_vf(j, k, l, i) - Ys_R(i - chemxb + 1) = qR_prim_rs${XYZ}$_vf(j + 1, k, l, i) - end do + Xs_L(:) = Ys_L(:)*MW_L/molecular_weights(:) + Xs_R(:) = Ys_R(:)*MW_R/molecular_weights(:) - call get_mixture_molecular_weight(Ys_L, MW_L) - call get_mixture_molecular_weight(Ys_R, MW_R) - - Xs_L(:) = Ys_L(:)*MW_L/molecular_weights(:) - Xs_R(:) = Ys_R(:)*MW_R/molecular_weights(:) - - R_gas_L = gas_constant/MW_L - R_gas_R = gas_constant/MW_R - T_L = pres_L/rho_L/R_gas_L - T_R = pres_R/rho_R/R_gas_R - - call get_species_specific_heats_r(T_L, Cp_iL) - call get_species_specific_heats_r(T_R, Cp_iR) - - if (chem_params%gamma_method == 1) then - ! gamma_method = 1: Ref. Section 2.3.1 Formulation of doi:10.7907/ZKW8-ES97. - Gamma_iL = Cp_iL/(Cp_iL - 1.0_wp) - Gamma_iR = Cp_iR/(Cp_iR - 1.0_wp) - - gamma_L = sum(Xs_L(:)/(Gamma_iL(:) - 1.0_wp)) - gamma_R = sum(Xs_R(:)/(Gamma_iR(:) - 1.0_wp)) - else if (chem_params%gamma_method == 2) then - ! gamma_method = 2: c_p / c_v where c_p, c_v are specific heats. - call get_mixture_specific_heat_cp_mass(T_L, Ys_L, Cp_L) - call get_mixture_specific_heat_cp_mass(T_R, Ys_R, Cp_R) - call get_mixture_specific_heat_cv_mass(T_L, Ys_L, Cv_L) - call get_mixture_specific_heat_cv_mass(T_R, Ys_R, Cv_R) - - Gamm_L = Cp_L/Cv_L - gamma_L = 1.0_wp/(Gamm_L - 1.0_wp) - Gamm_R = Cp_R/Cv_R - gamma_R = 1.0_wp/(Gamm_R - 1.0_wp) - end if + R_gas_L = gas_constant/MW_L + R_gas_R = gas_constant/MW_R + T_L = pres_L/rho_L/R_gas_L + T_R = pres_R/rho_R/R_gas_R - call get_mixture_energy_mass(T_L, Ys_L, E_L) - call get_mixture_energy_mass(T_R, Ys_R, E_R) - - E_L = rho_L*E_L + 5.e-1*rho_L*vel_L_rms - E_R = rho_R*E_R + 5.e-1*rho_R*vel_R_rms - H_L = (E_L + pres_L)/rho_L - H_R = (E_R + pres_R)/rho_R - elseif (mhd .and. relativity) then - Ga%L = 1._wp/sqrt(1._wp - vel_L_rms) - Ga%R = 1._wp/sqrt(1._wp - vel_R_rms) - vdotB%L = vel_L(1)*B%L(1) + vel_L(2)*B%L(2) + vel_L(3)*B%L(3) - vdotB%R = vel_R(1)*B%R(1) + vel_R(2)*B%R(2) + vel_R(3)*B%R(3) - - b4%L(1:3) = B%L(1:3)/Ga%L + Ga%L*vel_L(1:3)*vdotB%L - b4%R(1:3) = B%R(1:3)/Ga%R + Ga%R*vel_R(1:3)*vdotB%R - B2%L = B%L(1)**2._wp + B%L(2)**2._wp + B%L(3)**2._wp - B2%R = B%R(1)**2._wp + B%R(2)**2._wp + B%R(3)**2._wp - - pres_mag%L = 0.5_wp*(B2%L/Ga%L**2._wp + vdotB%L**2._wp) - pres_mag%R = 0.5_wp*(B2%R/Ga%R**2._wp + vdotB%R**2._wp) - - ! Hard-coded EOS - H_L = 1._wp + (gamma_L + 1)*pres_L/rho_L - H_R = 1._wp + (gamma_R + 1)*pres_R/rho_R - - cm%L(1:3) = (rho_L*H_L*Ga%L**2 + B2%L)*vel_L(1:3) - vdotB%L*B%L(1:3) - cm%R(1:3) = (rho_R*H_R*Ga%R**2 + B2%R)*vel_R(1:3) - vdotB%R*B%R(1:3) - - E_L = rho_L*H_L*Ga%L**2 - pres_L + 0.5_wp*(B2%L + vel_L_rms*B2%L - vdotB%L**2._wp) - rho_L*Ga%L - E_R = rho_R*H_R*Ga%R**2 - pres_R + 0.5_wp*(B2%R + vel_R_rms*B2%R - vdotB%R**2._wp) - rho_R*Ga%R - elseif (mhd .and. .not. relativity) then - pres_mag%L = 0.5_wp*(B%L(1)**2._wp + B%L(2)**2._wp + B%L(3)**2._wp) - pres_mag%R = 0.5_wp*(B%R(1)**2._wp + B%R(2)**2._wp + B%R(3)**2._wp) - E_L = gamma_L*pres_L + pi_inf_L + 0.5_wp*rho_L*vel_L_rms + qv_L + pres_mag%L - E_R = gamma_R*pres_R + pi_inf_R + 0.5_wp*rho_R*vel_R_rms + qv_R + pres_mag%R ! includes magnetic energy - H_L = (E_L + pres_L - pres_mag%L)/rho_L - H_R = (E_R + pres_R - pres_mag%R)/rho_R ! stagnation enthalpy here excludes magnetic energy (only used to find speed of sound) - else - E_L = gamma_L*pres_L + pi_inf_L + 5.e-1*rho_L*vel_L_rms + qv_L - E_R = gamma_R*pres_R + pi_inf_R + 5.e-1*rho_R*vel_R_rms + qv_R - H_L = (E_L + pres_L)/rho_L - H_R = (E_R + pres_R)/rho_R - end if + call get_species_specific_heats_r(T_L, Cp_iL) + call get_species_specific_heats_r(T_R, Cp_iR) - ! elastic energy update - if (hypoelasticity) then - G_L = 0._wp; G_R = 0._wp + if (chem_params%gamma_method == 1) then + ! gamma_method = 1: Ref. Section 2.3.1 Formulation of doi:10.7907/ZKW8-ES97. + Gamma_iL = Cp_iL/(Cp_iL - 1.0_wp) + Gamma_iR = Cp_iR/(Cp_iR - 1.0_wp) - $:GPU_LOOP(parallelism='[seq]') - do i = 1, num_fluids - G_L = G_L + alpha_L(i)*Gs(i) - G_R = G_R + alpha_R(i)*Gs(i) - end do + gamma_L = sum(Xs_L(:)/(Gamma_iL(:) - 1.0_wp)) + gamma_R = sum(Xs_R(:)/(Gamma_iR(:) - 1.0_wp)) + else if (chem_params%gamma_method == 2) then + ! gamma_method = 2: c_p / c_v where c_p, c_v are specific heats. + call get_mixture_specific_heat_cp_mass(T_L, Ys_L, Cp_L) + call get_mixture_specific_heat_cp_mass(T_R, Ys_R, Cp_R) + call get_mixture_specific_heat_cv_mass(T_L, Ys_L, Cv_L) + call get_mixture_specific_heat_cv_mass(T_R, Ys_R, Cv_R) + + Gamm_L = Cp_L/Cv_L + gamma_L = 1.0_wp/(Gamm_L - 1.0_wp) + Gamm_R = Cp_R/Cv_R + gamma_R = 1.0_wp/(Gamm_R - 1.0_wp) + end if - if (cont_damage) then - G_L = G_L*max((1._wp - qL_prim_rs${XYZ}$_vf(j, k, l, damage_idx)), 0._wp) - G_R = G_R*max((1._wp - qR_prim_rs${XYZ}$_vf(j, k, l, damage_idx)), 0._wp) + call get_mixture_energy_mass(T_L, Ys_L, E_L) + call get_mixture_energy_mass(T_R, Ys_R, E_R) + + E_L = rho_L*E_L + 5.e-1*rho_L*vel_L_rms + E_R = rho_R*E_R + 5.e-1*rho_R*vel_R_rms + H_L = (E_L + pres_L)/rho_L + H_R = (E_R + pres_R)/rho_R + elseif (mhd .and. relativity) then + Ga%L = 1._wp/sqrt(1._wp - vel_L_rms) + Ga%R = 1._wp/sqrt(1._wp - vel_R_rms) + vdotB%L = vel_L(1)*B%L(1) + vel_L(2)*B%L(2) + vel_L(3)*B%L(3) + vdotB%R = vel_R(1)*B%R(1) + vel_R(2)*B%R(2) + vel_R(3)*B%R(3) + + b4%L(1:3) = B%L(1:3)/Ga%L + Ga%L*vel_L(1:3)*vdotB%L + b4%R(1:3) = B%R(1:3)/Ga%R + Ga%R*vel_R(1:3)*vdotB%R + B2%L = B%L(1)**2._wp + B%L(2)**2._wp + B%L(3)**2._wp + B2%R = B%R(1)**2._wp + B%R(2)**2._wp + B%R(3)**2._wp + + pres_mag%L = 0.5_wp*(B2%L/Ga%L**2._wp + vdotB%L**2._wp) + pres_mag%R = 0.5_wp*(B2%R/Ga%R**2._wp + vdotB%R**2._wp) + + ! Hard-coded EOS + H_L = 1._wp + (gamma_L + 1)*pres_L/rho_L + H_R = 1._wp + (gamma_R + 1)*pres_R/rho_R + + cm%L(1:3) = (rho_L*H_L*Ga%L**2 + B2%L)*vel_L(1:3) - vdotB%L*B%L(1:3) + cm%R(1:3) = (rho_R*H_R*Ga%R**2 + B2%R)*vel_R(1:3) - vdotB%R*B%R(1:3) + + E_L = rho_L*H_L*Ga%L**2 - pres_L + 0.5_wp*(B2%L + vel_L_rms*B2%L - vdotB%L**2._wp) - rho_L*Ga%L + E_R = rho_R*H_R*Ga%R**2 - pres_R + 0.5_wp*(B2%R + vel_R_rms*B2%R - vdotB%R**2._wp) - rho_R*Ga%R + elseif (mhd .and. .not. relativity) then + pres_mag%L = 0.5_wp*(B%L(1)**2._wp + B%L(2)**2._wp + B%L(3)**2._wp) + pres_mag%R = 0.5_wp*(B%R(1)**2._wp + B%R(2)**2._wp + B%R(3)**2._wp) + E_L = gamma_L*pres_L + pi_inf_L + 0.5_wp*rho_L*vel_L_rms + qv_L + pres_mag%L + E_R = gamma_R*pres_R + pi_inf_R + 0.5_wp*rho_R*vel_R_rms + qv_R + pres_mag%R ! includes magnetic energy + H_L = (E_L + pres_L - pres_mag%L)/rho_L + H_R = (E_R + pres_R - pres_mag%R)/rho_R ! stagnation enthalpy here excludes magnetic energy (only used to find speed of sound) + else + E_L = gamma_L*pres_L + pi_inf_L + 5.e-1*rho_L*vel_L_rms + qv_L + E_R = gamma_R*pres_R + pi_inf_R + 5.e-1*rho_R*vel_R_rms + qv_R + H_L = (E_L + pres_L)/rho_L + H_R = (E_R + pres_R)/rho_R end if - do i = 1, strxe - strxb + 1 - tau_e_L(i) = qL_prim_rs${XYZ}$_vf(j, k, l, strxb - 1 + i) - tau_e_R(i) = qR_prim_rs${XYZ}$_vf(j + 1, k, l, strxb - 1 + i) - ! Elastic contribution to energy if G large enough - !TODO take out if statement if stable without - if ((G_L > 1000) .and. (G_R > 1000)) then - E_L = E_L + (tau_e_L(i)*tau_e_L(i))/(4._wp*G_L) - E_R = E_R + (tau_e_R(i)*tau_e_R(i))/(4._wp*G_R) - ! Double for shear stresses - if (any(strxb - 1 + i == shear_indices)) then + ! elastic energy update + if (hypoelasticity) then + G_L = 0._wp; G_R = 0._wp + + $:GPU_LOOP(parallelism='[seq]') + do i = 1, num_fluids + G_L = G_L + alpha_L(i)*Gs_rs(i) + G_R = G_R + alpha_R(i)*Gs_rs(i) + end do + + if (cont_damage) then + G_L = G_L*max((1._wp - qL_prim_rs${XYZ}$_vf(j, k, l, damage_idx)), 0._wp) + G_R = G_R*max((1._wp - qR_prim_rs${XYZ}$_vf(j, k, l, damage_idx)), 0._wp) + end if + + $:GPU_LOOP(parallelism='[seq]') + do i = 1, strxe - strxb + 1 + tau_e_L(i) = qL_prim_rs${XYZ}$_vf(j, k, l, strxb - 1 + i) + tau_e_R(i) = qR_prim_rs${XYZ}$_vf(j + 1, k, l, strxb - 1 + i) + ! Elastic contribution to energy if G large enough + !TODO take out if statement if stable without + if ((G_L > 1000) .and. (G_R > 1000)) then E_L = E_L + (tau_e_L(i)*tau_e_L(i))/(4._wp*G_L) E_R = E_R + (tau_e_R(i)*tau_e_R(i))/(4._wp*G_R) + ! Double for shear stresses + if (any(strxb - 1 + i == shear_indices)) then + E_L = E_L + (tau_e_L(i)*tau_e_L(i))/(4._wp*G_L) + E_R = E_R + (tau_e_R(i)*tau_e_R(i))/(4._wp*G_R) + end if end if - end if - end do - end if + end do + end if - ! elastic energy update - !if ( hyperelasticity ) then - ! G_L = 0._wp - ! G_R = 0._wp - ! - ! $:GPU_LOOP(parallelism='[seq]') - ! do i = 1, num_fluids - ! G_L = G_L + alpha_L(i)*Gs(i) - ! G_R = G_R + alpha_R(i)*Gs(i) - ! end do - ! ! Elastic contribution to energy if G large enough - ! if ((G_L > 1.e-3_wp) .and. (G_R > 1.e-3_wp)) then - ! E_L = E_L + G_L*qL_prim_rs${XYZ}$_vf(j, k, l, xiend + 1) - ! E_R = E_R + G_R*qR_prim_rs${XYZ}$_vf(j + 1, k, l, xiend + 1) - ! $:GPU_LOOP(parallelism='[seq]') - ! do i = 1, b_size-1 - ! tau_e_L(i) = G_L*qL_prim_rs${XYZ}$_vf(j, k, l, strxb - 1 + i) - ! tau_e_R(i) = G_R*qR_prim_rs${XYZ}$_vf(j + 1, k, l, strxb - 1 + i) - ! end do - ! $:GPU_LOOP(parallelism='[seq]') - ! do i = 1, b_size-1 - ! tau_e_L(i) = 0._wp - ! tau_e_R(i) = 0._wp - ! end do - ! $:GPU_LOOP(parallelism='[seq]') - ! do i = 1, num_dims - ! xi_field_L(i) = qL_prim_rs${XYZ}$_vf(j, k, l, xibeg - 1 + i) - ! xi_field_R(i) = qR_prim_rs${XYZ}$_vf(j + 1, k, l, xibeg - 1 + i) - ! end do - ! end if - !end if - - @:compute_average_state() - - call s_compute_speed_of_sound(pres_L, rho_L, gamma_L, pi_inf_L, H_L, alpha_L, & - vel_L_rms, 0._wp, c_L) - - call s_compute_speed_of_sound(pres_R, rho_R, gamma_R, pi_inf_R, H_R, alpha_R, & - vel_R_rms, 0._wp, c_R) - - !> The computation of c_avg does not require all the variables, and therefore the non '_avg' - ! variables are placeholders to call the subroutine. - - call s_compute_speed_of_sound(pres_R, rho_avg, gamma_avg, pi_inf_R, H_avg, alpha_R, & - vel_avg_rms, c_sum_Yi_Phi, c_avg) - - if (mhd) then - call s_compute_fast_magnetosonic_speed(rho_L, c_L, B%L, norm_dir, c_fast%L, H_L) - call s_compute_fast_magnetosonic_speed(rho_R, c_R, B%R, norm_dir, c_fast%R, H_R) - end if + ! elastic energy update + !if ( hyperelasticity ) then + ! G_L = 0._wp + ! G_R = 0._wp + ! + ! $:GPU_LOOP(parallelism='[seq]') + ! do i = 1, num_fluids + ! G_L = G_L + alpha_L(i)*Gs_rs(i) + ! G_R = G_R + alpha_R(i)*Gs_rs(i) + ! end do + ! ! Elastic contribution to energy if G large enough + ! if ((G_L > 1.e-3_wp) .and. (G_R > 1.e-3_wp)) then + ! E_L = E_L + G_L*qL_prim_rs${XYZ}$_vf(j, k, l, xiend + 1) + ! E_R = E_R + G_R*qR_prim_rs${XYZ}$_vf(j + 1, k, l, xiend + 1) + ! $:GPU_LOOP(parallelism='[seq]') + ! do i = 1, b_size-1 + ! tau_e_L(i) = G_L*qL_prim_rs${XYZ}$_vf(j, k, l, strxb - 1 + i) + ! tau_e_R(i) = G_R*qR_prim_rs${XYZ}$_vf(j + 1, k, l, strxb - 1 + i) + ! end do + ! $:GPU_LOOP(parallelism='[seq]') + ! do i = 1, b_size-1 + ! tau_e_L(i) = 0._wp + ! tau_e_R(i) = 0._wp + ! end do + ! $:GPU_LOOP(parallelism='[seq]') + ! do i = 1, num_dims + ! xi_field_L(i) = qL_prim_rs${XYZ}$_vf(j, k, l, xibeg - 1 + i) + ! xi_field_R(i) = qR_prim_rs${XYZ}$_vf(j + 1, k, l, xibeg - 1 + i) + ! end do + ! end if + !end if - if (viscous) then - if (chemistry) then - call compute_viscosity_and_inversion(T_L, Ys_L, T_R, Ys_R, Re_L(1), Re_R(1)) - end if - $:GPU_LOOP(parallelism='[seq]') - do i = 1, 2 - Re_avg_rs${XYZ}$_vf(j, k, l, i) = 2._wp/(1._wp/Re_L(i) + 1._wp/Re_R(i)) - end do - end if + @:compute_average_state() - if (wave_speeds == 1) then - if (mhd) then - s_L = min(vel_L(dir_idx(1)) - c_fast%L, vel_R(dir_idx(1)) - c_fast%R) - s_R = max(vel_R(dir_idx(1)) + c_fast%R, vel_L(dir_idx(1)) + c_fast%L) - elseif (hypoelasticity) then - s_L = min(vel_L(dir_idx(1)) - sqrt(c_L*c_L + & - (((4._wp*G_L)/3._wp) + & - tau_e_L(dir_idx_tau(1)))/rho_L) & - , vel_R(dir_idx(1)) - sqrt(c_R*c_R + & - (((4._wp*G_R)/3._wp) + & - tau_e_R(dir_idx_tau(1)))/rho_R)) - s_R = max(vel_R(dir_idx(1)) + sqrt(c_R*c_R + & - (((4._wp*G_R)/3._wp) + & - tau_e_R(dir_idx_tau(1)))/rho_R) & - , vel_L(dir_idx(1)) + sqrt(c_L*c_L + & - (((4._wp*G_L)/3._wp) + & - tau_e_L(dir_idx_tau(1)))/rho_L)) - else if (hyperelasticity) then - s_L = min(vel_L(dir_idx(1)) - sqrt(c_L*c_L + (4._wp*G_L/3._wp)/rho_L) & - , vel_R(dir_idx(1)) - sqrt(c_R*c_R + (4._wp*G_R/3._wp)/rho_R)) - s_R = max(vel_R(dir_idx(1)) + sqrt(c_R*c_R + (4._wp*G_R/3._wp)/rho_R) & - , vel_L(dir_idx(1)) + sqrt(c_L*c_L + (4._wp*G_L/3._wp)/rho_L)) - else - s_L = min(vel_L(dir_idx(1)) - c_L, vel_R(dir_idx(1)) - c_R) - s_R = max(vel_R(dir_idx(1)) + c_R, vel_L(dir_idx(1)) + c_L) - end if + call s_compute_speed_of_sound(pres_L, rho_L, gamma_L, pi_inf_L, H_L, alpha_L, & + vel_L_rms, 0._wp, c_L) - s_S = (pres_R - pres_L + rho_L*vel_L(dir_idx(1))* & - (s_L - vel_L(dir_idx(1))) - & - rho_R*vel_R(dir_idx(1))* & - (s_R - vel_R(dir_idx(1)))) & - /(rho_L*(s_L - vel_L(dir_idx(1))) - & - rho_R*(s_R - vel_R(dir_idx(1)))) - elseif (wave_speeds == 2) then - pres_SL = 5.e-1_wp*(pres_L + pres_R + rho_avg*c_avg* & - (vel_L(dir_idx(1)) - & - vel_R(dir_idx(1)))) - - pres_SR = pres_SL - - Ms_L = max(1._wp, sqrt(1._wp + ((5.e-1_wp + gamma_L)/(1._wp + gamma_L))* & - (pres_SL/pres_L - 1._wp)*pres_L/ & - ((pres_L + pi_inf_L/(1._wp + gamma_L))))) - Ms_R = max(1._wp, sqrt(1._wp + ((5.e-1_wp + gamma_R)/(1._wp + gamma_R))* & - (pres_SR/pres_R - 1._wp)*pres_R/ & - ((pres_R + pi_inf_R/(1._wp + gamma_R))))) - - s_L = vel_L(dir_idx(1)) - c_L*Ms_L - s_R = vel_R(dir_idx(1)) + c_R*Ms_R - - s_S = 5.e-1_wp*((vel_L(dir_idx(1)) + vel_R(dir_idx(1))) + & - (pres_L - pres_R)/ & - (rho_avg*c_avg)) - end if + call s_compute_speed_of_sound(pres_R, rho_R, gamma_R, pi_inf_R, H_R, alpha_R, & + vel_R_rms, 0._wp, c_R) - s_M = min(0._wp, s_L); s_P = max(0._wp, s_R) + !> The computation of c_avg does not require all the variables, and therefore the non '_avg' + ! variables are placeholders to call the subroutine. - xi_M = (5.e-1_wp + sign(5.e-1_wp, s_L)) & - + (5.e-1_wp - sign(5.e-1_wp, s_L)) & - *(5.e-1_wp + sign(5.e-1_wp, s_R)) - xi_P = (5.e-1_wp - sign(5.e-1_wp, s_R)) & - + (5.e-1_wp - sign(5.e-1_wp, s_L)) & - *(5.e-1_wp + sign(5.e-1_wp, s_R)) + call s_compute_speed_of_sound(pres_R, rho_avg, gamma_avg, pi_inf_R, H_avg, alpha_R, & + vel_avg_rms, c_sum_Yi_Phi, c_avg) - ! Low Mach correction - if (low_Mach == 1) then - @:compute_low_Mach_correction() - else - pcorr = 0._wp - end if + if (mhd) then + call s_compute_fast_magnetosonic_speed(rho_L, c_L, B%L, norm_dir, c_fast%L, H_L) + call s_compute_fast_magnetosonic_speed(rho_R, c_R, B%R, norm_dir, c_fast%R, H_R) + end if - ! Mass - if (.not. relativity) then - $:GPU_LOOP(parallelism='[seq]') - do i = 1, contxe - flux_rs${XYZ}$_vf(j, k, l, i) = & - (s_M*alpha_rho_R(i)*vel_R(norm_dir) & - - s_P*alpha_rho_L(i)*vel_L(norm_dir) & - + s_M*s_P*(alpha_rho_L(i) & - - alpha_rho_R(i))) & - /(s_M - s_P) - end do - elseif (relativity) then - $:GPU_LOOP(parallelism='[seq]') - do i = 1, contxe - flux_rs${XYZ}$_vf(j, k, l, i) = & - (s_M*Ga%R*alpha_rho_R(i)*vel_R(norm_dir) & - - s_P*Ga%L*alpha_rho_L(i)*vel_L(norm_dir) & - + s_M*s_P*(Ga%L*alpha_rho_L(i) & - - Ga%R*alpha_rho_R(i))) & - /(s_M - s_P) - end do - end if + if (viscous) then + if (chemistry) then + call compute_viscosity_and_inversion(T_L, Ys_L, T_R, Ys_R, Re_L(1), Re_R(1)) + end if + $:GPU_LOOP(parallelism='[seq]') + do i = 1, 2 + Re_avg_rs${XYZ}$_vf(j, k, l, i) = 2._wp/(1._wp/Re_L(i) + 1._wp/Re_R(i)) + end do + end if - ! Momentum - if (mhd .and. (.not. relativity)) then - $:GPU_LOOP(parallelism='[seq]') - do i = 1, 3 - ! Flux of rho*v_i in the ${XYZ}$ direction - ! = rho * v_i * v_${XYZ}$ - B_i * B_${XYZ}$ + delta_(${XYZ}$,i) * p_tot - flux_rs${XYZ}$_vf(j, k, l, contxe + i) = & - (s_M*(rho_R*vel_R(i)*vel_R(norm_dir) & - - B%R(i)*B%R(norm_dir) & - + dir_flg(i)*(pres_R + pres_mag%R)) & - - s_P*(rho_L*vel_L(i)*vel_L(norm_dir) & - - B%L(i)*B%L(norm_dir) & - + dir_flg(i)*(pres_L + pres_mag%L)) & - + s_M*s_P*(rho_L*vel_L(i) - rho_R*vel_R(i))) & - /(s_M - s_P) - end do - elseif (mhd .and. relativity) then - $:GPU_LOOP(parallelism='[seq]') - do i = 1, 3 - ! Flux of m_i in the ${XYZ}$ direction - ! = m_i * v_${XYZ}$ - b_i/Gamma * B_${XYZ}$ + delta_(${XYZ}$,i) * p_tot - flux_rs${XYZ}$_vf(j, k, l, contxe + i) = & - (s_M*(cm%R(i)*vel_R(norm_dir) & - - b4%R(i)/Ga%R*B%R(norm_dir) & - + dir_flg(i)*(pres_R + pres_mag%R)) & - - s_P*(cm%L(i)*vel_L(norm_dir) & - - b4%L(i)/Ga%L*B%L(norm_dir) & - + dir_flg(i)*(pres_L + pres_mag%L)) & - + s_M*s_P*(cm%L(i) - cm%R(i))) & - /(s_M - s_P) - end do - elseif (bubbles_euler) then - $:GPU_LOOP(parallelism='[seq]') - do i = 1, num_vels - flux_rs${XYZ}$_vf(j, k, l, contxe + dir_idx(i)) = & - (s_M*(rho_R*vel_R(dir_idx(1)) & - *vel_R(dir_idx(i)) & - + dir_flg(dir_idx(i))*(pres_R - ptilde_R)) & - - s_P*(rho_L*vel_L(dir_idx(1)) & - *vel_L(dir_idx(i)) & - + dir_flg(dir_idx(i))*(pres_L - ptilde_L)) & - + s_M*s_P*(rho_L*vel_L(dir_idx(i)) & - - rho_R*vel_R(dir_idx(i)))) & - /(s_M - s_P) & - + (s_M/s_L)*(s_P/s_R)*pcorr*(vel_R(dir_idx(i)) - vel_L(dir_idx(i))) - end do - else if (hypoelasticity) then - $:GPU_LOOP(parallelism='[seq]') - do i = 1, num_vels - flux_rs${XYZ}$_vf(j, k, l, contxe + dir_idx(i)) = & - (s_M*(rho_R*vel_R(dir_idx(1)) & - *vel_R(dir_idx(i)) & - + dir_flg(dir_idx(i))*pres_R & - - tau_e_R(dir_idx_tau(i))) & - - s_P*(rho_L*vel_L(dir_idx(1)) & - *vel_L(dir_idx(i)) & - + dir_flg(dir_idx(i))*pres_L & - - tau_e_L(dir_idx_tau(i))) & - + s_M*s_P*(rho_L*vel_L(dir_idx(i)) & - - rho_R*vel_R(dir_idx(i)))) & - /(s_M - s_P) - end do - else - $:GPU_LOOP(parallelism='[seq]') - do i = 1, num_vels - flux_rs${XYZ}$_vf(j, k, l, contxe + dir_idx(i)) = & - (s_M*(rho_R*vel_R(dir_idx(1)) & - *vel_R(dir_idx(i)) & - + dir_flg(dir_idx(i))*pres_R) & - - s_P*(rho_L*vel_L(dir_idx(1)) & - *vel_L(dir_idx(i)) & - + dir_flg(dir_idx(i))*pres_L) & - + s_M*s_P*(rho_L*vel_L(dir_idx(i)) & - - rho_R*vel_R(dir_idx(i)))) & - /(s_M - s_P) & - + (s_M/s_L)*(s_P/s_R)*pcorr*(vel_R(dir_idx(i)) - vel_L(dir_idx(i))) - end do - end if + if (wave_speeds == 1) then + if (mhd) then + s_L = min(vel_L(dir_idx(1)) - c_fast%L, vel_R(dir_idx(1)) - c_fast%R) + s_R = max(vel_R(dir_idx(1)) + c_fast%R, vel_L(dir_idx(1)) + c_fast%L) + elseif (hypoelasticity) then + s_L = min(vel_L(dir_idx(1)) - sqrt(c_L*c_L + & + (((4._wp*G_L)/3._wp) + & + tau_e_L(dir_idx_tau(1)))/rho_L) & + , vel_R(dir_idx(1)) - sqrt(c_R*c_R + & + (((4._wp*G_R)/3._wp) + & + tau_e_R(dir_idx_tau(1)))/rho_R)) + s_R = max(vel_R(dir_idx(1)) + sqrt(c_R*c_R + & + (((4._wp*G_R)/3._wp) + & + tau_e_R(dir_idx_tau(1)))/rho_R) & + , vel_L(dir_idx(1)) + sqrt(c_L*c_L + & + (((4._wp*G_L)/3._wp) + & + tau_e_L(dir_idx_tau(1)))/rho_L)) + else if (hyperelasticity) then + s_L = min(vel_L(dir_idx(1)) - sqrt(c_L*c_L + (4._wp*G_L/3._wp)/rho_L) & + , vel_R(dir_idx(1)) - sqrt(c_R*c_R + (4._wp*G_R/3._wp)/rho_R)) + s_R = max(vel_R(dir_idx(1)) + sqrt(c_R*c_R + (4._wp*G_R/3._wp)/rho_R) & + , vel_L(dir_idx(1)) + sqrt(c_L*c_L + (4._wp*G_L/3._wp)/rho_L)) + else + s_L = min(vel_L(dir_idx(1)) - c_L, vel_R(dir_idx(1)) - c_R) + s_R = max(vel_R(dir_idx(1)) + c_R, vel_L(dir_idx(1)) + c_L) + end if - ! Energy - if (mhd .and. (.not. relativity)) then - ! energy flux = (E + p + p_mag) * v_${XYZ}$ - B_${XYZ}$ * (v_x*B_x + v_y*B_y + v_z*B_z) - flux_rs${XYZ}$_vf(j, k, l, E_idx) = & - (s_M*(vel_R(norm_dir)*(E_R + pres_R + pres_mag%R) - B%R(norm_dir)*(vel_R(1)*B%R(1) + vel_R(2)*B%R(2) + vel_R(3)*B%R(3))) & - - s_P*(vel_L(norm_dir)*(E_L + pres_L + pres_mag%L) - B%L(norm_dir)*(vel_L(1)*B%L(1) + vel_L(2)*B%L(2) + vel_L(3)*B%L(3))) & - + s_M*s_P*(E_L - E_R)) & - /(s_M - s_P) - elseif (mhd .and. relativity) then - ! energy flux = m_${XYZ}$ - mass flux - ! Hard-coded for single-component for now - flux_rs${XYZ}$_vf(j, k, l, E_idx) = & - (s_M*(cm%R(norm_dir) - Ga%R*alpha_rho_R(1)*vel_R(norm_dir)) & - - s_P*(cm%L(norm_dir) - Ga%L*alpha_rho_L(1)*vel_L(norm_dir)) & - + s_M*s_P*(E_L - E_R)) & - /(s_M - s_P) - else if (bubbles_euler) then - flux_rs${XYZ}$_vf(j, k, l, E_idx) = & - (s_M*vel_R(dir_idx(1))*(E_R + pres_R - ptilde_R) & - - s_P*vel_L(dir_idx(1))*(E_L + pres_L - ptilde_L) & - + s_M*s_P*(E_L - E_R)) & - /(s_M - s_P) & - + (s_M/s_L)*(s_P/s_R)*pcorr*(vel_R_rms - vel_L_rms)/2._wp - else if (hypoelasticity) then - flux_tau_L = 0._wp; flux_tau_R = 0._wp - $:GPU_LOOP(parallelism='[seq]') - do i = 1, num_dims - flux_tau_L = flux_tau_L + tau_e_L(dir_idx_tau(i))*vel_L(dir_idx(i)) - flux_tau_R = flux_tau_R + tau_e_R(dir_idx_tau(i))*vel_R(dir_idx(i)) - end do - flux_rs${XYZ}$_vf(j, k, l, E_idx) = & - (s_M*(vel_R(dir_idx(1))*(E_R + pres_R) - flux_tau_R) & - - s_P*(vel_L(dir_idx(1))*(E_L + pres_L) - flux_tau_L) & - + s_M*s_P*(E_L - E_R))/(s_M - s_P) - else - flux_rs${XYZ}$_vf(j, k, l, E_idx) = & - (s_M*vel_R(dir_idx(1))*(E_R + pres_R) & - - s_P*vel_L(dir_idx(1))*(E_L + pres_L) & - + s_M*s_P*(E_L - E_R)) & - /(s_M - s_P) & - + (s_M/s_L)*(s_P/s_R)*pcorr*(vel_R_rms - vel_L_rms)/2._wp - end if + s_S = (pres_R - pres_L + rho_L*vel_L(dir_idx(1))* & + (s_L - vel_L(dir_idx(1))) - & + rho_R*vel_R(dir_idx(1))* & + (s_R - vel_R(dir_idx(1)))) & + /(rho_L*(s_L - vel_L(dir_idx(1))) - & + rho_R*(s_R - vel_R(dir_idx(1)))) + elseif (wave_speeds == 2) then + pres_SL = 5.e-1_wp*(pres_L + pres_R + rho_avg*c_avg* & + (vel_L(dir_idx(1)) - & + vel_R(dir_idx(1)))) - ! Elastic Stresses - if (hypoelasticity) then - do i = 1, strxe - strxb + 1 !TODO: this indexing may be slow - flux_rs${XYZ}$_vf(j, k, l, strxb - 1 + i) = & - (s_M*(rho_R*vel_R(dir_idx(1)) & - *tau_e_R(i)) & - - s_P*(rho_L*vel_L(dir_idx(1)) & - *tau_e_L(i)) & - + s_M*s_P*(rho_L*tau_e_L(i) & - - rho_R*tau_e_R(i))) & - /(s_M - s_P) - end do - end if + pres_SR = pres_SL - ! Advection - $:GPU_LOOP(parallelism='[seq]') - do i = advxb, advxe - flux_rs${XYZ}$_vf(j, k, l, i) = & - (qL_prim_rs${XYZ}$_vf(j, k, l, i) & - - qR_prim_rs${XYZ}$_vf(j + 1, k, l, i)) & - *s_M*s_P/(s_M - s_P) - flux_src_rs${XYZ}$_vf(j, k, l, i) = & - (s_M*qR_prim_rs${XYZ}$_vf(j + 1, k, l, i) & - - s_P*qL_prim_rs${XYZ}$_vf(j, k, l, i)) & - /(s_M - s_P) - end do + Ms_L = max(1._wp, sqrt(1._wp + ((5.e-1_wp + gamma_L)/(1._wp + gamma_L))* & + (pres_SL/pres_L - 1._wp)*pres_L/ & + ((pres_L + pi_inf_L/(1._wp + gamma_L))))) + Ms_R = max(1._wp, sqrt(1._wp + ((5.e-1_wp + gamma_R)/(1._wp + gamma_R))* & + (pres_SR/pres_R - 1._wp)*pres_R/ & + ((pres_R + pi_inf_R/(1._wp + gamma_R))))) - ! Xi field - !if ( hyperelasticity ) then - ! do i = 1, num_dims - ! flux_rs${XYZ}$_vf(j, k, l, xibeg - 1 + i) = & - ! (s_M*rho_R*vel_R(dir_idx(1))*xi_field_R(i) & - ! - s_P*rho_L*vel_L(dir_idx(1))*xi_field_L(i) & - ! + s_M*s_P*(rho_L*xi_field_L(i) & - ! - rho_R*xi_field_R(i))) & - ! /(s_M - s_P) - ! end do - !end if - - ! Div(U)? - $:GPU_LOOP(parallelism='[seq]') - do i = 1, num_vels - vel_src_rs${XYZ}$_vf(j, k, l, dir_idx(i)) = & - (xi_M*(rho_L*vel_L(dir_idx(i))* & - (s_L - vel_L(dir_idx(1))) - & - pres_L*dir_flg(dir_idx(i))) - & - xi_P*(rho_R*vel_R(dir_idx(i))* & - (s_R - vel_R(dir_idx(1))) - & - pres_R*dir_flg(dir_idx(i)))) & - /(xi_M*rho_L*(s_L - vel_L(dir_idx(1))) - & - xi_P*rho_R*(s_R - vel_R(dir_idx(1)))) - end do + s_L = vel_L(dir_idx(1)) - c_L*Ms_L + s_R = vel_R(dir_idx(1)) + c_R*Ms_R - if (bubbles_euler) then - ! From HLLC: Kills mass transport @ bubble gas density - if (num_fluids > 1) then - flux_rs${XYZ}$_vf(j, k, l, contxe) = 0._wp + s_S = 5.e-1_wp*((vel_L(dir_idx(1)) + vel_R(dir_idx(1))) + & + (pres_L - pres_R)/ & + (rho_avg*c_avg)) end if - end if - if (chemistry) then - $:GPU_LOOP(parallelism='[seq]') - do i = chemxb, chemxe - Y_L = qL_prim_rs${XYZ}$_vf(j, k, l, i) - Y_R = qR_prim_rs${XYZ}$_vf(j + 1, k, l, i) - - flux_rs${XYZ}$_vf(j, k, l, i) = (s_M*Y_R*rho_R*vel_R(dir_idx(1)) & - - s_P*Y_L*rho_L*vel_L(dir_idx(1)) & - + s_M*s_P*(Y_L*rho_L - Y_R*rho_R)) & - /(s_M - s_P) - flux_src_rs${XYZ}$_vf(j, k, l, i) = 0._wp - end do - end if + s_M = min(0._wp, s_L); s_P = max(0._wp, s_R) + + xi_M = (5.e-1_wp + sign(5.e-1_wp, s_L)) & + + (5.e-1_wp - sign(5.e-1_wp, s_L)) & + *(5.e-1_wp + sign(5.e-1_wp, s_R)) + xi_P = (5.e-1_wp - sign(5.e-1_wp, s_R)) & + + (5.e-1_wp - sign(5.e-1_wp, s_L)) & + *(5.e-1_wp + sign(5.e-1_wp, s_R)) + + ! Low Mach correction + if (low_Mach == 1) then + @:compute_low_Mach_correction() + else + pcorr = 0._wp + end if - if (mhd) then - if (n == 0) then ! 1D: d/dx flux only & Bx = Bx0 = const. - ! B_y flux = v_x * B_y - v_y * Bx0 - ! B_z flux = v_x * B_z - v_z * Bx0 + ! Mass + if (.not. relativity) then $:GPU_LOOP(parallelism='[seq]') - do i = 0, 1 - flux_rsx_vf(j, k, l, B_idx%beg + i) = (s_M*(vel_R(1)*B%R(2 + i) - vel_R(2 + i)*Bx0) & - - s_P*(vel_L(1)*B%L(2 + i) - vel_L(2 + i)*Bx0) & - + s_M*s_P*(B%L(2 + i) - B%R(2 + i)))/(s_M - s_P) + do i = 1, contxe + flux_rs${XYZ}$_vf(j, k, l, i) = & + (s_M*alpha_rho_R(i)*vel_R(norm_dir) & + - s_P*alpha_rho_L(i)*vel_L(norm_dir) & + + s_M*s_P*(alpha_rho_L(i) & + - alpha_rho_R(i))) & + /(s_M - s_P) end do - else ! 2D/3D: Bx, By, Bz /= const. but zero flux component in the same direction - ! B_x d/d${XYZ}$ flux = (1 - delta(x,${XYZ}$)) * (v_${XYZ}$ * B_x - v_x * B_${XYZ}$) - ! B_y d/d${XYZ}$ flux = (1 - delta(y,${XYZ}$)) * (v_${XYZ}$ * B_y - v_y * B_${XYZ}$) - ! B_z d/d${XYZ}$ flux = (1 - delta(z,${XYZ}$)) * (v_${XYZ}$ * B_z - v_z * B_${XYZ}$) + elseif (relativity) then $:GPU_LOOP(parallelism='[seq]') - do i = 0, 2 - flux_rs${XYZ}$_vf(j, k, l, B_idx%beg + i) = (1 - dir_flg(i + 1))*( & - s_M*(vel_R(dir_idx(1))*B%R(i + 1) - vel_R(i + 1)*B%R(norm_dir)) - & - s_P*(vel_L(dir_idx(1))*B%L(i + 1) - vel_L(i + 1)*B%L(norm_dir)) + & - s_M*s_P*(B%L(i + 1) - B%R(i + 1)))/(s_M - s_P) + do i = 1, contxe + flux_rs${XYZ}$_vf(j, k, l, i) = & + (s_M*Ga%R*alpha_rho_R(i)*vel_R(norm_dir) & + - s_P*Ga%L*alpha_rho_L(i)*vel_L(norm_dir) & + + s_M*s_P*(Ga%L*alpha_rho_L(i) & + - Ga%R*alpha_rho_R(i))) & + /(s_M - s_P) end do end if - flux_src_rs${XYZ}$_vf(j, k, l, advxb) = 0._wp - end if - #:if (NORM_DIR == 2) - if (cyl_coord) then - !Substituting the advective flux into the inviscid geometrical source flux + ! Momentum + if (mhd .and. (.not. relativity)) then $:GPU_LOOP(parallelism='[seq]') - do i = 1, E_idx - flux_gsrc_rs${XYZ}$_vf(j, k, l, i) = flux_rs${XYZ}$_vf(j, k, l, i) + do i = 1, 3 + ! Flux of rho*v_i in the ${XYZ}$ direction + ! = rho * v_i * v_${XYZ}$ - B_i * B_${XYZ}$ + delta_(${XYZ}$,i) * p_tot + flux_rs${XYZ}$_vf(j, k, l, contxe + i) = & + (s_M*(rho_R*vel_R(i)*vel_R(norm_dir) & + - B%R(i)*B%R(norm_dir) & + + dir_flg(i)*(pres_R + pres_mag%R)) & + - s_P*(rho_L*vel_L(i)*vel_L(norm_dir) & + - B%L(i)*B%L(norm_dir) & + + dir_flg(i)*(pres_L + pres_mag%L)) & + + s_M*s_P*(rho_L*vel_L(i) - rho_R*vel_R(i))) & + /(s_M - s_P) end do - ! Recalculating the radial momentum geometric source flux - flux_gsrc_rs${XYZ}$_vf(j, k, l, contxe + 2) = & - flux_rs${XYZ}$_vf(j, k, l, contxe + 2) & - - (s_M*pres_R - s_P*pres_L)/(s_M - s_P) - ! Geometrical source of the void fraction(s) is zero + elseif (mhd .and. relativity) then $:GPU_LOOP(parallelism='[seq]') - do i = advxb, advxe - flux_gsrc_rs${XYZ}$_vf(j, k, l, i) = flux_rs${XYZ}$_vf(j, k, l, i) + do i = 1, 3 + ! Flux of m_i in the ${XYZ}$ direction + ! = m_i * v_${XYZ}$ - b_i/Gamma * B_${XYZ}$ + delta_(${XYZ}$,i) * p_tot + flux_rs${XYZ}$_vf(j, k, l, contxe + i) = & + (s_M*(cm%R(i)*vel_R(norm_dir) & + - b4%R(i)/Ga%R*B%R(norm_dir) & + + dir_flg(i)*(pres_R + pres_mag%R)) & + - s_P*(cm%L(i)*vel_L(norm_dir) & + - b4%L(i)/Ga%L*B%L(norm_dir) & + + dir_flg(i)*(pres_L + pres_mag%L)) & + + s_M*s_P*(cm%L(i) - cm%R(i))) & + /(s_M - s_P) + end do + elseif (bubbles_euler) then + $:GPU_LOOP(parallelism='[seq]') + do i = 1, num_vels + flux_rs${XYZ}$_vf(j, k, l, contxe + dir_idx(i)) = & + (s_M*(rho_R*vel_R(dir_idx(1)) & + *vel_R(dir_idx(i)) & + + dir_flg(dir_idx(i))*(pres_R - ptilde_R)) & + - s_P*(rho_L*vel_L(dir_idx(1)) & + *vel_L(dir_idx(i)) & + + dir_flg(dir_idx(i))*(pres_L - ptilde_L)) & + + s_M*s_P*(rho_L*vel_L(dir_idx(i)) & + - rho_R*vel_R(dir_idx(i)))) & + /(s_M - s_P) & + + (s_M/s_L)*(s_P/s_R)*pcorr*(vel_R(dir_idx(i)) - vel_L(dir_idx(i))) + end do + else if (hypoelasticity) then + $:GPU_LOOP(parallelism='[seq]') + do i = 1, num_vels + flux_rs${XYZ}$_vf(j, k, l, contxe + dir_idx(i)) = & + (s_M*(rho_R*vel_R(dir_idx(1)) & + *vel_R(dir_idx(i)) & + + dir_flg(dir_idx(i))*pres_R & + - tau_e_R(dir_idx_tau(i))) & + - s_P*(rho_L*vel_L(dir_idx(1)) & + *vel_L(dir_idx(i)) & + + dir_flg(dir_idx(i))*pres_L & + - tau_e_L(dir_idx_tau(i))) & + + s_M*s_P*(rho_L*vel_L(dir_idx(i)) & + - rho_R*vel_R(dir_idx(i)))) & + /(s_M - s_P) + end do + else + $:GPU_LOOP(parallelism='[seq]') + do i = 1, num_vels + flux_rs${XYZ}$_vf(j, k, l, contxe + dir_idx(i)) = & + (s_M*(rho_R*vel_R(dir_idx(1)) & + *vel_R(dir_idx(i)) & + + dir_flg(dir_idx(i))*pres_R) & + - s_P*(rho_L*vel_L(dir_idx(1)) & + *vel_L(dir_idx(i)) & + + dir_flg(dir_idx(i))*pres_L) & + + s_M*s_P*(rho_L*vel_L(dir_idx(i)) & + - rho_R*vel_R(dir_idx(i)))) & + /(s_M - s_P) & + + (s_M/s_L)*(s_P/s_R)*pcorr*(vel_R(dir_idx(i)) - vel_L(dir_idx(i))) + end do + end if + + ! Energy + if (mhd .and. (.not. relativity)) then + ! energy flux = (E + p + p_mag) * v_${XYZ}$ - B_${XYZ}$ * (v_x*B_x + v_y*B_y + v_z*B_z) + flux_rs${XYZ}$_vf(j, k, l, E_idx) = & + (s_M*(vel_R(norm_dir)*(E_R + pres_R + pres_mag%R) - B%R(norm_dir)*(vel_R(1)*B%R(1) + vel_R(2)*B%R(2) + vel_R(3)*B%R(3))) & + - s_P*(vel_L(norm_dir)*(E_L + pres_L + pres_mag%L) - B%L(norm_dir)*(vel_L(1)*B%L(1) + vel_L(2)*B%L(2) + vel_L(3)*B%L(3))) & + + s_M*s_P*(E_L - E_R)) & + /(s_M - s_P) + elseif (mhd .and. relativity) then + ! energy flux = m_${XYZ}$ - mass flux + ! Hard-coded for single-component for now + flux_rs${XYZ}$_vf(j, k, l, E_idx) = & + (s_M*(cm%R(norm_dir) - Ga%R*alpha_rho_R(1)*vel_R(norm_dir)) & + - s_P*(cm%L(norm_dir) - Ga%L*alpha_rho_L(1)*vel_L(norm_dir)) & + + s_M*s_P*(E_L - E_R)) & + /(s_M - s_P) + else if (bubbles_euler) then + flux_rs${XYZ}$_vf(j, k, l, E_idx) = & + (s_M*vel_R(dir_idx(1))*(E_R + pres_R - ptilde_R) & + - s_P*vel_L(dir_idx(1))*(E_L + pres_L - ptilde_L) & + + s_M*s_P*(E_L - E_R)) & + /(s_M - s_P) & + + (s_M/s_L)*(s_P/s_R)*pcorr*(vel_R_rms - vel_L_rms)/2._wp + else if (hypoelasticity) then + flux_tau_L = 0._wp; flux_tau_R = 0._wp + $:GPU_LOOP(parallelism='[seq]') + do i = 1, num_dims + flux_tau_L = flux_tau_L + tau_e_L(dir_idx_tau(i))*vel_L(dir_idx(i)) + flux_tau_R = flux_tau_R + tau_e_R(dir_idx_tau(i))*vel_R(dir_idx(i)) + end do + flux_rs${XYZ}$_vf(j, k, l, E_idx) = & + (s_M*(vel_R(dir_idx(1))*(E_R + pres_R) - flux_tau_R) & + - s_P*(vel_L(dir_idx(1))*(E_L + pres_L) - flux_tau_L) & + + s_M*s_P*(E_L - E_R))/(s_M - s_P) + else + flux_rs${XYZ}$_vf(j, k, l, E_idx) = & + (s_M*vel_R(dir_idx(1))*(E_R + pres_R) & + - s_P*vel_L(dir_idx(1))*(E_L + pres_L) & + + s_M*s_P*(E_L - E_R)) & + /(s_M - s_P) & + + (s_M/s_L)*(s_P/s_R)*pcorr*(vel_R_rms - vel_L_rms)/2._wp + end if + + ! Elastic Stresses + if (hypoelasticity) then + do i = 1, strxe - strxb + 1 !TODO: this indexing may be slow + flux_rs${XYZ}$_vf(j, k, l, strxb - 1 + i) = & + (s_M*(rho_R*vel_R(dir_idx(1)) & + *tau_e_R(i)) & + - s_P*(rho_L*vel_L(dir_idx(1)) & + *tau_e_L(i)) & + + s_M*s_P*(rho_L*tau_e_L(i) & + - rho_R*tau_e_R(i))) & + /(s_M - s_P) end do end if - if (cyl_coord .and. hypoelasticity) then - ! += tau_sigmasigma using HLL - flux_gsrc_rs${XYZ}$_vf(j, k, l, contxe + 2) = & - flux_gsrc_rs${XYZ}$_vf(j, k, l, contxe + 2) + & - (s_M*tau_e_R(4) - s_P*tau_e_L(4)) & + ! Advection + $:GPU_LOOP(parallelism='[seq]') + do i = advxb, advxe + flux_rs${XYZ}$_vf(j, k, l, i) = & + (qL_prim_rs${XYZ}$_vf(j, k, l, i) & + - qR_prim_rs${XYZ}$_vf(j + 1, k, l, i)) & + *s_M*s_P/(s_M - s_P) + flux_src_rs${XYZ}$_vf(j, k, l, i) = & + (s_M*qR_prim_rs${XYZ}$_vf(j + 1, k, l, i) & + - s_P*qL_prim_rs${XYZ}$_vf(j, k, l, i)) & /(s_M - s_P) + end do + ! Xi field + !if ( hyperelasticity ) then + ! do i = 1, num_dims + ! flux_rs${XYZ}$_vf(j, k, l, xibeg - 1 + i) = & + ! (s_M*rho_R*vel_R(dir_idx(1))*xi_field_R(i) & + ! - s_P*rho_L*vel_L(dir_idx(1))*xi_field_L(i) & + ! + s_M*s_P*(rho_L*xi_field_L(i) & + ! - rho_R*xi_field_R(i))) & + ! /(s_M - s_P) + ! end do + !end if + + ! Div(U)? + $:GPU_LOOP(parallelism='[seq]') + do i = 1, num_vels + vel_src_rs${XYZ}$_vf(j, k, l, dir_idx(i)) = & + (xi_M*(rho_L*vel_L(dir_idx(i))* & + (s_L - vel_L(dir_idx(1))) - & + pres_L*dir_flg(dir_idx(i))) - & + xi_P*(rho_R*vel_R(dir_idx(i))* & + (s_R - vel_R(dir_idx(1))) - & + pres_R*dir_flg(dir_idx(i)))) & + /(xi_M*rho_L*(s_L - vel_L(dir_idx(1))) - & + xi_P*rho_R*(s_R - vel_R(dir_idx(1)))) + end do + + if (bubbles_euler) then + ! From HLLC: Kills mass transport @ bubble gas density + if (num_fluids > 1) then + flux_rs${XYZ}$_vf(j, k, l, contxe) = 0._wp + end if + end if + + if (chemistry) then $:GPU_LOOP(parallelism='[seq]') - do i = strxb, strxe - flux_gsrc_rs${XYZ}$_vf(j, k, l, i) = flux_rs${XYZ}$_vf(j, k, l, i) + do i = chemxb, chemxe + Y_L = qL_prim_rs${XYZ}$_vf(j, k, l, i) + Y_R = qR_prim_rs${XYZ}$_vf(j + 1, k, l, i) + + flux_rs${XYZ}$_vf(j, k, l, i) = (s_M*Y_R*rho_R*vel_R(dir_idx(1)) & + - s_P*Y_L*rho_L*vel_L(dir_idx(1)) & + + s_M*s_P*(Y_L*rho_L - Y_R*rho_R)) & + /(s_M - s_P) + flux_src_rs${XYZ}$_vf(j, k, l, i) = 0._wp end do end if - #:endif + if (mhd) then + if (n == 0) then ! 1D: d/dx flux only & Bx = Bx0 = const. + ! B_y flux = v_x * B_y - v_y * Bx0 + ! B_z flux = v_x * B_z - v_z * Bx0 + $:GPU_LOOP(parallelism='[seq]') + do i = 0, 1 + flux_rsx_vf(j, k, l, B_idx%beg + i) = (s_M*(vel_R(1)*B%R(2 + i) - vel_R(2 + i)*Bx0) & + - s_P*(vel_L(1)*B%L(2 + i) - vel_L(2 + i)*Bx0) & + + s_M*s_P*(B%L(2 + i) - B%R(2 + i)))/(s_M - s_P) + end do + else ! 2D/3D: Bx, By, Bz /= const. but zero flux component in the same direction + ! B_x d/d${XYZ}$ flux = (1 - delta(x,${XYZ}$)) * (v_${XYZ}$ * B_x - v_x * B_${XYZ}$) + ! B_y d/d${XYZ}$ flux = (1 - delta(y,${XYZ}$)) * (v_${XYZ}$ * B_y - v_y * B_${XYZ}$) + ! B_z d/d${XYZ}$ flux = (1 - delta(z,${XYZ}$)) * (v_${XYZ}$ * B_z - v_z * B_${XYZ}$) + $:GPU_LOOP(parallelism='[seq]') + do i = 0, 2 + flux_rs${XYZ}$_vf(j, k, l, B_idx%beg + i) = (1 - dir_flg(i + 1))*( & + s_M*(vel_R(dir_idx(1))*B%R(i + 1) - vel_R(i + 1)*B%R(norm_dir)) - & + s_P*(vel_L(dir_idx(1))*B%L(i + 1) - vel_L(i + 1)*B%L(norm_dir)) + & + s_M*s_P*(B%L(i + 1) - B%R(i + 1)))/(s_M - s_P) + end do + end if + flux_src_rs${XYZ}$_vf(j, k, l, advxb) = 0._wp + end if + + #:if (NORM_DIR == 2) + if (cyl_coord) then + !Substituting the advective flux into the inviscid geometrical source flux + $:GPU_LOOP(parallelism='[seq]') + do i = 1, E_idx + flux_gsrc_rs${XYZ}$_vf(j, k, l, i) = flux_rs${XYZ}$_vf(j, k, l, i) + end do + ! Recalculating the radial momentum geometric source flux + flux_gsrc_rs${XYZ}$_vf(j, k, l, contxe + 2) = & + flux_rs${XYZ}$_vf(j, k, l, contxe + 2) & + - (s_M*pres_R - s_P*pres_L)/(s_M - s_P) + ! Geometrical source of the void fraction(s) is zero + $:GPU_LOOP(parallelism='[seq]') + do i = advxb, advxe + flux_gsrc_rs${XYZ}$_vf(j, k, l, i) = flux_rs${XYZ}$_vf(j, k, l, i) + end do + end if + + if (cyl_coord .and. hypoelasticity) then + ! += tau_sigmasigma using HLL + flux_gsrc_rs${XYZ}$_vf(j, k, l, contxe + 2) = & + flux_gsrc_rs${XYZ}$_vf(j, k, l, contxe + 2) + & + (s_M*tau_e_R(4) - s_P*tau_e_L(4)) & + /(s_M - s_P) + + $:GPU_LOOP(parallelism='[seq]') + do i = strxb, strxe + flux_gsrc_rs${XYZ}$_vf(j, k, l, i) = flux_rs${XYZ}$_vf(j, k, l, i) + end do + end if + #:endif + + end do end do end do - end do + #:endcall GPU_PARALLEL_LOOP end if #:endfor @@ -1191,1576 +1186,1589 @@ contains ! 6-EQUATION MODEL WITH HLLC if (model_eqns == 3) then !ME3 - $:GPU_PARALLEL_LOOP(collapse=3, private='[vel_L, vel_R, & - & vel_K_Star, Re_L, Re_R, rho_avg, h_avg, & - & gamma_avg, s_L, s_R, s_S, vel_avg_rms, & - & alpha_L, alpha_R, Ys_L, Ys_R, Xs_L, Xs_R, & - & Gamma_iL, Gamma_iR, Cp_iL, Cp_iR, Yi_avg, & - & Phi_avg, h_iL, h_iR, h_avg_2, tau_e_L, & - & tau_e_R, G_L, G_R, flux_ene_e, xi_field_L, & - & xi_field_R, pcorr, zcoef, vel_L_tmp, vel_R_tmp]') - do l = is3%beg, is3%end - do k = is2%beg, is2%end - do j = is1%beg, is1%end - - idx1 = dir_idx(1) - - vel_L_rms = 0._wp; vel_R_rms = 0._wp - rho_L = 0._wp; rho_R = 0._wp - gamma_L = 0._wp; gamma_R = 0._wp - pi_inf_L = 0._wp; pi_inf_R = 0._wp - qv_L = 0._wp; qv_R = 0._wp - alpha_L_sum = 0._wp; alpha_R_sum = 0._wp + #:call GPU_PARALLEL_LOOP(collapse=3, private='[vel_L, vel_R, vel_K_Star, Re_L, Re_R, rho_avg, h_avg, gamma_avg, s_L, s_R, s_S, vel_avg_rms, alpha_L, alpha_R, Ys_L, Ys_R, Xs_L, Xs_R, Gamma_iL, Gamma_iR, Cp_iL, Cp_iR, Yi_avg, Phi_avg, h_iL, h_iR, h_avg_2, tau_e_L, tau_e_R, G_L, G_R, flux_ene_e, xi_field_L, xi_field_R, pcorr, zcoef, vel_L_tmp, vel_R_tmp]') + do l = is3%beg, is3%end + do k = is2%beg, is2%end + do j = is1%beg, is1%end - $:GPU_LOOP(parallelism='[seq]') - do i = 1, num_dims - vel_L(i) = qL_prim_rs${XYZ}$_vf(j, k, l, contxe + i) - vel_R(i) = qR_prim_rs${XYZ}$_vf(j + 1, k, l, contxe + i) - vel_L_rms = vel_L_rms + vel_L(i)**2._wp - vel_R_rms = vel_R_rms + vel_R(i)**2._wp - end do + idx1 = dir_idx(1) - pres_L = qL_prim_rs${XYZ}$_vf(j, k, l, E_idx) - pres_R = qR_prim_rs${XYZ}$_vf(j + 1, k, l, E_idx) + vel_L_rms = 0._wp; vel_R_rms = 0._wp + rho_L = 0._wp; rho_R = 0._wp + gamma_L = 0._wp; gamma_R = 0._wp + pi_inf_L = 0._wp; pi_inf_R = 0._wp + qv_L = 0._wp; qv_R = 0._wp + alpha_L_sum = 0._wp; alpha_R_sum = 0._wp - if (mpp_lim) then $:GPU_LOOP(parallelism='[seq]') - do i = 1, num_fluids - qL_prim_rs${XYZ}$_vf(j, k, l, i) = max(0._wp, qL_prim_rs${XYZ}$_vf(j, k, l, i)) - qL_prim_rs${XYZ}$_vf(j, k, l, E_idx + i) = min(max(0._wp, qL_prim_rs${XYZ}$_vf(j, k, l, E_idx + i)), 1._wp) - qR_prim_rs${XYZ}$_vf(j + 1, k, l, i) = max(0._wp, qR_prim_rs${XYZ}$_vf(j + 1, k, l, i)) - qR_prim_rs${XYZ}$_vf(j + 1, k, l, E_idx + i) = min(max(0._wp, qR_prim_rs${XYZ}$_vf(j + 1, k, l, E_idx + i)), 1._wp) - alpha_L_sum = alpha_L_sum + qL_prim_rs${XYZ}$_vf(j, k, l, E_idx + i) - alpha_R_sum = alpha_R_sum + qR_prim_rs${XYZ}$_vf(j + 1, k, l, E_idx + i) - end do - - $:GPU_LOOP(parallelism='[seq]') - do i = 1, num_fluids - qL_prim_rs${XYZ}$_vf(j, k, l, E_idx + i) = qL_prim_rs${XYZ}$_vf(j, k, l, E_idx + i)/max(alpha_L_sum, sgm_eps) - qR_prim_rs${XYZ}$_vf(j + 1, k, l, E_idx + i) = qR_prim_rs${XYZ}$_vf(j + 1, k, l, E_idx + i)/max(alpha_R_sum, sgm_eps) + do i = 1, num_dims + vel_L(i) = qL_prim_rs${XYZ}$_vf(j, k, l, contxe + i) + vel_R(i) = qR_prim_rs${XYZ}$_vf(j + 1, k, l, contxe + i) + vel_L_rms = vel_L_rms + vel_L(i)**2._wp + vel_R_rms = vel_R_rms + vel_R(i)**2._wp end do - end if - $:GPU_LOOP(parallelism='[seq]') - do i = 1, num_fluids - rho_L = rho_L + qL_prim_rs${XYZ}$_vf(j, k, l, i) - gamma_L = gamma_L + qL_prim_rs${XYZ}$_vf(j, k, l, E_idx + i)*gammas(i) - pi_inf_L = pi_inf_L + qL_prim_rs${XYZ}$_vf(j, k, l, E_idx + i)*pi_infs(i) - qv_L = qv_L + qL_prim_rs${XYZ}$_vf(j, k, l, i)*qvs(i) - - rho_R = rho_R + qR_prim_rs${XYZ}$_vf(j + 1, k, l, i) - gamma_R = gamma_R + qR_prim_rs${XYZ}$_vf(j + 1, k, l, E_idx + i)*gammas(i) - pi_inf_R = pi_inf_R + qR_prim_rs${XYZ}$_vf(j + 1, k, l, E_idx + i)*pi_infs(i) - qv_R = qv_R + qR_prim_rs${XYZ}$_vf(j + 1, k, l, i)*qvs(i) - - alpha_L(i) = qL_prim_rs${XYZ}$_vf(j, k, l, advxb + i - 1) - alpha_R(i) = qR_prim_rs${XYZ}$_vf(j + 1, k, l, advxb + i - 1) - end do + pres_L = qL_prim_rs${XYZ}$_vf(j, k, l, E_idx) + pres_R = qR_prim_rs${XYZ}$_vf(j + 1, k, l, E_idx) - if (viscous) then - $:GPU_LOOP(parallelism='[seq]') - do i = 1, 2 - Re_L(i) = dflt_real - Re_R(i) = dflt_real - if (Re_size(i) > 0) Re_L(i) = 0._wp - if (Re_size(i) > 0) Re_R(i) = 0._wp + if (mpp_lim) then $:GPU_LOOP(parallelism='[seq]') - do q = 1, Re_size(i) - Re_L(i) = qL_prim_rs${XYZ}$_vf(j, k, l, E_idx + Re_idx(i, q))/Res(i, q) & - + Re_L(i) - Re_R(i) = qR_prim_rs${XYZ}$_vf(j + 1, k, l, E_idx + Re_idx(i, q))/Res(i, q) & - + Re_R(i) + do i = 1, num_fluids + qL_prim_rs${XYZ}$_vf(j, k, l, i) = max(0._wp, qL_prim_rs${XYZ}$_vf(j, k, l, i)) + qL_prim_rs${XYZ}$_vf(j, k, l, E_idx + i) = min(max(0._wp, qL_prim_rs${XYZ}$_vf(j, k, l, E_idx + i)), 1._wp) + qR_prim_rs${XYZ}$_vf(j + 1, k, l, i) = max(0._wp, qR_prim_rs${XYZ}$_vf(j + 1, k, l, i)) + qR_prim_rs${XYZ}$_vf(j + 1, k, l, E_idx + i) = min(max(0._wp, qR_prim_rs${XYZ}$_vf(j + 1, k, l, E_idx + i)), 1._wp) + alpha_L_sum = alpha_L_sum + qL_prim_rs${XYZ}$_vf(j, k, l, E_idx + i) + alpha_R_sum = alpha_R_sum + qR_prim_rs${XYZ}$_vf(j + 1, k, l, E_idx + i) end do - Re_L(i) = 1._wp/max(Re_L(i), sgm_eps) - Re_R(i) = 1._wp/max(Re_R(i), sgm_eps) - end do - end if - - E_L = gamma_L*pres_L + pi_inf_L + 5.e-1_wp*rho_L*vel_L_rms + qv_L - E_R = gamma_R*pres_R + pi_inf_R + 5.e-1_wp*rho_R*vel_R_rms + qv_R - ! ENERGY ADJUSTMENTS FOR HYPOELASTIC ENERGY - if (hypoelasticity) then - $:GPU_LOOP(parallelism='[seq]') - do i = 1, strxe - strxb + 1 - tau_e_L(i) = qL_prim_rs${XYZ}$_vf(j, k, l, strxb - 1 + i) - tau_e_R(i) = qR_prim_rs${XYZ}$_vf(j + 1, k, l, strxb - 1 + i) - end do - G_L = 0._wp; G_R = 0._wp - $:GPU_LOOP(parallelism='[seq]') - do i = 1, num_fluids - G_L = G_L + alpha_L(i)*Gs(i) - G_R = G_R + alpha_R(i)*Gs(i) - end do - $:GPU_LOOP(parallelism='[seq]') - do i = 1, strxe - strxb + 1 - ! Elastic contribution to energy if G large enough - if ((G_L > verysmall) .and. (G_R > verysmall)) then - E_L = E_L + (tau_e_L(i)*tau_e_L(i))/(4._wp*G_L) - E_R = E_R + (tau_e_R(i)*tau_e_R(i))/(4._wp*G_R) - ! Additional terms in 2D and 3D - if ((i == 2) .or. (i == 4) .or. (i == 5)) then - E_L = E_L + (tau_e_L(i)*tau_e_L(i))/(4._wp*G_L) - E_R = E_R + (tau_e_R(i)*tau_e_R(i))/(4._wp*G_R) - end if - end if - end do - end if + $:GPU_LOOP(parallelism='[seq]') + do i = 1, num_fluids + qL_prim_rs${XYZ}$_vf(j, k, l, E_idx + i) = qL_prim_rs${XYZ}$_vf(j, k, l, E_idx + i)/max(alpha_L_sum, sgm_eps) + qR_prim_rs${XYZ}$_vf(j + 1, k, l, E_idx + i) = qR_prim_rs${XYZ}$_vf(j + 1, k, l, E_idx + i)/max(alpha_R_sum, sgm_eps) + end do + end if - ! ENERGY ADJUSTMENTS FOR HYPERELASTIC ENERGY - if (hyperelasticity) then - $:GPU_LOOP(parallelism='[seq]') - do i = 1, num_dims - xi_field_L(i) = qL_prim_rs${XYZ}$_vf(j, k, l, xibeg - 1 + i) - xi_field_R(i) = qR_prim_rs${XYZ}$_vf(j + 1, k, l, xibeg - 1 + i) - end do - G_L = 0._wp; G_R = 0._wp; $:GPU_LOOP(parallelism='[seq]') do i = 1, num_fluids - ! Mixture left and right shear modulus - G_L = G_L + alpha_L(i)*Gs(i) - G_R = G_R + alpha_R(i)*Gs(i) - end do - ! Elastic contribution to energy if G large enough - if (G_L > verysmall .and. G_R > verysmall) then - E_L = E_L + G_L*qL_prim_rs${XYZ}$_vf(j, k, l, xiend + 1) - E_R = E_R + G_R*qR_prim_rs${XYZ}$_vf(j + 1, k, l, xiend + 1) - end if - $:GPU_LOOP(parallelism='[seq]') - do i = 1, b_size - 1 - tau_e_L(i) = qL_prim_rs${XYZ}$_vf(j, k, l, strxb - 1 + i) - tau_e_R(i) = qR_prim_rs${XYZ}$_vf(j + 1, k, l, strxb - 1 + i) - end do - end if + rho_L = rho_L + qL_prim_rs${XYZ}$_vf(j, k, l, i) + gamma_L = gamma_L + qL_prim_rs${XYZ}$_vf(j, k, l, E_idx + i)*gammas(i) + pi_inf_L = pi_inf_L + qL_prim_rs${XYZ}$_vf(j, k, l, E_idx + i)*pi_infs(i) + qv_L = qv_L + qL_prim_rs${XYZ}$_vf(j, k, l, i)*qvs(i) - H_L = (E_L + pres_L)/rho_L - H_R = (E_R + pres_R)/rho_R + rho_R = rho_R + qR_prim_rs${XYZ}$_vf(j + 1, k, l, i) + gamma_R = gamma_R + qR_prim_rs${XYZ}$_vf(j + 1, k, l, E_idx + i)*gammas(i) + pi_inf_R = pi_inf_R + qR_prim_rs${XYZ}$_vf(j + 1, k, l, E_idx + i)*pi_infs(i) + qv_R = qv_R + qR_prim_rs${XYZ}$_vf(j + 1, k, l, i)*qvs(i) - @:compute_average_state() + alpha_L(i) = qL_prim_rs${XYZ}$_vf(j, k, l, advxb + i - 1) + alpha_R(i) = qR_prim_rs${XYZ}$_vf(j + 1, k, l, advxb + i - 1) + end do - call s_compute_speed_of_sound(pres_L, rho_L, gamma_L, pi_inf_L, H_L, alpha_L, & - vel_L_rms, 0._wp, c_L) + if (viscous) then + $:GPU_LOOP(parallelism='[seq]') + do i = 1, 2 + Re_L(i) = dflt_real + Re_R(i) = dflt_real + if (Re_size(i) > 0) Re_L(i) = 0._wp + if (Re_size(i) > 0) Re_R(i) = 0._wp + $:GPU_LOOP(parallelism='[seq]') + do q = 1, Re_size(i) + Re_L(i) = qL_prim_rs${XYZ}$_vf(j, k, l, E_idx + Re_idx(i, q))/Res_gs(i, q) & + + Re_L(i) + Re_R(i) = qR_prim_rs${XYZ}$_vf(j + 1, k, l, E_idx + Re_idx(i, q))/Res_gs(i, q) & + + Re_R(i) + end do + Re_L(i) = 1._wp/max(Re_L(i), sgm_eps) + Re_R(i) = 1._wp/max(Re_R(i), sgm_eps) + end do + end if + if (viscous) then + $:GPU_LOOP(parallelism='[seq]') + do i = 1, 2 + Re_L(i) = dflt_real - call s_compute_speed_of_sound(pres_R, rho_R, gamma_R, pi_inf_R, H_R, alpha_R, & - vel_R_rms, 0._wp, c_R) + if (Re_size(i) > 0) Re_L(i) = 0._wp - !> The computation of c_avg does not require all the variables, and therefore the non '_avg' - ! variables are placeholders to call the subroutine. - call s_compute_speed_of_sound(pres_R, rho_avg, gamma_avg, pi_inf_R, H_avg, alpha_R, & - vel_avg_rms, 0._wp, c_avg) + $:GPU_LOOP(parallelism='[seq]') + do q = 1, Re_size(i) + Re_L(i) = qL_prim_rs${XYZ}$_vf(j, k, l, E_idx + Re_idx(i, q))/Res_gs(i, q) & + + Re_L(i) + end do - if (viscous) then - $:GPU_LOOP(parallelism='[seq]') - do i = 1, 2 - Re_avg_rs${XYZ}$_vf(j, k, l, i) = 2._wp/(1._wp/Re_L(i) + 1._wp/Re_R(i)) - end do - end if + Re_L(i) = 1._wp/max(Re_L(i), sgm_eps) - ! Low Mach correction - if (low_Mach == 2) then - @:compute_low_Mach_correction() - end if + end do - ! COMPUTING THE DIRECT WAVE SPEEDS - if (wave_speeds == 1) then - if (elasticity) then - s_L = min(vel_L(dir_idx(1)) - sqrt(c_L*c_L + & - (((4._wp*G_L)/3._wp) + tau_e_L(dir_idx_tau(1)))/rho_L), vel_R(dir_idx(1)) - sqrt(c_R*c_R + & - (((4._wp*G_R)/3._wp) + tau_e_R(dir_idx_tau(1)))/rho_R)) - s_R = max(vel_R(dir_idx(1)) + sqrt(c_R*c_R + & - (((4._wp*G_R)/3._wp) + tau_e_R(dir_idx_tau(1)))/rho_R), vel_L(dir_idx(1)) + sqrt(c_L*c_L + & - (((4._wp*G_L)/3._wp) + tau_e_L(dir_idx_tau(1)))/rho_L)) - s_S = (pres_R - tau_e_R(dir_idx_tau(1)) - pres_L + & - tau_e_L(dir_idx_tau(1)) + rho_L*vel_L(idx1)*(s_L - vel_L(idx1)) - & - rho_R*vel_R(idx1)*(s_R - vel_R(idx1)))/(rho_L*(s_L - vel_L(idx1)) - & - rho_R*(s_R - vel_R(idx1))) - else - s_L = min(vel_L(dir_idx(1)) - c_L, vel_R(dir_idx(1)) - c_R) - s_R = max(vel_R(dir_idx(1)) + c_R, vel_L(dir_idx(1)) + c_L) - s_S = (pres_R - pres_L + rho_L*vel_L(dir_idx(1))* & - (s_L - vel_L(dir_idx(1))) - rho_R*vel_R(dir_idx(1))*(s_R - vel_R(dir_idx(1)))) & - /(rho_L*(s_L - vel_L(dir_idx(1))) - rho_R*(s_R - vel_R(dir_idx(1)))) + $:GPU_LOOP(parallelism='[seq]') + do i = 1, 2 + Re_R(i) = dflt_real - end if - elseif (wave_speeds == 2) then - pres_SL = 5.e-1_wp*(pres_L + pres_R + rho_avg*c_avg* & - (vel_L(dir_idx(1)) - & - vel_R(dir_idx(1)))) + if (Re_size(i) > 0) Re_R(i) = 0._wp - pres_SR = pres_SL + $:GPU_LOOP(parallelism='[seq]') + do q = 1, Re_size(i) + Re_R(i) = qR_prim_rs${XYZ}$_vf(j + 1, k, l, E_idx + Re_idx(i, q))/Res_gs(i, q) & + + Re_R(i) + end do - Ms_L = max(1._wp, sqrt(1._wp + ((5.e-1_wp + gamma_L)/(1._wp + gamma_L))* & - (pres_SL/pres_L - 1._wp)*pres_L/ & - ((pres_L + pi_inf_L/(1._wp + gamma_L))))) - Ms_R = max(1._wp, sqrt(1._wp + ((5.e-1_wp + gamma_R)/(1._wp + gamma_R))* & - (pres_SR/pres_R - 1._wp)*pres_R/ & - ((pres_R + pi_inf_R/(1._wp + gamma_R))))) + Re_R(i) = 1._wp/max(Re_R(i), sgm_eps) + end do + end if - s_L = vel_L(dir_idx(1)) - c_L*Ms_L - s_R = vel_R(dir_idx(1)) + c_R*Ms_R + E_L = gamma_L*pres_L + pi_inf_L + 5.e-1_wp*rho_L*vel_L_rms + qv_L + E_R = gamma_R*pres_R + pi_inf_R + 5.e-1_wp*rho_R*vel_R_rms + qv_R - s_S = 5.e-1_wp*((vel_L(dir_idx(1)) + vel_R(dir_idx(1))) + & - (pres_L - pres_R)/ & - (rho_avg*c_avg)) - end if + ! ENERGY ADJUSTMENTS FOR HYPOELASTIC ENERGY + if (hypoelasticity) then + $:GPU_LOOP(parallelism='[seq]') + do i = 1, strxe - strxb + 1 + tau_e_L(i) = qL_prim_rs${XYZ}$_vf(j, k, l, strxb - 1 + i) + tau_e_R(i) = qR_prim_rs${XYZ}$_vf(j + 1, k, l, strxb - 1 + i) + end do + G_L = 0._wp; G_R = 0._wp + $:GPU_LOOP(parallelism='[seq]') + do i = 1, num_fluids + G_L = G_L + alpha_L(i)*Gs_rs(i) + G_R = G_R + alpha_R(i)*Gs_rs(i) + end do + $:GPU_LOOP(parallelism='[seq]') + do i = 1, strxe - strxb + 1 + ! Elastic contribution to energy if G large enough + if ((G_L > verysmall) .and. (G_R > verysmall)) then + E_L = E_L + (tau_e_L(i)*tau_e_L(i))/(4._wp*G_L) + E_R = E_R + (tau_e_R(i)*tau_e_R(i))/(4._wp*G_R) + ! Additional terms in 2D and 3D + if ((i == 2) .or. (i == 4) .or. (i == 5)) then + E_L = E_L + (tau_e_L(i)*tau_e_L(i))/(4._wp*G_L) + E_R = E_R + (tau_e_R(i)*tau_e_R(i))/(4._wp*G_R) + end if + end if + end do + end if - ! follows Einfeldt et al. - ! s_M/P = min/max(0.,s_L/R) - s_M = min(0._wp, s_L); s_P = max(0._wp, s_R) + ! ENERGY ADJUSTMENTS FOR HYPERELASTIC ENERGY + if (hyperelasticity) then + $:GPU_LOOP(parallelism='[seq]') + do i = 1, num_dims + xi_field_L(i) = qL_prim_rs${XYZ}$_vf(j, k, l, xibeg - 1 + i) + xi_field_R(i) = qR_prim_rs${XYZ}$_vf(j + 1, k, l, xibeg - 1 + i) + end do + G_L = 0._wp; G_R = 0._wp; + $:GPU_LOOP(parallelism='[seq]') + do i = 1, num_fluids + ! Mixture left and right shear modulus + G_L = G_L + alpha_L(i)*Gs_rs(i) + G_R = G_R + alpha_R(i)*Gs_rs(i) + end do + ! Elastic contribution to energy if G large enough + if (G_L > verysmall .and. G_R > verysmall) then + E_L = E_L + G_L*qL_prim_rs${XYZ}$_vf(j, k, l, xiend + 1) + E_R = E_R + G_R*qR_prim_rs${XYZ}$_vf(j + 1, k, l, xiend + 1) + end if + $:GPU_LOOP(parallelism='[seq]') + do i = 1, b_size - 1 + tau_e_L(i) = qL_prim_rs${XYZ}$_vf(j, k, l, strxb - 1 + i) + tau_e_R(i) = qR_prim_rs${XYZ}$_vf(j + 1, k, l, strxb - 1 + i) + end do + end if + + H_L = (E_L + pres_L)/rho_L + H_R = (E_R + pres_R)/rho_R - ! goes with q_star_L/R = xi_L/R * (variable) - ! xi_L/R = ( ( s_L/R - u_L/R )/(s_L/R - s_star) ) - xi_L = (s_L - vel_L(idx1))/(s_L - s_S) - xi_R = (s_R - vel_R(idx1))/(s_R - s_S) + @:compute_average_state() - ! goes with numerical star velocity in x/y/z directions - ! xi_P/M = 0.5 +/m sgn(0.5,s_star) - xi_M = (5.e-1_wp + sign(0.5_wp, s_S)) - xi_P = (5.e-1_wp - sign(0.5_wp, s_S)) + call s_compute_speed_of_sound(pres_L, rho_L, gamma_L, pi_inf_L, H_L, alpha_L, & + vel_L_rms, 0._wp, c_L) - ! goes with the numerical velocity in x/y/z directions - ! xi_P/M (pressure) = min/max(0. sgn(1,sL/sR)) - xi_MP = -min(0._wp, sign(1._wp, s_L)) - xi_PP = max(0._wp, sign(1._wp, s_R)) + call s_compute_speed_of_sound(pres_R, rho_R, gamma_R, pi_inf_R, H_R, alpha_R, & + vel_R_rms, 0._wp, c_R) - E_star = xi_M*(E_L + xi_MP*(xi_L*(E_L + (s_S - vel_L(dir_idx(1)))* & - (rho_L*s_S + pres_L/(s_L - vel_L(dir_idx(1))))) - E_L)) + & - xi_P*(E_R + xi_PP*(xi_R*(E_R + (s_S - vel_R(dir_idx(1)))* & - (rho_R*s_S + pres_R/(s_R - vel_R(dir_idx(1))))) - E_R)) - p_Star = xi_M*(pres_L + xi_MP*(rho_L*(s_L - vel_L(dir_idx(1)))*(s_S - vel_L(dir_idx(1))))) + & - xi_P*(pres_R + xi_PP*(rho_R*(s_R - vel_R(dir_idx(1)))*(s_S - vel_R(dir_idx(1))))) + !> The computation of c_avg does not require all the variables, and therefore the non '_avg' + ! variables are placeholders to call the subroutine. + call s_compute_speed_of_sound(pres_R, rho_avg, gamma_avg, pi_inf_R, H_avg, alpha_R, & + vel_avg_rms, 0._wp, c_avg) - rho_Star = xi_M*(rho_L*(xi_MP*xi_L + 1._wp - xi_MP)) + & - xi_P*(rho_R*(xi_PP*xi_R + 1._wp - xi_PP)) + if (viscous) then + $:GPU_LOOP(parallelism='[seq]') + do i = 1, 2 + Re_avg_rs${XYZ}$_vf(j, k, l, i) = 2._wp/(1._wp/Re_L(i) + 1._wp/Re_R(i)) + end do + end if - vel_K_Star = vel_L(idx1)*(1._wp - xi_MP) + xi_MP*vel_R(idx1) + & - xi_MP*xi_PP*(s_S - vel_R(idx1)) + ! Low Mach correction + if (low_Mach == 2) then + @:compute_low_Mach_correction() + end if - ! Low Mach correction - if (low_Mach == 1) then - @:compute_low_Mach_correction() - else - pcorr = 0._wp - end if + ! COMPUTING THE DIRECT WAVE SPEEDS + if (wave_speeds == 1) then + if (elasticity) then + s_L = min(vel_L(dir_idx(1)) - sqrt(c_L*c_L + & + (((4._wp*G_L)/3._wp) + tau_e_L(dir_idx_tau(1)))/rho_L), vel_R(dir_idx(1)) - sqrt(c_R*c_R + & + (((4._wp*G_R)/3._wp) + tau_e_R(dir_idx_tau(1)))/rho_R)) + s_R = max(vel_R(dir_idx(1)) + sqrt(c_R*c_R + & + (((4._wp*G_R)/3._wp) + tau_e_R(dir_idx_tau(1)))/rho_R), vel_L(dir_idx(1)) + sqrt(c_L*c_L + & + (((4._wp*G_L)/3._wp) + tau_e_L(dir_idx_tau(1)))/rho_L)) + s_S = (pres_R - tau_e_R(dir_idx_tau(1)) - pres_L + & + tau_e_L(dir_idx_tau(1)) + rho_L*vel_L(idx1)*(s_L - vel_L(idx1)) - & + rho_R*vel_R(idx1)*(s_R - vel_R(idx1)))/(rho_L*(s_L - vel_L(idx1)) - & + rho_R*(s_R - vel_R(idx1))) + else + s_L = min(vel_L(dir_idx(1)) - c_L, vel_R(dir_idx(1)) - c_R) + s_R = max(vel_R(dir_idx(1)) + c_R, vel_L(dir_idx(1)) + c_L) + s_S = (pres_R - pres_L + rho_L*vel_L(dir_idx(1))* & + (s_L - vel_L(dir_idx(1))) - rho_R*vel_R(dir_idx(1))*(s_R - vel_R(dir_idx(1)))) & + /(rho_L*(s_L - vel_L(dir_idx(1))) - rho_R*(s_R - vel_R(dir_idx(1)))) - ! COMPUTING FLUXES - ! MASS FLUX. - $:GPU_LOOP(parallelism='[seq]') - do i = 1, contxe - flux_rs${XYZ}$_vf(j, k, l, i) = & - xi_M*qL_prim_rs${XYZ}$_vf(j, k, l, i)*(vel_L(idx1) + s_M*(xi_L - 1._wp)) + & - xi_P*qR_prim_rs${XYZ}$_vf(j + 1, k, l, i)*(vel_R(idx1) + s_P*(xi_R - 1._wp)) - end do + end if + elseif (wave_speeds == 2) then + pres_SL = 5.e-1_wp*(pres_L + pres_R + rho_avg*c_avg* & + (vel_L(dir_idx(1)) - & + vel_R(dir_idx(1)))) + + pres_SR = pres_SL + + Ms_L = max(1._wp, sqrt(1._wp + ((5.e-1_wp + gamma_L)/(1._wp + gamma_L))* & + (pres_SL/pres_L - 1._wp)*pres_L/ & + ((pres_L + pi_inf_L/(1._wp + gamma_L))))) + Ms_R = max(1._wp, sqrt(1._wp + ((5.e-1_wp + gamma_R)/(1._wp + gamma_R))* & + (pres_SR/pres_R - 1._wp)*pres_R/ & + ((pres_R + pi_inf_R/(1._wp + gamma_R))))) + + s_L = vel_L(dir_idx(1)) - c_L*Ms_L + s_R = vel_R(dir_idx(1)) + c_R*Ms_R + + s_S = 5.e-1_wp*((vel_L(dir_idx(1)) + vel_R(dir_idx(1))) + & + (pres_L - pres_R)/ & + (rho_avg*c_avg)) + end if - ! MOMENTUM FLUX. - ! f = \rho u u - \sigma, q = \rho u, q_star = \xi * \rho*(s_star, v, w) - $:GPU_LOOP(parallelism='[seq]') - do i = 1, num_dims - idxi = dir_idx(i) - flux_rs${XYZ}$_vf(j, k, l, contxe + idxi) = rho_Star*vel_K_Star* & - (dir_flg(idxi)*vel_K_Star + (1._wp - dir_flg(idxi))*(xi_M*vel_L(idxi) + xi_P*vel_R(idxi))) + dir_flg(idxi)*p_Star & - + (s_M/s_L)*(s_P/s_R)*dir_flg(idxi)*pcorr - end do + ! follows Einfeldt et al. + ! s_M/P = min/max(0.,s_L/R) + s_M = min(0._wp, s_L); s_P = max(0._wp, s_R) + + ! goes with q_star_L/R = xi_L/R * (variable) + ! xi_L/R = ( ( s_L/R - u_L/R )/(s_L/R - s_star) ) + xi_L = (s_L - vel_L(idx1))/(s_L - s_S) + xi_R = (s_R - vel_R(idx1))/(s_R - s_S) + + ! goes with numerical star velocity in x/y/z directions + ! xi_P/M = 0.5 +/m sgn(0.5,s_star) + xi_M = (5.e-1_wp + sign(0.5_wp, s_S)) + xi_P = (5.e-1_wp - sign(0.5_wp, s_S)) + + ! goes with the numerical velocity in x/y/z directions + ! xi_P/M (pressure) = min/max(0. sgn(1,sL/sR)) + xi_MP = -min(0._wp, sign(1._wp, s_L)) + xi_PP = max(0._wp, sign(1._wp, s_R)) + + E_star = xi_M*(E_L + xi_MP*(xi_L*(E_L + (s_S - vel_L(dir_idx(1)))* & + (rho_L*s_S + pres_L/(s_L - vel_L(dir_idx(1))))) - E_L)) + & + xi_P*(E_R + xi_PP*(xi_R*(E_R + (s_S - vel_R(dir_idx(1)))* & + (rho_R*s_S + pres_R/(s_R - vel_R(dir_idx(1))))) - E_R)) + p_Star = xi_M*(pres_L + xi_MP*(rho_L*(s_L - vel_L(dir_idx(1)))*(s_S - vel_L(dir_idx(1))))) + & + xi_P*(pres_R + xi_PP*(rho_R*(s_R - vel_R(dir_idx(1)))*(s_S - vel_R(dir_idx(1))))) + + rho_Star = xi_M*(rho_L*(xi_MP*xi_L + 1._wp - xi_MP)) + & + xi_P*(rho_R*(xi_PP*xi_R + 1._wp - xi_PP)) + + vel_K_Star = vel_L(idx1)*(1._wp - xi_MP) + xi_MP*vel_R(idx1) + & + xi_MP*xi_PP*(s_S - vel_R(idx1)) + + ! Low Mach correction + if (low_Mach == 1) then + @:compute_low_Mach_correction() + else + pcorr = 0._wp + end if - ! ENERGY FLUX. - ! f = u*(E-\sigma), q = E, q_star = \xi*E+(s-u)(\rho s_star - \sigma/(s-u)) - flux_rs${XYZ}$_vf(j, k, l, E_idx) = (E_star + p_Star)*vel_K_Star & - + (s_M/s_L)*(s_P/s_R)*pcorr*s_S + ! COMPUTING FLUXES + ! MASS FLUX. + $:GPU_LOOP(parallelism='[seq]') + do i = 1, contxe + flux_rs${XYZ}$_vf(j, k, l, i) = & + xi_M*qL_prim_rs${XYZ}$_vf(j, k, l, i)*(vel_L(idx1) + s_M*(xi_L - 1._wp)) + & + xi_P*qR_prim_rs${XYZ}$_vf(j + 1, k, l, i)*(vel_R(idx1) + s_P*(xi_R - 1._wp)) + end do - ! ELASTICITY. Elastic shear stress additions for the momentum and energy flux - if (elasticity) then - flux_ene_e = 0._wp; + ! MOMENTUM FLUX. + ! f = \rho u u - \sigma, q = \rho u, q_star = \xi * \rho*(s_star, v, w) $:GPU_LOOP(parallelism='[seq]') do i = 1, num_dims idxi = dir_idx(i) - ! MOMENTUM ELASTIC FLUX. - flux_rs${XYZ}$_vf(j, k, l, contxe + idxi) = & - flux_rs${XYZ}$_vf(j, k, l, contxe + idxi) & - - xi_M*tau_e_L(dir_idx_tau(i)) - xi_P*tau_e_R(dir_idx_tau(i)) - ! ENERGY ELASTIC FLUX. - flux_ene_e = flux_ene_e - & - xi_M*(vel_L(idxi)*tau_e_L(dir_idx_tau(i)) + & - s_M*(xi_L*((s_S - vel_L(i))*(tau_e_L(dir_idx_tau(i))/(s_L - vel_L(i)))))) - & - xi_P*(vel_R(idxi)*tau_e_R(dir_idx_tau(i)) + & - s_P*(xi_R*((s_S - vel_R(i))*(tau_e_R(dir_idx_tau(i))/(s_R - vel_R(i)))))) + flux_rs${XYZ}$_vf(j, k, l, contxe + idxi) = rho_Star*vel_K_Star* & + (dir_flg(idxi)*vel_K_Star + (1._wp - dir_flg(idxi))*(xi_M*vel_L(idxi) + xi_P*vel_R(idxi))) + dir_flg(idxi)*p_Star & + + (s_M/s_L)*(s_P/s_R)*dir_flg(idxi)*pcorr end do - flux_rs${XYZ}$_vf(j, k, l, E_idx) = flux_rs${XYZ}$_vf(j, k, l, E_idx) + flux_ene_e - end if - - ! VOLUME FRACTION FLUX. - $:GPU_LOOP(parallelism='[seq]') - do i = advxb, advxe - flux_rs${XYZ}$_vf(j, k, l, i) = & - xi_M*qL_prim_rs${XYZ}$_vf(j, k, l, i)*s_S + & - xi_P*qR_prim_rs${XYZ}$_vf(j + 1, k, l, i)*s_S - end do - - ! SOURCE TERM FOR VOLUME FRACTION ADVECTION FLUX. - $:GPU_LOOP(parallelism='[seq]') - do i = 1, num_dims - idxi = dir_idx(i) - vel_src_rs${XYZ}$_vf(j, k, l, idxi) = & - xi_M*(vel_L(idxi) + dir_flg(idxi)*(s_S*(xi_MP*(xi_L - 1) + 1) - vel_L(idxi))) + & - xi_P*(vel_R(idxi) + dir_flg(idxi)*(s_S*(xi_PP*(xi_R - 1) + 1) - vel_R(idxi))) - end do - ! INTERNAL ENERGIES ADVECTION FLUX. - ! K-th pressure and velocity in preparation for the internal energy flux - $:GPU_LOOP(parallelism='[seq]') - do i = 1, num_fluids - p_K_Star = xi_M*(xi_MP*((pres_L + pi_infs(i)/(1._wp + gammas(i)))* & - xi_L**(1._wp/gammas(i) + 1._wp) - pi_infs(i)/(1._wp + gammas(i)) - pres_L) + pres_L) + & - xi_P*(xi_PP*((pres_R + pi_infs(i)/(1._wp + gammas(i)))* & - xi_R**(1._wp/gammas(i) + 1._wp) - pi_infs(i)/(1._wp + gammas(i)) - pres_R) + pres_R) - - flux_rs${XYZ}$_vf(j, k, l, i + intxb - 1) = & - ((xi_M*qL_prim_rs${XYZ}$_vf(j, k, l, i + advxb - 1) + xi_P*qR_prim_rs${XYZ}$_vf(j + 1, k, l, i + advxb - 1))* & - (gammas(i)*p_K_Star + pi_infs(i)) + & - (xi_M*qL_prim_rs${XYZ}$_vf(j, k, l, i + contxb - 1) + xi_P*qR_prim_rs${XYZ}$_vf(j + 1, k, l, i + contxb - 1))* & - qvs(i))*vel_K_Star & - + (s_M/s_L)*(s_P/s_R)*pcorr*s_S*(xi_M*qL_prim_rs${XYZ}$_vf(j, k, l, i + advxb - 1) + xi_P*qR_prim_rs${XYZ}$_vf(j + 1, k, l, i + advxb - 1)) - end do + ! ENERGY FLUX. + ! f = u*(E-\sigma), q = E, q_star = \xi*E+(s-u)(\rho s_star - \sigma/(s-u)) + flux_rs${XYZ}$_vf(j, k, l, E_idx) = (E_star + p_Star)*vel_K_Star & + + (s_M/s_L)*(s_P/s_R)*pcorr*s_S - flux_src_rs${XYZ}$_vf(j, k, l, advxb) = vel_src_rs${XYZ}$_vf(j, k, l, idx1) + ! ELASTICITY. Elastic shear stress additions for the momentum and energy flux + if (elasticity) then + flux_ene_e = 0._wp; + $:GPU_LOOP(parallelism='[seq]') + do i = 1, num_dims + idxi = dir_idx(i) + ! MOMENTUM ELASTIC FLUX. + flux_rs${XYZ}$_vf(j, k, l, contxe + idxi) = & + flux_rs${XYZ}$_vf(j, k, l, contxe + idxi) & + - xi_M*tau_e_L(dir_idx_tau(i)) - xi_P*tau_e_R(dir_idx_tau(i)) + ! ENERGY ELASTIC FLUX. + flux_ene_e = flux_ene_e - & + xi_M*(vel_L(idxi)*tau_e_L(dir_idx_tau(i)) + & + s_M*(xi_L*((s_S - vel_L(i))*(tau_e_L(dir_idx_tau(i))/(s_L - vel_L(i)))))) - & + xi_P*(vel_R(idxi)*tau_e_R(dir_idx_tau(i)) + & + s_P*(xi_R*((s_S - vel_R(i))*(tau_e_R(dir_idx_tau(i))/(s_R - vel_R(i)))))) + end do + flux_rs${XYZ}$_vf(j, k, l, E_idx) = flux_rs${XYZ}$_vf(j, k, l, E_idx) + flux_ene_e + end if - ! HYPOELASTIC STRESS EVOLUTION FLUX. - if (hypoelasticity) then + ! VOLUME FRACTION FLUX. $:GPU_LOOP(parallelism='[seq]') - do i = 1, strxe - strxb + 1 - flux_rs${XYZ}$_vf(j, k, l, strxb - 1 + i) = & - xi_M*(s_S/(s_L - s_S))*(s_L*rho_L*tau_e_L(i) - rho_L*vel_L(idx1)*tau_e_L(i)) + & - xi_P*(s_S/(s_R - s_S))*(s_R*rho_R*tau_e_R(i) - rho_R*vel_R(idx1)*tau_e_R(i)) + do i = advxb, advxe + flux_rs${XYZ}$_vf(j, k, l, i) = & + xi_M*qL_prim_rs${XYZ}$_vf(j, k, l, i)*s_S + & + xi_P*qR_prim_rs${XYZ}$_vf(j + 1, k, l, i)*s_S end do - end if - ! REFERENCE MAP FLUX. - if (hyperelasticity) then + ! SOURCE TERM FOR VOLUME FRACTION ADVECTION FLUX. $:GPU_LOOP(parallelism='[seq]') do i = 1, num_dims - flux_rs${XYZ}$_vf(j, k, l, xibeg - 1 + i) = & - xi_M*(s_S/(s_L - s_S))*(s_L*rho_L*xi_field_L(i) & - - rho_L*vel_L(idx1)*xi_field_L(i)) + & - xi_P*(s_S/(s_R - s_S))*(s_R*rho_R*xi_field_R(i) & - - rho_R*vel_R(idx1)*xi_field_R(i)) + idxi = dir_idx(i) + vel_src_rs${XYZ}$_vf(j, k, l, idxi) = & + xi_M*(vel_L(idxi) + dir_flg(idxi)*(s_S*(xi_MP*(xi_L - 1) + 1) - vel_L(idxi))) + & + xi_P*(vel_R(idxi) + dir_flg(idxi)*(s_S*(xi_PP*(xi_R - 1) + 1) - vel_R(idxi))) end do - end if - ! COLOR FUNCTION FLUX - if (surface_tension) then - flux_rs${XYZ}$_vf(j, k, l, c_idx) = & - (xi_M*qL_prim_rs${XYZ}$_vf(j, k, l, c_idx) + & - xi_P*qR_prim_rs${XYZ}$_vf(j + 1, k, l, c_idx))*s_S - end if + ! INTERNAL ENERGIES ADVECTION FLUX. + ! K-th pressure and velocity in preparation for the internal energy flux + $:GPU_LOOP(parallelism='[seq]') + do i = 1, num_fluids + p_K_Star = xi_M*(xi_MP*((pres_L + pi_infs(i)/(1._wp + gammas(i)))* & + xi_L**(1._wp/gammas(i) + 1._wp) - pi_infs(i)/(1._wp + gammas(i)) - pres_L) + pres_L) + & + xi_P*(xi_PP*((pres_R + pi_infs(i)/(1._wp + gammas(i)))* & + xi_R**(1._wp/gammas(i) + 1._wp) - pi_infs(i)/(1._wp + gammas(i)) - pres_R) + pres_R) + + flux_rs${XYZ}$_vf(j, k, l, i + intxb - 1) = & + ((xi_M*qL_prim_rs${XYZ}$_vf(j, k, l, i + advxb - 1) + xi_P*qR_prim_rs${XYZ}$_vf(j + 1, k, l, i + advxb - 1))* & + (gammas(i)*p_K_Star + pi_infs(i)) + & + (xi_M*qL_prim_rs${XYZ}$_vf(j, k, l, i + contxb - 1) + xi_P*qR_prim_rs${XYZ}$_vf(j + 1, k, l, i + contxb - 1))* & + qvs(i))*vel_K_Star & + + (s_M/s_L)*(s_P/s_R)*pcorr*s_S*(xi_M*qL_prim_rs${XYZ}$_vf(j, k, l, i + advxb - 1) + xi_P*qR_prim_rs${XYZ}$_vf(j + 1, k, l, i + advxb - 1)) + end do - ! Geometrical source flux for cylindrical coordinates - #:if (NORM_DIR == 2) - if (cyl_coord) then - !Substituting the advective flux into the inviscid geometrical source flux - $:GPU_LOOP(parallelism='[seq]') - do i = 1, E_idx - flux_gsrc_rs${XYZ}$_vf(j, k, l, i) = flux_rs${XYZ}$_vf(j, k, l, i) - end do - $:GPU_LOOP(parallelism='[seq]') - do i = intxb, intxe - flux_gsrc_rs${XYZ}$_vf(j, k, l, i) = flux_rs${XYZ}$_vf(j, k, l, i) - end do - ! Recalculating the radial momentum geometric source flux - flux_gsrc_rs${XYZ}$_vf(j, k, l, momxb - 1 + dir_idx(1)) = & - flux_gsrc_rs${XYZ}$_vf(j, k, l, momxb - 1 + dir_idx(1)) - p_Star - ! Geometrical source of the void fraction(s) is zero + flux_src_rs${XYZ}$_vf(j, k, l, advxb) = vel_src_rs${XYZ}$_vf(j, k, l, idx1) + + ! HYPOELASTIC STRESS EVOLUTION FLUX. + if (hypoelasticity) then $:GPU_LOOP(parallelism='[seq]') - do i = advxb, advxe - flux_gsrc_rs${XYZ}$_vf(j, k, l, i) = 0._wp + do i = 1, strxe - strxb + 1 + flux_rs${XYZ}$_vf(j, k, l, strxb - 1 + i) = & + xi_M*(s_S/(s_L - s_S))*(s_L*rho_L*tau_e_L(i) - rho_L*vel_L(idx1)*tau_e_L(i)) + & + xi_P*(s_S/(s_R - s_S))*(s_R*rho_R*tau_e_R(i) - rho_R*vel_R(idx1)*tau_e_R(i)) end do end if - #:endif - #:if (NORM_DIR == 3) - if (grid_geometry == 3) then + + ! REFERENCE MAP FLUX. + if (hyperelasticity) then $:GPU_LOOP(parallelism='[seq]') - do i = 1, sys_size - flux_gsrc_rs${XYZ}$_vf(j, k, l, i) = 0._wp + do i = 1, num_dims + flux_rs${XYZ}$_vf(j, k, l, xibeg - 1 + i) = & + xi_M*(s_S/(s_L - s_S))*(s_L*rho_L*xi_field_L(i) & + - rho_L*vel_L(idx1)*xi_field_L(i)) + & + xi_P*(s_S/(s_R - s_S))*(s_R*rho_R*xi_field_R(i) & + - rho_R*vel_R(idx1)*xi_field_R(i)) end do - flux_gsrc_rs${XYZ}$_vf(j, k, l, momxb - 1 + dir_idx(1)) = & - flux_gsrc_rs${XYZ}$_vf(j, k, l, momxb - 1 + dir_idx(1)) - p_Star + end if - flux_gsrc_rs${XYZ}$_vf(j, k, l, momxe) = flux_rs${XYZ}$_vf(j, k, l, momxb + 1) + ! COLOR FUNCTION FLUX + if (surface_tension) then + flux_rs${XYZ}$_vf(j, k, l, c_idx) = & + (xi_M*qL_prim_rs${XYZ}$_vf(j, k, l, c_idx) + & + xi_P*qR_prim_rs${XYZ}$_vf(j + 1, k, l, c_idx))*s_S end if - #:endif + ! Geometrical source flux for cylindrical coordinates + #:if (NORM_DIR == 2) + if (cyl_coord) then + !Substituting the advective flux into the inviscid geometrical source flux + $:GPU_LOOP(parallelism='[seq]') + do i = 1, E_idx + flux_gsrc_rs${XYZ}$_vf(j, k, l, i) = flux_rs${XYZ}$_vf(j, k, l, i) + end do + $:GPU_LOOP(parallelism='[seq]') + do i = intxb, intxe + flux_gsrc_rs${XYZ}$_vf(j, k, l, i) = flux_rs${XYZ}$_vf(j, k, l, i) + end do + ! Recalculating the radial momentum geometric source flux + flux_gsrc_rs${XYZ}$_vf(j, k, l, momxb - 1 + dir_idx(1)) = & + flux_gsrc_rs${XYZ}$_vf(j, k, l, momxb - 1 + dir_idx(1)) - p_Star + ! Geometrical source of the void fraction(s) is zero + $:GPU_LOOP(parallelism='[seq]') + do i = advxb, advxe + flux_gsrc_rs${XYZ}$_vf(j, k, l, i) = 0._wp + end do + end if + #:endif + #:if (NORM_DIR == 3) + if (grid_geometry == 3) then + $:GPU_LOOP(parallelism='[seq]') + do i = 1, sys_size + flux_gsrc_rs${XYZ}$_vf(j, k, l, i) = 0._wp + end do + flux_gsrc_rs${XYZ}$_vf(j, k, l, momxb - 1 + dir_idx(1)) = & + flux_gsrc_rs${XYZ}$_vf(j, k, l, momxb - 1 + dir_idx(1)) - p_Star + + flux_gsrc_rs${XYZ}$_vf(j, k, l, momxe) = flux_rs${XYZ}$_vf(j, k, l, momxb + 1) + end if + #:endif + + end do end do end do - end do + #:endcall GPU_PARALLEL_LOOP elseif (model_eqns == 4) then !ME4 - $:GPU_PARALLEL_LOOP(collapse=3, private='[alpha_rho_L, & - & alpha_rho_R, vel_L, vel_R, alpha_L, alpha_R, & - & rho_avg, h_avg, gamma_avg, s_L, s_R, s_S, & - & vel_avg_rms, nbub_L, nbub_R, ptilde_L, ptilde_R]') - do l = is3%beg, is3%end - do k = is2%beg, is2%end - do j = is1%beg, is1%end - - vel_L_rms = 0._wp; vel_R_rms = 0._wp - rho_L = 0._wp; rho_R = 0._wp - gamma_L = 0._wp; gamma_R = 0._wp - pi_inf_L = 0._wp; pi_inf_R = 0._wp - qv_L = 0._wp; qv_R = 0._wp - - $:GPU_LOOP(parallelism='[seq]') - do i = 1, contxe - alpha_rho_L(i) = qL_prim_rs${XYZ}$_vf(j, k, l, i) - alpha_rho_R(i) = qR_prim_rs${XYZ}$_vf(j + 1, k, l, i) - end do - - $:GPU_LOOP(parallelism='[seq]') - do i = 1, num_dims - vel_L(i) = qL_prim_rs${XYZ}$_vf(j, k, l, contxe + i) - vel_R(i) = qR_prim_rs${XYZ}$_vf(j + 1, k, l, contxe + i) - vel_L_rms = vel_L_rms + vel_L(i)**2._wp - vel_R_rms = vel_R_rms + vel_R(i)**2._wp - end do - - $:GPU_LOOP(parallelism='[seq]') - do i = 1, num_fluids - alpha_L(i) = qL_prim_rs${XYZ}$_vf(j, k, l, E_idx + i) - alpha_R(i) = qR_prim_rs${XYZ}$_vf(j + 1, k, l, E_idx + i) - end do - - $:GPU_LOOP(parallelism='[seq]') - do i = 1, num_fluids - rho_L = rho_L + alpha_rho_L(i) - gamma_L = gamma_L + alpha_L(i)*gammas(i) - pi_inf_L = pi_inf_L + alpha_L(i)*pi_infs(i) - qv_L = qv_L + alpha_rho_L(i)*qvs(i) - - rho_R = rho_R + alpha_rho_R(i) - gamma_R = gamma_R + alpha_R(i)*gammas(i) - pi_inf_R = pi_inf_R + alpha_R(i)*pi_infs(i) - qv_R = qv_R + alpha_rho_R(i)*qvs(i) - end do - - pres_L = qL_prim_rs${XYZ}$_vf(j, k, l, E_idx) - pres_R = qR_prim_rs${XYZ}$_vf(j + 1, k, l, E_idx) - - E_L = gamma_L*pres_L + pi_inf_L + 5.e-1_wp*rho_L*vel_L_rms + qv_L - E_R = gamma_R*pres_R + pi_inf_R + 5.e-1_wp*rho_R*vel_R_rms + qv_R - - H_L = (E_L + pres_L)/rho_L - H_R = (E_R + pres_R)/rho_R - - @:compute_average_state() - - call s_compute_speed_of_sound(pres_L, rho_L, gamma_L, pi_inf_L, H_L, alpha_L, & - vel_L_rms, 0._wp, c_L) - - call s_compute_speed_of_sound(pres_R, rho_R, gamma_R, pi_inf_R, H_R, alpha_R, & - vel_R_rms, 0._wp, c_R) - - !> The computation of c_avg does not require all the variables, and therefore the non '_avg' - ! variables are placeholders to call the subroutine. + #:call GPU_PARALLEL_LOOP(collapse=3, private='[alpha_rho_L, alpha_rho_R, vel_L, vel_R, alpha_L, alpha_R, rho_avg, h_avg, gamma_avg, s_L, s_R, s_S, vel_avg_rms, nbub_L, nbub_R, ptilde_L, ptilde_R]') + do l = is3%beg, is3%end + do k = is2%beg, is2%end + do j = is1%beg, is1%end - call s_compute_speed_of_sound(pres_R, rho_avg, gamma_avg, pi_inf_R, H_avg, alpha_R, & - vel_avg_rms, 0._wp, c_avg) + vel_L_rms = 0._wp; vel_R_rms = 0._wp + rho_L = 0._wp; rho_R = 0._wp + gamma_L = 0._wp; gamma_R = 0._wp + pi_inf_L = 0._wp; pi_inf_R = 0._wp + qv_L = 0._wp; qv_R = 0._wp - if (wave_speeds == 1) then - s_L = min(vel_L(dir_idx(1)) - c_L, vel_R(dir_idx(1)) - c_R) - s_R = max(vel_R(dir_idx(1)) + c_R, vel_L(dir_idx(1)) + c_L) + $:GPU_LOOP(parallelism='[seq]') + do i = 1, contxe + alpha_rho_L(i) = qL_prim_rs${XYZ}$_vf(j, k, l, i) + alpha_rho_R(i) = qR_prim_rs${XYZ}$_vf(j + 1, k, l, i) + end do - s_S = (pres_R - pres_L + rho_L*vel_L(dir_idx(1))* & - (s_L - vel_L(dir_idx(1))) - & - rho_R*vel_R(dir_idx(1))* & - (s_R - vel_R(dir_idx(1)))) & - /(rho_L*(s_L - vel_L(dir_idx(1))) - & - rho_R*(s_R - vel_R(dir_idx(1)))) - elseif (wave_speeds == 2) then - pres_SL = 5.e-1_wp*(pres_L + pres_R + rho_avg*c_avg* & - (vel_L(dir_idx(1)) - & - vel_R(dir_idx(1)))) + $:GPU_LOOP(parallelism='[seq]') + do i = 1, num_dims + vel_L(i) = qL_prim_rs${XYZ}$_vf(j, k, l, contxe + i) + vel_R(i) = qR_prim_rs${XYZ}$_vf(j + 1, k, l, contxe + i) + vel_L_rms = vel_L_rms + vel_L(i)**2._wp + vel_R_rms = vel_R_rms + vel_R(i)**2._wp + end do - pres_SR = pres_SL + $:GPU_LOOP(parallelism='[seq]') + do i = 1, num_fluids + alpha_L(i) = qL_prim_rs${XYZ}$_vf(j, k, l, E_idx + i) + alpha_R(i) = qR_prim_rs${XYZ}$_vf(j + 1, k, l, E_idx + i) + end do - Ms_L = max(1._wp, sqrt(1._wp + ((5.e-1_wp + gamma_L)/(1._wp + gamma_L))* & - (pres_SL/pres_L - 1._wp)*pres_L/ & - ((pres_L + pi_inf_L/(1._wp + gamma_L))))) - Ms_R = max(1._wp, sqrt(1._wp + ((5.e-1_wp + gamma_R)/(1._wp + gamma_R))* & - (pres_SR/pres_R - 1._wp)*pres_R/ & - ((pres_R + pi_inf_R/(1._wp + gamma_R))))) + $:GPU_LOOP(parallelism='[seq]') + do i = 1, num_fluids + rho_L = rho_L + alpha_rho_L(i) + gamma_L = gamma_L + alpha_L(i)*gammas(i) + pi_inf_L = pi_inf_L + alpha_L(i)*pi_infs(i) + qv_L = qv_L + alpha_rho_L(i)*qvs(i) + + rho_R = rho_R + alpha_rho_R(i) + gamma_R = gamma_R + alpha_R(i)*gammas(i) + pi_inf_R = pi_inf_R + alpha_R(i)*pi_infs(i) + qv_R = qv_R + alpha_rho_R(i)*qvs(i) + end do - s_L = vel_L(dir_idx(1)) - c_L*Ms_L - s_R = vel_R(dir_idx(1)) + c_R*Ms_R + pres_L = qL_prim_rs${XYZ}$_vf(j, k, l, E_idx) + pres_R = qR_prim_rs${XYZ}$_vf(j + 1, k, l, E_idx) - s_S = 5.e-1_wp*((vel_L(dir_idx(1)) + vel_R(dir_idx(1))) + & - (pres_L - pres_R)/ & - (rho_avg*c_avg)) - end if + E_L = gamma_L*pres_L + pi_inf_L + 5.e-1_wp*rho_L*vel_L_rms + qv_L + E_R = gamma_R*pres_R + pi_inf_R + 5.e-1_wp*rho_R*vel_R_rms + qv_R - ! follows Einfeldt et al. - ! s_M/P = min/max(0.,s_L/R) - s_M = min(0._wp, s_L); s_P = max(0._wp, s_R) + H_L = (E_L + pres_L)/rho_L + H_R = (E_R + pres_R)/rho_R - ! goes with q_star_L/R = xi_L/R * (variable) - ! xi_L/R = ( ( s_L/R - u_L/R )/(s_L/R - s_star) ) - xi_L = (s_L - vel_L(dir_idx(1)))/(s_L - s_S) - xi_R = (s_R - vel_R(dir_idx(1)))/(s_R - s_S) + @:compute_average_state() - ! goes with numerical velocity in x/y/z directions - ! xi_P/M = 0.5 +/m sgn(0.5,s_star) - xi_M = (5.e-1_wp + sign(5.e-1_wp, s_S)) - xi_P = (5.e-1_wp - sign(5.e-1_wp, s_S)) + call s_compute_speed_of_sound(pres_L, rho_L, gamma_L, pi_inf_L, H_L, alpha_L, & + vel_L_rms, 0._wp, c_L) - $:GPU_LOOP(parallelism='[seq]') - do i = 1, contxe - flux_rs${XYZ}$_vf(j, k, l, i) = & - xi_M*alpha_rho_L(i) & - *(vel_L(dir_idx(1)) + s_M*(xi_L - 1._wp)) & - + xi_P*alpha_rho_R(i) & - *(vel_R(dir_idx(1)) + s_P*(xi_R - 1._wp)) - end do + call s_compute_speed_of_sound(pres_R, rho_R, gamma_R, pi_inf_R, H_R, alpha_R, & + vel_R_rms, 0._wp, c_R) - ! Momentum flux. - ! f = \rho u u + p I, q = \rho u, q_star = \xi * \rho*(s_star, v, w) - $:GPU_LOOP(parallelism='[seq]') - do i = 1, num_dims - flux_rs${XYZ}$_vf(j, k, l, contxe + dir_idx(i)) = & - xi_M*(rho_L*(vel_L(dir_idx(1))* & - vel_L(dir_idx(i)) + & - s_M*(xi_L*(dir_flg(dir_idx(i))*s_S + & - (1._wp - dir_flg(dir_idx(i)))* & - vel_L(dir_idx(i))) - vel_L(dir_idx(i)))) + & - dir_flg(dir_idx(i))*pres_L) & - + xi_P*(rho_R*(vel_R(dir_idx(1))* & - vel_R(dir_idx(i)) + & - s_P*(xi_R*(dir_flg(dir_idx(i))*s_S + & - (1._wp - dir_flg(dir_idx(i)))* & - vel_R(dir_idx(i))) - vel_R(dir_idx(i)))) + & - dir_flg(dir_idx(i))*pres_R) - end do + !> The computation of c_avg does not require all the variables, and therefore the non '_avg' + ! variables are placeholders to call the subroutine. - if (bubbles_euler) then - ! Put p_tilde in - $:GPU_LOOP(parallelism='[seq]') - do i = 1, num_dims - flux_rs${XYZ}$_vf(j, k, l, contxe + dir_idx(i)) = & - flux_rs${XYZ}$_vf(j, k, l, contxe + dir_idx(i)) + & - xi_M*(dir_flg(dir_idx(i))*(-1._wp*ptilde_L)) & - + xi_P*(dir_flg(dir_idx(i))*(-1._wp*ptilde_R)) - end do - end if + call s_compute_speed_of_sound(pres_R, rho_avg, gamma_avg, pi_inf_R, H_avg, alpha_R, & + vel_avg_rms, 0._wp, c_avg) - flux_rs${XYZ}$_vf(j, k, l, E_idx) = 0._wp + if (wave_speeds == 1) then + s_L = min(vel_L(dir_idx(1)) - c_L, vel_R(dir_idx(1)) - c_R) + s_R = max(vel_R(dir_idx(1)) + c_R, vel_L(dir_idx(1)) + c_L) - $:GPU_LOOP(parallelism='[seq]') - do i = alf_idx, alf_idx !only advect the void fraction - flux_rs${XYZ}$_vf(j, k, l, i) = & - xi_M*qL_prim_rs${XYZ}$_vf(j, k, l, i) & - *(vel_L(dir_idx(1)) + s_M*(xi_L - 1._wp)) & - + xi_P*qR_prim_rs${XYZ}$_vf(j + 1, k, l, i) & - *(vel_R(dir_idx(1)) + s_P*(xi_R - 1._wp)) - end do + s_S = (pres_R - pres_L + rho_L*vel_L(dir_idx(1))* & + (s_L - vel_L(dir_idx(1))) - & + rho_R*vel_R(dir_idx(1))* & + (s_R - vel_R(dir_idx(1)))) & + /(rho_L*(s_L - vel_L(dir_idx(1))) - & + rho_R*(s_R - vel_R(dir_idx(1)))) + elseif (wave_speeds == 2) then + pres_SL = 5.e-1_wp*(pres_L + pres_R + rho_avg*c_avg* & + (vel_L(dir_idx(1)) - & + vel_R(dir_idx(1)))) + + pres_SR = pres_SL + + Ms_L = max(1._wp, sqrt(1._wp + ((5.e-1_wp + gamma_L)/(1._wp + gamma_L))* & + (pres_SL/pres_L - 1._wp)*pres_L/ & + ((pres_L + pi_inf_L/(1._wp + gamma_L))))) + Ms_R = max(1._wp, sqrt(1._wp + ((5.e-1_wp + gamma_R)/(1._wp + gamma_R))* & + (pres_SR/pres_R - 1._wp)*pres_R/ & + ((pres_R + pi_inf_R/(1._wp + gamma_R))))) + + s_L = vel_L(dir_idx(1)) - c_L*Ms_L + s_R = vel_R(dir_idx(1)) + c_R*Ms_R + + s_S = 5.e-1_wp*((vel_L(dir_idx(1)) + vel_R(dir_idx(1))) + & + (pres_L - pres_R)/ & + (rho_avg*c_avg)) + end if - ! Source for volume fraction advection equation - $:GPU_LOOP(parallelism='[seq]') - do i = 1, num_dims + ! follows Einfeldt et al. + ! s_M/P = min/max(0.,s_L/R) + s_M = min(0._wp, s_L); s_P = max(0._wp, s_R) - vel_src_rs${XYZ}$_vf(j, k, l, dir_idx(i)) = 0._wp - !IF ( (model_eqns == 4) .or. (num_fluids==1) ) vel_src_rs_vf(dir_idx(i))%sf(j,k,l) = 0._wp - end do + ! goes with q_star_L/R = xi_L/R * (variable) + ! xi_L/R = ( ( s_L/R - u_L/R )/(s_L/R - s_star) ) + xi_L = (s_L - vel_L(dir_idx(1)))/(s_L - s_S) + xi_R = (s_R - vel_R(dir_idx(1)))/(s_R - s_S) - flux_src_rs${XYZ}$_vf(j, k, l, advxb) = vel_src_rs${XYZ}$_vf(j, k, l, dir_idx(1)) + ! goes with numerical velocity in x/y/z directions + ! xi_P/M = 0.5 +/m sgn(0.5,s_star) + xi_M = (5.e-1_wp + sign(5.e-1_wp, s_S)) + xi_P = (5.e-1_wp - sign(5.e-1_wp, s_S)) - ! Add advection flux for bubble variables - if (bubbles_euler) then $:GPU_LOOP(parallelism='[seq]') - do i = bubxb, bubxe + do i = 1, contxe flux_rs${XYZ}$_vf(j, k, l, i) = & - xi_M*nbub_L*qL_prim_rs${XYZ}$_vf(j, k, l, i) & + xi_M*alpha_rho_L(i) & *(vel_L(dir_idx(1)) + s_M*(xi_L - 1._wp)) & - + xi_P*nbub_R*qR_prim_rs${XYZ}$_vf(j + 1, k, l, i) & + + xi_P*alpha_rho_R(i) & *(vel_R(dir_idx(1)) + s_P*(xi_R - 1._wp)) end do - end if - ! Geometrical source flux for cylindrical coordinates - - #:if (NORM_DIR == 2) - if (cyl_coord) then - ! Substituting the advective flux into the inviscid geometrical source flux - $:GPU_LOOP(parallelism='[seq]') - do i = 1, E_idx - flux_gsrc_rs${XYZ}$_vf(j, k, l, i) = flux_rs${XYZ}$_vf(j, k, l, i) - end do - ! Recalculating the radial momentum geometric source flux - flux_gsrc_rs${XYZ}$_vf(j, k, l, contxe + dir_idx(1)) = & + ! Momentum flux. + ! f = \rho u u + p I, q = \rho u, q_star = \xi * \rho*(s_star, v, w) + $:GPU_LOOP(parallelism='[seq]') + do i = 1, num_dims + flux_rs${XYZ}$_vf(j, k, l, contxe + dir_idx(i)) = & xi_M*(rho_L*(vel_L(dir_idx(1))* & - vel_L(dir_idx(1)) + & - s_M*(xi_L*(dir_flg(dir_idx(1))*s_S + & - (1._wp - dir_flg(dir_idx(1)))* & - vel_L(dir_idx(1))) - vel_L(dir_idx(1))))) & + vel_L(dir_idx(i)) + & + s_M*(xi_L*(dir_flg(dir_idx(i))*s_S + & + (1._wp - dir_flg(dir_idx(i)))* & + vel_L(dir_idx(i))) - vel_L(dir_idx(i)))) + & + dir_flg(dir_idx(i))*pres_L) & + xi_P*(rho_R*(vel_R(dir_idx(1))* & - vel_R(dir_idx(1)) + & - s_P*(xi_R*(dir_flg(dir_idx(1))*s_S + & - (1._wp - dir_flg(dir_idx(1)))* & - vel_R(dir_idx(1))) - vel_R(dir_idx(1))))) - ! Geometrical source of the void fraction(s) is zero + vel_R(dir_idx(i)) + & + s_P*(xi_R*(dir_flg(dir_idx(i))*s_S + & + (1._wp - dir_flg(dir_idx(i)))* & + vel_R(dir_idx(i))) - vel_R(dir_idx(i)))) + & + dir_flg(dir_idx(i))*pres_R) + end do + + if (bubbles_euler) then + ! Put p_tilde in $:GPU_LOOP(parallelism='[seq]') - do i = advxb, advxe - flux_gsrc_rs${XYZ}$_vf(j, k, l, i) = 0._wp + do i = 1, num_dims + flux_rs${XYZ}$_vf(j, k, l, contxe + dir_idx(i)) = & + flux_rs${XYZ}$_vf(j, k, l, contxe + dir_idx(i)) + & + xi_M*(dir_flg(dir_idx(i))*(-1._wp*ptilde_L)) & + + xi_P*(dir_flg(dir_idx(i))*(-1._wp*ptilde_R)) end do end if - #:endif - #:if (NORM_DIR == 3) - if (grid_geometry == 3) then + + flux_rs${XYZ}$_vf(j, k, l, E_idx) = 0._wp + + $:GPU_LOOP(parallelism='[seq]') + do i = alf_idx, alf_idx !only advect the void fraction + flux_rs${XYZ}$_vf(j, k, l, i) = & + xi_M*qL_prim_rs${XYZ}$_vf(j, k, l, i) & + *(vel_L(dir_idx(1)) + s_M*(xi_L - 1._wp)) & + + xi_P*qR_prim_rs${XYZ}$_vf(j + 1, k, l, i) & + *(vel_R(dir_idx(1)) + s_P*(xi_R - 1._wp)) + end do + + ! Source for volume fraction advection equation + $:GPU_LOOP(parallelism='[seq]') + do i = 1, num_dims + + vel_src_rs${XYZ}$_vf(j, k, l, dir_idx(i)) = 0._wp + !IF ( (model_eqns == 4) .or. (num_fluids==1) ) vel_src_rs_vf(dir_idx(i))%sf(j,k,l) = 0._wp + end do + + flux_src_rs${XYZ}$_vf(j, k, l, advxb) = vel_src_rs${XYZ}$_vf(j, k, l, dir_idx(1)) + + ! Add advection flux for bubble variables + if (bubbles_euler) then $:GPU_LOOP(parallelism='[seq]') - do i = 1, sys_size - flux_gsrc_rs${XYZ}$_vf(j, k, l, i) = 0._wp + do i = bubxb, bubxe + flux_rs${XYZ}$_vf(j, k, l, i) = & + xi_M*nbub_L*qL_prim_rs${XYZ}$_vf(j, k, l, i) & + *(vel_L(dir_idx(1)) + s_M*(xi_L - 1._wp)) & + + xi_P*nbub_R*qR_prim_rs${XYZ}$_vf(j + 1, k, l, i) & + *(vel_R(dir_idx(1)) + s_P*(xi_R - 1._wp)) end do - flux_gsrc_rs${XYZ}$_vf(j, k, l, momxb + 1) = & - -xi_M*(rho_L*(vel_L(dir_idx(1))* & - vel_L(dir_idx(1)) + & - s_M*(xi_L*(dir_flg(dir_idx(1))*s_S + & - (1._wp - dir_flg(dir_idx(1)))* & - vel_L(dir_idx(1))) - vel_L(dir_idx(1))))) & - - xi_P*(rho_R*(vel_R(dir_idx(1))* & - vel_R(dir_idx(1)) + & - s_P*(xi_R*(dir_flg(dir_idx(1))*s_S + & - (1._wp - dir_flg(dir_idx(1)))* & - vel_R(dir_idx(1))) - vel_R(dir_idx(1))))) - flux_gsrc_rs${XYZ}$_vf(j, k, l, momxe) = flux_rs${XYZ}$_vf(j, k, l, momxb + 1) end if - #:endif + + ! Geometrical source flux for cylindrical coordinates + + #:if (NORM_DIR == 2) + if (cyl_coord) then + ! Substituting the advective flux into the inviscid geometrical source flux + $:GPU_LOOP(parallelism='[seq]') + do i = 1, E_idx + flux_gsrc_rs${XYZ}$_vf(j, k, l, i) = flux_rs${XYZ}$_vf(j, k, l, i) + end do + ! Recalculating the radial momentum geometric source flux + flux_gsrc_rs${XYZ}$_vf(j, k, l, contxe + dir_idx(1)) = & + xi_M*(rho_L*(vel_L(dir_idx(1))* & + vel_L(dir_idx(1)) + & + s_M*(xi_L*(dir_flg(dir_idx(1))*s_S + & + (1._wp - dir_flg(dir_idx(1)))* & + vel_L(dir_idx(1))) - vel_L(dir_idx(1))))) & + + xi_P*(rho_R*(vel_R(dir_idx(1))* & + vel_R(dir_idx(1)) + & + s_P*(xi_R*(dir_flg(dir_idx(1))*s_S + & + (1._wp - dir_flg(dir_idx(1)))* & + vel_R(dir_idx(1))) - vel_R(dir_idx(1))))) + ! Geometrical source of the void fraction(s) is zero + $:GPU_LOOP(parallelism='[seq]') + do i = advxb, advxe + flux_gsrc_rs${XYZ}$_vf(j, k, l, i) = 0._wp + end do + end if + #:endif + #:if (NORM_DIR == 3) + if (grid_geometry == 3) then + $:GPU_LOOP(parallelism='[seq]') + do i = 1, sys_size + flux_gsrc_rs${XYZ}$_vf(j, k, l, i) = 0._wp + end do + flux_gsrc_rs${XYZ}$_vf(j, k, l, momxb + 1) = & + -xi_M*(rho_L*(vel_L(dir_idx(1))* & + vel_L(dir_idx(1)) + & + s_M*(xi_L*(dir_flg(dir_idx(1))*s_S + & + (1._wp - dir_flg(dir_idx(1)))* & + vel_L(dir_idx(1))) - vel_L(dir_idx(1))))) & + - xi_P*(rho_R*(vel_R(dir_idx(1))* & + vel_R(dir_idx(1)) + & + s_P*(xi_R*(dir_flg(dir_idx(1))*s_S + & + (1._wp - dir_flg(dir_idx(1)))* & + vel_R(dir_idx(1))) - vel_R(dir_idx(1))))) + flux_gsrc_rs${XYZ}$_vf(j, k, l, momxe) = flux_rs${XYZ}$_vf(j, k, l, momxb + 1) + end if + #:endif + end do end do end do - end do + #:endcall GPU_PARALLEL_LOOP elseif (model_eqns == 2 .and. bubbles_euler) then - $:GPU_PARALLEL_LOOP(collapse=3, private='[R0_L, R0_R, V0_L, & - & V0_R, P0_L, P0_R, pbw_L, pbw_R, vel_L, & - & vel_R, rho_avg, alpha_L, alpha_R, h_avg, & - & gamma_avg, s_L, s_R, s_S, nbub_L, nbub_R, & - & ptilde_L, ptilde_R, vel_avg_rms, Re_L, Re_R, & - & pcorr, zcoef, vel_L_tmp, vel_R_tmp]') - do l = is3%beg, is3%end - do k = is2%beg, is2%end - do j = is1%beg, is1%end - - vel_L_rms = 0._wp; vel_R_rms = 0._wp - rho_L = 0._wp; rho_R = 0._wp - gamma_L = 0._wp; gamma_R = 0._wp - pi_inf_L = 0._wp; pi_inf_R = 0._wp - qv_L = 0._wp; qv_R = 0._wp - - $:GPU_LOOP(parallelism='[seq]') - do i = 1, num_fluids - alpha_L(i) = qL_prim_rs${XYZ}$_vf(j, k, l, E_idx + i) - alpha_R(i) = qR_prim_rs${XYZ}$_vf(j + 1, k, l, E_idx + i) - end do + #:call GPU_PARALLEL_LOOP(collapse=3, private='[R0_L, R0_R, V0_L, V0_R, P0_L, P0_R, pbw_L, pbw_R, vel_L, vel_R, rho_avg, alpha_L, alpha_R, h_avg, gamma_avg, s_L, s_R, s_S, nbub_L, nbub_R, ptilde_L, ptilde_R, vel_avg_rms, Re_L, Re_R, pcorr, zcoef, vel_L_tmp, vel_R_tmp]') + do l = is3%beg, is3%end + do k = is2%beg, is2%end + do j = is1%beg, is1%end - $:GPU_LOOP(parallelism='[seq]') - do i = 1, num_dims - vel_L(i) = qL_prim_rs${XYZ}$_vf(j, k, l, contxe + i) - vel_R(i) = qR_prim_rs${XYZ}$_vf(j + 1, k, l, contxe + i) - vel_L_rms = vel_L_rms + vel_L(i)**2._wp - vel_R_rms = vel_R_rms + vel_R(i)**2._wp - end do + vel_L_rms = 0._wp; vel_R_rms = 0._wp + rho_L = 0._wp; rho_R = 0._wp + gamma_L = 0._wp; gamma_R = 0._wp + pi_inf_L = 0._wp; pi_inf_R = 0._wp + qv_L = 0._wp; qv_R = 0._wp - ! Retain this in the refactor - if (mpp_lim .and. (num_fluids > 2)) then $:GPU_LOOP(parallelism='[seq]') do i = 1, num_fluids - rho_L = rho_L + qL_prim_rs${XYZ}$_vf(j, k, l, i) - gamma_L = gamma_L + qL_prim_rs${XYZ}$_vf(j, k, l, E_idx + i)*gammas(i) - pi_inf_L = pi_inf_L + qL_prim_rs${XYZ}$_vf(j, k, l, E_idx + i)*pi_infs(i) - qv_L = qv_L + qL_prim_rs${XYZ}$_vf(j, k, l, i)*qvs(i) - rho_R = rho_R + qR_prim_rs${XYZ}$_vf(j + 1, k, l, i) - gamma_R = gamma_R + qR_prim_rs${XYZ}$_vf(j + 1, k, l, E_idx + i)*gammas(i) - pi_inf_R = pi_inf_R + qR_prim_rs${XYZ}$_vf(j + 1, k, l, E_idx + i)*pi_infs(i) - qv_R = qv_R + qR_prim_rs${XYZ}$_vf(j + 1, k, l, i)*qvs(i) + alpha_L(i) = qL_prim_rs${XYZ}$_vf(j, k, l, E_idx + i) + alpha_R(i) = qR_prim_rs${XYZ}$_vf(j + 1, k, l, E_idx + i) end do - else if (num_fluids > 2) then + $:GPU_LOOP(parallelism='[seq]') - do i = 1, num_fluids - 1 - rho_L = rho_L + qL_prim_rs${XYZ}$_vf(j, k, l, i) - gamma_L = gamma_L + qL_prim_rs${XYZ}$_vf(j, k, l, E_idx + i)*gammas(i) - pi_inf_L = pi_inf_L + qL_prim_rs${XYZ}$_vf(j, k, l, E_idx + i)*pi_infs(i) - qv_L = qv_L + qL_prim_rs${XYZ}$_vf(j, k, l, i)*qvs(i) - rho_R = rho_R + qR_prim_rs${XYZ}$_vf(j + 1, k, l, i) - gamma_R = gamma_R + qR_prim_rs${XYZ}$_vf(j + 1, k, l, E_idx + i)*gammas(i) - pi_inf_R = pi_inf_R + qR_prim_rs${XYZ}$_vf(j + 1, k, l, E_idx + i)*pi_infs(i) - qv_R = qv_R + qR_prim_rs${XYZ}$_vf(j + 1, k, l, i)*qvs(i) + do i = 1, num_dims + vel_L(i) = qL_prim_rs${XYZ}$_vf(j, k, l, contxe + i) + vel_R(i) = qR_prim_rs${XYZ}$_vf(j + 1, k, l, contxe + i) + vel_L_rms = vel_L_rms + vel_L(i)**2._wp + vel_R_rms = vel_R_rms + vel_R(i)**2._wp end do - else - rho_L = qL_prim_rs${XYZ}$_vf(j, k, l, 1) - gamma_L = gammas(1) - pi_inf_L = pi_infs(1) - qv_L = qvs(1) - rho_R = qR_prim_rs${XYZ}$_vf(j + 1, k, l, 1) - gamma_R = gammas(1) - pi_inf_R = pi_infs(1) - qv_R = qvs(1) - end if - if (viscous) then - if (num_fluids == 1) then ! Need to consider case with num_fluids >= 2 + ! Retain this in the refactor + if (mpp_lim .and. (num_fluids > 2)) then $:GPU_LOOP(parallelism='[seq]') - do i = 1, 2 - Re_L(i) = dflt_real - Re_R(i) = dflt_real - - if (Re_size(i) > 0) Re_L(i) = 0._wp - if (Re_size(i) > 0) Re_R(i) = 0._wp - - $:GPU_LOOP(parallelism='[seq]') - do q = 1, Re_size(i) - Re_L(i) = (1._wp - qL_prim_rs${XYZ}$_vf(j, k, l, E_idx + Re_idx(i, q)))/Res(i, q) & - + Re_L(i) - Re_R(i) = (1._wp - qR_prim_rs${XYZ}$_vf(j + 1, k, l, E_idx + Re_idx(i, q)))/Res(i, q) & - + Re_R(i) - end do - - Re_L(i) = 1._wp/max(Re_L(i), sgm_eps) - Re_R(i) = 1._wp/max(Re_R(i), sgm_eps) - + do i = 1, num_fluids + rho_L = rho_L + qL_prim_rs${XYZ}$_vf(j, k, l, i) + gamma_L = gamma_L + qL_prim_rs${XYZ}$_vf(j, k, l, E_idx + i)*gammas(i) + pi_inf_L = pi_inf_L + qL_prim_rs${XYZ}$_vf(j, k, l, E_idx + i)*pi_infs(i) + qv_L = qv_L + qL_prim_rs${XYZ}$_vf(j, k, l, i)*qvs(i) + rho_R = rho_R + qR_prim_rs${XYZ}$_vf(j + 1, k, l, i) + gamma_R = gamma_R + qR_prim_rs${XYZ}$_vf(j + 1, k, l, E_idx + i)*gammas(i) + pi_inf_R = pi_inf_R + qR_prim_rs${XYZ}$_vf(j + 1, k, l, E_idx + i)*pi_infs(i) + qv_R = qv_R + qR_prim_rs${XYZ}$_vf(j + 1, k, l, i)*qvs(i) + end do + else if (num_fluids > 2) then + $:GPU_LOOP(parallelism='[seq]') + do i = 1, num_fluids - 1 + rho_L = rho_L + qL_prim_rs${XYZ}$_vf(j, k, l, i) + gamma_L = gamma_L + qL_prim_rs${XYZ}$_vf(j, k, l, E_idx + i)*gammas(i) + pi_inf_L = pi_inf_L + qL_prim_rs${XYZ}$_vf(j, k, l, E_idx + i)*pi_infs(i) + qv_L = qv_L + qL_prim_rs${XYZ}$_vf(j, k, l, i)*qvs(i) + rho_R = rho_R + qR_prim_rs${XYZ}$_vf(j + 1, k, l, i) + gamma_R = gamma_R + qR_prim_rs${XYZ}$_vf(j + 1, k, l, E_idx + i)*gammas(i) + pi_inf_R = pi_inf_R + qR_prim_rs${XYZ}$_vf(j + 1, k, l, E_idx + i)*pi_infs(i) + qv_R = qv_R + qR_prim_rs${XYZ}$_vf(j + 1, k, l, i)*qvs(i) end do + else + rho_L = qL_prim_rs${XYZ}$_vf(j, k, l, 1) + gamma_L = gammas(1) + pi_inf_L = pi_infs(1) + qv_L = qvs(1) + rho_R = qR_prim_rs${XYZ}$_vf(j + 1, k, l, 1) + gamma_R = gammas(1) + pi_inf_R = pi_infs(1) + qv_R = qvs(1) end if - end if - pres_L = qL_prim_rs${XYZ}$_vf(j, k, l, E_idx) - pres_R = qR_prim_rs${XYZ}$_vf(j + 1, k, l, E_idx) + if (viscous) then + if (num_fluids == 1) then ! Need to consider case with num_fluids >= 2 + $:GPU_LOOP(parallelism='[seq]') + do i = 1, 2 + Re_L(i) = dflt_real + Re_R(i) = dflt_real - E_L = gamma_L*pres_L + pi_inf_L + 5.e-1_wp*rho_L*vel_L_rms - E_R = gamma_R*pres_R + pi_inf_R + 5.e-1_wp*rho_R*vel_R_rms + if (Re_size(i) > 0) Re_L(i) = 0._wp + if (Re_size(i) > 0) Re_R(i) = 0._wp - H_L = (E_L + pres_L)/rho_L - H_R = (E_R + pres_R)/rho_R + $:GPU_LOOP(parallelism='[seq]') + do q = 1, Re_size(i) + Re_L(i) = (1._wp - qL_prim_rs${XYZ}$_vf(j, k, l, E_idx + Re_idx(i, q)))/Res_gs(i, q) & + + Re_L(i) + Re_R(i) = (1._wp - qR_prim_rs${XYZ}$_vf(j + 1, k, l, E_idx + Re_idx(i, q)))/Res_gs(i, q) & + + Re_R(i) + end do - if (avg_state == 2) then - $:GPU_LOOP(parallelism='[seq]') - do i = 1, nb - R0_L(i) = qL_prim_rs${XYZ}$_vf(j, k, l, rs(i)) - R0_R(i) = qR_prim_rs${XYZ}$_vf(j + 1, k, l, rs(i)) - - V0_L(i) = qL_prim_rs${XYZ}$_vf(j, k, l, vs(i)) - V0_R(i) = qR_prim_rs${XYZ}$_vf(j + 1, k, l, vs(i)) - if (.not. polytropic .and. .not. qbmm) then - P0_L(i) = qL_prim_rs${XYZ}$_vf(j, k, l, ps(i)) - P0_R(i) = qR_prim_rs${XYZ}$_vf(j + 1, k, l, ps(i)) - end if - end do + Re_L(i) = 1._wp/max(Re_L(i), sgm_eps) + Re_R(i) = 1._wp/max(Re_R(i), sgm_eps) - if (.not. qbmm) then - if (adv_n) then - nbub_L = qL_prim_rs${XYZ}$_vf(j, k, l, n_idx) - nbub_R = qR_prim_rs${XYZ}$_vf(j + 1, k, l, n_idx) - else - nbub_L_denom = 0._wp - nbub_R_denom = 0._wp - $:GPU_LOOP(parallelism='[seq]') - do i = 1, nb - nbub_L_denom = nbub_L_denom + (R0_L(i)**3._wp)*weight(i) - nbub_R_denom = nbub_R_denom + (R0_R(i)**3._wp)*weight(i) end do - nbub_L = (3._wp/(4._wp*pi))*qL_prim_rs${XYZ}$_vf(j, k, l, E_idx + num_fluids)/nbub_L_denom - nbub_R = (3._wp/(4._wp*pi))*qR_prim_rs${XYZ}$_vf(j + 1, k, l, E_idx + num_fluids)/nbub_R_denom end if - else - !nb stored in 0th moment of first R0 bin in variable conversion module - nbub_L = qL_prim_rs${XYZ}$_vf(j, k, l, bubxb) - nbub_R = qR_prim_rs${XYZ}$_vf(j + 1, k, l, bubxb) end if - $:GPU_LOOP(parallelism='[seq]') - do i = 1, nb - if (.not. qbmm) then - if (polytropic) then - pbw_L(i) = f_cpbw_KM(R0(i), R0_L(i), V0_L(i), 0._wp) - pbw_R(i) = f_cpbw_KM(R0(i), R0_R(i), V0_R(i), 0._wp) - else - pbw_L(i) = f_cpbw_KM(R0(i), R0_L(i), V0_L(i), P0_L(i)) - pbw_R(i) = f_cpbw_KM(R0(i), R0_R(i), V0_R(i), P0_R(i)) - end if - end if - end do - - if (qbmm) then - PbwR3Lbar = mom_sp_rs${XYZ}$_vf(j, k, l, 4) - PbwR3Rbar = mom_sp_rs${XYZ}$_vf(j + 1, k, l, 4) - - R3Lbar = mom_sp_rs${XYZ}$_vf(j, k, l, 1) - R3Rbar = mom_sp_rs${XYZ}$_vf(j + 1, k, l, 1) + pres_L = qL_prim_rs${XYZ}$_vf(j, k, l, E_idx) + pres_R = qR_prim_rs${XYZ}$_vf(j + 1, k, l, E_idx) - R3V2Lbar = mom_sp_rs${XYZ}$_vf(j, k, l, 3) - R3V2Rbar = mom_sp_rs${XYZ}$_vf(j + 1, k, l, 3) - else + E_L = gamma_L*pres_L + pi_inf_L + 5.e-1_wp*rho_L*vel_L_rms + E_R = gamma_R*pres_R + pi_inf_R + 5.e-1_wp*rho_R*vel_R_rms - PbwR3Lbar = 0._wp - PbwR3Rbar = 0._wp + H_L = (E_L + pres_L)/rho_L + H_R = (E_R + pres_R)/rho_R - R3Lbar = 0._wp - R3Rbar = 0._wp + if (avg_state == 2) then + $:GPU_LOOP(parallelism='[seq]') + do i = 1, nb + R0_L(i) = qL_prim_rs${XYZ}$_vf(j, k, l, rs(i)) + R0_R(i) = qR_prim_rs${XYZ}$_vf(j + 1, k, l, rs(i)) + + V0_L(i) = qL_prim_rs${XYZ}$_vf(j, k, l, vs(i)) + V0_R(i) = qR_prim_rs${XYZ}$_vf(j + 1, k, l, vs(i)) + if (.not. polytropic .and. .not. qbmm) then + P0_L(i) = qL_prim_rs${XYZ}$_vf(j, k, l, ps(i)) + P0_R(i) = qR_prim_rs${XYZ}$_vf(j + 1, k, l, ps(i)) + end if + end do - R3V2Lbar = 0._wp - R3V2Rbar = 0._wp + if (.not. qbmm) then + if (adv_n) then + nbub_L = qL_prim_rs${XYZ}$_vf(j, k, l, n_idx) + nbub_R = qR_prim_rs${XYZ}$_vf(j + 1, k, l, n_idx) + else + nbub_L_denom = 0._wp + nbub_R_denom = 0._wp + $:GPU_LOOP(parallelism='[seq]') + do i = 1, nb + nbub_L_denom = nbub_L_denom + (R0_L(i)**3._wp)*weight(i) + nbub_R_denom = nbub_R_denom + (R0_R(i)**3._wp)*weight(i) + end do + nbub_L = (3._wp/(4._wp*pi))*qL_prim_rs${XYZ}$_vf(j, k, l, E_idx + num_fluids)/nbub_L_denom + nbub_R = (3._wp/(4._wp*pi))*qR_prim_rs${XYZ}$_vf(j + 1, k, l, E_idx + num_fluids)/nbub_R_denom + end if + else + !nb stored in 0th moment of first R0 bin in variable conversion module + nbub_L = qL_prim_rs${XYZ}$_vf(j, k, l, bubxb) + nbub_R = qR_prim_rs${XYZ}$_vf(j + 1, k, l, bubxb) + end if $:GPU_LOOP(parallelism='[seq]') do i = 1, nb - PbwR3Lbar = PbwR3Lbar + pbw_L(i)*(R0_L(i)**3._wp)*weight(i) - PbwR3Rbar = PbwR3Rbar + pbw_R(i)*(R0_R(i)**3._wp)*weight(i) + if (.not. qbmm) then + if (polytropic) then + pbw_L(i) = f_cpbw_KM(R0(i), R0_L(i), V0_L(i), 0._wp) + pbw_R(i) = f_cpbw_KM(R0(i), R0_R(i), V0_R(i), 0._wp) + else + pbw_L(i) = f_cpbw_KM(R0(i), R0_L(i), V0_L(i), P0_L(i)) + pbw_R(i) = f_cpbw_KM(R0(i), R0_R(i), V0_R(i), P0_R(i)) + end if + end if + end do - R3Lbar = R3Lbar + (R0_L(i)**3._wp)*weight(i) - R3Rbar = R3Rbar + (R0_R(i)**3._wp)*weight(i) + if (qbmm) then + PbwR3Lbar = mom_sp_rs${XYZ}$_vf(j, k, l, 4) + PbwR3Rbar = mom_sp_rs${XYZ}$_vf(j + 1, k, l, 4) - R3V2Lbar = R3V2Lbar + (R0_L(i)**3._wp)*(V0_L(i)**2._wp)*weight(i) - R3V2Rbar = R3V2Rbar + (R0_R(i)**3._wp)*(V0_R(i)**2._wp)*weight(i) - end do - end if + R3Lbar = mom_sp_rs${XYZ}$_vf(j, k, l, 1) + R3Rbar = mom_sp_rs${XYZ}$_vf(j + 1, k, l, 1) - if (qL_prim_rs${XYZ}$_vf(j, k, l, E_idx + num_fluids) < small_alf .or. R3Lbar < small_alf) then - ptilde_L = qL_prim_rs${XYZ}$_vf(j, k, l, E_idx + num_fluids)*pres_L - else - ptilde_L = qL_prim_rs${XYZ}$_vf(j, k, l, E_idx + num_fluids)*(pres_L - PbwR3Lbar/R3Lbar - & - rho_L*R3V2Lbar/R3Lbar) - end if + R3V2Lbar = mom_sp_rs${XYZ}$_vf(j, k, l, 3) + R3V2Rbar = mom_sp_rs${XYZ}$_vf(j + 1, k, l, 3) + else - if (qR_prim_rs${XYZ}$_vf(j + 1, k, l, E_idx + num_fluids) < small_alf .or. R3Rbar < small_alf) then - ptilde_R = qR_prim_rs${XYZ}$_vf(j + 1, k, l, E_idx + num_fluids)*pres_R - else - ptilde_R = qR_prim_rs${XYZ}$_vf(j + 1, k, l, E_idx + num_fluids)*(pres_R - PbwR3Rbar/R3Rbar - & - rho_R*R3V2Rbar/R3Rbar) - end if + PbwR3Lbar = 0._wp + PbwR3Rbar = 0._wp - if ((.not. f_approx_equal(ptilde_L, ptilde_L)) .or. (.not. f_approx_equal(ptilde_R, ptilde_R))) then - end if + R3Lbar = 0._wp + R3Rbar = 0._wp - rho_avg = 5.e-1_wp*(rho_L + rho_R) - H_avg = 5.e-1_wp*(H_L + H_R) - gamma_avg = 5.e-1_wp*(gamma_L + gamma_R) - vel_avg_rms = 0._wp + R3V2Lbar = 0._wp + R3V2Rbar = 0._wp - $:GPU_LOOP(parallelism='[seq]') - do i = 1, num_dims - vel_avg_rms = vel_avg_rms + (5.e-1_wp*(vel_L(i) + vel_R(i)))**2._wp - end do + $:GPU_LOOP(parallelism='[seq]') + do i = 1, nb + PbwR3Lbar = PbwR3Lbar + pbw_L(i)*(R0_L(i)**3._wp)*weight(i) + PbwR3Rbar = PbwR3Rbar + pbw_R(i)*(R0_R(i)**3._wp)*weight(i) - end if + R3Lbar = R3Lbar + (R0_L(i)**3._wp)*weight(i) + R3Rbar = R3Rbar + (R0_R(i)**3._wp)*weight(i) - call s_compute_speed_of_sound(pres_L, rho_L, gamma_L, pi_inf_L, H_L, alpha_L, & - vel_L_rms, 0._wp, c_L) + R3V2Lbar = R3V2Lbar + (R0_L(i)**3._wp)*(V0_L(i)**2._wp)*weight(i) + R3V2Rbar = R3V2Rbar + (R0_R(i)**3._wp)*(V0_R(i)**2._wp)*weight(i) + end do + end if - call s_compute_speed_of_sound(pres_R, rho_R, gamma_R, pi_inf_R, H_R, alpha_R, & - vel_R_rms, 0._wp, c_R) + if (qL_prim_rs${XYZ}$_vf(j, k, l, E_idx + num_fluids) < small_alf .or. R3Lbar < small_alf) then + ptilde_L = qL_prim_rs${XYZ}$_vf(j, k, l, E_idx + num_fluids)*pres_L + else + ptilde_L = qL_prim_rs${XYZ}$_vf(j, k, l, E_idx + num_fluids)*(pres_L - PbwR3Lbar/R3Lbar - & + rho_L*R3V2Lbar/R3Lbar) + end if - !> The computation of c_avg does not require all the variables, and therefore the non '_avg' - ! variables are placeholders to call the subroutine. - call s_compute_speed_of_sound(pres_R, rho_avg, gamma_avg, pi_inf_R, H_avg, alpha_R, & - vel_avg_rms, 0._wp, c_avg) + if (qR_prim_rs${XYZ}$_vf(j + 1, k, l, E_idx + num_fluids) < small_alf .or. R3Rbar < small_alf) then + ptilde_R = qR_prim_rs${XYZ}$_vf(j + 1, k, l, E_idx + num_fluids)*pres_R + else + ptilde_R = qR_prim_rs${XYZ}$_vf(j + 1, k, l, E_idx + num_fluids)*(pres_R - PbwR3Rbar/R3Rbar - & + rho_R*R3V2Rbar/R3Rbar) + end if - if (viscous) then - $:GPU_LOOP(parallelism='[seq]') - do i = 1, 2 - Re_avg_rs${XYZ}$_vf(j, k, l, i) = 2._wp/(1._wp/Re_L(i) + 1._wp/Re_R(i)) - end do - end if + if ((.not. f_approx_equal(ptilde_L, ptilde_L)) .or. (.not. f_approx_equal(ptilde_R, ptilde_R))) then + end if - ! Low Mach correction - if (low_Mach == 2) then - @:compute_low_Mach_correction() - end if + rho_avg = 5.e-1_wp*(rho_L + rho_R) + H_avg = 5.e-1_wp*(H_L + H_R) + gamma_avg = 5.e-1_wp*(gamma_L + gamma_R) + vel_avg_rms = 0._wp - if (wave_speeds == 1) then - s_L = min(vel_L(dir_idx(1)) - c_L, vel_R(dir_idx(1)) - c_R) - s_R = max(vel_R(dir_idx(1)) + c_R, vel_L(dir_idx(1)) + c_L) + $:GPU_LOOP(parallelism='[seq]') + do i = 1, num_dims + vel_avg_rms = vel_avg_rms + (5.e-1_wp*(vel_L(i) + vel_R(i)))**2._wp + end do - s_S = (pres_R - pres_L + rho_L*vel_L(dir_idx(1))* & - (s_L - vel_L(dir_idx(1))) - & - rho_R*vel_R(dir_idx(1))* & - (s_R - vel_R(dir_idx(1)))) & - /(rho_L*(s_L - vel_L(dir_idx(1))) - & - rho_R*(s_R - vel_R(dir_idx(1)))) - elseif (wave_speeds == 2) then - pres_SL = 5.e-1_wp*(pres_L + pres_R + rho_avg*c_avg* & - (vel_L(dir_idx(1)) - & - vel_R(dir_idx(1)))) + end if - pres_SR = pres_SL + call s_compute_speed_of_sound(pres_L, rho_L, gamma_L, pi_inf_L, H_L, alpha_L, & + vel_L_rms, 0._wp, c_L) - Ms_L = max(1._wp, sqrt(1._wp + ((5.e-1_wp + gamma_L)/(1._wp + gamma_L))* & - (pres_SL/pres_L - 1._wp)*pres_L/ & - ((pres_L + pi_inf_L/(1._wp + gamma_L))))) - Ms_R = max(1._wp, sqrt(1._wp + ((5.e-1_wp + gamma_R)/(1._wp + gamma_R))* & - (pres_SR/pres_R - 1._wp)*pres_R/ & - ((pres_R + pi_inf_R/(1._wp + gamma_R))))) + call s_compute_speed_of_sound(pres_R, rho_R, gamma_R, pi_inf_R, H_R, alpha_R, & + vel_R_rms, 0._wp, c_R) - s_L = vel_L(dir_idx(1)) - c_L*Ms_L - s_R = vel_R(dir_idx(1)) + c_R*Ms_R + !> The computation of c_avg does not require all the variables, and therefore the non '_avg' + ! variables are placeholders to call the subroutine. + call s_compute_speed_of_sound(pres_R, rho_avg, gamma_avg, pi_inf_R, H_avg, alpha_R, & + vel_avg_rms, 0._wp, c_avg) - s_S = 5.e-1_wp*((vel_L(dir_idx(1)) + vel_R(dir_idx(1))) + & - (pres_L - pres_R)/ & - (rho_avg*c_avg)) - end if + if (viscous) then + $:GPU_LOOP(parallelism='[seq]') + do i = 1, 2 + Re_avg_rs${XYZ}$_vf(j, k, l, i) = 2._wp/(1._wp/Re_L(i) + 1._wp/Re_R(i)) + end do + end if - ! follows Einfeldt et al. - ! s_M/P = min/max(0.,s_L/R) - s_M = min(0._wp, s_L); s_P = max(0._wp, s_R) + ! Low Mach correction + if (low_Mach == 2) then + @:compute_low_Mach_correction() + end if - ! goes with q_star_L/R = xi_L/R * (variable) - ! xi_L/R = ( ( s_L/R - u_L/R )/(s_L/R - s_star) ) - xi_L = (s_L - vel_L(dir_idx(1)))/(s_L - s_S) - xi_R = (s_R - vel_R(dir_idx(1)))/(s_R - s_S) + if (wave_speeds == 1) then + s_L = min(vel_L(dir_idx(1)) - c_L, vel_R(dir_idx(1)) - c_R) + s_R = max(vel_R(dir_idx(1)) + c_R, vel_L(dir_idx(1)) + c_L) - ! goes with numerical velocity in x/y/z directions - ! xi_P/M = 0.5 +/m sgn(0.5,s_star) - xi_M = (5.e-1_wp + sign(5.e-1_wp, s_S)) - xi_P = (5.e-1_wp - sign(5.e-1_wp, s_S)) + s_S = (pres_R - pres_L + rho_L*vel_L(dir_idx(1))* & + (s_L - vel_L(dir_idx(1))) - & + rho_R*vel_R(dir_idx(1))* & + (s_R - vel_R(dir_idx(1)))) & + /(rho_L*(s_L - vel_L(dir_idx(1))) - & + rho_R*(s_R - vel_R(dir_idx(1)))) + elseif (wave_speeds == 2) then + pres_SL = 5.e-1_wp*(pres_L + pres_R + rho_avg*c_avg* & + (vel_L(dir_idx(1)) - & + vel_R(dir_idx(1)))) + + pres_SR = pres_SL + + Ms_L = max(1._wp, sqrt(1._wp + ((5.e-1_wp + gamma_L)/(1._wp + gamma_L))* & + (pres_SL/pres_L - 1._wp)*pres_L/ & + ((pres_L + pi_inf_L/(1._wp + gamma_L))))) + Ms_R = max(1._wp, sqrt(1._wp + ((5.e-1_wp + gamma_R)/(1._wp + gamma_R))* & + (pres_SR/pres_R - 1._wp)*pres_R/ & + ((pres_R + pi_inf_R/(1._wp + gamma_R))))) + + s_L = vel_L(dir_idx(1)) - c_L*Ms_L + s_R = vel_R(dir_idx(1)) + c_R*Ms_R + + s_S = 5.e-1_wp*((vel_L(dir_idx(1)) + vel_R(dir_idx(1))) + & + (pres_L - pres_R)/ & + (rho_avg*c_avg)) + end if - ! Low Mach correction - if (low_Mach == 1) then - @:compute_low_Mach_correction() - else - pcorr = 0._wp - end if + ! follows Einfeldt et al. + ! s_M/P = min/max(0.,s_L/R) + s_M = min(0._wp, s_L); s_P = max(0._wp, s_R) - $:GPU_LOOP(parallelism='[seq]') - do i = 1, contxe - flux_rs${XYZ}$_vf(j, k, l, i) = & - xi_M*qL_prim_rs${XYZ}$_vf(j, k, l, i) & - *(vel_L(dir_idx(1)) + s_M*(xi_L - 1._wp)) & - + xi_P*qR_prim_rs${XYZ}$_vf(j + 1, k, l, i) & - *(vel_R(dir_idx(1)) + s_P*(xi_R - 1._wp)) - end do + ! goes with q_star_L/R = xi_L/R * (variable) + ! xi_L/R = ( ( s_L/R - u_L/R )/(s_L/R - s_star) ) + xi_L = (s_L - vel_L(dir_idx(1)))/(s_L - s_S) + xi_R = (s_R - vel_R(dir_idx(1)))/(s_R - s_S) - if (bubbles_euler .and. (num_fluids > 1)) then - ! Kill mass transport @ gas density - flux_rs${XYZ}$_vf(j, k, l, contxe) = 0._wp - end if + ! goes with numerical velocity in x/y/z directions + ! xi_P/M = 0.5 +/m sgn(0.5,s_star) + xi_M = (5.e-1_wp + sign(5.e-1_wp, s_S)) + xi_P = (5.e-1_wp - sign(5.e-1_wp, s_S)) - ! Momentum flux. - ! f = \rho u u + p I, q = \rho u, q_star = \xi * \rho*(s_star, v, w) + ! Low Mach correction + if (low_Mach == 1) then + @:compute_low_Mach_correction() + else + pcorr = 0._wp + end if - ! Include p_tilde + $:GPU_LOOP(parallelism='[seq]') + do i = 1, contxe + flux_rs${XYZ}$_vf(j, k, l, i) = & + xi_M*qL_prim_rs${XYZ}$_vf(j, k, l, i) & + *(vel_L(dir_idx(1)) + s_M*(xi_L - 1._wp)) & + + xi_P*qR_prim_rs${XYZ}$_vf(j + 1, k, l, i) & + *(vel_R(dir_idx(1)) + s_P*(xi_R - 1._wp)) + end do - $:GPU_LOOP(parallelism='[seq]') - do i = 1, num_dims - flux_rs${XYZ}$_vf(j, k, l, contxe + dir_idx(i)) = & - xi_M*(rho_L*(vel_L(dir_idx(1))* & - vel_L(dir_idx(i)) + & - s_M*(xi_L*(dir_flg(dir_idx(i))*s_S + & - (1._wp - dir_flg(dir_idx(i)))* & - vel_L(dir_idx(i))) - vel_L(dir_idx(i)))) + & - dir_flg(dir_idx(i))*(pres_L - ptilde_L)) & - + xi_P*(rho_R*(vel_R(dir_idx(1))* & - vel_R(dir_idx(i)) + & - s_P*(xi_R*(dir_flg(dir_idx(i))*s_S + & - (1._wp - dir_flg(dir_idx(i)))* & - vel_R(dir_idx(i))) - vel_R(dir_idx(i)))) + & - dir_flg(dir_idx(i))*(pres_R - ptilde_R)) & - + (s_M/s_L)*(s_P/s_R)*dir_flg(dir_idx(i))*pcorr - end do + if (bubbles_euler .and. (num_fluids > 1)) then + ! Kill mass transport @ gas density + flux_rs${XYZ}$_vf(j, k, l, contxe) = 0._wp + end if - ! Energy flux. - ! f = u*(E+p), q = E, q_star = \xi*E+(s-u)(\rho s_star + p/(s-u)) - flux_rs${XYZ}$_vf(j, k, l, E_idx) = & - xi_M*(vel_L(dir_idx(1))*(E_L + pres_L - ptilde_L) + & - s_M*(xi_L*(E_L + (s_S - vel_L(dir_idx(1)))* & - (rho_L*s_S + (pres_L - ptilde_L)/ & - (s_L - vel_L(dir_idx(1))))) - E_L)) & - + xi_P*(vel_R(dir_idx(1))*(E_R + pres_R - ptilde_R) + & - s_P*(xi_R*(E_R + (s_S - vel_R(dir_idx(1)))* & - (rho_R*s_S + (pres_R - ptilde_R)/ & - (s_R - vel_R(dir_idx(1))))) - E_R)) & - + (s_M/s_L)*(s_P/s_R)*pcorr*s_S - - ! Volume fraction flux - $:GPU_LOOP(parallelism='[seq]') - do i = advxb, advxe - flux_rs${XYZ}$_vf(j, k, l, i) = & - xi_M*qL_prim_rs${XYZ}$_vf(j, k, l, i) & - *(vel_L(dir_idx(1)) + s_M*(xi_L - 1._wp)) & - + xi_P*qR_prim_rs${XYZ}$_vf(j + 1, k, l, i) & - *(vel_R(dir_idx(1)) + s_P*(xi_R - 1._wp)) - end do + ! Momentum flux. + ! f = \rho u u + p I, q = \rho u, q_star = \xi * \rho*(s_star, v, w) - ! Source for volume fraction advection equation - $:GPU_LOOP(parallelism='[seq]') - do i = 1, num_dims - vel_src_rs${XYZ}$_vf(j, k, l, dir_idx(i)) = & - xi_M*(vel_L(dir_idx(i)) + & - dir_flg(dir_idx(i))* & - s_M*(xi_L - 1._wp)) & - + xi_P*(vel_R(dir_idx(i)) + & - dir_flg(dir_idx(i))* & - s_P*(xi_R - 1._wp)) - - !IF ( (model_eqns == 4) .or. (num_fluids==1) ) vel_src_rs_vf(idxi)%sf(j,k,l) = 0._wp - end do + ! Include p_tilde - flux_src_rs${XYZ}$_vf(j, k, l, advxb) = vel_src_rs${XYZ}$_vf(j, k, l, dir_idx(1)) + $:GPU_LOOP(parallelism='[seq]') + do i = 1, num_dims + flux_rs${XYZ}$_vf(j, k, l, contxe + dir_idx(i)) = & + xi_M*(rho_L*(vel_L(dir_idx(1))* & + vel_L(dir_idx(i)) + & + s_M*(xi_L*(dir_flg(dir_idx(i))*s_S + & + (1._wp - dir_flg(dir_idx(i)))* & + vel_L(dir_idx(i))) - vel_L(dir_idx(i)))) + & + dir_flg(dir_idx(i))*(pres_L - ptilde_L)) & + + xi_P*(rho_R*(vel_R(dir_idx(1))* & + vel_R(dir_idx(i)) + & + s_P*(xi_R*(dir_flg(dir_idx(i))*s_S + & + (1._wp - dir_flg(dir_idx(i)))* & + vel_R(dir_idx(i))) - vel_R(dir_idx(i)))) + & + dir_flg(dir_idx(i))*(pres_R - ptilde_R)) & + + (s_M/s_L)*(s_P/s_R)*dir_flg(dir_idx(i))*pcorr + end do - ! Add advection flux for bubble variables - $:GPU_LOOP(parallelism='[seq]') - do i = bubxb, bubxe - flux_rs${XYZ}$_vf(j, k, l, i) = & - xi_M*nbub_L*qL_prim_rs${XYZ}$_vf(j, k, l, i) & - *(vel_L(dir_idx(1)) + s_M*(xi_L - 1._wp)) & - + xi_P*nbub_R*qR_prim_rs${XYZ}$_vf(j + 1, k, l, i) & - *(vel_R(dir_idx(1)) + s_P*(xi_R - 1._wp)) - end do + ! Energy flux. + ! f = u*(E+p), q = E, q_star = \xi*E+(s-u)(\rho s_star + p/(s-u)) + flux_rs${XYZ}$_vf(j, k, l, E_idx) = & + xi_M*(vel_L(dir_idx(1))*(E_L + pres_L - ptilde_L) + & + s_M*(xi_L*(E_L + (s_S - vel_L(dir_idx(1)))* & + (rho_L*s_S + (pres_L - ptilde_L)/ & + (s_L - vel_L(dir_idx(1))))) - E_L)) & + + xi_P*(vel_R(dir_idx(1))*(E_R + pres_R - ptilde_R) + & + s_P*(xi_R*(E_R + (s_S - vel_R(dir_idx(1)))* & + (rho_R*s_S + (pres_R - ptilde_R)/ & + (s_R - vel_R(dir_idx(1))))) - E_R)) & + + (s_M/s_L)*(s_P/s_R)*pcorr*s_S + + ! Volume fraction flux + $:GPU_LOOP(parallelism='[seq]') + do i = advxb, advxe + flux_rs${XYZ}$_vf(j, k, l, i) = & + xi_M*qL_prim_rs${XYZ}$_vf(j, k, l, i) & + *(vel_L(dir_idx(1)) + s_M*(xi_L - 1._wp)) & + + xi_P*qR_prim_rs${XYZ}$_vf(j + 1, k, l, i) & + *(vel_R(dir_idx(1)) + s_P*(xi_R - 1._wp)) + end do - if (qbmm) then - flux_rs${XYZ}$_vf(j, k, l, bubxb) = & - xi_M*nbub_L & - *(vel_L(dir_idx(1)) + s_M*(xi_L - 1._wp)) & - + xi_P*nbub_R & - *(vel_R(dir_idx(1)) + s_P*(xi_R - 1._wp)) - end if + ! Source for volume fraction advection equation + $:GPU_LOOP(parallelism='[seq]') + do i = 1, num_dims + vel_src_rs${XYZ}$_vf(j, k, l, dir_idx(i)) = & + xi_M*(vel_L(dir_idx(i)) + & + dir_flg(dir_idx(i))* & + s_M*(xi_L - 1._wp)) & + + xi_P*(vel_R(dir_idx(i)) + & + dir_flg(dir_idx(i))* & + s_P*(xi_R - 1._wp)) + + !IF ( (model_eqns == 4) .or. (num_fluids==1) ) vel_src_rs_vf(idxi)%sf(j,k,l) = 0._wp + end do - if (adv_n) then - flux_rs${XYZ}$_vf(j, k, l, n_idx) = & - xi_M*nbub_L & - *(vel_L(dir_idx(1)) + s_M*(xi_L - 1._wp)) & - + xi_P*nbub_R & - *(vel_R(dir_idx(1)) + s_P*(xi_R - 1._wp)) - end if + flux_src_rs${XYZ}$_vf(j, k, l, advxb) = vel_src_rs${XYZ}$_vf(j, k, l, dir_idx(1)) - ! Geometrical source flux for cylindrical coordinates - #:if (NORM_DIR == 2) - if (cyl_coord) then - ! Substituting the advective flux into the inviscid geometrical source flux - $:GPU_LOOP(parallelism='[seq]') - do i = 1, E_idx - flux_gsrc_rs${XYZ}$_vf(j, k, l, i) = flux_rs${XYZ}$_vf(j, k, l, i) - end do - ! Recalculating the radial momentum geometric source flux - flux_gsrc_rs${XYZ}$_vf(j, k, l, contxe + dir_idx(1)) = & - xi_M*(rho_L*(vel_L(dir_idx(1))* & - vel_L(dir_idx(1)) + & - s_M*(xi_L*(dir_flg(dir_idx(1))*s_S + & - (1._wp - dir_flg(dir_idx(1)))* & - vel_L(dir_idx(1))) - vel_L(dir_idx(1))))) & - + xi_P*(rho_R*(vel_R(dir_idx(1))* & - vel_R(dir_idx(1)) + & - s_P*(xi_R*(dir_flg(dir_idx(1))*s_S + & - (1._wp - dir_flg(dir_idx(1)))* & - vel_R(dir_idx(1))) - vel_R(dir_idx(1))))) - ! Geometrical source of the void fraction(s) is zero - $:GPU_LOOP(parallelism='[seq]') - do i = advxb, advxe - flux_gsrc_rs${XYZ}$_vf(j, k, l, i) = 0._wp - end do - end if - #:endif - #:if (NORM_DIR == 3) - if (grid_geometry == 3) then - $:GPU_LOOP(parallelism='[seq]') - do i = 1, sys_size - flux_gsrc_rs${XYZ}$_vf(j, k, l, i) = 0._wp - end do + ! Add advection flux for bubble variables + $:GPU_LOOP(parallelism='[seq]') + do i = bubxb, bubxe + flux_rs${XYZ}$_vf(j, k, l, i) = & + xi_M*nbub_L*qL_prim_rs${XYZ}$_vf(j, k, l, i) & + *(vel_L(dir_idx(1)) + s_M*(xi_L - 1._wp)) & + + xi_P*nbub_R*qR_prim_rs${XYZ}$_vf(j + 1, k, l, i) & + *(vel_R(dir_idx(1)) + s_P*(xi_R - 1._wp)) + end do - flux_gsrc_rs${XYZ}$_vf(j, k, l, momxb + 1) = & - -xi_M*(rho_L*(vel_L(dir_idx(1))* & - vel_L(dir_idx(1)) + & - s_M*(xi_L*(dir_flg(dir_idx(1))*s_S + & - (1._wp - dir_flg(dir_idx(1)))* & - vel_L(dir_idx(1))) - vel_L(dir_idx(1))))) & - - xi_P*(rho_R*(vel_R(dir_idx(1))* & - vel_R(dir_idx(1)) + & - s_P*(xi_R*(dir_flg(dir_idx(1))*s_S + & - (1._wp - dir_flg(dir_idx(1)))* & - vel_R(dir_idx(1))) - vel_R(dir_idx(1))))) - flux_gsrc_rs${XYZ}$_vf(j, k, l, momxe) = flux_rs${XYZ}$_vf(j, k, l, momxb + 1) + if (qbmm) then + flux_rs${XYZ}$_vf(j, k, l, bubxb) = & + xi_M*nbub_L & + *(vel_L(dir_idx(1)) + s_M*(xi_L - 1._wp)) & + + xi_P*nbub_R & + *(vel_R(dir_idx(1)) + s_P*(xi_R - 1._wp)) + end if + if (adv_n) then + flux_rs${XYZ}$_vf(j, k, l, n_idx) = & + xi_M*nbub_L & + *(vel_L(dir_idx(1)) + s_M*(xi_L - 1._wp)) & + + xi_P*nbub_R & + *(vel_R(dir_idx(1)) + s_P*(xi_R - 1._wp)) end if - #:endif - end do - end do - end do - else - ! 5-EQUATION MODEL WITH HLLC - $:GPU_PARALLEL_LOOP(collapse=3, private='[vel_L, vel_R, & - & Re_L, Re_R, rho_avg, h_avg, gamma_avg, & - & alpha_L, alpha_R, s_L, s_R, s_S, & - & vel_avg_rms, pcorr, zcoef, vel_L_tmp, & - & vel_R_tmp, Ys_L, Ys_R, Xs_L, Xs_R, & - & Gamma_iL, Gamma_iR, Cp_iL, Cp_iR, tau_e_L, & - & tau_e_R, xi_field_L, xi_field_R, Yi_avg, & - & Phi_avg, h_iL, h_iR, h_avg_2]', & - & copyin='[is1, is2, is3]') - do l = is3%beg, is3%end - do k = is2%beg, is2%end - do j = is1%beg, is1%end - !idx1 = 1; if (dir_idx(1) == 2) idx1 = 2; if (dir_idx(1) == 3) idx1 = 3 + ! Geometrical source flux for cylindrical coordinates + #:if (NORM_DIR == 2) + if (cyl_coord) then + ! Substituting the advective flux into the inviscid geometrical source flux + $:GPU_LOOP(parallelism='[seq]') + do i = 1, E_idx + flux_gsrc_rs${XYZ}$_vf(j, k, l, i) = flux_rs${XYZ}$_vf(j, k, l, i) + end do + ! Recalculating the radial momentum geometric source flux + flux_gsrc_rs${XYZ}$_vf(j, k, l, contxe + dir_idx(1)) = & + xi_M*(rho_L*(vel_L(dir_idx(1))* & + vel_L(dir_idx(1)) + & + s_M*(xi_L*(dir_flg(dir_idx(1))*s_S + & + (1._wp - dir_flg(dir_idx(1)))* & + vel_L(dir_idx(1))) - vel_L(dir_idx(1))))) & + + xi_P*(rho_R*(vel_R(dir_idx(1))* & + vel_R(dir_idx(1)) + & + s_P*(xi_R*(dir_flg(dir_idx(1))*s_S + & + (1._wp - dir_flg(dir_idx(1)))* & + vel_R(dir_idx(1))) - vel_R(dir_idx(1))))) + ! Geometrical source of the void fraction(s) is zero + $:GPU_LOOP(parallelism='[seq]') + do i = advxb, advxe + flux_gsrc_rs${XYZ}$_vf(j, k, l, i) = 0._wp + end do + end if + #:endif + #:if (NORM_DIR == 3) + if (grid_geometry == 3) then + $:GPU_LOOP(parallelism='[seq]') + do i = 1, sys_size + flux_gsrc_rs${XYZ}$_vf(j, k, l, i) = 0._wp + end do - vel_L_rms = 0._wp; vel_R_rms = 0._wp - rho_L = 0._wp; rho_R = 0._wp - gamma_L = 0._wp; gamma_R = 0._wp - pi_inf_L = 0._wp; pi_inf_R = 0._wp - qv_L = 0._wp; qv_R = 0._wp - alpha_L_sum = 0._wp; alpha_R_sum = 0._wp + flux_gsrc_rs${XYZ}$_vf(j, k, l, momxb + 1) = & + -xi_M*(rho_L*(vel_L(dir_idx(1))* & + vel_L(dir_idx(1)) + & + s_M*(xi_L*(dir_flg(dir_idx(1))*s_S + & + (1._wp - dir_flg(dir_idx(1)))* & + vel_L(dir_idx(1))) - vel_L(dir_idx(1))))) & + - xi_P*(rho_R*(vel_R(dir_idx(1))* & + vel_R(dir_idx(1)) + & + s_P*(xi_R*(dir_flg(dir_idx(1))*s_S + & + (1._wp - dir_flg(dir_idx(1)))* & + vel_R(dir_idx(1))) - vel_R(dir_idx(1))))) + flux_gsrc_rs${XYZ}$_vf(j, k, l, momxe) = flux_rs${XYZ}$_vf(j, k, l, momxb + 1) - $:GPU_LOOP(parallelism='[seq]') - do i = 1, num_fluids - alpha_L(i) = qL_prim_rs${XYZ}$_vf(j, k, l, E_idx + i) - alpha_R(i) = qR_prim_rs${XYZ}$_vf(j + 1, k, l, E_idx + i) + end if + #:endif end do + end do + end do + #:endcall GPU_PARALLEL_LOOP + else + ! 5-EQUATION MODEL WITH HLLC + #:call GPU_PARALLEL_LOOP(collapse=3, private='[T_L, T_R, vel_L, vel_R, Re_L, Re_R, rho_avg, h_avg, gamma_avg, alpha_L, alpha_R, s_L, s_R, s_S, vel_avg_rms, pcorr, zcoef, vel_L_tmp, vel_R_tmp, Ys_L, Ys_R, Xs_L, Xs_R, Gamma_iL, Gamma_iR, Cp_iL, Cp_iR, tau_e_L, tau_e_R, xi_field_L, xi_field_R, Yi_avg,Phi_avg, h_iL, h_iR, h_avg_2]', copyin='[is1, is2, is3]') + do l = is3%beg, is3%end + do k = is2%beg, is2%end + do j = is1%beg, is1%end - $:GPU_LOOP(parallelism='[seq]') - do i = 1, num_dims - vel_L(i) = qL_prim_rs${XYZ}$_vf(j, k, l, contxe + i) - vel_R(i) = qR_prim_rs${XYZ}$_vf(j + 1, k, l, contxe + i) - vel_L_rms = vel_L_rms + vel_L(i)**2._wp - vel_R_rms = vel_R_rms + vel_R(i)**2._wp - end do + !idx1 = 1; if (dir_idx(1) == 2) idx1 = 2; if (dir_idx(1) == 3) idx1 = 3 - pres_L = qL_prim_rs${XYZ}$_vf(j, k, l, E_idx) - pres_R = qR_prim_rs${XYZ}$_vf(j + 1, k, l, E_idx) + vel_L_rms = 0._wp; vel_R_rms = 0._wp + rho_L = 0._wp; rho_R = 0._wp + gamma_L = 0._wp; gamma_R = 0._wp + pi_inf_L = 0._wp; pi_inf_R = 0._wp + qv_L = 0._wp; qv_R = 0._wp + alpha_L_sum = 0._wp; alpha_R_sum = 0._wp - ! Change this by splitting it into the cases - ! present in the bubbles_euler - if (mpp_lim) then $:GPU_LOOP(parallelism='[seq]') do i = 1, num_fluids - qL_prim_rs${XYZ}$_vf(j, k, l, i) = max(0._wp, qL_prim_rs${XYZ}$_vf(j, k, l, i)) - qL_prim_rs${XYZ}$_vf(j, k, l, E_idx + i) = min(max(0._wp, qL_prim_rs${XYZ}$_vf(j, k, l, E_idx + i)), 1._wp) - qR_prim_rs${XYZ}$_vf(j + 1, k, l, i) = max(0._wp, qR_prim_rs${XYZ}$_vf(j + 1, k, l, i)) - qR_prim_rs${XYZ}$_vf(j + 1, k, l, E_idx + i) = min(max(0._wp, qR_prim_rs${XYZ}$_vf(j + 1, k, l, E_idx + i)), 1._wp) - alpha_L_sum = alpha_L_sum + qL_prim_rs${XYZ}$_vf(j, k, l, E_idx + i) - alpha_R_sum = alpha_R_sum + qR_prim_rs${XYZ}$_vf(j + 1, k, l, E_idx + i) + alpha_L(i) = qL_prim_rs${XYZ}$_vf(j, k, l, E_idx + i) + alpha_R(i) = qR_prim_rs${XYZ}$_vf(j + 1, k, l, E_idx + i) end do $:GPU_LOOP(parallelism='[seq]') - do i = 1, num_fluids - qL_prim_rs${XYZ}$_vf(j, k, l, E_idx + i) = qL_prim_rs${XYZ}$_vf(j, k, l, E_idx + i)/max(alpha_L_sum, sgm_eps) - qR_prim_rs${XYZ}$_vf(j + 1, k, l, E_idx + i) = qR_prim_rs${XYZ}$_vf(j + 1, k, l, E_idx + i)/max(alpha_R_sum, sgm_eps) + do i = 1, num_dims + vel_L(i) = qL_prim_rs${XYZ}$_vf(j, k, l, contxe + i) + vel_R(i) = qR_prim_rs${XYZ}$_vf(j + 1, k, l, contxe + i) + vel_L_rms = vel_L_rms + vel_L(i)**2._wp + vel_R_rms = vel_R_rms + vel_R(i)**2._wp end do - end if - - $:GPU_LOOP(parallelism='[seq]') - do i = 1, num_fluids - rho_L = rho_L + qL_prim_rs${XYZ}$_vf(j, k, l, i) - gamma_L = gamma_L + qL_prim_rs${XYZ}$_vf(j, k, l, E_idx + i)*gammas(i) - pi_inf_L = pi_inf_L + qL_prim_rs${XYZ}$_vf(j, k, l, E_idx + i)*pi_infs(i) - qv_L = qv_L + qL_prim_rs${XYZ}$_vf(j, k, l, i)*qvs(i) - - rho_R = rho_R + qR_prim_rs${XYZ}$_vf(j + 1, k, l, i) - gamma_R = gamma_R + qR_prim_rs${XYZ}$_vf(j + 1, k, l, E_idx + i)*gammas(i) - pi_inf_R = pi_inf_R + qR_prim_rs${XYZ}$_vf(j + 1, k, l, E_idx + i)*pi_infs(i) - qv_R = qv_R + qR_prim_rs${XYZ}$_vf(j + 1, k, l, i)*qvs(i) - end do - if (viscous) then - $:GPU_LOOP(parallelism='[seq]') - do i = 1, 2 - Re_L(i) = dflt_real - Re_R(i) = dflt_real + pres_L = qL_prim_rs${XYZ}$_vf(j, k, l, E_idx) + pres_R = qR_prim_rs${XYZ}$_vf(j + 1, k, l, E_idx) - if (Re_size(i) > 0) Re_L(i) = 0._wp - if (Re_size(i) > 0) Re_R(i) = 0._wp + ! Change this by splitting it into the cases + ! present in the bubbles_euler + if (mpp_lim) then + $:GPU_LOOP(parallelism='[seq]') + do i = 1, num_fluids + qL_prim_rs${XYZ}$_vf(j, k, l, i) = max(0._wp, qL_prim_rs${XYZ}$_vf(j, k, l, i)) + qL_prim_rs${XYZ}$_vf(j, k, l, E_idx + i) = min(max(0._wp, qL_prim_rs${XYZ}$_vf(j, k, l, E_idx + i)), 1._wp) + qR_prim_rs${XYZ}$_vf(j + 1, k, l, i) = max(0._wp, qR_prim_rs${XYZ}$_vf(j + 1, k, l, i)) + qR_prim_rs${XYZ}$_vf(j + 1, k, l, E_idx + i) = min(max(0._wp, qR_prim_rs${XYZ}$_vf(j + 1, k, l, E_idx + i)), 1._wp) + alpha_L_sum = alpha_L_sum + qL_prim_rs${XYZ}$_vf(j, k, l, E_idx + i) + alpha_R_sum = alpha_R_sum + qR_prim_rs${XYZ}$_vf(j + 1, k, l, E_idx + i) + end do $:GPU_LOOP(parallelism='[seq]') - do q = 1, Re_size(i) - Re_L(i) = qL_prim_rs${XYZ}$_vf(j, k, l, E_idx + Re_idx(i, q))/Res(i, q) & - + Re_L(i) - Re_R(i) = qR_prim_rs${XYZ}$_vf(j + 1, k, l, E_idx + Re_idx(i, q))/Res(i, q) & - + Re_R(i) + do i = 1, num_fluids + qL_prim_rs${XYZ}$_vf(j, k, l, E_idx + i) = qL_prim_rs${XYZ}$_vf(j, k, l, E_idx + i)/max(alpha_L_sum, sgm_eps) + qR_prim_rs${XYZ}$_vf(j + 1, k, l, E_idx + i) = qR_prim_rs${XYZ}$_vf(j + 1, k, l, E_idx + i)/max(alpha_R_sum, sgm_eps) end do - Re_L(i) = 1._wp/max(Re_L(i), sgm_eps) - Re_R(i) = 1._wp/max(Re_R(i), sgm_eps) - end do - end if + end if - if (chemistry) then - c_sum_Yi_Phi = 0.0_wp $:GPU_LOOP(parallelism='[seq]') - do i = chemxb, chemxe - Ys_L(i - chemxb + 1) = qL_prim_rs${XYZ}$_vf(j, k, l, i) - Ys_R(i - chemxb + 1) = qR_prim_rs${XYZ}$_vf(j + 1, k, l, i) + do i = 1, num_fluids + rho_L = rho_L + qL_prim_rs${XYZ}$_vf(j, k, l, i) + gamma_L = gamma_L + qL_prim_rs${XYZ}$_vf(j, k, l, E_idx + i)*gammas(i) + pi_inf_L = pi_inf_L + qL_prim_rs${XYZ}$_vf(j, k, l, E_idx + i)*pi_infs(i) + qv_L = qv_L + qL_prim_rs${XYZ}$_vf(j, k, l, i)*qvs(i) + + rho_R = rho_R + qR_prim_rs${XYZ}$_vf(j + 1, k, l, i) + gamma_R = gamma_R + qR_prim_rs${XYZ}$_vf(j + 1, k, l, E_idx + i)*gammas(i) + pi_inf_R = pi_inf_R + qR_prim_rs${XYZ}$_vf(j + 1, k, l, E_idx + i)*pi_infs(i) + qv_R = qv_R + qR_prim_rs${XYZ}$_vf(j + 1, k, l, i)*qvs(i) end do - call get_mixture_molecular_weight(Ys_L, MW_L) - call get_mixture_molecular_weight(Ys_R, MW_R) + if (viscous) then + $:GPU_LOOP(parallelism='[seq]') + do i = 1, 2 + Re_L(i) = dflt_real + Re_R(i) = dflt_real - Xs_L(:) = Ys_L(:)*MW_L/molecular_weights(:) - Xs_R(:) = Ys_R(:)*MW_R/molecular_weights(:) + if (Re_size(i) > 0) Re_L(i) = 0._wp + if (Re_size(i) > 0) Re_R(i) = 0._wp - R_gas_L = gas_constant/MW_L - R_gas_R = gas_constant/MW_R + $:GPU_LOOP(parallelism='[seq]') + do q = 1, Re_size(i) + Re_L(i) = qL_prim_rs${XYZ}$_vf(j, k, l, E_idx + Re_idx(i, q))/Res_gs(i, q) & + + Re_L(i) + Re_R(i) = qR_prim_rs${XYZ}$_vf(j + 1, k, l, E_idx + Re_idx(i, q))/Res_gs(i, q) & + + Re_R(i) + end do + Re_L(i) = 1._wp/max(Re_L(i), sgm_eps) + Re_R(i) = 1._wp/max(Re_R(i), sgm_eps) + end do + end if - T_L = pres_L/rho_L/R_gas_L - T_R = pres_R/rho_R/R_gas_R + if (chemistry) then + c_sum_Yi_Phi = 0.0_wp + $:GPU_LOOP(parallelism='[seq]') + do i = chemxb, chemxe + Ys_L(i - chemxb + 1) = qL_prim_rs${XYZ}$_vf(j, k, l, i) + Ys_R(i - chemxb + 1) = qR_prim_rs${XYZ}$_vf(j + 1, k, l, i) + end do - call get_species_specific_heats_r(T_L, Cp_iL) - call get_species_specific_heats_r(T_R, Cp_iR) + call get_mixture_molecular_weight(Ys_L, MW_L) + call get_mixture_molecular_weight(Ys_R, MW_R) - if (chem_params%gamma_method == 1) then - !> gamma_method = 1: Ref. Section 2.3.1 Formulation of doi:10.7907/ZKW8-ES97. - Gamma_iL = Cp_iL/(Cp_iL - 1.0_wp) - Gamma_iR = Cp_iR/(Cp_iR - 1.0_wp) + Xs_L(:) = Ys_L(:)*MW_L/molecular_weights(:) + Xs_R(:) = Ys_R(:)*MW_R/molecular_weights(:) - gamma_L = sum(Xs_L(:)/(Gamma_iL(:) - 1.0_wp)) - gamma_R = sum(Xs_R(:)/(Gamma_iR(:) - 1.0_wp)) - else if (chem_params%gamma_method == 2) then - !> gamma_method = 2: c_p / c_v where c_p, c_v are specific heats. - call get_mixture_specific_heat_cp_mass(T_L, Ys_L, Cp_L) - call get_mixture_specific_heat_cp_mass(T_R, Ys_R, Cp_R) - call get_mixture_specific_heat_cv_mass(T_L, Ys_L, Cv_L) - call get_mixture_specific_heat_cv_mass(T_R, Ys_R, Cv_R) + R_gas_L = gas_constant/MW_L + R_gas_R = gas_constant/MW_R - Gamm_L = Cp_L/Cv_L; Gamm_R = Cp_R/Cv_R - gamma_L = 1.0_wp/(Gamm_L - 1.0_wp); gamma_R = 1.0_wp/(Gamm_R - 1.0_wp) - end if + T_L = pres_L/rho_L/R_gas_L + T_R = pres_R/rho_R/R_gas_R - call get_mixture_energy_mass(T_L, Ys_L, E_L) - call get_mixture_energy_mass(T_R, Ys_R, E_R) + call get_species_specific_heats_r(T_L, Cp_iL) + call get_species_specific_heats_r(T_R, Cp_iR) - E_L = rho_L*E_L + 5.e-1*rho_L*vel_L_rms - E_R = rho_R*E_R + 5.e-1*rho_R*vel_R_rms - H_L = (E_L + pres_L)/rho_L - H_R = (E_R + pres_R)/rho_R - else - E_L = gamma_L*pres_L + pi_inf_L + 5.e-1*rho_L*vel_L_rms + qv_L - E_R = gamma_R*pres_R + pi_inf_R + 5.e-1*rho_R*vel_R_rms + qv_R + if (chem_params%gamma_method == 1) then + !> gamma_method = 1: Ref. Section 2.3.1 Formulation of doi:10.7907/ZKW8-ES97. + Gamma_iL = Cp_iL/(Cp_iL - 1.0_wp) + Gamma_iR = Cp_iR/(Cp_iR - 1.0_wp) - H_L = (E_L + pres_L)/rho_L - H_R = (E_R + pres_R)/rho_R - end if + gamma_L = sum(Xs_L(:)/(Gamma_iL(:) - 1.0_wp)) + gamma_R = sum(Xs_R(:)/(Gamma_iR(:) - 1.0_wp)) + else if (chem_params%gamma_method == 2) then + !> gamma_method = 2: c_p / c_v where c_p, c_v are specific heats. + call get_mixture_specific_heat_cp_mass(T_L, Ys_L, Cp_L) + call get_mixture_specific_heat_cp_mass(T_R, Ys_R, Cp_R) + call get_mixture_specific_heat_cv_mass(T_L, Ys_L, Cv_L) + call get_mixture_specific_heat_cv_mass(T_R, Ys_R, Cv_R) - ! ENERGY ADJUSTMENTS FOR HYPOELASTIC ENERGY - if (hypoelasticity) then - $:GPU_LOOP(parallelism='[seq]') - do i = 1, strxe - strxb + 1 - tau_e_L(i) = qL_prim_rs${XYZ}$_vf(j, k, l, strxb - 1 + i) - tau_e_R(i) = qR_prim_rs${XYZ}$_vf(j + 1, k, l, strxb - 1 + i) - end do - G_L = 0._wp - G_R = 0._wp - $:GPU_LOOP(parallelism='[seq]') - do i = 1, num_fluids - G_L = G_L + alpha_L(i)*Gs(i) - G_R = G_R + alpha_R(i)*Gs(i) - end do - $:GPU_LOOP(parallelism='[seq]') - do i = 1, strxe - strxb + 1 - ! Elastic contribution to energy if G large enough - if ((G_L > verysmall) .and. (G_R > verysmall)) then - E_L = E_L + (tau_e_L(i)*tau_e_L(i))/(4._wp*G_L) - E_R = E_R + (tau_e_R(i)*tau_e_R(i))/(4._wp*G_R) - ! Additional terms in 2D and 3D - if ((i == 2) .or. (i == 4) .or. (i == 5)) then - E_L = E_L + (tau_e_L(i)*tau_e_L(i))/(4._wp*G_L) - E_R = E_R + (tau_e_R(i)*tau_e_R(i))/(4._wp*G_R) - end if + Gamm_L = Cp_L/Cv_L; Gamm_R = Cp_R/Cv_R + gamma_L = 1.0_wp/(Gamm_L - 1.0_wp); gamma_R = 1.0_wp/(Gamm_R - 1.0_wp) end if - end do - end if - - ! ENERGY ADJUSTMENTS FOR HYPERELASTIC ENERGY - if (hyperelasticity) then - $:GPU_LOOP(parallelism='[seq]') - do i = 1, num_dims - xi_field_L(i) = qL_prim_rs${XYZ}$_vf(j, k, l, xibeg - 1 + i) - xi_field_R(i) = qR_prim_rs${XYZ}$_vf(j + 1, k, l, xibeg - 1 + i) - end do - G_L = 0._wp - G_R = 0._wp - $:GPU_LOOP(parallelism='[seq]') - do i = 1, num_fluids - ! Mixture left and right shear modulus - G_L = G_L + alpha_L(i)*Gs(i) - G_R = G_R + alpha_R(i)*Gs(i) - end do - ! Elastic contribution to energy if G large enough - if (G_L > verysmall .and. G_R > verysmall) then - E_L = E_L + G_L*qL_prim_rs${XYZ}$_vf(j, k, l, xiend + 1) - E_R = E_R + G_R*qR_prim_rs${XYZ}$_vf(j + 1, k, l, xiend + 1) - end if - $:GPU_LOOP(parallelism='[seq]') - do i = 1, b_size - 1 - tau_e_L(i) = qL_prim_rs${XYZ}$_vf(j, k, l, strxb - 1 + i) - tau_e_R(i) = qR_prim_rs${XYZ}$_vf(j + 1, k, l, strxb - 1 + i) - end do - end if - - H_L = (E_L + pres_L)/rho_L - H_R = (E_R + pres_R)/rho_R - @:compute_average_state() - - call s_compute_speed_of_sound(pres_L, rho_L, gamma_L, pi_inf_L, H_L, alpha_L, & - vel_L_rms, 0._wp, c_L) + call get_mixture_energy_mass(T_L, Ys_L, E_L) + call get_mixture_energy_mass(T_R, Ys_R, E_R) - call s_compute_speed_of_sound(pres_R, rho_R, gamma_R, pi_inf_R, H_R, alpha_R, & - vel_R_rms, 0._wp, c_R) + E_L = rho_L*E_L + 5.e-1*rho_L*vel_L_rms + E_R = rho_R*E_R + 5.e-1*rho_R*vel_R_rms + H_L = (E_L + pres_L)/rho_L + H_R = (E_R + pres_R)/rho_R + else + E_L = gamma_L*pres_L + pi_inf_L + 5.e-1*rho_L*vel_L_rms + qv_L + E_R = gamma_R*pres_R + pi_inf_R + 5.e-1*rho_R*vel_R_rms + qv_R - !> The computation of c_avg does not require all the variables, and therefore the non '_avg' - ! variables are placeholders to call the subroutine. - call s_compute_speed_of_sound(pres_R, rho_avg, gamma_avg, pi_inf_R, H_avg, alpha_R, & - vel_avg_rms, c_sum_Yi_Phi, c_avg) + H_L = (E_L + pres_L)/rho_L + H_R = (E_R + pres_R)/rho_R + end if - if (viscous) then - if (chemistry) then - call compute_viscosity_and_inversion(T_L, Ys_L, T_R, Ys_R, Re_L(1), Re_R(1)) + ! ENERGY ADJUSTMENTS FOR HYPOELASTIC ENERGY + if (hypoelasticity) then + $:GPU_LOOP(parallelism='[seq]') + do i = 1, strxe - strxb + 1 + tau_e_L(i) = qL_prim_rs${XYZ}$_vf(j, k, l, strxb - 1 + i) + tau_e_R(i) = qR_prim_rs${XYZ}$_vf(j + 1, k, l, strxb - 1 + i) + end do + G_L = 0._wp + G_R = 0._wp + $:GPU_LOOP(parallelism='[seq]') + do i = 1, num_fluids + G_L = G_L + alpha_L(i)*Gs_rs(i) + G_R = G_R + alpha_R(i)*Gs_rs(i) + end do + $:GPU_LOOP(parallelism='[seq]') + do i = 1, strxe - strxb + 1 + ! Elastic contribution to energy if G large enough + if ((G_L > verysmall) .and. (G_R > verysmall)) then + E_L = E_L + (tau_e_L(i)*tau_e_L(i))/(4._wp*G_L) + E_R = E_R + (tau_e_R(i)*tau_e_R(i))/(4._wp*G_R) + ! Additional terms in 2D and 3D + if ((i == 2) .or. (i == 4) .or. (i == 5)) then + E_L = E_L + (tau_e_L(i)*tau_e_L(i))/(4._wp*G_L) + E_R = E_R + (tau_e_R(i)*tau_e_R(i))/(4._wp*G_R) + end if + end if + end do end if - $:GPU_LOOP(parallelism='[seq]') - do i = 1, 2 - Re_avg_rs${XYZ}$_vf(j, k, l, i) = 2._wp/(1._wp/Re_L(i) + 1._wp/Re_R(i)) - end do - end if - ! Low Mach correction - if (low_Mach == 2) then - @:compute_low_Mach_correction() - end if + ! ENERGY ADJUSTMENTS FOR HYPERELASTIC ENERGY + if (hyperelasticity) then + $:GPU_LOOP(parallelism='[seq]') + do i = 1, num_dims + xi_field_L(i) = qL_prim_rs${XYZ}$_vf(j, k, l, xibeg - 1 + i) + xi_field_R(i) = qR_prim_rs${XYZ}$_vf(j + 1, k, l, xibeg - 1 + i) + end do + G_L = 0._wp + G_R = 0._wp + $:GPU_LOOP(parallelism='[seq]') + do i = 1, num_fluids + ! Mixture left and right shear modulus + G_L = G_L + alpha_L(i)*Gs_rs(i) + G_R = G_R + alpha_R(i)*Gs_rs(i) + end do + ! Elastic contribution to energy if G large enough + if (G_L > verysmall .and. G_R > verysmall) then + E_L = E_L + G_L*qL_prim_rs${XYZ}$_vf(j, k, l, xiend + 1) + E_R = E_R + G_R*qR_prim_rs${XYZ}$_vf(j + 1, k, l, xiend + 1) + end if + $:GPU_LOOP(parallelism='[seq]') + do i = 1, b_size - 1 + tau_e_L(i) = qL_prim_rs${XYZ}$_vf(j, k, l, strxb - 1 + i) + tau_e_R(i) = qR_prim_rs${XYZ}$_vf(j + 1, k, l, strxb - 1 + i) + end do + end if - if (wave_speeds == 1) then - if (elasticity) then - s_L = min(vel_L(dir_idx(1)) - sqrt(c_L*c_L + & - (((4._wp*G_L)/3._wp) + tau_e_L(dir_idx_tau(1)))/rho_L), vel_R(dir_idx(1)) - sqrt(c_R*c_R + & - (((4._wp*G_R)/3._wp) + tau_e_R(dir_idx_tau(1)))/rho_R)) - s_R = max(vel_R(dir_idx(1)) + sqrt(c_R*c_R + & - (((4._wp*G_R)/3._wp) + tau_e_R(dir_idx_tau(1)))/rho_R), vel_L(dir_idx(1)) + sqrt(c_L*c_L + & - (((4._wp*G_L)/3._wp) + tau_e_L(dir_idx_tau(1)))/rho_L)) - s_S = (pres_R - tau_e_R(dir_idx_tau(1)) - pres_L + & - tau_e_L(dir_idx_tau(1)) + rho_L*vel_L(idx1)*(s_L - vel_L(idx1)) - & - rho_R*vel_R(idx1)*(s_R - vel_R(idx1)))/(rho_L*(s_L - vel_L(idx1)) - & - rho_R*(s_R - vel_R(idx1))) - else - s_L = min(vel_L(dir_idx(1)) - c_L, vel_R(dir_idx(1)) - c_R) - s_R = max(vel_R(dir_idx(1)) + c_R, vel_L(dir_idx(1)) + c_L) - s_S = (pres_R - pres_L + rho_L*vel_L(dir_idx(1))* & - (s_L - vel_L(dir_idx(1))) - rho_R*vel_R(dir_idx(1))*(s_R - vel_R(dir_idx(1)))) & - /(rho_L*(s_L - vel_L(dir_idx(1))) - rho_R*(s_R - vel_R(dir_idx(1)))) + H_L = (E_L + pres_L)/rho_L + H_R = (E_R + pres_R)/rho_R - end if - elseif (wave_speeds == 2) then - pres_SL = 5.e-1_wp*(pres_L + pres_R + rho_avg*c_avg* & - (vel_L(idx1) - & - vel_R(idx1))) + @:compute_average_state() - pres_SR = pres_SL + call s_compute_speed_of_sound(pres_L, rho_L, gamma_L, pi_inf_L, H_L, alpha_L, & + vel_L_rms, 0._wp, c_L) - Ms_L = max(1._wp, sqrt(1._wp + ((5.e-1_wp + gamma_L)/(1._wp + gamma_L))* & - (pres_SL/pres_L - 1._wp)*pres_L/ & - ((pres_L + pi_inf_L/(1._wp + gamma_L))))) - Ms_R = max(1._wp, sqrt(1._wp + ((5.e-1_wp + gamma_R)/(1._wp + gamma_R))* & - (pres_SR/pres_R - 1._wp)*pres_R/ & - ((pres_R + pi_inf_R/(1._wp + gamma_R))))) + call s_compute_speed_of_sound(pres_R, rho_R, gamma_R, pi_inf_R, H_R, alpha_R, & + vel_R_rms, 0._wp, c_R) - s_L = vel_L(idx1) - c_L*Ms_L - s_R = vel_R(idx1) + c_R*Ms_R + !> The computation of c_avg does not require all the variables, and therefore the non '_avg' + ! variables are placeholders to call the subroutine. + call s_compute_speed_of_sound(pres_R, rho_avg, gamma_avg, pi_inf_R, H_avg, alpha_R, & + vel_avg_rms, c_sum_Yi_Phi, c_avg) - s_S = 5.e-1_wp*((vel_L(idx1) + vel_R(idx1)) + & - (pres_L - pres_R)/ & - (rho_avg*c_avg)) - end if + if (viscous) then + if (chemistry) then + call compute_viscosity_and_inversion(T_L, Ys_L, T_R, Ys_R, Re_L(1), Re_R(1)) + end if + $:GPU_LOOP(parallelism='[seq]') + do i = 1, 2 + Re_avg_rs${XYZ}$_vf(j, k, l, i) = 2._wp/(1._wp/Re_L(i) + 1._wp/Re_R(i)) + end do + end if - ! follows Einfeldt et al. - ! s_M/P = min/max(0.,s_L/R) - s_M = min(0._wp, s_L); s_P = max(0._wp, s_R) + ! Low Mach correction + if (low_Mach == 2) then + @:compute_low_Mach_correction() + end if - ! goes with q_star_L/R = xi_L/R * (variable) - ! xi_L/R = ( ( s_L/R - u_L/R )/(s_L/R - s_star) ) - xi_L = (s_L - vel_L(idx1))/(s_L - s_S) - xi_R = (s_R - vel_R(idx1))/(s_R - s_S) + if (wave_speeds == 1) then + if (elasticity) then + s_L = min(vel_L(dir_idx(1)) - sqrt(c_L*c_L + & + (((4._wp*G_L)/3._wp) + tau_e_L(dir_idx_tau(1)))/rho_L), vel_R(dir_idx(1)) - sqrt(c_R*c_R + & + (((4._wp*G_R)/3._wp) + tau_e_R(dir_idx_tau(1)))/rho_R)) + s_R = max(vel_R(dir_idx(1)) + sqrt(c_R*c_R + & + (((4._wp*G_R)/3._wp) + tau_e_R(dir_idx_tau(1)))/rho_R), vel_L(dir_idx(1)) + sqrt(c_L*c_L + & + (((4._wp*G_L)/3._wp) + tau_e_L(dir_idx_tau(1)))/rho_L)) + s_S = (pres_R - tau_e_R(dir_idx_tau(1)) - pres_L + & + tau_e_L(dir_idx_tau(1)) + rho_L*vel_L(idx1)*(s_L - vel_L(idx1)) - & + rho_R*vel_R(idx1)*(s_R - vel_R(idx1)))/(rho_L*(s_L - vel_L(idx1)) - & + rho_R*(s_R - vel_R(idx1))) + else + s_L = min(vel_L(dir_idx(1)) - c_L, vel_R(dir_idx(1)) - c_R) + s_R = max(vel_R(dir_idx(1)) + c_R, vel_L(dir_idx(1)) + c_L) + s_S = (pres_R - pres_L + rho_L*vel_L(dir_idx(1))* & + (s_L - vel_L(dir_idx(1))) - rho_R*vel_R(dir_idx(1))*(s_R - vel_R(dir_idx(1)))) & + /(rho_L*(s_L - vel_L(dir_idx(1))) - rho_R*(s_R - vel_R(dir_idx(1)))) - ! goes with numerical velocity in x/y/z directions - ! xi_P/M = 0.5 +/m sgn(0.5,s_star) - xi_M = (5.e-1_wp + sign(5.e-1_wp, s_S)) - xi_P = (5.e-1_wp - sign(5.e-1_wp, s_S)) + end if + elseif (wave_speeds == 2) then + pres_SL = 5.e-1_wp*(pres_L + pres_R + rho_avg*c_avg* & + (vel_L(idx1) - & + vel_R(idx1))) + + pres_SR = pres_SL + + Ms_L = max(1._wp, sqrt(1._wp + ((5.e-1_wp + gamma_L)/(1._wp + gamma_L))* & + (pres_SL/pres_L - 1._wp)*pres_L/ & + ((pres_L + pi_inf_L/(1._wp + gamma_L))))) + Ms_R = max(1._wp, sqrt(1._wp + ((5.e-1_wp + gamma_R)/(1._wp + gamma_R))* & + (pres_SR/pres_R - 1._wp)*pres_R/ & + ((pres_R + pi_inf_R/(1._wp + gamma_R))))) + + s_L = vel_L(idx1) - c_L*Ms_L + s_R = vel_R(idx1) + c_R*Ms_R + + s_S = 5.e-1_wp*((vel_L(idx1) + vel_R(idx1)) + & + (pres_L - pres_R)/ & + (rho_avg*c_avg)) + end if - ! Low Mach correction - if (low_Mach == 1) then - @:compute_low_Mach_correction() - else - pcorr = 0._wp - end if + ! follows Einfeldt et al. + ! s_M/P = min/max(0.,s_L/R) + s_M = min(0._wp, s_L); s_P = max(0._wp, s_R) - ! COMPUTING THE HLLC FLUXES - ! MASS FLUX. - $:GPU_LOOP(parallelism='[seq]') - do i = 1, contxe - flux_rs${XYZ}$_vf(j, k, l, i) = & - xi_M*qL_prim_rs${XYZ}$_vf(j, k, l, i) & - *(vel_L(idx1) + s_M*(xi_L - 1._wp)) & - + xi_P*qR_prim_rs${XYZ}$_vf(j + 1, k, l, i) & - *(vel_R(idx1) + s_P*(xi_R - 1._wp)) - end do + ! goes with q_star_L/R = xi_L/R * (variable) + ! xi_L/R = ( ( s_L/R - u_L/R )/(s_L/R - s_star) ) + xi_L = (s_L - vel_L(idx1))/(s_L - s_S) + xi_R = (s_R - vel_R(idx1))/(s_R - s_S) - ! MOMENTUM FLUX. - ! f = \rho u u - \sigma, q = \rho u, q_star = \xi * \rho*(s_star, v, w) - $:GPU_LOOP(parallelism='[seq]') - do i = 1, num_dims - idxi = dir_idx(i) - flux_rs${XYZ}$_vf(j, k, l, contxe + idxi) = & - xi_M*(rho_L*(vel_L(idx1)* & - vel_L(idxi) + & - s_M*(xi_L*(dir_flg(idxi)*s_S + & - (1._wp - dir_flg(idxi))* & - vel_L(idxi)) - vel_L(idxi))) + & - dir_flg(idxi)*(pres_L)) & - + xi_P*(rho_R*(vel_R(idx1)* & - vel_R(idxi) + & - s_P*(xi_R*(dir_flg(idxi)*s_S + & - (1._wp - dir_flg(idxi))* & - vel_R(idxi)) - vel_R(idxi))) + & - dir_flg(idxi)*(pres_R)) & - + (s_M/s_L)*(s_P/s_R)*dir_flg(idxi)*pcorr - end do + ! goes with numerical velocity in x/y/z directions + ! xi_P/M = 0.5 +/m sgn(0.5,s_star) + xi_M = (5.e-1_wp + sign(5.e-1_wp, s_S)) + xi_P = (5.e-1_wp - sign(5.e-1_wp, s_S)) + + ! Low Mach correction + if (low_Mach == 1) then + @:compute_low_Mach_correction() + else + pcorr = 0._wp + end if - ! ENERGY FLUX. - ! f = u*(E-\sigma), q = E, q_star = \xi*E+(s-u)(\rho s_star - \sigma/(s-u)) - flux_rs${XYZ}$_vf(j, k, l, E_idx) = & - xi_M*(vel_L(idx1)*(E_L + pres_L) + & - s_M*(xi_L*(E_L + (s_S - vel_L(idx1))* & - (rho_L*s_S + pres_L/ & - (s_L - vel_L(idx1)))) - E_L)) & - + xi_P*(vel_R(idx1)*(E_R + pres_R) + & - s_P*(xi_R*(E_R + (s_S - vel_R(idx1))* & - (rho_R*s_S + pres_R/ & - (s_R - vel_R(idx1)))) - E_R)) & - + (s_M/s_L)*(s_P/s_R)*pcorr*s_S - - ! ELASTICITY. Elastic shear stress additions for the momentum and energy flux - if (elasticity) then - flux_ene_e = 0._wp + ! COMPUTING THE HLLC FLUXES + ! MASS FLUX. $:GPU_LOOP(parallelism='[seq]') - do i = 1, num_dims - idxi = dir_idx(i) - ! MOMENTUM ELASTIC FLUX. - flux_rs${XYZ}$_vf(j, k, l, contxe + idxi) = & - flux_rs${XYZ}$_vf(j, k, l, contxe + idxi) & - - xi_M*tau_e_L(dir_idx_tau(i)) - xi_P*tau_e_R(dir_idx_tau(i)) - ! ENERGY ELASTIC FLUX. - flux_ene_e = flux_ene_e - & - xi_M*(vel_L(idxi)*tau_e_L(dir_idx_tau(i)) + & - s_M*(xi_L*((s_S - vel_L(i))*(tau_e_L(dir_idx_tau(i))/(s_L - vel_L(i)))))) - & - xi_P*(vel_R(idxi)*tau_e_R(dir_idx_tau(i)) + & - s_P*(xi_R*((s_S - vel_R(i))*(tau_e_R(dir_idx_tau(i))/(s_R - vel_R(i)))))) + do i = 1, contxe + flux_rs${XYZ}$_vf(j, k, l, i) = & + xi_M*qL_prim_rs${XYZ}$_vf(j, k, l, i) & + *(vel_L(idx1) + s_M*(xi_L - 1._wp)) & + + xi_P*qR_prim_rs${XYZ}$_vf(j + 1, k, l, i) & + *(vel_R(idx1) + s_P*(xi_R - 1._wp)) end do - flux_rs${XYZ}$_vf(j, k, l, E_idx) = flux_rs${XYZ}$_vf(j, k, l, E_idx) + flux_ene_e - end if - ! HYPOELASTIC STRESS EVOLUTION FLUX. - if (hypoelasticity) then + ! MOMENTUM FLUX. + ! f = \rho u u - \sigma, q = \rho u, q_star = \xi * \rho*(s_star, v, w) $:GPU_LOOP(parallelism='[seq]') - do i = 1, strxe - strxb + 1 - flux_rs${XYZ}$_vf(j, k, l, strxb - 1 + i) = & - xi_M*(s_S/(s_L - s_S))*(s_L*rho_L*tau_e_L(i) - rho_L*vel_L(idx1)*tau_e_L(i)) + & - xi_P*(s_S/(s_R - s_S))*(s_R*rho_R*tau_e_R(i) - rho_R*vel_R(idx1)*tau_e_R(i)) + do i = 1, num_dims + idxi = dir_idx(i) + flux_rs${XYZ}$_vf(j, k, l, contxe + idxi) = & + xi_M*(rho_L*(vel_L(idx1)* & + vel_L(idxi) + & + s_M*(xi_L*(dir_flg(idxi)*s_S + & + (1._wp - dir_flg(idxi))* & + vel_L(idxi)) - vel_L(idxi))) + & + dir_flg(idxi)*(pres_L)) & + + xi_P*(rho_R*(vel_R(idx1)* & + vel_R(idxi) + & + s_P*(xi_R*(dir_flg(idxi)*s_S + & + (1._wp - dir_flg(idxi))* & + vel_R(idxi)) - vel_R(idxi))) + & + dir_flg(idxi)*(pres_R)) & + + (s_M/s_L)*(s_P/s_R)*dir_flg(idxi)*pcorr end do - end if - - ! VOLUME FRACTION FLUX. - $:GPU_LOOP(parallelism='[seq]') - do i = advxb, advxe - flux_rs${XYZ}$_vf(j, k, l, i) = & - xi_M*qL_prim_rs${XYZ}$_vf(j, k, l, i) & - *(vel_L(idx1) + s_M*(xi_L - 1._wp)) & - + xi_P*qR_prim_rs${XYZ}$_vf(j + 1, k, l, i) & - *(vel_R(idx1) + s_P*(xi_R - 1._wp)) - end do - ! VOLUME FRACTION SOURCE FLUX. - $:GPU_LOOP(parallelism='[seq]') - do i = 1, num_dims - idxi = dir_idx(i) - vel_src_rs${XYZ}$_vf(j, k, l, idxi) = & - xi_M*(vel_L(idxi) + & - dir_flg(idxi)* & - s_M*(xi_L - 1._wp)) & - + xi_P*(vel_R(idxi) + & - dir_flg(idxi)* & - s_P*(xi_R - 1._wp)) - end do + ! ENERGY FLUX. + ! f = u*(E-\sigma), q = E, q_star = \xi*E+(s-u)(\rho s_star - \sigma/(s-u)) + flux_rs${XYZ}$_vf(j, k, l, E_idx) = & + xi_M*(vel_L(idx1)*(E_L + pres_L) + & + s_M*(xi_L*(E_L + (s_S - vel_L(idx1))* & + (rho_L*s_S + pres_L/ & + (s_L - vel_L(idx1)))) - E_L)) & + + xi_P*(vel_R(idx1)*(E_R + pres_R) + & + s_P*(xi_R*(E_R + (s_S - vel_R(idx1))* & + (rho_R*s_S + pres_R/ & + (s_R - vel_R(idx1)))) - E_R)) & + + (s_M/s_L)*(s_P/s_R)*pcorr*s_S + + ! ELASTICITY. Elastic shear stress additions for the momentum and energy flux + if (elasticity) then + flux_ene_e = 0._wp + $:GPU_LOOP(parallelism='[seq]') + do i = 1, num_dims + idxi = dir_idx(i) + ! MOMENTUM ELASTIC FLUX. + flux_rs${XYZ}$_vf(j, k, l, contxe + idxi) = & + flux_rs${XYZ}$_vf(j, k, l, contxe + idxi) & + - xi_M*tau_e_L(dir_idx_tau(i)) - xi_P*tau_e_R(dir_idx_tau(i)) + ! ENERGY ELASTIC FLUX. + flux_ene_e = flux_ene_e - & + xi_M*(vel_L(idxi)*tau_e_L(dir_idx_tau(i)) + & + s_M*(xi_L*((s_S - vel_L(i))*(tau_e_L(dir_idx_tau(i))/(s_L - vel_L(i)))))) - & + xi_P*(vel_R(idxi)*tau_e_R(dir_idx_tau(i)) + & + s_P*(xi_R*((s_S - vel_R(i))*(tau_e_R(dir_idx_tau(i))/(s_R - vel_R(i)))))) + end do + flux_rs${XYZ}$_vf(j, k, l, E_idx) = flux_rs${XYZ}$_vf(j, k, l, E_idx) + flux_ene_e + end if - ! COLOR FUNCTION FLUX - if (surface_tension) then - flux_rs${XYZ}$_vf(j, k, l, c_idx) = & - xi_M*qL_prim_rs${XYZ}$_vf(j, k, l, c_idx) & - *(vel_L(idx1) + s_M*(xi_L - 1._wp)) & - + xi_P*qR_prim_rs${XYZ}$_vf(j + 1, k, l, c_idx) & - *(vel_R(idx1) + s_P*(xi_R - 1._wp)) - end if + ! HYPOELASTIC STRESS EVOLUTION FLUX. + if (hypoelasticity) then + $:GPU_LOOP(parallelism='[seq]') + do i = 1, strxe - strxb + 1 + flux_rs${XYZ}$_vf(j, k, l, strxb - 1 + i) = & + xi_M*(s_S/(s_L - s_S))*(s_L*rho_L*tau_e_L(i) - rho_L*vel_L(idx1)*tau_e_L(i)) + & + xi_P*(s_S/(s_R - s_S))*(s_R*rho_R*tau_e_R(i) - rho_R*vel_R(idx1)*tau_e_R(i)) + end do + end if - ! REFERENCE MAP FLUX. - if (hyperelasticity) then + ! VOLUME FRACTION FLUX. $:GPU_LOOP(parallelism='[seq]') - do i = 1, num_dims - flux_rs${XYZ}$_vf(j, k, l, xibeg - 1 + i) = & - xi_M*(s_S/(s_L - s_S))*(s_L*rho_L*xi_field_L(i) & - - rho_L*vel_L(idx1)*xi_field_L(i)) + & - xi_P*(s_S/(s_R - s_S))*(s_R*rho_R*xi_field_R(i) & - - rho_R*vel_R(idx1)*xi_field_R(i)) + do i = advxb, advxe + flux_rs${XYZ}$_vf(j, k, l, i) = & + xi_M*qL_prim_rs${XYZ}$_vf(j, k, l, i) & + *(vel_L(idx1) + s_M*(xi_L - 1._wp)) & + + xi_P*qR_prim_rs${XYZ}$_vf(j + 1, k, l, i) & + *(vel_R(idx1) + s_P*(xi_R - 1._wp)) end do - end if - flux_src_rs${XYZ}$_vf(j, k, l, advxb) = vel_src_rs${XYZ}$_vf(j, k, l, idx1) - - if (chemistry) then + ! VOLUME FRACTION SOURCE FLUX. $:GPU_LOOP(parallelism='[seq]') - do i = chemxb, chemxe - Y_L = qL_prim_rs${XYZ}$_vf(j, k, l, i) - Y_R = qR_prim_rs${XYZ}$_vf(j + 1, k, l, i) - - flux_rs${XYZ}$_vf(j, k, l, i) = xi_M*rho_L*Y_L*(vel_L(idx1) + s_M*(xi_L - 1._wp)) & - + xi_P*rho_R*Y_R*(vel_R(idx1) + s_P*(xi_R - 1._wp)) - flux_src_rs${XYZ}$_vf(j, k, l, i) = 0.0_wp + do i = 1, num_dims + idxi = dir_idx(i) + vel_src_rs${XYZ}$_vf(j, k, l, idxi) = & + xi_M*(vel_L(idxi) + & + dir_flg(idxi)* & + s_M*(xi_L - 1._wp)) & + + xi_P*(vel_R(idxi) + & + dir_flg(idxi)* & + s_P*(xi_R - 1._wp)) end do - end if - ! Geometrical source flux for cylindrical coordinates - #:if (NORM_DIR == 2) - if (cyl_coord) then - !Substituting the advective flux into the inviscid geometrical source flux - $:GPU_LOOP(parallelism='[seq]') - do i = 1, E_idx - flux_gsrc_rs${XYZ}$_vf(j, k, l, i) = flux_rs${XYZ}$_vf(j, k, l, i) - end do - ! Recalculating the radial momentum geometric source flux - flux_gsrc_rs${XYZ}$_vf(j, k, l, contxe + idx1) = & - xi_M*(rho_L*(vel_L(idx1)* & - vel_L(idx1) + & - s_M*(xi_L*(dir_flg(idx1)*s_S + & - (1._wp - dir_flg(idx1))* & - vel_L(idx1)) - vel_L(idx1)))) & - + xi_P*(rho_R*(vel_R(idx1)* & - vel_R(idx1) + & - s_P*(xi_R*(dir_flg(idx1)*s_S + & - (1._wp - dir_flg(idx1))* & - vel_R(idx1)) - vel_R(idx1)))) - ! Geometrical source of the void fraction(s) is zero + ! COLOR FUNCTION FLUX + if (surface_tension) then + flux_rs${XYZ}$_vf(j, k, l, c_idx) = & + xi_M*qL_prim_rs${XYZ}$_vf(j, k, l, c_idx) & + *(vel_L(idx1) + s_M*(xi_L - 1._wp)) & + + xi_P*qR_prim_rs${XYZ}$_vf(j + 1, k, l, c_idx) & + *(vel_R(idx1) + s_P*(xi_R - 1._wp)) + end if + + ! REFERENCE MAP FLUX. + if (hyperelasticity) then $:GPU_LOOP(parallelism='[seq]') - do i = advxb, advxe - flux_gsrc_rs${XYZ}$_vf(j, k, l, i) = 0._wp + do i = 1, num_dims + flux_rs${XYZ}$_vf(j, k, l, xibeg - 1 + i) = & + xi_M*(s_S/(s_L - s_S))*(s_L*rho_L*xi_field_L(i) & + - rho_L*vel_L(idx1)*xi_field_L(i)) + & + xi_P*(s_S/(s_R - s_S))*(s_R*rho_R*xi_field_R(i) & + - rho_R*vel_R(idx1)*xi_field_R(i)) end do end if - #:endif - #:if (NORM_DIR == 3) - if (grid_geometry == 3) then + + flux_src_rs${XYZ}$_vf(j, k, l, advxb) = vel_src_rs${XYZ}$_vf(j, k, l, idx1) + + if (chemistry) then $:GPU_LOOP(parallelism='[seq]') - do i = 1, sys_size - flux_gsrc_rs${XYZ}$_vf(j, k, l, i) = 0._wp + do i = chemxb, chemxe + Y_L = qL_prim_rs${XYZ}$_vf(j, k, l, i) + Y_R = qR_prim_rs${XYZ}$_vf(j + 1, k, l, i) + + flux_rs${XYZ}$_vf(j, k, l, i) = xi_M*rho_L*Y_L*(vel_L(idx1) + s_M*(xi_L - 1._wp)) & + + xi_P*rho_R*Y_R*(vel_R(idx1) + s_P*(xi_R - 1._wp)) + flux_src_rs${XYZ}$_vf(j, k, l, i) = 0.0_wp end do + end if + + ! Geometrical source flux for cylindrical coordinates + #:if (NORM_DIR == 2) + if (cyl_coord) then + !Substituting the advective flux into the inviscid geometrical source flux + $:GPU_LOOP(parallelism='[seq]') + do i = 1, E_idx + flux_gsrc_rs${XYZ}$_vf(j, k, l, i) = flux_rs${XYZ}$_vf(j, k, l, i) + end do + ! Recalculating the radial momentum geometric source flux + flux_gsrc_rs${XYZ}$_vf(j, k, l, contxe + idx1) = & + xi_M*(rho_L*(vel_L(idx1)* & + vel_L(idx1) + & + s_M*(xi_L*(dir_flg(idx1)*s_S + & + (1._wp - dir_flg(idx1))* & + vel_L(idx1)) - vel_L(idx1)))) & + + xi_P*(rho_R*(vel_R(idx1)* & + vel_R(idx1) + & + s_P*(xi_R*(dir_flg(idx1)*s_S + & + (1._wp - dir_flg(idx1))* & + vel_R(idx1)) - vel_R(idx1)))) + ! Geometrical source of the void fraction(s) is zero + $:GPU_LOOP(parallelism='[seq]') + do i = advxb, advxe + flux_gsrc_rs${XYZ}$_vf(j, k, l, i) = 0._wp + end do + end if + #:endif + #:if (NORM_DIR == 3) + if (grid_geometry == 3) then + $:GPU_LOOP(parallelism='[seq]') + do i = 1, sys_size + flux_gsrc_rs${XYZ}$_vf(j, k, l, i) = 0._wp + end do - flux_gsrc_rs${XYZ}$_vf(j, k, l, momxb + 1) = & - -xi_M*(rho_L*(vel_L(idx1)* & - vel_L(idx1) + & - s_M*(xi_L*(dir_flg(idx1)*s_S + & - (1._wp - dir_flg(idx1))* & - vel_L(idx1)) - vel_L(idx1)))) & - - xi_P*(rho_R*(vel_R(idx1)* & - vel_R(idx1) + & - s_P*(xi_R*(dir_flg(idx1)*s_S + & - (1._wp - dir_flg(idx1))* & - vel_R(idx1)) - vel_R(idx1)))) - flux_gsrc_rs${XYZ}$_vf(j, k, l, momxe) = flux_rs${XYZ}$_vf(j, k, l, momxb + 1) + flux_gsrc_rs${XYZ}$_vf(j, k, l, momxb + 1) = & + -xi_M*(rho_L*(vel_L(idx1)* & + vel_L(idx1) + & + s_M*(xi_L*(dir_flg(idx1)*s_S + & + (1._wp - dir_flg(idx1))* & + vel_L(idx1)) - vel_L(idx1)))) & + - xi_P*(rho_R*(vel_R(idx1)* & + vel_R(idx1) + & + s_P*(xi_R*(dir_flg(idx1)*s_S + & + (1._wp - dir_flg(idx1))* & + vel_R(idx1)) - vel_R(idx1)))) + flux_gsrc_rs${XYZ}$_vf(j, k, l, momxe) = flux_rs${XYZ}$_vf(j, k, l, momxb + 1) - end if - #:endif + end if + #:endif + end do end do end do - end do + #:endcall GPU_PARALLEL_LOOP end if end if #:endfor @@ -2873,183 +2881,181 @@ contains #:for NORM_DIR, XYZ in [(1, 'x'), (2, 'y'), (3, 'z')] if (norm_dir == ${NORM_DIR}$) then - $:GPU_PARALLEL_LOOP(collapse=3, private='[alpha_rho_L, & - & alpha_rho_R, vel, alpha_L, alpha_R, rho, pres, & - & E, H_no_mag, gamma, pi_inf, qv, vel_rms, B, & - & c, c_fast, pres_mag, U_L, U_R, U_starL, & - & U_starR, U_doubleL, U_doubleR, F_L, F_R, & - & F_starL, F_starR, F_hlld]') - do l = is3%beg, is3%end - do k = is2%beg, is2%end - do j = is1%beg, is1%end + #:block UNDEF_AMD + #:call GPU_PARALLEL_LOOP(collapse=3, private='[alpha_rho_L, alpha_rho_R, vel, alpha_L, alpha_R, rho, pres,E, H_no_mag, gamma, pi_inf, qv, vel_rms, B, c, c_fast, pres_mag, U_L, U_R, U_starL, U_starR, U_doubleL, U_doubleR, F_L, F_R, F_starL, F_starR, F_hlld]') + do l = is3%beg, is3%end + do k = is2%beg, is2%end + do j = is1%beg, is1%end - ! (1) Extract the left/right primitive states - do i = 1, contxe - alpha_rho_L(i) = qL_prim_rs${XYZ}$_vf(j, k, l, i) - alpha_rho_R(i) = qR_prim_rs${XYZ}$_vf(j + 1, k, l, i) - end do + ! (1) Extract the left/right primitive states + do i = 1, contxe + alpha_rho_L(i) = qL_prim_rs${XYZ}$_vf(j, k, l, i) + alpha_rho_R(i) = qR_prim_rs${XYZ}$_vf(j + 1, k, l, i) + end do - ! NOTE: unlike HLL & HLLC, vel_L here is permutated by dir_idx for simpler logic - do i = 1, num_vels - vel%L(i) = qL_prim_rs${XYZ}$_vf(j, k, l, contxe + dir_idx(i)) - vel%R(i) = qR_prim_rs${XYZ}$_vf(j + 1, k, l, contxe + dir_idx(i)) - end do + ! NOTE: unlike HLL & HLLC, vel_L here is permutated by dir_idx for simpler logic + do i = 1, num_vels + vel%L(i) = qL_prim_rs${XYZ}$_vf(j, k, l, contxe + dir_idx(i)) + vel%R(i) = qR_prim_rs${XYZ}$_vf(j + 1, k, l, contxe + dir_idx(i)) + end do - vel_rms%L = sum(vel%L**2._wp) - vel_rms%R = sum(vel%R**2._wp) + vel_rms%L = sum(vel%L**2._wp) + vel_rms%R = sum(vel%R**2._wp) - do i = 1, num_fluids - alpha_L(i) = qL_prim_rs${XYZ}$_vf(j, k, l, E_idx + i) - alpha_R(i) = qR_prim_rs${XYZ}$_vf(j + 1, k, l, E_idx + i) - end do + do i = 1, num_fluids + alpha_L(i) = qL_prim_rs${XYZ}$_vf(j, k, l, E_idx + i) + alpha_R(i) = qR_prim_rs${XYZ}$_vf(j + 1, k, l, E_idx + i) + end do - pres%L = qL_prim_rs${XYZ}$_vf(j, k, l, E_idx) - pres%R = qR_prim_rs${XYZ}$_vf(j + 1, k, l, E_idx) - - ! NOTE: unlike HLL, Bx, By, Bz are permutated by dir_idx for simpler logic - if (mhd) then - if (n == 0) then ! 1D: constant Bx; By, Bz as variables; only in x so not permutated - B%L = [Bx0, qL_prim_rs${XYZ}$_vf(j, k, l, B_idx%beg), qL_prim_rs${XYZ}$_vf(j, k, l, B_idx%beg + 1)] - B%R = [Bx0, qR_prim_rs${XYZ}$_vf(j + 1, k, l, B_idx%beg), qR_prim_rs${XYZ}$_vf(j + 1, k, l, B_idx%beg + 1)] - else ! 2D/3D: Bx, By, Bz as variables - B%L = [qL_prim_rs${XYZ}$_vf(j, k, l, B_idx%beg + dir_idx(1) - 1), & - qL_prim_rs${XYZ}$_vf(j, k, l, B_idx%beg + dir_idx(2) - 1), & - qL_prim_rs${XYZ}$_vf(j, k, l, B_idx%beg + dir_idx(3) - 1)] - B%R = [qR_prim_rs${XYZ}$_vf(j + 1, k, l, B_idx%beg + dir_idx(1) - 1), & - qR_prim_rs${XYZ}$_vf(j + 1, k, l, B_idx%beg + dir_idx(2) - 1), & - qR_prim_rs${XYZ}$_vf(j + 1, k, l, B_idx%beg + dir_idx(3) - 1)] - end if - end if + pres%L = qL_prim_rs${XYZ}$_vf(j, k, l, E_idx) + pres%R = qR_prim_rs${XYZ}$_vf(j + 1, k, l, E_idx) + + ! NOTE: unlike HLL, Bx, By, Bz are permutated by dir_idx for simpler logic + if (mhd) then + if (n == 0) then ! 1D: constant Bx; By, Bz as variables; only in x so not permutated + B%L = [Bx0, qL_prim_rs${XYZ}$_vf(j, k, l, B_idx%beg), qL_prim_rs${XYZ}$_vf(j, k, l, B_idx%beg + 1)] + B%R = [Bx0, qR_prim_rs${XYZ}$_vf(j + 1, k, l, B_idx%beg), qR_prim_rs${XYZ}$_vf(j + 1, k, l, B_idx%beg + 1)] + else ! 2D/3D: Bx, By, Bz as variables + B%L = [qL_prim_rs${XYZ}$_vf(j, k, l, B_idx%beg + dir_idx(1) - 1), & + qL_prim_rs${XYZ}$_vf(j, k, l, B_idx%beg + dir_idx(2) - 1), & + qL_prim_rs${XYZ}$_vf(j, k, l, B_idx%beg + dir_idx(3) - 1)] + B%R = [qR_prim_rs${XYZ}$_vf(j + 1, k, l, B_idx%beg + dir_idx(1) - 1), & + qR_prim_rs${XYZ}$_vf(j + 1, k, l, B_idx%beg + dir_idx(2) - 1), & + qR_prim_rs${XYZ}$_vf(j + 1, k, l, B_idx%beg + dir_idx(3) - 1)] + end if + end if - ! Sum properties of all fluid components - rho%L = 0._wp; gamma%L = 0._wp; pi_inf%L = 0._wp; qv%L = 0._wp - rho%R = 0._wp; gamma%R = 0._wp; pi_inf%R = 0._wp; qv%R = 0._wp - $:GPU_LOOP(parallelism='[seq]') - do i = 1, num_fluids - rho%L = rho%L + alpha_rho_L(i) - gamma%L = gamma%L + alpha_L(i)*gammas(i) - pi_inf%L = pi_inf%L + alpha_L(i)*pi_infs(i) - qv%L = qv%L + alpha_rho_L(i)*qvs(i) - - rho%R = rho%R + alpha_rho_R(i) - gamma%R = gamma%R + alpha_R(i)*gammas(i) - pi_inf%R = pi_inf%R + alpha_R(i)*pi_infs(i) - qv%R = qv%R + alpha_rho_R(i)*qvs(i) - end do + ! Sum properties of all fluid components + rho%L = 0._wp; gamma%L = 0._wp; pi_inf%L = 0._wp; qv%L = 0._wp + rho%R = 0._wp; gamma%R = 0._wp; pi_inf%R = 0._wp; qv%R = 0._wp + $:GPU_LOOP(parallelism='[seq]') + do i = 1, num_fluids + rho%L = rho%L + alpha_rho_L(i) + gamma%L = gamma%L + alpha_L(i)*gammas(i) + pi_inf%L = pi_inf%L + alpha_L(i)*pi_infs(i) + qv%L = qv%L + alpha_rho_L(i)*qvs(i) + + rho%R = rho%R + alpha_rho_R(i) + gamma%R = gamma%R + alpha_R(i)*gammas(i) + pi_inf%R = pi_inf%R + alpha_R(i)*pi_infs(i) + qv%R = qv%R + alpha_rho_R(i)*qvs(i) + end do - pres_mag%L = 0.5_wp*sum(B%L**2._wp) - pres_mag%R = 0.5_wp*sum(B%R**2._wp) - E%L = gamma%L*pres%L + pi_inf%L + 0.5_wp*rho%L*vel_rms%L + qv%L + pres_mag%L - E%R = gamma%R*pres%R + pi_inf%R + 0.5_wp*rho%R*vel_rms%R + qv%R + pres_mag%R ! includes magnetic energy - H_no_mag%L = (E%L + pres%L - pres_mag%L)/rho%L - H_no_mag%R = (E%R + pres%R - pres_mag%R)/rho%R ! stagnation enthalpy here excludes magnetic energy (only used to find speed of sound) - - ! (2) Compute fast wave speeds - call s_compute_speed_of_sound(pres%L, rho%L, gamma%L, pi_inf%L, H_no_mag%L, alpha_L, vel_rms%L, 0._wp, c%L) - call s_compute_speed_of_sound(pres%R, rho%R, gamma%R, pi_inf%R, H_no_mag%R, alpha_R, vel_rms%R, 0._wp, c%R) - call s_compute_fast_magnetosonic_speed(rho%L, c%L, B%L, norm_dir, c_fast%L, H_no_mag%L) - call s_compute_fast_magnetosonic_speed(rho%R, c%R, B%R, norm_dir, c_fast%R, H_no_mag%R) - - ! (3) Compute contact speed s_M [Miyoshi Equ. (38)] - s_L = min(vel%L(1) - c_fast%L, vel%R(1) - c_fast%R) - s_R = max(vel%R(1) + c_fast%R, vel%L(1) + c_fast%L) - - pTot_L = pres%L + pres_mag%L - pTot_R = pres%R + pres_mag%R - - s_M = (((s_R - vel%R(1))*rho%R*vel%R(1) - & - (s_L - vel%L(1))*rho%L*vel%L(1) - pTot_R + pTot_L)/ & - ((s_R - vel%R(1))*rho%R - (s_L - vel%L(1))*rho%L)) - - ! (4) Compute star state variables - rhoL_star = rho%L*(s_L - vel%L(1))/(s_L - s_M) - rhoR_star = rho%R*(s_R - vel%R(1))/(s_R - s_M) - p_star = pTot_L + rho%L*(s_L - vel%L(1))*(s_M - vel%L(1))/(s_L - s_M) - E_starL = ((s_L - vel%L(1))*E%L - pTot_L*vel%L(1) + p_star*s_M)/(s_L - s_M) - E_starR = ((s_R - vel%R(1))*E%R - pTot_R*vel%R(1) + p_star*s_M)/(s_R - s_M) - - ! (5) Compute left/right state vectors and fluxes - U_L = [rho%L, rho%L*vel%L(1:3), B%L(2:3), E%L] - U_starL = [rhoL_star, rhoL_star*s_M, rhoL_star*vel%L(2:3), B%L(2:3), E_starL] - U_R = [rho%R, rho%R*vel%R(1:3), B%R(2:3), E%R] - U_starR = [rhoR_star, rhoR_star*s_M, rhoR_star*vel%R(2:3), B%R(2:3), E_starR] - - ! Compute the left/right fluxes - F_L(1) = U_L(2) - F_L(2) = U_L(2)*vel%L(1) - B%L(1)*B%L(1) + pTot_L - F_L(3:4) = U_L(2)*vel%L(2:3) - B%L(1)*B%L(2:3) - F_L(5:6) = vel%L(1)*B%L(2:3) - vel%L(2:3)*B%L(1) - F_L(7) = (E%L + pTot_L)*vel%L(1) - B%L(1)*(vel%L(1)*B%L(1) + vel%L(2)*B%L(2) + vel%L(3)*B%L(3)) - - F_R(1) = U_R(2) - F_R(2) = U_R(2)*vel%R(1) - B%R(1)*B%R(1) + pTot_R - F_R(3:4) = U_R(2)*vel%R(2:3) - B%R(1)*B%R(2:3) - F_R(5:6) = vel%R(1)*B%R(2:3) - vel%R(2:3)*B%R(1) - F_R(7) = (E%R + pTot_R)*vel%R(1) - B%R(1)*(vel%R(1)*B%R(1) + vel%R(2)*B%R(2) + vel%R(3)*B%R(3)) - ! Compute the star flux using HLL relation - F_starL = F_L + s_L*(U_starL - U_L) - F_starR = F_R + s_R*(U_starR - U_R) - ! Compute the rotational (Alfvén) speeds - s_starL = s_M - abs(B%L(1))/sqrt(rhoL_star) - s_starR = s_M + abs(B%L(1))/sqrt(rhoR_star) - ! Compute the double–star states [Miyoshi Eqns. (59)-(62)] - sqrt_rhoL_star = sqrt(rhoL_star); sqrt_rhoR_star = sqrt(rhoR_star) - vL_star = vel%L(2); wL_star = vel%L(3) - vR_star = vel%R(2); wR_star = vel%R(3) - - ! (6) Compute the double–star states [Miyoshi Eqns. (59)-(62)] - denom_ds = sqrt_rhoL_star + sqrt_rhoR_star - sign_Bx = sign(1._wp, B%L(1)) - v_double = (sqrt_rhoL_star*vL_star + sqrt_rhoR_star*vR_star + (B%R(2) - B%L(2))*sign_Bx)/denom_ds - w_double = (sqrt_rhoL_star*wL_star + sqrt_rhoR_star*wR_star + (B%R(3) - B%L(3))*sign_Bx)/denom_ds - By_double = (sqrt_rhoL_star*B%R(2) + sqrt_rhoR_star*B%L(2) + sqrt_rhoL_star*sqrt_rhoR_star*(vR_star - vL_star)*sign_Bx)/denom_ds - Bz_double = (sqrt_rhoL_star*B%R(3) + sqrt_rhoR_star*B%L(3) + sqrt_rhoL_star*sqrt_rhoR_star*(wR_star - wL_star)*sign_Bx)/denom_ds - - E_doubleL = E_starL - sqrt_rhoL_star*((vL_star*B%L(2) + wL_star*B%L(3)) - (v_double*By_double + w_double*Bz_double))*sign_Bx - E_doubleR = E_starR + sqrt_rhoR_star*((vR_star*B%R(2) + wR_star*B%R(3)) - (v_double*By_double + w_double*Bz_double))*sign_Bx - E_double = 0.5_wp*(E_doubleL + E_doubleR) - - U_doubleL = [rhoL_star, rhoL_star*s_M, rhoL_star*v_double, rhoL_star*w_double, By_double, Bz_double, E_double] - U_doubleR = [rhoR_star, rhoR_star*s_M, rhoR_star*v_double, rhoR_star*w_double, By_double, Bz_double, E_double] - - ! (11) Choose HLLD flux based on wave-speed regions - if (0.0_wp <= s_L) then - F_hlld = F_L - else if (0.0_wp <= s_starL) then - F_hlld = F_L + s_L*(U_starL - U_L) - else if (0.0_wp <= s_M) then - F_hlld = F_starL + s_starL*(U_doubleL - U_starL) - else if (0.0_wp <= s_starR) then - F_hlld = F_starR + s_starR*(U_doubleR - U_starR) - else if (0.0_wp <= s_R) then - F_hlld = F_R + s_R*(U_starR - U_R) - else - F_hlld = F_R - end if + pres_mag%L = 0.5_wp*sum(B%L**2._wp) + pres_mag%R = 0.5_wp*sum(B%R**2._wp) + E%L = gamma%L*pres%L + pi_inf%L + 0.5_wp*rho%L*vel_rms%L + qv%L + pres_mag%L + E%R = gamma%R*pres%R + pi_inf%R + 0.5_wp*rho%R*vel_rms%R + qv%R + pres_mag%R ! includes magnetic energy + H_no_mag%L = (E%L + pres%L - pres_mag%L)/rho%L + H_no_mag%R = (E%R + pres%R - pres_mag%R)/rho%R ! stagnation enthalpy here excludes magnetic energy (only used to find speed of sound) + + ! (2) Compute fast wave speeds + call s_compute_speed_of_sound(pres%L, rho%L, gamma%L, pi_inf%L, H_no_mag%L, alpha_L, vel_rms%L, 0._wp, c%L) + call s_compute_speed_of_sound(pres%R, rho%R, gamma%R, pi_inf%R, H_no_mag%R, alpha_R, vel_rms%R, 0._wp, c%R) + call s_compute_fast_magnetosonic_speed(rho%L, c%L, B%L, norm_dir, c_fast%L, H_no_mag%L) + call s_compute_fast_magnetosonic_speed(rho%R, c%R, B%R, norm_dir, c_fast%R, H_no_mag%R) + + ! (3) Compute contact speed s_M [Miyoshi Equ. (38)] + s_L = min(vel%L(1) - c_fast%L, vel%R(1) - c_fast%R) + s_R = max(vel%R(1) + c_fast%R, vel%L(1) + c_fast%L) + + pTot_L = pres%L + pres_mag%L + pTot_R = pres%R + pres_mag%R + + s_M = (((s_R - vel%R(1))*rho%R*vel%R(1) - & + (s_L - vel%L(1))*rho%L*vel%L(1) - pTot_R + pTot_L)/ & + ((s_R - vel%R(1))*rho%R - (s_L - vel%L(1))*rho%L)) + + ! (4) Compute star state variables + rhoL_star = rho%L*(s_L - vel%L(1))/(s_L - s_M) + rhoR_star = rho%R*(s_R - vel%R(1))/(s_R - s_M) + p_star = pTot_L + rho%L*(s_L - vel%L(1))*(s_M - vel%L(1))/(s_L - s_M) + E_starL = ((s_L - vel%L(1))*E%L - pTot_L*vel%L(1) + p_star*s_M)/(s_L - s_M) + E_starR = ((s_R - vel%R(1))*E%R - pTot_R*vel%R(1) + p_star*s_M)/(s_R - s_M) + + ! (5) Compute left/right state vectors and fluxes + U_L = [rho%L, rho%L*vel%L(1:3), B%L(2:3), E%L] + U_starL = [rhoL_star, rhoL_star*s_M, rhoL_star*vel%L(2:3), B%L(2:3), E_starL] + U_R = [rho%R, rho%R*vel%R(1:3), B%R(2:3), E%R] + U_starR = [rhoR_star, rhoR_star*s_M, rhoR_star*vel%R(2:3), B%R(2:3), E_starR] + + ! Compute the left/right fluxes + F_L(1) = U_L(2) + F_L(2) = U_L(2)*vel%L(1) - B%L(1)*B%L(1) + pTot_L + F_L(3:4) = U_L(2)*vel%L(2:3) - B%L(1)*B%L(2:3) + F_L(5:6) = vel%L(1)*B%L(2:3) - vel%L(2:3)*B%L(1) + F_L(7) = (E%L + pTot_L)*vel%L(1) - B%L(1)*(vel%L(1)*B%L(1) + vel%L(2)*B%L(2) + vel%L(3)*B%L(3)) + + F_R(1) = U_R(2) + F_R(2) = U_R(2)*vel%R(1) - B%R(1)*B%R(1) + pTot_R + F_R(3:4) = U_R(2)*vel%R(2:3) - B%R(1)*B%R(2:3) + F_R(5:6) = vel%R(1)*B%R(2:3) - vel%R(2:3)*B%R(1) + F_R(7) = (E%R + pTot_R)*vel%R(1) - B%R(1)*(vel%R(1)*B%R(1) + vel%R(2)*B%R(2) + vel%R(3)*B%R(3)) + ! Compute the star flux using HLL relation + F_starL = F_L + s_L*(U_starL - U_L) + F_starR = F_R + s_R*(U_starR - U_R) + ! Compute the rotational (Alfvén) speeds + s_starL = s_M - abs(B%L(1))/sqrt(rhoL_star) + s_starR = s_M + abs(B%L(1))/sqrt(rhoR_star) + ! Compute the double–star states [Miyoshi Eqns. (59)-(62)] + sqrt_rhoL_star = sqrt(rhoL_star); sqrt_rhoR_star = sqrt(rhoR_star) + vL_star = vel%L(2); wL_star = vel%L(3) + vR_star = vel%R(2); wR_star = vel%R(3) + + ! (6) Compute the double–star states [Miyoshi Eqns. (59)-(62)] + denom_ds = sqrt_rhoL_star + sqrt_rhoR_star + sign_Bx = sign(1._wp, B%L(1)) + v_double = (sqrt_rhoL_star*vL_star + sqrt_rhoR_star*vR_star + (B%R(2) - B%L(2))*sign_Bx)/denom_ds + w_double = (sqrt_rhoL_star*wL_star + sqrt_rhoR_star*wR_star + (B%R(3) - B%L(3))*sign_Bx)/denom_ds + By_double = (sqrt_rhoL_star*B%R(2) + sqrt_rhoR_star*B%L(2) + sqrt_rhoL_star*sqrt_rhoR_star*(vR_star - vL_star)*sign_Bx)/denom_ds + Bz_double = (sqrt_rhoL_star*B%R(3) + sqrt_rhoR_star*B%L(3) + sqrt_rhoL_star*sqrt_rhoR_star*(wR_star - wL_star)*sign_Bx)/denom_ds + + E_doubleL = E_starL - sqrt_rhoL_star*((vL_star*B%L(2) + wL_star*B%L(3)) - (v_double*By_double + w_double*Bz_double))*sign_Bx + E_doubleR = E_starR + sqrt_rhoR_star*((vR_star*B%R(2) + wR_star*B%R(3)) - (v_double*By_double + w_double*Bz_double))*sign_Bx + E_double = 0.5_wp*(E_doubleL + E_doubleR) + + U_doubleL = [rhoL_star, rhoL_star*s_M, rhoL_star*v_double, rhoL_star*w_double, By_double, Bz_double, E_double] + U_doubleR = [rhoR_star, rhoR_star*s_M, rhoR_star*v_double, rhoR_star*w_double, By_double, Bz_double, E_double] + + ! (11) Choose HLLD flux based on wave-speed regions + if (0.0_wp <= s_L) then + F_hlld = F_L + else if (0.0_wp <= s_starL) then + F_hlld = F_L + s_L*(U_starL - U_L) + else if (0.0_wp <= s_M) then + F_hlld = F_starL + s_starL*(U_doubleL - U_starL) + else if (0.0_wp <= s_starR) then + F_hlld = F_starR + s_starR*(U_doubleR - U_starR) + else if (0.0_wp <= s_R) then + F_hlld = F_R + s_R*(U_starR - U_R) + else + F_hlld = F_R + end if - ! (12) Reorder and write temporary variables to the flux array - ! Mass - flux_rs${XYZ}$_vf(j, k, l, 1) = F_hlld(1) ! TODO multi-component - ! Momentum - flux_rs${XYZ}$_vf(j, k, l, [contxe + dir_idx(1), contxe + dir_idx(2), contxe + dir_idx(3)]) = F_hlld([2, 3, 4]) - ! Magnetic field - if (n == 0) then - flux_rs${XYZ}$_vf(j, k, l, [B_idx%beg, B_idx%beg + 1]) = F_hlld([5, 6]) - else - flux_rs${XYZ}$_vf(j, k, l, [B_idx%beg + dir_idx(2) - 1, B_idx%beg + dir_idx(3) - 1]) = F_hlld([5, 6]) - end if - ! Energy - flux_rs${XYZ}$_vf(j, k, l, E_idx) = F_hlld(7) - ! Partial fraction - $:GPU_LOOP(parallelism='[seq]') - do i = advxb, advxe - flux_rs${XYZ}$_vf(j, k, l, i) = 0._wp ! TODO multi-component (zero for now) - end do + ! (12) Reorder and write temporary variables to the flux array + ! Mass + flux_rs${XYZ}$_vf(j, k, l, 1) = F_hlld(1) ! TODO multi-component + ! Momentum + flux_rs${XYZ}$_vf(j, k, l, [contxe + dir_idx(1), contxe + dir_idx(2), contxe + dir_idx(3)]) = F_hlld([2, 3, 4]) + ! Magnetic field + if (n == 0) then + flux_rs${XYZ}$_vf(j, k, l, [B_idx%beg, B_idx%beg + 1]) = F_hlld([5, 6]) + else + flux_rs${XYZ}$_vf(j, k, l, [B_idx%beg + dir_idx(2) - 1, B_idx%beg + dir_idx(3) - 1]) = F_hlld([5, 6]) + end if + ! Energy + flux_rs${XYZ}$_vf(j, k, l, E_idx) = F_hlld(7) + ! Partial fraction + $:GPU_LOOP(parallelism='[seq]') + do i = advxb, advxe + flux_rs${XYZ}$_vf(j, k, l, i) = 0._wp ! TODO multi-component (zero for now) + end do - flux_src_rs${XYZ}$_vf(j, k, l, advxb) = 0._wp + flux_src_rs${XYZ}$_vf(j, k, l, advxb) = 0._wp + end do + end do end do - end do - end do + #:endcall GPU_PARALLEL_LOOP + #:endblock UNDEF_AMD end if #:endfor @@ -3067,24 +3073,24 @@ contains ! the Riemann problem solution integer :: i, j - @:ALLOCATE(Gs(1:num_fluids)) + @:ALLOCATE(Gs_rs(1:num_fluids)) do i = 1, num_fluids - Gs(i) = fluid_pp(i)%G + Gs_rs(i) = fluid_pp(i)%G end do - $:GPU_UPDATE(device='[Gs]') + $:GPU_UPDATE(device='[Gs_rs]') if (viscous) then - @:ALLOCATE(Res(1:2, 1:Re_size_max)) + @:ALLOCATE(Res_gs(1:2, 1:Re_size_max)) end if if (viscous) then do i = 1, 2 do j = 1, Re_size(i) - Res(i, j) = fluid_pp(Re_idx(i, j))%Re(i) + Res_gs(i, j) = fluid_pp(Re_idx(i, j))%Re(i) end do end do - $:GPU_UPDATE(device='[Res,Re_idx,Re_size]') + $:GPU_UPDATE(device='[Res_gs,Re_idx,Re_size]') end if $:GPU_ENTER_DATA(copyin='[is1,is2,is3,isx,isy,isz]') @@ -3252,51 +3258,55 @@ contains if (norm_dir == 1) then if (bc_x%beg == BC_RIEMANN_EXTRAP) then ! Riemann state extrap. BC at beginning - $:GPU_PARALLEL_LOOP(collapse=3) - do i = 1, sys_size - do l = is3%beg, is3%end - do k = is2%beg, is2%end - qL_prim_rsx_vf(-1, k, l, i) = & - qR_prim_rsx_vf(0, k, l, i) - end do - end do - end do - - if (viscous) then - $:GPU_PARALLEL_LOOP(collapse=3) - do i = momxb, momxe - do l = isz%beg, isz%end - do k = isy%beg, isy%end - - dqL_prim_dx_vf(i)%sf(-1, k, l) = & - dqR_prim_dx_vf(i)%sf(0, k, l) + #:call GPU_PARALLEL_LOOP(collapse=3) + do i = 1, sys_size + do l = is3%beg, is3%end + do k = is2%beg, is2%end + qL_prim_rsx_vf(-1, k, l, i) = & + qR_prim_rsx_vf(0, k, l, i) end do end do end do + #:endcall GPU_PARALLEL_LOOP - if (n > 0) then - $:GPU_PARALLEL_LOOP(collapse=3) + if (viscous) then + #:call GPU_PARALLEL_LOOP(collapse=3) do i = momxb, momxe do l = isz%beg, isz%end do k = isy%beg, isy%end - dqL_prim_dy_vf(i)%sf(-1, k, l) = & - dqR_prim_dy_vf(i)%sf(0, k, l) + dqL_prim_dx_vf(i)%sf(-1, k, l) = & + dqR_prim_dx_vf(i)%sf(0, k, l) end do end do end do + #:endcall GPU_PARALLEL_LOOP - if (p > 0) then - $:GPU_PARALLEL_LOOP(collapse=3) + if (n > 0) then + #:call GPU_PARALLEL_LOOP(collapse=3) do i = momxb, momxe do l = isz%beg, isz%end do k = isy%beg, isy%end - dqL_prim_dz_vf(i)%sf(-1, k, l) = & - dqR_prim_dz_vf(i)%sf(0, k, l) + dqL_prim_dy_vf(i)%sf(-1, k, l) = & + dqR_prim_dy_vf(i)%sf(0, k, l) end do end do end do + #:endcall GPU_PARALLEL_LOOP + + if (p > 0) then + #:call GPU_PARALLEL_LOOP(collapse=3) + do i = momxb, momxe + do l = isz%beg, isz%end + do k = isy%beg, isy%end + + dqL_prim_dz_vf(i)%sf(-1, k, l) = & + dqR_prim_dz_vf(i)%sf(0, k, l) + end do + end do + end do + #:endcall GPU_PARALLEL_LOOP end if end if @@ -3307,52 +3317,56 @@ contains if (bc_x%end == BC_RIEMANN_EXTRAP) then ! Riemann state extrap. BC at end - $:GPU_PARALLEL_LOOP(collapse=3) - do i = 1, sys_size - do l = is3%beg, is3%end - do k = is2%beg, is2%end - qR_prim_rsx_vf(m + 1, k, l, i) = & - qL_prim_rsx_vf(m, k, l, i) + #:call GPU_PARALLEL_LOOP(collapse=3) + do i = 1, sys_size + do l = is3%beg, is3%end + do k = is2%beg, is2%end + qR_prim_rsx_vf(m + 1, k, l, i) = & + qL_prim_rsx_vf(m, k, l, i) + end do end do end do - end do + #:endcall GPU_PARALLEL_LOOP if (viscous) then - $:GPU_PARALLEL_LOOP(collapse=3) - do i = momxb, momxe - do l = isz%beg, isz%end - do k = isy%beg, isy%end - - dqR_prim_dx_vf(i)%sf(m + 1, k, l) = & - dqL_prim_dx_vf(i)%sf(m, k, l) - end do - end do - end do - - if (n > 0) then - $:GPU_PARALLEL_LOOP(collapse=3) + #:call GPU_PARALLEL_LOOP(collapse=3) do i = momxb, momxe do l = isz%beg, isz%end do k = isy%beg, isy%end - dqR_prim_dy_vf(i)%sf(m + 1, k, l) = & - dqL_prim_dy_vf(i)%sf(m, k, l) + dqR_prim_dx_vf(i)%sf(m + 1, k, l) = & + dqL_prim_dx_vf(i)%sf(m, k, l) end do end do end do + #:endcall GPU_PARALLEL_LOOP - if (p > 0) then - $:GPU_PARALLEL_LOOP(collapse=3) + if (n > 0) then + #:call GPU_PARALLEL_LOOP(collapse=3) do i = momxb, momxe do l = isz%beg, isz%end do k = isy%beg, isy%end - dqR_prim_dz_vf(i)%sf(m + 1, k, l) = & - dqL_prim_dz_vf(i)%sf(m, k, l) + dqR_prim_dy_vf(i)%sf(m + 1, k, l) = & + dqL_prim_dy_vf(i)%sf(m, k, l) end do end do end do + #:endcall GPU_PARALLEL_LOOP + + if (p > 0) then + #:call GPU_PARALLEL_LOOP(collapse=3) + do i = momxb, momxe + do l = isz%beg, isz%end + do k = isy%beg, isy%end + + dqR_prim_dz_vf(i)%sf(m + 1, k, l) = & + dqL_prim_dz_vf(i)%sf(m, k, l) + end do + end do + end do + #:endcall GPU_PARALLEL_LOOP end if end if @@ -3366,48 +3380,52 @@ contains elseif (norm_dir == 2) then if (bc_y%beg == BC_RIEMANN_EXTRAP) then ! Riemann state extrap. BC at beginning - $:GPU_PARALLEL_LOOP(collapse=3) - do i = 1, sys_size - do l = is3%beg, is3%end - do k = is2%beg, is2%end - qL_prim_rsy_vf(-1, k, l, i) = & - qR_prim_rsy_vf(0, k, l, i) + #:call GPU_PARALLEL_LOOP(collapse=3) + do i = 1, sys_size + do l = is3%beg, is3%end + do k = is2%beg, is2%end + qL_prim_rsy_vf(-1, k, l, i) = & + qR_prim_rsy_vf(0, k, l, i) + end do end do end do - end do + #:endcall GPU_PARALLEL_LOOP if (viscous) then - $:GPU_PARALLEL_LOOP(collapse=3) - do i = momxb, momxe - do l = isz%beg, isz%end - do j = isx%beg, isx%end - dqL_prim_dx_vf(i)%sf(j, -1, l) = & - dqR_prim_dx_vf(i)%sf(j, 0, l) - end do - end do - end do - - $:GPU_PARALLEL_LOOP(collapse=3) - do i = momxb, momxe - do l = isz%beg, isz%end - do j = isx%beg, isx%end - dqL_prim_dy_vf(i)%sf(j, -1, l) = & - dqR_prim_dy_vf(i)%sf(j, 0, l) + #:call GPU_PARALLEL_LOOP(collapse=3) + do i = momxb, momxe + do l = isz%beg, isz%end + do j = isx%beg, isx%end + dqL_prim_dx_vf(i)%sf(j, -1, l) = & + dqR_prim_dx_vf(i)%sf(j, 0, l) + end do end do end do - end do + #:endcall GPU_PARALLEL_LOOP - if (p > 0) then - $:GPU_PARALLEL_LOOP(collapse=3) + #:call GPU_PARALLEL_LOOP(collapse=3) do i = momxb, momxe do l = isz%beg, isz%end do j = isx%beg, isx%end - dqL_prim_dz_vf(i)%sf(j, -1, l) = & - dqR_prim_dz_vf(i)%sf(j, 0, l) + dqL_prim_dy_vf(i)%sf(j, -1, l) = & + dqR_prim_dy_vf(i)%sf(j, 0, l) end do end do end do + #:endcall GPU_PARALLEL_LOOP + + if (p > 0) then + #:call GPU_PARALLEL_LOOP(collapse=3) + do i = momxb, momxe + do l = isz%beg, isz%end + do j = isx%beg, isx%end + dqL_prim_dz_vf(i)%sf(j, -1, l) = & + dqR_prim_dz_vf(i)%sf(j, 0, l) + end do + end do + end do + #:endcall GPU_PARALLEL_LOOP end if end if @@ -3416,48 +3434,52 @@ contains if (bc_y%end == BC_RIEMANN_EXTRAP) then ! Riemann state extrap. BC at end - $:GPU_PARALLEL_LOOP(collapse=3) - do i = 1, sys_size - do l = is3%beg, is3%end - do k = is2%beg, is2%end - qR_prim_rsy_vf(n + 1, k, l, i) = & - qL_prim_rsy_vf(n, k, l, i) + #:call GPU_PARALLEL_LOOP(collapse=3) + do i = 1, sys_size + do l = is3%beg, is3%end + do k = is2%beg, is2%end + qR_prim_rsy_vf(n + 1, k, l, i) = & + qL_prim_rsy_vf(n, k, l, i) + end do end do end do - end do + #:endcall GPU_PARALLEL_LOOP if (viscous) then - $:GPU_PARALLEL_LOOP(collapse=3) - do i = momxb, momxe - do l = isz%beg, isz%end - do j = isx%beg, isx%end - dqR_prim_dx_vf(i)%sf(j, n + 1, l) = & - dqL_prim_dx_vf(i)%sf(j, n, l) - end do - end do - end do - - $:GPU_PARALLEL_LOOP(collapse=3) - do i = momxb, momxe - do l = isz%beg, isz%end - do j = isx%beg, isx%end - dqR_prim_dy_vf(i)%sf(j, n + 1, l) = & - dqL_prim_dy_vf(i)%sf(j, n, l) + #:call GPU_PARALLEL_LOOP(collapse=3) + do i = momxb, momxe + do l = isz%beg, isz%end + do j = isx%beg, isx%end + dqR_prim_dx_vf(i)%sf(j, n + 1, l) = & + dqL_prim_dx_vf(i)%sf(j, n, l) + end do end do end do - end do + #:endcall GPU_PARALLEL_LOOP - if (p > 0) then - $:GPU_PARALLEL_LOOP(collapse=3) + #:call GPU_PARALLEL_LOOP(collapse=3) do i = momxb, momxe do l = isz%beg, isz%end do j = isx%beg, isx%end - dqR_prim_dz_vf(i)%sf(j, n + 1, l) = & - dqL_prim_dz_vf(i)%sf(j, n, l) + dqR_prim_dy_vf(i)%sf(j, n + 1, l) = & + dqL_prim_dy_vf(i)%sf(j, n, l) end do end do end do + #:endcall GPU_PARALLEL_LOOP + + if (p > 0) then + #:call GPU_PARALLEL_LOOP(collapse=3) + do i = momxb, momxe + do l = isz%beg, isz%end + do j = isx%beg, isx%end + dqR_prim_dz_vf(i)%sf(j, n + 1, l) = & + dqL_prim_dz_vf(i)%sf(j, n, l) + end do + end do + end do + #:endcall GPU_PARALLEL_LOOP end if end if @@ -3469,90 +3491,98 @@ contains else if (bc_z%beg == BC_RIEMANN_EXTRAP) then ! Riemann state extrap. BC at beginning - $:GPU_PARALLEL_LOOP(collapse=3) - do i = 1, sys_size - do l = is3%beg, is3%end - do k = is2%beg, is2%end - qL_prim_rsz_vf(-1, k, l, i) = & - qR_prim_rsz_vf(0, k, l, i) + #:call GPU_PARALLEL_LOOP(collapse=3) + do i = 1, sys_size + do l = is3%beg, is3%end + do k = is2%beg, is2%end + qL_prim_rsz_vf(-1, k, l, i) = & + qR_prim_rsz_vf(0, k, l, i) + end do end do end do - end do + #:endcall GPU_PARALLEL_LOOP if (viscous) then - $:GPU_PARALLEL_LOOP(collapse=3) - do i = momxb, momxe - do k = isy%beg, isy%end - do j = isx%beg, isx%end - dqL_prim_dx_vf(i)%sf(j, k, -1) = & - dqR_prim_dx_vf(i)%sf(j, k, 0) + #:call GPU_PARALLEL_LOOP(collapse=3) + do i = momxb, momxe + do k = isy%beg, isy%end + do j = isx%beg, isx%end + dqL_prim_dx_vf(i)%sf(j, k, -1) = & + dqR_prim_dx_vf(i)%sf(j, k, 0) + end do end do end do - end do - $:GPU_PARALLEL_LOOP(collapse=3) - do i = momxb, momxe - do k = isy%beg, isy%end - do j = isx%beg, isx%end - dqL_prim_dy_vf(i)%sf(j, k, -1) = & - dqR_prim_dy_vf(i)%sf(j, k, 0) + #:endcall GPU_PARALLEL_LOOP + #:call GPU_PARALLEL_LOOP(collapse=3) + do i = momxb, momxe + do k = isy%beg, isy%end + do j = isx%beg, isx%end + dqL_prim_dy_vf(i)%sf(j, k, -1) = & + dqR_prim_dy_vf(i)%sf(j, k, 0) + end do end do end do - end do - $:GPU_PARALLEL_LOOP(collapse=3) - do i = momxb, momxe - do k = isy%beg, isy%end - do j = isx%beg, isx%end - dqL_prim_dz_vf(i)%sf(j, k, -1) = & - dqR_prim_dz_vf(i)%sf(j, k, 0) + #:endcall GPU_PARALLEL_LOOP + #:call GPU_PARALLEL_LOOP(collapse=3) + do i = momxb, momxe + do k = isy%beg, isy%end + do j = isx%beg, isx%end + dqL_prim_dz_vf(i)%sf(j, k, -1) = & + dqR_prim_dz_vf(i)%sf(j, k, 0) + end do end do end do - end do + #:endcall GPU_PARALLEL_LOOP end if end if if (bc_z%end == BC_RIEMANN_EXTRAP) then ! Riemann state extrap. BC at end - $:GPU_PARALLEL_LOOP(collapse=3) - do i = 1, sys_size - do l = is3%beg, is3%end - do k = is2%beg, is2%end - qR_prim_rsz_vf(p + 1, k, l, i) = & - qL_prim_rsz_vf(p, k, l, i) + #:call GPU_PARALLEL_LOOP(collapse=3) + do i = 1, sys_size + do l = is3%beg, is3%end + do k = is2%beg, is2%end + qR_prim_rsz_vf(p + 1, k, l, i) = & + qL_prim_rsz_vf(p, k, l, i) + end do end do end do - end do + #:endcall GPU_PARALLEL_LOOP if (viscous) then - $:GPU_PARALLEL_LOOP(collapse=3) - do i = momxb, momxe - do k = isy%beg, isy%end - do j = isx%beg, isx%end - dqR_prim_dx_vf(i)%sf(j, k, p + 1) = & - dqL_prim_dx_vf(i)%sf(j, k, p) + #:call GPU_PARALLEL_LOOP(collapse=3) + do i = momxb, momxe + do k = isy%beg, isy%end + do j = isx%beg, isx%end + dqR_prim_dx_vf(i)%sf(j, k, p + 1) = & + dqL_prim_dx_vf(i)%sf(j, k, p) + end do end do end do - end do + #:endcall GPU_PARALLEL_LOOP - $:GPU_PARALLEL_LOOP(collapse=3) - do i = momxb, momxe - do k = isy%beg, isy%end - do j = isx%beg, isx%end - dqR_prim_dy_vf(i)%sf(j, k, p + 1) = & - dqL_prim_dy_vf(i)%sf(j, k, p) + #:call GPU_PARALLEL_LOOP(collapse=3) + do i = momxb, momxe + do k = isy%beg, isy%end + do j = isx%beg, isx%end + dqR_prim_dy_vf(i)%sf(j, k, p + 1) = & + dqL_prim_dy_vf(i)%sf(j, k, p) + end do end do end do - end do + #:endcall GPU_PARALLEL_LOOP - $:GPU_PARALLEL_LOOP(collapse=3) - do i = momxb, momxe - do k = isy%beg, isy%end - do j = isx%beg, isx%end - dqR_prim_dz_vf(i)%sf(j, k, p + 1) = & - dqL_prim_dz_vf(i)%sf(j, k, p) + #:call GPU_PARALLEL_LOOP(collapse=3) + do i = momxb, momxe + do k = isy%beg, isy%end + do j = isx%beg, isx%end + dqR_prim_dz_vf(i)%sf(j, k, p + 1) = & + dqL_prim_dz_vf(i)%sf(j, k, p) + end do end do end do - end do + #:endcall GPU_PARALLEL_LOOP end if end if @@ -3596,133 +3626,173 @@ contains if (viscous .or. (surface_tension)) then - $:GPU_PARALLEL_LOOP(collapse=4) - do i = momxb, E_idx - do l = is3%beg, is3%end - do k = is2%beg, is2%end - do j = is1%beg, is1%end - flux_src_vf(i)%sf(j, k, l) = 0._wp + #:call GPU_PARALLEL_LOOP(collapse=4) + do i = momxb, E_idx + do l = is3%beg, is3%end + do k = is2%beg, is2%end + do j = is1%beg, is1%end + flux_src_vf(i)%sf(j, k, l) = 0._wp + end do + end do + end do + end do + #:endcall GPU_PARALLEL_LOOP + end if + + if (chem_params%diffusion) then + #:call GPU_PARALLEL_LOOP(collapse=4) + do i = E_idx, chemxe + do l = is3%beg, is3%end + do k = is2%beg, is2%end + do j = is1%beg, is1%end + if (i == E_idx .or. i >= chemxb) then + flux_src_vf(i)%sf(j, k, l) = 0._wp + end if + end do end do end do end do - end do + #:endcall GPU_PARALLEL_LOOP end if if (chem_params%diffusion) then - $:GPU_PARALLEL_LOOP(collapse=4) - do i = E_idx, chemxe - do l = is3%beg, is3%end - do k = is2%beg, is2%end - do j = is1%beg, is1%end - if (i == E_idx .or. i >= chemxb) then - flux_src_vf(i)%sf(j, k, l) = 0._wp - end if + #:call GPU_PARALLEL_LOOP(collapse=4) + do i = E_idx, chemxe + do l = is3%beg, is3%end + do k = is2%beg, is2%end + do j = is1%beg, is1%end + if (i == E_idx .or. i >= chemxb) then + flux_src_vf(i)%sf(j, k, l) = 0._wp + end if + end do end do end do end do - end do + #:endcall GPU_PARALLEL_LOOP end if if (qbmm) then - - $:GPU_PARALLEL_LOOP(collapse=4) - do i = 1, 4 - do l = is3%beg, is3%end - do k = is2%beg, is2%end - do j = is1%beg, is1%end + 1 - mom_sp_rsx_vf(j, k, l, i) = mom_sp(i)%sf(j, k, l) + #:call GPU_PARALLEL_LOOP(collapse=4) + do i = 1, 4 + do l = is3%beg, is3%end + do k = is2%beg, is2%end + do j = is1%beg, is1%end + 1 + mom_sp_rsx_vf(j, k, l, i) = mom_sp(i)%sf(j, k, l) + end do end do end do end do - end do + #:endcall GPU_PARALLEL_LOOP end if ! Reshaping Inputted Data in y-direction elseif (norm_dir == 2) then if (viscous .or. (surface_tension)) then - $:GPU_PARALLEL_LOOP(collapse=4) - do i = momxb, E_idx - do l = is3%beg, is3%end - do j = is1%beg, is1%end - do k = is2%beg, is2%end - flux_src_vf(i)%sf(k, j, l) = 0._wp + #:call GPU_PARALLEL_LOOP(collapse=4) + do i = momxb, E_idx + do l = is3%beg, is3%end + do j = is1%beg, is1%end + do k = is2%beg, is2%end + flux_src_vf(i)%sf(k, j, l) = 0._wp + end do end do end do end do - end do + #:endcall GPU_PARALLEL_LOOP end if if (chem_params%diffusion) then - $:GPU_PARALLEL_LOOP(collapse=4) - do i = E_idx, chemxe - do l = is3%beg, is3%end - do j = is1%beg, is1%end - do k = is2%beg, is2%end - if (i == E_idx .or. i >= chemxb) then - flux_src_vf(i)%sf(k, j, l) = 0._wp - end if + #:call GPU_PARALLEL_LOOP(collapse=4) + do i = E_idx, chemxe + do l = is3%beg, is3%end + do j = is1%beg, is1%end + do k = is2%beg, is2%end + if (i == E_idx .or. i >= chemxb) then + flux_src_vf(i)%sf(k, j, l) = 0._wp + end if + end do end do end do end do - end do + #:endcall GPU_PARALLEL_LOOP end if if (qbmm) then - $:GPU_PARALLEL_LOOP(collapse=4) - do i = 1, 4 - do l = is3%beg, is3%end - do k = is2%beg, is2%end - do j = is1%beg, is1%end + 1 - mom_sp_rsy_vf(j, k, l, i) = mom_sp(i)%sf(k, j, l) + #:call GPU_PARALLEL_LOOP(collapse=4) + do i = 1, 4 + do l = is3%beg, is3%end + do k = is2%beg, is2%end + do j = is1%beg, is1%end + 1 + mom_sp_rsy_vf(j, k, l, i) = mom_sp(i)%sf(k, j, l) + end do end do end do end do - end do + #:endcall GPU_PARALLEL_LOOP end if ! Reshaping Inputted Data in z-direction else if (viscous .or. (surface_tension)) then - $:GPU_PARALLEL_LOOP(collapse=4) - do i = momxb, E_idx - do j = is1%beg, is1%end - do k = is2%beg, is2%end - do l = is3%beg, is3%end - flux_src_vf(i)%sf(l, k, j) = 0._wp + #:call GPU_PARALLEL_LOOP(collapse=4) + do i = momxb, E_idx + do j = is1%beg, is1%end + do k = is2%beg, is2%end + do l = is3%beg, is3%end + flux_src_vf(i)%sf(l, k, j) = 0._wp + end do end do end do end do - end do + #:endcall GPU_PARALLEL_LOOP end if if (chem_params%diffusion) then - $:GPU_PARALLEL_LOOP(collapse=4) - do i = E_idx, chemxe - do j = is1%beg, is1%end - do k = is2%beg, is2%end - do l = is3%beg, is3%end - if (i == E_idx .or. i >= chemxb) then - flux_src_vf(i)%sf(l, k, j) = 0._wp - end if + #:call GPU_PARALLEL_LOOP(collapse=4) + do i = E_idx, chemxe + do j = is1%beg, is1%end + do k = is2%beg, is2%end + do l = is3%beg, is3%end + if (i == E_idx .or. i >= chemxb) then + flux_src_vf(i)%sf(l, k, j) = 0._wp + end if + end do end do end do end do - end do + #:endcall GPU_PARALLEL_LOOP + end if + + if (chem_params%diffusion) then + #:call GPU_PARALLEL_LOOP(collapse=4) + do i = E_idx, chemxe + do j = is1%beg, is1%end + do k = is2%beg, is2%end + do l = is3%beg, is3%end + if (i == E_idx .or. i >= chemxb) then + flux_src_vf(i)%sf(l, k, j) = 0._wp + end if + end do + end do + end do + end do + #:endcall GPU_PARALLEL_LOOP end if if (qbmm) then - $:GPU_PARALLEL_LOOP(collapse=4) - do i = 1, 4 - do l = is3%beg, is3%end - do k = is2%beg, is2%end - do j = is1%beg, is1%end + 1 - mom_sp_rsz_vf(j, k, l, i) = mom_sp(i)%sf(l, k, j) + #:call GPU_PARALLEL_LOOP(collapse=4) + do i = 1, 4 + do l = is3%beg, is3%end + do k = is2%beg, is2%end + do j = is1%beg, is1%end + 1 + mom_sp_rsz_vf(j, k, l, i) = mom_sp(i)%sf(l, k, j) + end do end do end do end do - end do + #:endcall GPU_PARALLEL_LOOP end if end if @@ -3746,11 +3816,11 @@ contains !! @param[in] ix Global X-direction loop bounds (int_bounds_info). !! @param[in] iy Global Y-direction loop bounds (int_bounds_info). !! @param[in] iz Global Z-direction loop bounds (int_bounds_info). - pure subroutine s_compute_cylindrical_viscous_source_flux(velL_vf, & - dvelL_dx_vf, dvelL_dy_vf, dvelL_dz_vf, & - velR_vf, & - dvelR_dx_vf, dvelR_dy_vf, dvelR_dz_vf, & - flux_src_vf, norm_dir, ix, iy, iz) + subroutine s_compute_cylindrical_viscous_source_flux(velL_vf, & + dvelL_dx_vf, dvelL_dy_vf, dvelL_dz_vf, & + velR_vf, & + dvelR_dx_vf, dvelR_dy_vf, dvelR_dz_vf, & + flux_src_vf, norm_dir, ix, iy, iz) type(scalar_field), dimension(num_dims), intent(in) :: velL_vf, velR_vf type(scalar_field), dimension(num_dims), intent(in) :: dvelL_dx_vf, dvelR_dx_vf @@ -3779,115 +3849,113 @@ contains integer :: i_vel !!< Loop iterator for velocity components. integer :: idx_rp(3) !!< Indices $(j,k,l)$ of 'right' point for averaging. - $:GPU_PARALLEL_LOOP(collapse=3, private='[idx_rp, avg_v_int, & - & avg_dvdx_int, avg_dvdy_int, avg_dvdz_int, Re_s, Re_b, & - & vel_src_int, r_eff, divergence_cyl, stress_vector_shear, & - & stress_normal_bulk, div_v_term_const]') - do l = iz%beg, iz%end - do k = iy%beg, iy%end - do j = ix%beg, ix%end - - ! Determine indices for the 'right' state for averaging across the interface - idx_rp = [j, k, l] - idx_rp(norm_dir) = idx_rp(norm_dir) + 1 - - ! Average velocities and their derivatives at the interface - ! For cylindrical: x-dir ~ axial (z_cyl), y-dir ~ radial (r_cyl), z-dir ~ azimuthal (theta_cyl) - $:GPU_LOOP(parallelism='[seq]') - do i_vel = 1, num_dims - avg_v_int(i_vel) = 0.5_wp*(velL_vf(i_vel)%sf(j, k, l) + velR_vf(i_vel)%sf(idx_rp(1), idx_rp(2), idx_rp(3))) - - avg_dvdx_int(i_vel) = 0.5_wp*(dvelL_dx_vf(i_vel)%sf(j, k, l) + & - dvelR_dx_vf(i_vel)%sf(idx_rp(1), idx_rp(2), idx_rp(3))) - if (num_dims > 1) then - avg_dvdy_int(i_vel) = 0.5_wp*(dvelL_dy_vf(i_vel)%sf(j, k, l) + & - dvelR_dy_vf(i_vel)%sf(idx_rp(1), idx_rp(2), idx_rp(3))) - else - avg_dvdy_int(i_vel) = 0.0_wp - end if - if (num_dims > 2) then - avg_dvdz_int(i_vel) = 0.5_wp*(dvelL_dz_vf(i_vel)%sf(j, k, l) + & - dvelR_dz_vf(i_vel)%sf(idx_rp(1), idx_rp(2), idx_rp(3))) - else - avg_dvdz_int(i_vel) = 0.0_wp - end if - end do - - ! Get Re numbers and interface velocity for viscous work - select case (norm_dir) - case (1) ! x-face (axial face in z_cyl direction) - Re_s = Re_avg_rsx_vf(j, k, l, 1) - Re_b = Re_avg_rsx_vf(j, k, l, 2) - vel_src_int = vel_src_rsx_vf(j, k, l, 1:num_dims) - r_eff = y_cc(k) - case (2) ! y-face (radial face in r_cyl direction) - Re_s = Re_avg_rsy_vf(k, j, l, 1) - Re_b = Re_avg_rsy_vf(k, j, l, 2) - vel_src_int = vel_src_rsy_vf(k, j, l, 1:num_dims) - r_eff = y_cb(k) - case (3) ! z-face (azimuthal face in theta_cyl direction) - Re_s = Re_avg_rsz_vf(l, k, j, 1) - Re_b = Re_avg_rsz_vf(l, k, j, 2) - vel_src_int = vel_src_rsz_vf(l, k, j, 1:num_dims) - r_eff = y_cc(k) - end select - - ! Divergence in cylindrical coordinates (vx=vz_cyl, vy=vr_cyl, vz=vtheta_cyl) - divergence_cyl = avg_dvdx_int(1) + avg_dvdy_int(2) + avg_v_int(2)/r_eff - if (num_dims > 2) then - divergence_cyl = divergence_cyl + avg_dvdz_int(3)/r_eff - end if + #:call GPU_PARALLEL_LOOP(collapse=3, private='[idx_rp, avg_v_int, avg_dvdx_int, avg_dvdy_int, avg_dvdz_int, Re_s, Re_b, vel_src_int, r_eff, divergence_cyl, stress_vector_shear, stress_normal_bulk, div_v_term_const]') + do l = iz%beg, iz%end + do k = iy%beg, iy%end + do j = ix%beg, ix%end - stress_vector_shear = 0.0_wp - stress_normal_bulk = 0.0_wp + ! Determine indices for the 'right' state for averaging across the interface + idx_rp = [j, k, l] + idx_rp(norm_dir) = idx_rp(norm_dir) + 1 - if (shear_stress) then - div_v_term_const = -(2.0_wp/3.0_wp)*divergence_cyl/Re_s + ! Average velocities and their derivatives at the interface + ! For cylindrical: x-dir ~ axial (z_cyl), y-dir ~ radial (r_cyl), z-dir ~ azimuthal (theta_cyl) + $:GPU_LOOP(parallelism='[seq]') + do i_vel = 1, num_dims + avg_v_int(i_vel) = 0.5_wp*(velL_vf(i_vel)%sf(j, k, l) + velR_vf(i_vel)%sf(idx_rp(1), idx_rp(2), idx_rp(3))) - select case (norm_dir) - case (1) ! X-face (axial normal, z_cyl) - stress_vector_shear(1) = (2.0_wp*avg_dvdx_int(1))/Re_s + div_v_term_const - if (num_dims > 1) then - stress_vector_shear(2) = (avg_dvdy_int(1) + avg_dvdx_int(2))/Re_s - end if - if (num_dims > 2) then - stress_vector_shear(3) = (avg_dvdz_int(1)/r_eff + avg_dvdx_int(3))/Re_s - end if - case (2) ! Y-face (radial normal, r_cyl) + avg_dvdx_int(i_vel) = 0.5_wp*(dvelL_dx_vf(i_vel)%sf(j, k, l) + & + dvelR_dx_vf(i_vel)%sf(idx_rp(1), idx_rp(2), idx_rp(3))) if (num_dims > 1) then - stress_vector_shear(1) = (avg_dvdy_int(1) + avg_dvdx_int(2))/Re_s - stress_vector_shear(2) = (2.0_wp*avg_dvdy_int(2))/Re_s + div_v_term_const - if (num_dims > 2) then - stress_vector_shear(3) = (avg_dvdz_int(2)/r_eff - avg_v_int(3)/r_eff + avg_dvdy_int(3))/Re_s - end if + avg_dvdy_int(i_vel) = 0.5_wp*(dvelL_dy_vf(i_vel)%sf(j, k, l) + & + dvelR_dy_vf(i_vel)%sf(idx_rp(1), idx_rp(2), idx_rp(3))) else - stress_vector_shear(1) = (2.0_wp*avg_dvdx_int(1))/Re_s + div_v_term_const + avg_dvdy_int(i_vel) = 0.0_wp end if - case (3) ! Z-face (azimuthal normal, theta_cyl) if (num_dims > 2) then - stress_vector_shear(1) = (avg_dvdz_int(1)/r_eff + avg_dvdx_int(3))/Re_s - stress_vector_shear(2) = (avg_dvdz_int(2)/r_eff - avg_v_int(3)/r_eff + avg_dvdy_int(3))/Re_s - stress_vector_shear(3) = (2.0_wp*(avg_dvdz_int(3)/r_eff + avg_v_int(2)/r_eff))/Re_s + div_v_term_const + avg_dvdz_int(i_vel) = 0.5_wp*(dvelL_dz_vf(i_vel)%sf(j, k, l) + & + dvelR_dz_vf(i_vel)%sf(idx_rp(1), idx_rp(2), idx_rp(3))) + else + avg_dvdz_int(i_vel) = 0.0_wp end if + end do + + ! Get Re numbers and interface velocity for viscous work + select case (norm_dir) + case (1) ! x-face (axial face in z_cyl direction) + Re_s = Re_avg_rsx_vf(j, k, l, 1) + Re_b = Re_avg_rsx_vf(j, k, l, 2) + vel_src_int = vel_src_rsx_vf(j, k, l, 1:num_dims) + r_eff = y_cc(k) + case (2) ! y-face (radial face in r_cyl direction) + Re_s = Re_avg_rsy_vf(k, j, l, 1) + Re_b = Re_avg_rsy_vf(k, j, l, 2) + vel_src_int = vel_src_rsy_vf(k, j, l, 1:num_dims) + r_eff = y_cb(k) + case (3) ! z-face (azimuthal face in theta_cyl direction) + Re_s = Re_avg_rsz_vf(l, k, j, 1) + Re_b = Re_avg_rsz_vf(l, k, j, 2) + vel_src_int = vel_src_rsz_vf(l, k, j, 1:num_dims) + r_eff = y_cc(k) end select - $:GPU_LOOP(parallelism='[seq]') - do i_vel = 1, num_dims - flux_src_vf(momxb + i_vel - 1)%sf(j, k, l) = flux_src_vf(momxb + i_vel - 1)%sf(j, k, l) - stress_vector_shear(i_vel) - flux_src_vf(E_idx)%sf(j, k, l) = flux_src_vf(E_idx)%sf(j, k, l) - vel_src_int(i_vel)*stress_vector_shear(i_vel) - end do - end if + ! Divergence in cylindrical coordinates (vx=vz_cyl, vy=vr_cyl, vz=vtheta_cyl) + divergence_cyl = avg_dvdx_int(1) + avg_dvdy_int(2) + avg_v_int(2)/r_eff + if (num_dims > 2) then + divergence_cyl = divergence_cyl + avg_dvdz_int(3)/r_eff + end if - if (bulk_stress) then - stress_normal_bulk = divergence_cyl/Re_b + stress_vector_shear = 0.0_wp + stress_normal_bulk = 0.0_wp - flux_src_vf(momxb + norm_dir - 1)%sf(j, k, l) = flux_src_vf(momxb + norm_dir - 1)%sf(j, k, l) - stress_normal_bulk - flux_src_vf(E_idx)%sf(j, k, l) = flux_src_vf(E_idx)%sf(j, k, l) - vel_src_int(norm_dir)*stress_normal_bulk - end if + if (shear_stress) then + div_v_term_const = -(2.0_wp/3.0_wp)*divergence_cyl/Re_s + + select case (norm_dir) + case (1) ! X-face (axial normal, z_cyl) + stress_vector_shear(1) = (2.0_wp*avg_dvdx_int(1))/Re_s + div_v_term_const + if (num_dims > 1) then + stress_vector_shear(2) = (avg_dvdy_int(1) + avg_dvdx_int(2))/Re_s + end if + if (num_dims > 2) then + stress_vector_shear(3) = (avg_dvdz_int(1)/r_eff + avg_dvdx_int(3))/Re_s + end if + case (2) ! Y-face (radial normal, r_cyl) + if (num_dims > 1) then + stress_vector_shear(1) = (avg_dvdy_int(1) + avg_dvdx_int(2))/Re_s + stress_vector_shear(2) = (2.0_wp*avg_dvdy_int(2))/Re_s + div_v_term_const + if (num_dims > 2) then + stress_vector_shear(3) = (avg_dvdz_int(2)/r_eff - avg_v_int(3)/r_eff + avg_dvdy_int(3))/Re_s + end if + else + stress_vector_shear(1) = (2.0_wp*avg_dvdx_int(1))/Re_s + div_v_term_const + end if + case (3) ! Z-face (azimuthal normal, theta_cyl) + if (num_dims > 2) then + stress_vector_shear(1) = (avg_dvdz_int(1)/r_eff + avg_dvdx_int(3))/Re_s + stress_vector_shear(2) = (avg_dvdz_int(2)/r_eff - avg_v_int(3)/r_eff + avg_dvdy_int(3))/Re_s + stress_vector_shear(3) = (2.0_wp*(avg_dvdz_int(3)/r_eff + avg_v_int(2)/r_eff))/Re_s + div_v_term_const + end if + end select + + $:GPU_LOOP(parallelism='[seq]') + do i_vel = 1, num_dims + flux_src_vf(momxb + i_vel - 1)%sf(j, k, l) = flux_src_vf(momxb + i_vel - 1)%sf(j, k, l) - stress_vector_shear(i_vel) + flux_src_vf(E_idx)%sf(j, k, l) = flux_src_vf(E_idx)%sf(j, k, l) - vel_src_int(i_vel)*stress_vector_shear(i_vel) + end do + end if + + if (bulk_stress) then + stress_normal_bulk = divergence_cyl/Re_b + + flux_src_vf(momxb + norm_dir - 1)%sf(j, k, l) = flux_src_vf(momxb + norm_dir - 1)%sf(j, k, l) - stress_normal_bulk + flux_src_vf(E_idx)%sf(j, k, l) = flux_src_vf(E_idx)%sf(j, k, l) - vel_src_int(norm_dir)*stress_normal_bulk + end if + end do end do end do - end do + #:endcall GPU_PARALLEL_LOOP end subroutine s_compute_cylindrical_viscous_source_flux @@ -3907,14 +3975,14 @@ contains !! @param[in] ix X-direction loop bounds (int_bounds_info). !! @param[in] iy Y-direction loop bounds (int_bounds_info). !! @param[in] iz Z-direction loop bounds (int_bounds_info). - pure subroutine s_compute_cartesian_viscous_source_flux(dvelL_dx_vf, & - dvelL_dy_vf, & - dvelL_dz_vf, & - dvelR_dx_vf, & - dvelR_dy_vf, & - dvelR_dz_vf, & - flux_src_vf, & - norm_dir) + subroutine s_compute_cartesian_viscous_source_flux(dvelL_dx_vf, & + dvelL_dy_vf, & + dvelL_dz_vf, & + dvelR_dx_vf, & + dvelR_dy_vf, & + dvelR_dz_vf, & + flux_src_vf, & + norm_dir) ! Arguments type(scalar_field), dimension(num_dims), intent(in) :: dvelL_dx_vf, dvelR_dx_vf @@ -3941,89 +4009,88 @@ contains real(wp) :: divergence_v !< Velocity divergence at interface. - $:GPU_PARALLEL_LOOP(collapse=3, private='[idx_right_phys, vel_grad_avg, & - & current_tau_shear, current_tau_bulk, vel_src_at_interface, & - & Re_shear, Re_bulk, divergence_v, i_dim, vel_comp_idx]') - do l_loop = isz%beg, isz%end - do k_loop = isy%beg, isy%end - do j_loop = isx%beg, isx%end - - idx_right_phys(1) = j_loop - idx_right_phys(2) = k_loop - idx_right_phys(3) = l_loop - idx_right_phys(norm_dir) = idx_right_phys(norm_dir) + 1 - - vel_grad_avg = 0.0_wp - do vel_comp_idx = 1, num_dims - vel_grad_avg(vel_comp_idx, 1) = 0.5_wp*(dvelL_dx_vf(vel_comp_idx)%sf(j_loop, k_loop, l_loop) + & - dvelR_dx_vf(vel_comp_idx)%sf(idx_right_phys(1), idx_right_phys(2), idx_right_phys(3))) - if (num_dims > 1) then - vel_grad_avg(vel_comp_idx, 2) = 0.5_wp*(dvelL_dy_vf(vel_comp_idx)%sf(j_loop, k_loop, l_loop) + & - dvelR_dy_vf(vel_comp_idx)%sf(idx_right_phys(1), idx_right_phys(2), idx_right_phys(3))) - end if - if (num_dims > 2) then - vel_grad_avg(vel_comp_idx, 3) = 0.5_wp*(dvelL_dz_vf(vel_comp_idx)%sf(j_loop, k_loop, l_loop) + & - dvelR_dz_vf(vel_comp_idx)%sf(idx_right_phys(1), idx_right_phys(2), idx_right_phys(3))) - end if - end do + #:call GPU_PARALLEL_LOOP(collapse=3, private='[idx_right_phys, vel_grad_avg, current_tau_shear, current_tau_bulk, vel_src_at_interface, Re_shear, Re_bulk, divergence_v, i_dim, vel_comp_idx]') + do l_loop = isz%beg, isz%end + do k_loop = isy%beg, isy%end + do j_loop = isx%beg, isx%end - divergence_v = 0.0_wp - do i_dim = 1, num_dims - divergence_v = divergence_v + vel_grad_avg(i_dim, i_dim) - end do + idx_right_phys(1) = j_loop + idx_right_phys(2) = k_loop + idx_right_phys(3) = l_loop + idx_right_phys(norm_dir) = idx_right_phys(norm_dir) + 1 - vel_src_at_interface = 0.0_wp - if (norm_dir == 1) then - Re_shear = Re_avg_rsx_vf(j_loop, k_loop, l_loop, 1) - Re_bulk = Re_avg_rsx_vf(j_loop, k_loop, l_loop, 2) - do i_dim = 1, num_dims - vel_src_at_interface(i_dim) = vel_src_rsx_vf(j_loop, k_loop, l_loop, i_dim) - end do - else if (norm_dir == 2) then - Re_shear = Re_avg_rsy_vf(k_loop, j_loop, l_loop, 1) - Re_bulk = Re_avg_rsy_vf(k_loop, j_loop, l_loop, 2) - do i_dim = 1, num_dims - vel_src_at_interface(i_dim) = vel_src_rsy_vf(k_loop, j_loop, l_loop, i_dim) + vel_grad_avg = 0.0_wp + do vel_comp_idx = 1, num_dims + vel_grad_avg(vel_comp_idx, 1) = 0.5_wp*(dvelL_dx_vf(vel_comp_idx)%sf(j_loop, k_loop, l_loop) + & + dvelR_dx_vf(vel_comp_idx)%sf(idx_right_phys(1), idx_right_phys(2), idx_right_phys(3))) + if (num_dims > 1) then + vel_grad_avg(vel_comp_idx, 2) = 0.5_wp*(dvelL_dy_vf(vel_comp_idx)%sf(j_loop, k_loop, l_loop) + & + dvelR_dy_vf(vel_comp_idx)%sf(idx_right_phys(1), idx_right_phys(2), idx_right_phys(3))) + end if + if (num_dims > 2) then + vel_grad_avg(vel_comp_idx, 3) = 0.5_wp*(dvelL_dz_vf(vel_comp_idx)%sf(j_loop, k_loop, l_loop) + & + dvelR_dz_vf(vel_comp_idx)%sf(idx_right_phys(1), idx_right_phys(2), idx_right_phys(3))) + end if end do - else - Re_shear = Re_avg_rsz_vf(l_loop, k_loop, j_loop, 1) - Re_bulk = Re_avg_rsz_vf(l_loop, k_loop, j_loop, 2) + + divergence_v = 0.0_wp do i_dim = 1, num_dims - vel_src_at_interface(i_dim) = vel_src_rsz_vf(l_loop, k_loop, j_loop, i_dim) + divergence_v = divergence_v + vel_grad_avg(i_dim, i_dim) end do - end if - if (shear_stress) then - ! current_tau_shear = 0.0_wp - call s_calculate_shear_stress_tensor(vel_grad_avg, Re_shear, divergence_v, current_tau_shear) + vel_src_at_interface = 0.0_wp + if (norm_dir == 1) then + Re_shear = Re_avg_rsx_vf(j_loop, k_loop, l_loop, 1) + Re_bulk = Re_avg_rsx_vf(j_loop, k_loop, l_loop, 2) + do i_dim = 1, num_dims + vel_src_at_interface(i_dim) = vel_src_rsx_vf(j_loop, k_loop, l_loop, i_dim) + end do + else if (norm_dir == 2) then + Re_shear = Re_avg_rsy_vf(k_loop, j_loop, l_loop, 1) + Re_bulk = Re_avg_rsy_vf(k_loop, j_loop, l_loop, 2) + do i_dim = 1, num_dims + vel_src_at_interface(i_dim) = vel_src_rsy_vf(k_loop, j_loop, l_loop, i_dim) + end do + else + Re_shear = Re_avg_rsz_vf(l_loop, k_loop, j_loop, 1) + Re_bulk = Re_avg_rsz_vf(l_loop, k_loop, j_loop, 2) + do i_dim = 1, num_dims + vel_src_at_interface(i_dim) = vel_src_rsz_vf(l_loop, k_loop, j_loop, i_dim) + end do + end if - do i_dim = 1, num_dims - flux_src_vf(momxb + i_dim - 1)%sf(j_loop, k_loop, l_loop) = & - flux_src_vf(momxb + i_dim - 1)%sf(j_loop, k_loop, l_loop) - current_tau_shear(norm_dir, i_dim) + if (shear_stress) then + ! current_tau_shear = 0.0_wp + call s_calculate_shear_stress_tensor(vel_grad_avg, Re_shear, divergence_v, current_tau_shear) - flux_src_vf(E_idx)%sf(j_loop, k_loop, l_loop) = & - flux_src_vf(E_idx)%sf(j_loop, k_loop, l_loop) - & - vel_src_at_interface(i_dim)*current_tau_shear(norm_dir, i_dim) - end do - end if + do i_dim = 1, num_dims + flux_src_vf(momxb + i_dim - 1)%sf(j_loop, k_loop, l_loop) = & + flux_src_vf(momxb + i_dim - 1)%sf(j_loop, k_loop, l_loop) - current_tau_shear(norm_dir, i_dim) - if (bulk_stress) then - ! current_tau_bulk = 0.0_wp - call s_calculate_bulk_stress_tensor(Re_bulk, divergence_v, current_tau_bulk) + flux_src_vf(E_idx)%sf(j_loop, k_loop, l_loop) = & + flux_src_vf(E_idx)%sf(j_loop, k_loop, l_loop) - & + vel_src_at_interface(i_dim)*current_tau_shear(norm_dir, i_dim) + end do + end if - do i_dim = 1, num_dims - flux_src_vf(momxb + i_dim - 1)%sf(j_loop, k_loop, l_loop) = & - flux_src_vf(momxb + i_dim - 1)%sf(j_loop, k_loop, l_loop) - current_tau_bulk(norm_dir, i_dim) + if (bulk_stress) then + ! current_tau_bulk = 0.0_wp + call s_calculate_bulk_stress_tensor(Re_bulk, divergence_v, current_tau_bulk) - flux_src_vf(E_idx)%sf(j_loop, k_loop, l_loop) = & - flux_src_vf(E_idx)%sf(j_loop, k_loop, l_loop) - & - vel_src_at_interface(i_dim)*current_tau_bulk(norm_dir, i_dim) - end do - end if + do i_dim = 1, num_dims + flux_src_vf(momxb + i_dim - 1)%sf(j_loop, k_loop, l_loop) = & + flux_src_vf(momxb + i_dim - 1)%sf(j_loop, k_loop, l_loop) - current_tau_bulk(norm_dir, i_dim) + + flux_src_vf(E_idx)%sf(j_loop, k_loop, l_loop) = & + flux_src_vf(E_idx)%sf(j_loop, k_loop, l_loop) - & + vel_src_at_interface(i_dim)*current_tau_bulk(norm_dir, i_dim) + end do + end if + end do end do end do - end do + #:endcall GPU_PARALLEL_LOOP end subroutine s_compute_cartesian_viscous_source_flux @@ -4033,11 +4100,9 @@ contains !! @param[in] Re_shear Shear Reynolds number. !! @param[in] divergence_v Velocity divergence (du/dx + dv/dy + dw/dz). !! @param[out] tau_shear_out Calculated shear stress tensor (stress on i-face, j-direction). - pure subroutine s_calculate_shear_stress_tensor(vel_grad_avg, Re_shear, divergence_v, tau_shear_out) + subroutine s_calculate_shear_stress_tensor(vel_grad_avg, Re_shear, divergence_v, tau_shear_out) $:GPU_ROUTINE(parallelism='[seq]') - implicit none - ! Arguments real(wp), dimension(num_dims, num_dims), intent(in) :: vel_grad_avg real(wp), intent(in) :: Re_shear @@ -4067,11 +4132,9 @@ contains !! @param[in] Re_bulk Bulk Reynolds number. !! @param[in] divergence_v Velocity divergence (du/dx + dv/dy + dw/dz). !! @param[out] tau_bulk_out Calculated bulk stress tensor (stress on i-face, i-direction). - pure subroutine s_calculate_bulk_stress_tensor(Re_bulk, divergence_v, tau_bulk_out) + subroutine s_calculate_bulk_stress_tensor(Re_bulk, divergence_v, tau_bulk_out) $:GPU_ROUTINE(parallelism='[seq]') - implicit none - ! Arguments real(wp), intent(in) :: Re_bulk real(wp), intent(in) :: divergence_v @@ -4094,9 +4157,9 @@ contains !! @param flux_src_vf Intercell source fluxes !! @param flux_gsrc_vf Intercell geometric source fluxes !! @param norm_dir Dimensional splitting coordinate direction - pure subroutine s_finalize_riemann_solver(flux_vf, flux_src_vf, & - flux_gsrc_vf, & - norm_dir) + subroutine s_finalize_riemann_solver(flux_vf, flux_src_vf, & + flux_gsrc_vf, & + norm_dir) type(scalar_field), & dimension(sys_size), & @@ -4108,144 +4171,155 @@ contains ! Reshaping Outputted Data in y-direction if (norm_dir == 2) then - $:GPU_PARALLEL_LOOP(collapse=4) - do i = 1, sys_size - do l = is3%beg, is3%end - do j = is1%beg, is1%end - do k = is2%beg, is2%end - flux_vf(i)%sf(k, j, l) = & - flux_rsy_vf(j, k, l, i) - end do - end do - end do - end do - - if (cyl_coord) then - $:GPU_PARALLEL_LOOP(collapse=4) + #:call GPU_PARALLEL_LOOP(collapse=4) do i = 1, sys_size do l = is3%beg, is3%end do j = is1%beg, is1%end do k = is2%beg, is2%end - flux_gsrc_vf(i)%sf(k, j, l) = & - flux_gsrc_rsy_vf(j, k, l, i) + flux_vf(i)%sf(k, j, l) = & + flux_rsy_vf(j, k, l, i) end do end do end do end do + #:endcall GPU_PARALLEL_LOOP + + if (cyl_coord) then + #:call GPU_PARALLEL_LOOP(collapse=4) + do i = 1, sys_size + do l = is3%beg, is3%end + do j = is1%beg, is1%end + do k = is2%beg, is2%end + flux_gsrc_vf(i)%sf(k, j, l) = & + flux_gsrc_rsy_vf(j, k, l, i) + end do + end do + end do + end do + #:endcall GPU_PARALLEL_LOOP end if - $:GPU_PARALLEL_LOOP(collapse=3) - do l = is3%beg, is3%end - do j = is1%beg, is1%end - do k = is2%beg, is2%end - flux_src_vf(advxb)%sf(k, j, l) = & - flux_src_rsy_vf(j, k, l, advxb) + #:call GPU_PARALLEL_LOOP(collapse=3) + do l = is3%beg, is3%end + do j = is1%beg, is1%end + do k = is2%beg, is2%end + flux_src_vf(advxb)%sf(k, j, l) = & + flux_src_rsy_vf(j, k, l, advxb) + end do end do end do - end do + #:endcall GPU_PARALLEL_LOOP if (riemann_solver == 1 .or. riemann_solver == 4) then - $:GPU_PARALLEL_LOOP(collapse=4) - do i = advxb + 1, advxe - do l = is3%beg, is3%end - do j = is1%beg, is1%end - do k = is2%beg, is2%end - flux_src_vf(i)%sf(k, j, l) = & - flux_src_rsy_vf(j, k, l, i) + #:call GPU_PARALLEL_LOOP(collapse=4) + do i = advxb + 1, advxe + do l = is3%beg, is3%end + do j = is1%beg, is1%end + do k = is2%beg, is2%end + flux_src_vf(i)%sf(k, j, l) = & + flux_src_rsy_vf(j, k, l, i) + end do end do end do end do - end do + #:endcall GPU_PARALLEL_LOOP end if ! Reshaping Outputted Data in z-direction elseif (norm_dir == 3) then - $:GPU_PARALLEL_LOOP(collapse=4) - do i = 1, sys_size - do j = is1%beg, is1%end - do k = is2%beg, is2%end - do l = is3%beg, is3%end - - flux_vf(i)%sf(l, k, j) = & - flux_rsz_vf(j, k, l, i) - end do - end do - end do - end do - if (grid_geometry == 3) then - $:GPU_PARALLEL_LOOP(collapse=4) + #:call GPU_PARALLEL_LOOP(collapse=4) do i = 1, sys_size do j = is1%beg, is1%end do k = is2%beg, is2%end do l = is3%beg, is3%end - flux_gsrc_vf(i)%sf(l, k, j) = & - flux_gsrc_rsz_vf(j, k, l, i) + flux_vf(i)%sf(l, k, j) = & + flux_rsz_vf(j, k, l, i) end do end do end do end do + #:endcall GPU_PARALLEL_LOOP + if (grid_geometry == 3) then + #:call GPU_PARALLEL_LOOP(collapse=4) + do i = 1, sys_size + do j = is1%beg, is1%end + do k = is2%beg, is2%end + do l = is3%beg, is3%end + + flux_gsrc_vf(i)%sf(l, k, j) = & + flux_gsrc_rsz_vf(j, k, l, i) + end do + end do + end do + end do + #:endcall GPU_PARALLEL_LOOP end if - $:GPU_PARALLEL_LOOP(collapse=3) - do j = is1%beg, is1%end - do k = is2%beg, is2%end - do l = is3%beg, is3%end - flux_src_vf(advxb)%sf(l, k, j) = & - flux_src_rsz_vf(j, k, l, advxb) + #:call GPU_PARALLEL_LOOP(collapse=3) + do j = is1%beg, is1%end + do k = is2%beg, is2%end + do l = is3%beg, is3%end + flux_src_vf(advxb)%sf(l, k, j) = & + flux_src_rsz_vf(j, k, l, advxb) + end do end do end do - end do + #:endcall GPU_PARALLEL_LOOP if (riemann_solver == 1 .or. riemann_solver == 4) then - $:GPU_PARALLEL_LOOP(collapse=4) - do i = advxb + 1, advxe - do j = is1%beg, is1%end - do k = is2%beg, is2%end - do l = is3%beg, is3%end - flux_src_vf(i)%sf(l, k, j) = & - flux_src_rsz_vf(j, k, l, i) + #:call GPU_PARALLEL_LOOP(collapse=4) + do i = advxb + 1, advxe + do j = is1%beg, is1%end + do k = is2%beg, is2%end + do l = is3%beg, is3%end + flux_src_vf(i)%sf(l, k, j) = & + flux_src_rsz_vf(j, k, l, i) + end do end do end do end do - end do + #:endcall GPU_PARALLEL_LOOP end if elseif (norm_dir == 1) then - $:GPU_PARALLEL_LOOP(collapse=4) - do i = 1, sys_size - do l = is3%beg, is3%end - do k = is2%beg, is2%end - do j = is1%beg, is1%end - flux_vf(i)%sf(j, k, l) = & - flux_rsx_vf(j, k, l, i) + #:call GPU_PARALLEL_LOOP(collapse=4) + do i = 1, sys_size + do l = is3%beg, is3%end + do k = is2%beg, is2%end + do j = is1%beg, is1%end + flux_vf(i)%sf(j, k, l) = & + flux_rsx_vf(j, k, l, i) + end do end do end do end do - end do + #:endcall GPU_PARALLEL_LOOP - $:GPU_PARALLEL_LOOP(collapse=3) - do l = is3%beg, is3%end - do k = is2%beg, is2%end - do j = is1%beg, is1%end - flux_src_vf(advxb)%sf(j, k, l) = & - flux_src_rsx_vf(j, k, l, advxb) + #:call GPU_PARALLEL_LOOP(collapse=3) + do l = is3%beg, is3%end + do k = is2%beg, is2%end + do j = is1%beg, is1%end + flux_src_vf(advxb)%sf(j, k, l) = & + flux_src_rsx_vf(j, k, l, advxb) + end do end do end do - end do + #:endcall GPU_PARALLEL_LOOP if (riemann_solver == 1 .or. riemann_solver == 4) then - $:GPU_PARALLEL_LOOP(collapse=4) - do i = advxb + 1, advxe - do l = is3%beg, is3%end - do k = is2%beg, is2%end - do j = is1%beg, is1%end - flux_src_vf(i)%sf(j, k, l) = & - flux_src_rsx_vf(j, k, l, i) + #:call GPU_PARALLEL_LOOP(collapse=4) + do i = advxb + 1, advxe + do l = is3%beg, is3%end + do k = is2%beg, is2%end + do j = is1%beg, is1%end + flux_src_vf(i)%sf(j, k, l) = & + flux_src_rsx_vf(j, k, l, i) + end do end do end do end do - end do + #:endcall GPU_PARALLEL_LOOP end if end if diff --git a/src/simulation/m_sim_helpers.fpp b/src/simulation/m_sim_helpers.fpp index df8f6b9eae..8008225e54 100644 --- a/src/simulation/m_sim_helpers.fpp +++ b/src/simulation/m_sim_helpers.fpp @@ -20,7 +20,7 @@ contains !! @param k y coordinate index !! @param l z coordinate index !! @return fltr_dtheta Modified dtheta value for cylindrical coordinates - pure function f_compute_filtered_dtheta(k, l) result(fltr_dtheta) + function f_compute_filtered_dtheta(k, l) result(fltr_dtheta) $:GPU_ROUTINE(parallelism='[seq]') integer, intent(in) :: k, l real(wp) :: fltr_dtheta @@ -47,7 +47,7 @@ contains !! @param k y coordinate index !! @param l z coordinate index !! @return cfl_terms computed CFL terms for 2D/3D cases - pure function f_compute_multidim_cfl_terms(vel, c, j, k, l) result(cfl_terms) + function f_compute_multidim_cfl_terms(vel, c, j, k, l) result(cfl_terms) $:GPU_ROUTINE(parallelism='[seq]') real(wp), dimension(num_vels), intent(in) :: vel real(wp), intent(in) :: c @@ -89,7 +89,7 @@ contains !! @param j x index !! @param k y index !! @param l z index - pure subroutine s_compute_enthalpy(q_prim_vf, pres, rho, gamma, pi_inf, Re, H, alpha, vel, vel_sum, j, k, l) + subroutine s_compute_enthalpy(q_prim_vf, pres, rho, gamma, pi_inf, Re, H, alpha, vel, vel_sum, j, k, l) $:GPU_ROUTINE(function_name='s_compute_enthalpy',parallelism='[seq]', & & cray_inline=True) @@ -181,7 +181,7 @@ contains !! @param icfl_sf cell-centered inviscid cfl number !! @param vcfl_sf (optional) cell-centered viscous CFL number !! @param Rc_sf (optional) cell centered Rc - pure subroutine s_compute_stability_from_dt(vel, c, rho, Re_l, j, k, l, icfl_sf, vcfl_sf, Rc_sf) + subroutine s_compute_stability_from_dt(vel, c, rho, Re_l, j, k, l, icfl_sf, vcfl_sf, Rc_sf) $:GPU_ROUTINE(parallelism='[seq]') real(wp), intent(in), dimension(num_vels) :: vel real(wp), intent(in) :: c, rho @@ -244,7 +244,7 @@ contains !! @param j x coordinate !! @param k y coordinate !! @param l z coordinate - pure subroutine s_compute_dt_from_cfl(vel, c, max_dt, rho, Re_l, j, k, l) + subroutine s_compute_dt_from_cfl(vel, c, max_dt, rho, Re_l, j, k, l) $:GPU_ROUTINE(parallelism='[seq]') real(wp), dimension(num_vels), intent(in) :: vel real(wp), intent(in) :: c, rho diff --git a/src/simulation/m_start_up.fpp b/src/simulation/m_start_up.fpp index 123db558ea..8902162c8d 100644 --- a/src/simulation/m_start_up.fpp +++ b/src/simulation/m_start_up.fpp @@ -71,9 +71,7 @@ module m_start_up use m_helper_basic !< Functions to compare floating point numbers -#ifdef MFC_OpenACC - use openacc -#endif + $:USE_GPU_MODULE() use m_nvtx @@ -115,7 +113,7 @@ module m_start_up contains - !> Read data files. Dispatch subroutine that replaces procedure pointer. + !> Read data files. Dispatch subroutine that replaces procedure pointer. !! @param q_cons_vf Conservative variables impure subroutine s_read_data_files(q_cons_vf) @@ -123,7 +121,6 @@ contains dimension(sys_size), & intent(inout) :: q_cons_vf - if (.not. parallel_io) then call s_read_serial_data_files(q_cons_vf) else @@ -214,7 +211,7 @@ contains if ((bf_x) .or. (bf_y) .or. (bf_z)) then bodyForces = .true. - endif + end if ! Store m,n,p into global m,n,p m_glb = m @@ -228,7 +225,7 @@ contains if (any((/bc_x%beg, bc_x%end, bc_y%beg, bc_y%end, bc_z%beg, bc_z%end/) == -17) .or. & num_bc_patches > 0) then bc_io = .true. - endif + end if else call s_mpi_abort(trim(file_path)//' is missing. Exiting.') @@ -271,7 +268,6 @@ contains type(scalar_field), dimension(sys_size), intent(INOUT) :: q_cons_vf - character(LEN=path_len + 2*name_len) :: t_step_dir !< !! Relative path to the starting time-step directory @@ -437,9 +433,9 @@ contains inquire (FILE=trim(file_path), EXIST=file_exist) if (file_exist) then open (2, FILE=trim(file_path), & - FORM='unformatted', & - ACTION='read', & - STATUS='old') + FORM='unformatted', & + ACTION='read', & + STATUS='old') read (2) ib_markers%sf(0:m, 0:n, 0:p); close (2) else call s_mpi_abort(trim(file_path)//' is missing. Exiting.') @@ -451,9 +447,9 @@ contains inquire (FILE=trim(file_path), EXIST=file_exist) if (file_exist) then open (2, FILE=trim(file_path), & - FORM='unformatted', & - ACTION='read', & - STATUS='old') + FORM='unformatted', & + ACTION='read', & + STATUS='old') read (2) levelset%sf(0:m, 0:n, 0:p, 1:num_ibs); close (2) ! print*, 'check', STL_levelset(106, 50, 0, 1) else @@ -466,9 +462,9 @@ contains inquire (FILE=trim(file_path), EXIST=file_exist) if (file_exist) then open (2, FILE=trim(file_path), & - FORM='unformatted', & - ACTION='read', & - STATUS='old') + FORM='unformatted', & + ACTION='read', & + STATUS='old') read (2) levelset_norm%sf(0:m, 0:n, 0:p, 1:num_ibs, 1:3); close (2) else call s_mpi_abort(trim(file_path)//' is missing. Exiting.') @@ -550,14 +546,14 @@ contains file_loc = trim(case_dir)//'/restart_data'//trim(mpiiofs)//'x_cb.dat' inquire (FILE=trim(file_loc), EXIST=file_exist) - if(down_sample) then - m_ds = INT((m+1)/3) - 1 - n_ds = INT((n+1)/3) - 1 - p_ds = INT((p+1)/3) - 1 + if (down_sample) then + m_ds = int((m + 1)/3) - 1 + n_ds = int((n + 1)/3) - 1 + p_ds = int((p + 1)/3) - 1 - m_glb_ds = INT((m_glb+1)/3) - 1 - n_glb_ds = INT((n_glb+1)/3) - 1 - p_glb_ds = INT((p_glb+1)/3) - 1 + m_glb_ds = int((m_glb + 1)/3) - 1 + n_glb_ds = int((n_glb + 1)/3) - 1 + p_glb_ds = int((p_glb + 1)/3) - 1 end if if (file_exist) then @@ -646,18 +642,18 @@ contains call MPI_FILE_OPEN(MPI_COMM_SELF, file_loc, MPI_MODE_RDONLY, mpi_info_int, ifile, ierr) ! Initialize MPI data I/O - if(down_sample) then + if (down_sample) then call s_initialize_mpi_data_ds(q_cons_vf) else if (ib) then call s_initialize_mpi_data(q_cons_vf, ib_markers, & - levelset, levelset_norm) + levelset, levelset_norm) else call s_initialize_mpi_data(q_cons_vf) end if end if - if(down_sample) then + if (down_sample) then ! Size of local arrays data_size = (m_ds + 3)*(n_ds + 3)*(p_ds + 3) m_glb_read = m_glb_ds + 1 @@ -699,7 +695,7 @@ contains end do end if else - if(down_sample) then + if (down_sample) then do i = 1, sys_size var_MOK = int(i, MPI_OFFSET_KIND) @@ -754,7 +750,7 @@ contains call MPI_FILE_SET_VIEW(ifile, disp, mpi_p, MPI_IO_levelset_DATA%view, & 'native', mpi_info_int, ierr) - call MPI_FILE_READ(ifile, MPI_IO_levelset_DATA%var%sf, data_size * num_ibs, & + call MPI_FILE_READ(ifile, MPI_IO_levelset_DATA%var%sf, data_size*num_ibs, & mpi_p, status, ierr) else @@ -774,7 +770,7 @@ contains call MPI_FILE_SET_VIEW(ifile, disp, mpi_p, MPI_IO_levelsetnorm_DATA%view, & 'native', mpi_info_int, ierr) - call MPI_FILE_READ(ifile, MPI_IO_levelsetnorm_DATA%var%sf, data_size * num_ibs * 3, & + call MPI_FILE_READ(ifile, MPI_IO_levelsetnorm_DATA%var%sf, data_size*num_ibs*3, & mpi_p, status, ierr) else @@ -804,14 +800,13 @@ contains if (ib) then call s_initialize_mpi_data(q_cons_vf, ib_markers, & - levelset, levelset_norm) + levelset, levelset_norm) else call s_initialize_mpi_data(q_cons_vf) end if - ! Size of local arrays data_size = (m + 1)*(n + 1)*(p + 1) @@ -903,7 +898,7 @@ contains call MPI_FILE_SET_VIEW(ifile, disp, mpi_p, MPI_IO_levelset_DATA%view, & 'native', mpi_info_int, ierr) - call MPI_FILE_READ(ifile, MPI_IO_levelset_DATA%var%sf, data_size * num_ibs, & + call MPI_FILE_READ(ifile, MPI_IO_levelset_DATA%var%sf, data_size*num_ibs, & mpi_p, status, ierr) else @@ -923,7 +918,7 @@ contains call MPI_FILE_SET_VIEW(ifile, disp, mpi_p, MPI_IO_levelsetnorm_DATA%view, & 'native', mpi_info_int, ierr) - call MPI_FILE_READ(ifile, MPI_IO_levelsetnorm_DATA%var%sf, data_size * num_ibs * 3, & + call MPI_FILE_READ(ifile, MPI_IO_levelsetnorm_DATA%var%sf, data_size*num_ibs*3, & mpi_p, status, ierr) else @@ -1007,7 +1002,7 @@ contains end subroutine s_read_parallel_data_files - !> The purpose of this procedure is to initialize the + !> The purpose of this procedure is to initialize the !! values of the internal-energy equations of each phase !! from the mass of each phase, the mixture momentum and !! mixture-total-energy equations. @@ -1054,14 +1049,14 @@ contains if (mhd) then if (n == 0) then - pres_mag = 0.5_wp*(Bx0**2 + v_vf(B_idx%beg)%sf(j, k, l)**2 + v_vf(B_idx%beg+1)%sf(j, k, l)**2) + pres_mag = 0.5_wp*(Bx0**2 + v_vf(B_idx%beg)%sf(j, k, l)**2 + v_vf(B_idx%beg + 1)%sf(j, k, l)**2) else - pres_mag = 0.5_wp*(v_vf(B_idx%beg)%sf(j, k, l)**2 + v_vf(B_idx%beg+1)%sf(j, k, l)**2 + v_vf(B_idx%beg+2)%sf(j, k, l)**2) + pres_mag = 0.5_wp*(v_vf(B_idx%beg)%sf(j, k, l)**2 + v_vf(B_idx%beg + 1)%sf(j, k, l)**2 + v_vf(B_idx%beg + 2)%sf(j, k, l)**2) end if end if call s_compute_pressure(v_vf(E_idx)%sf(j, k, l), 0._wp, & - dyn_pres, pi_inf, gamma, rho, qv, rhoYks, pres, T, pres_mag = pres_mag) + dyn_pres, pi_inf, gamma, rho, qv, rhoYks, pres, T, pres_mag=pres_mag) do i = 1, num_fluids v_vf(i + internalEnergies_idx%beg - 1)%sf(j, k, l) = v_vf(i + adv_idx%beg - 1)%sf(j, k, l)* & @@ -1079,7 +1074,6 @@ contains integer, intent(inout) :: t_step real(wp), intent(inout) :: time_avg - integer :: i if (cfl_dt) then @@ -1090,7 +1084,7 @@ contains if (t_step == 0) dt_init = dt if (dt < 1.e-3_wp*dt_init .and. cfl_adap_dt .and. proc_rank == 0) then - print*, "Delta t = ", dt + print *, "Delta t = ", dt call s_mpi_abort("Delta t has become too small") end if end if @@ -1120,7 +1114,7 @@ contains else if (proc_rank == 0 .and. mod(t_step - t_step_start, t_step_print) == 0) then print '(" [", I3, "%] Time step ", I8, " of ", I0, " @ t_step = ", I8, " Time Avg = ", ES12.6, " Time/step= ", ES12.6, "")', & - int(ceiling(100._wp*(real(t_step - t_step_start)/(t_step_stop - t_step_start + 1)))), & + int(ceiling(100._wp*(real(t_step - t_step_start)/(t_step_stop - t_step_start + 1)))), & t_step - t_step_start + 1, & t_step_stop - t_step_start + 1, & t_step, & @@ -1141,7 +1135,6 @@ contains call s_compute_derived_variables(t_step) - #ifdef DEBUG print *, 'Computed derived vars' #endif @@ -1196,9 +1189,9 @@ contains io_time_final = maxval(io_proc_time) end if - grind_time = time_final * 1.0e9_wp / & - (real(sys_size, wp) * real(maxval((/1, m_glb/)), wp) * & - real(maxval((/1, n_glb/)), wp) * real(maxval((/1, p_glb/)), wp)) + grind_time = time_final*1.0e9_wp/ & + (real(sys_size, wp)*real(maxval((/1, m_glb/)), wp)* & + real(maxval((/1, n_glb/)), wp)*real(maxval((/1, p_glb/)), wp)) print *, "Performance:", grind_time, "ns/gp/eq/rhs" inquire (FILE='time_data.dat', EXIST=file_exists) @@ -1324,7 +1317,7 @@ contains call s_initialize_variables_conversion_module() if (grid_geometry == 3) call s_initialize_fftw_module() - if(bubbles_euler) call s_initialize_bubbles_EE_module() + if (bubbles_euler) call s_initialize_bubbles_EE_module() if (ib) call s_initialize_ibm_module() if (qbmm) call s_initialize_qbmm_module() @@ -1348,19 +1341,19 @@ contains call s_initialize_boundary_common_module() - if(down_sample) then - m_ds = INT((m+1)/3) - 1 - n_ds = INT((n+1)/3) - 1 - p_ds = INT((p+1)/3) - 1 + if (down_sample) then + m_ds = int((m + 1)/3) - 1 + n_ds = int((n + 1)/3) - 1 + p_ds = int((p + 1)/3) - 1 - allocate(q_cons_temp(1:sys_size)) + allocate (q_cons_temp(1:sys_size)) do i = 1, sys_size - allocate(q_cons_temp(i)%sf(-1:m_ds+1,-1:n_ds+1,-1:p_ds+1)) + allocate (q_cons_temp(i)%sf(-1:m_ds + 1, -1:n_ds + 1, -1:p_ds + 1)) end do end if ! Reading in the user provided initial condition and grid data - if(down_sample) then + if (down_sample) then call s_read_data_files(q_cons_temp) call s_upsample_data(q_cons_ts(1)%vf, q_cons_temp) do i = 1, sys_size @@ -1408,14 +1401,16 @@ contains impure subroutine s_initialize_mpi_domain integer :: ierr -#ifdef MFC_OpenACC +#ifdef MFC_GPU real(wp) :: starttime, endtime integer :: num_devices, local_size, num_nodes, ppn, my_device_num integer :: dev, devNum, local_rank #ifdef MFC_MPI integer :: local_comm #endif +#if defined(MFC_OpenACC) integer(acc_device_kind) :: devtype +#endif #endif ! Initializing MPI execution environment @@ -1423,7 +1418,7 @@ contains call s_mpi_initialize() ! Bind GPUs if OpenACC is enabled -#ifdef MFC_OpenACC +#ifdef MFC_GPU #ifndef MFC_MPI local_size = 1 local_rank = 0 @@ -1433,12 +1428,17 @@ contains call MPI_Comm_size(local_comm, local_size, ierr) call MPI_Comm_rank(local_comm, local_rank, ierr) #endif - +#if defined(MFC_OpenACC) devtype = acc_get_device_type() devNum = acc_get_num_devices(devtype) dev = mod(local_rank, devNum) call acc_set_device_num(dev, devtype) +#elif defined(MFC_OpenMP) + devNum = omp_get_num_devices() + dev = mod(local_rank, devNum) + call omp_set_default_device(dev) +#endif #endif ! The rank 0 processor assigns default values to the user inputs prior to @@ -1457,12 +1457,12 @@ contains "case-optimized", & #:endif m, n, p, num_procs, & -#ifdef MFC_OpenACC -!&< +#if defined(MFC_OpenACC) "with OpenACC offloading" -!&> +#elif defined(MFC_OpenMP) + "with OpenMP offloading" #else - "on CPUs" + "on CPUs" #endif end if @@ -1495,7 +1495,7 @@ contains end if $:GPU_UPDATE(device='[chem_params]') - + $:GPU_UPDATE(device='[nb,R0ref,Ca,Web,Re_inv,weight,R0, & & bubbles_euler,polytropic,polydisperse,qbmm, & & ptil,bubble_model,thermal,poly_sigma,adv_n,adap_dt, & @@ -1509,7 +1509,7 @@ contains $:GPU_UPDATE(device='[sigma, surface_tension]') $:GPU_UPDATE(device='[dx,dy,dz,x_cb,x_cc,y_cb,y_cc,z_cb,z_cc]') - +! #if defined(MFC_OpenACC) $:GPU_UPDATE(device='[bc_x%vb1,bc_x%vb2,bc_x%vb3,bc_x%ve1,bc_x%ve2,bc_x%ve3]') $:GPU_UPDATE(device='[bc_y%vb1,bc_y%vb2,bc_y%vb3,bc_y%ve1,bc_y%ve2,bc_y%ve3]') $:GPU_UPDATE(device='[bc_z%vb1,bc_z%vb2,bc_z%vb3,bc_z%ve1,bc_z%ve2,bc_z%ve3]') @@ -1517,6 +1517,9 @@ contains $:GPU_UPDATE(device='[bc_x%grcbc_in,bc_x%grcbc_out,bc_x%grcbc_vel_out]') $:GPU_UPDATE(device='[bc_y%grcbc_in,bc_y%grcbc_out,bc_y%grcbc_vel_out]') $:GPU_UPDATE(device='[bc_z%grcbc_in,bc_z%grcbc_out,bc_z%grcbc_vel_out]') +! #elif defined(MFC_OpenMP) +! $:GPU_UPDATE(device='[bc_x,bc_y,bc_z]') +! #endif $:GPU_UPDATE(device='[relax, relax_model]') if (relax) then @@ -1529,6 +1532,15 @@ contains $:GPU_UPDATE(device='[igr, igr_order]') + #:block DEF_AMD + block + use m_thermochem, only: molecular_weights + use m_chemistry, only: molecular_weights_nonparameter + molecular_weights_nonparameter(:) = molecular_weights(:) + $:GPU_UPDATE(device='[molecular_weights_nonparameter]') + end block + #:endblock + end subroutine s_initialize_gpu_vars impure subroutine s_finalize_modules @@ -1562,7 +1574,7 @@ contains end if call s_finalize_mpi_proxy_module() - if (surface_tension) call s_finalize_surface_tension_module() + if (surface_tension) call s_finalize_surface_tension_module() if (bodyForces) call s_finalize_body_forces_module() if (mhd .and. powell) call s_finalize_mhd_powell_module diff --git a/src/simulation/m_surface_tension.fpp b/src/simulation/m_surface_tension.fpp index 0522113c3a..40977b9a18 100644 --- a/src/simulation/m_surface_tension.fpp +++ b/src/simulation/m_surface_tension.fpp @@ -68,7 +68,7 @@ contains end if end subroutine s_initialize_surface_tension_module - pure subroutine s_compute_capillary_source_flux( & + subroutine s_compute_capillary_source_flux( & vSrc_rsx_vf, vSrc_rsy_vf, vSrc_rsz_vf, & flux_src_vf, & id, isx, isy, isz) @@ -88,144 +88,141 @@ contains integer :: j, k, l, i if (id == 1) then - $:GPU_PARALLEL_LOOP(collapse=3, private='[Omega, w1L, w2L, w3L, & - & w1R, w2R, w3R, w1, w2, w3, normWL, & - & normWR, normW]') - do l = isz%beg, isz%end - do k = isy%beg, isy%end - do j = isx%beg, isx%end + #:call GPU_PARALLEL_LOOP(collapse=3, private='[Omega, w1L, w2L, w3L, w1R, w2R, w3R, w1, w2, w3, normWL, normWR, normW]') + do l = isz%beg, isz%end + do k = isy%beg, isy%end + do j = isx%beg, isx%end - w1L = gL_x(j, k, l, 1) - w2L = gL_x(j, k, l, 2) - w3L = 0._wp - if (p > 0) w3L = gL_x(j, k, l, 3) + w1L = gL_x(j, k, l, 1) + w2L = gL_x(j, k, l, 2) + w3L = 0._wp + if (p > 0) w3L = gL_x(j, k, l, 3) - w1R = gR_x(j + 1, k, l, 1) - w2R = gR_x(j + 1, k, l, 2) - w3R = 0._wp - if (p > 0) w3R = gR_x(j + 1, k, l, 3) + w1R = gR_x(j + 1, k, l, 1) + w2R = gR_x(j + 1, k, l, 2) + w3R = 0._wp + if (p > 0) w3R = gR_x(j + 1, k, l, 3) - normWL = gL_x(j, k, l, num_dims + 1) - normWR = gR_x(j + 1, k, l, num_dims + 1) + normWL = gL_x(j, k, l, num_dims + 1) + normWR = gR_x(j + 1, k, l, num_dims + 1) - w1 = (w1L + w1R)/2._wp - w2 = (w2L + w2R)/2._wp - w3 = (w3L + w3R)/2._wp - normW = (normWL + normWR)/2._wp + w1 = (w1L + w1R)/2._wp + w2 = (w2L + w2R)/2._wp + w3 = (w3L + w3R)/2._wp + normW = (normWL + normWR)/2._wp - if (normW > capillary_cutoff) then - @:compute_capillary_stress_tensor() + if (normW > capillary_cutoff) then + @:compute_capillary_stress_tensor() - do i = 1, num_dims + do i = 1, num_dims - flux_src_vf(momxb + i - 1)%sf(j, k, l) = & - flux_src_vf(momxb + i - 1)%sf(j, k, l) + Omega(1, i) + flux_src_vf(momxb + i - 1)%sf(j, k, l) = & + flux_src_vf(momxb + i - 1)%sf(j, k, l) + Omega(1, i) - flux_src_vf(E_idx)%sf(j, k, l) = flux_src_vf(E_idx)%sf(j, k, l) + & - Omega(1, i)*vSrc_rsx_vf(j, k, l, i) + flux_src_vf(E_idx)%sf(j, k, l) = flux_src_vf(E_idx)%sf(j, k, l) + & + Omega(1, i)*vSrc_rsx_vf(j, k, l, i) - end do + end do - flux_src_vf(E_idx)%sf(j, k, l) = flux_src_vf(E_idx)%sf(j, k, l) + & - sigma*c_divs(num_dims + 1)%sf(j, k, l)*vSrc_rsx_vf(j, k, l, 1) - end if + flux_src_vf(E_idx)%sf(j, k, l) = flux_src_vf(E_idx)%sf(j, k, l) + & + sigma*c_divs(num_dims + 1)%sf(j, k, l)*vSrc_rsx_vf(j, k, l, 1) + end if + end do end do end do - end do + #:endcall GPU_PARALLEL_LOOP elseif (id == 2) then - $:GPU_PARALLEL_LOOP(collapse=3, private='[Omega, w1L, w2L, w3L, & - & w1R, w2R, w3R, w1, w2, w3, normWL, normWR, & - & normW]') - do l = isz%beg, isz%end - do k = isy%beg, isy%end - do j = isx%beg, isx%end + #:call GPU_PARALLEL_LOOP(collapse=3, private='[Omega, w1L, w2L, w3L, w1R, w2R, w3R, w1, w2, w3, normWL, normWR, normW]') + do l = isz%beg, isz%end + do k = isy%beg, isy%end + do j = isx%beg, isx%end - w1L = gL_y(k, j, l, 1) - w2L = gL_y(k, j, l, 2) - w3L = 0._wp - if (p > 0) w3L = gL_y(k, j, l, 3) + w1L = gL_y(k, j, l, 1) + w2L = gL_y(k, j, l, 2) + w3L = 0._wp + if (p > 0) w3L = gL_y(k, j, l, 3) - w1R = gR_y(k + 1, j, l, 1) - w2R = gR_y(k + 1, j, l, 2) - w3R = 0._wp - if (p > 0) w3R = gR_y(k + 1, j, l, 3) + w1R = gR_y(k + 1, j, l, 1) + w2R = gR_y(k + 1, j, l, 2) + w3R = 0._wp + if (p > 0) w3R = gR_y(k + 1, j, l, 3) - normWL = gL_y(k, j, l, num_dims + 1) - normWR = gR_y(k + 1, j, l, num_dims + 1) + normWL = gL_y(k, j, l, num_dims + 1) + normWR = gR_y(k + 1, j, l, num_dims + 1) - w1 = (w1L + w1R)/2._wp - w2 = (w2L + w2R)/2._wp - w3 = (w3L + w3R)/2._wp - normW = (normWL + normWR)/2._wp + w1 = (w1L + w1R)/2._wp + w2 = (w2L + w2R)/2._wp + w3 = (w3L + w3R)/2._wp + normW = (normWL + normWR)/2._wp - if (normW > capillary_cutoff) then - @:compute_capillary_stress_tensor() + if (normW > capillary_cutoff) then + @:compute_capillary_stress_tensor() - do i = 1, num_dims + do i = 1, num_dims - flux_src_vf(momxb + i - 1)%sf(j, k, l) = & - flux_src_vf(momxb + i - 1)%sf(j, k, l) + Omega(2, i) + flux_src_vf(momxb + i - 1)%sf(j, k, l) = & + flux_src_vf(momxb + i - 1)%sf(j, k, l) + Omega(2, i) - flux_src_vf(E_idx)%sf(j, k, l) = flux_src_vf(E_idx)%sf(j, k, l) + & - Omega(2, i)*vSrc_rsy_vf(k, j, l, i) + flux_src_vf(E_idx)%sf(j, k, l) = flux_src_vf(E_idx)%sf(j, k, l) + & + Omega(2, i)*vSrc_rsy_vf(k, j, l, i) - end do + end do - flux_src_vf(E_idx)%sf(j, k, l) = flux_src_vf(E_idx)%sf(j, k, l) + & - sigma*c_divs(num_dims + 1)%sf(j, k, l)*vSrc_rsy_vf(k, j, l, 2) - end if + flux_src_vf(E_idx)%sf(j, k, l) = flux_src_vf(E_idx)%sf(j, k, l) + & + sigma*c_divs(num_dims + 1)%sf(j, k, l)*vSrc_rsy_vf(k, j, l, 2) + end if + end do end do end do - end do + #:endcall GPU_PARALLEL_LOOP elseif (id == 3) then - $:GPU_PARALLEL_LOOP(collapse=3, private='[Omega, w1L, w2L, w3L, & - & w1R, w2R, w3R, w1, w2, w3, normWL, normWR, & - & normW]') - do l = isz%beg, isz%end - do k = isy%beg, isy%end - do j = isx%beg, isx%end + #:call GPU_PARALLEL_LOOP(collapse=3, private='[Omega, w1L, w2L, w3L, w1R, w2R, w3R, w1, w2, w3, normWL, normWR, normW]') + do l = isz%beg, isz%end + do k = isy%beg, isy%end + do j = isx%beg, isx%end - w1L = gL_z(l, k, j, 1) - w2L = gL_z(l, k, j, 2) - w3L = 0._wp - if (p > 0) w3L = gL_z(l, k, j, 3) + w1L = gL_z(l, k, j, 1) + w2L = gL_z(l, k, j, 2) + w3L = 0._wp + if (p > 0) w3L = gL_z(l, k, j, 3) - w1R = gR_z(l + 1, k, j, 1) - w2R = gR_z(l + 1, k, j, 2) - w3R = 0._wp - if (p > 0) w3R = gR_z(l + 1, k, j, 3) + w1R = gR_z(l + 1, k, j, 1) + w2R = gR_z(l + 1, k, j, 2) + w3R = 0._wp + if (p > 0) w3R = gR_z(l + 1, k, j, 3) - normWL = gL_z(l, k, j, num_dims + 1) - normWR = gR_z(l + 1, k, j, num_dims + 1) + normWL = gL_z(l, k, j, num_dims + 1) + normWR = gR_z(l + 1, k, j, num_dims + 1) - w1 = (w1L + w1R)/2._wp - w2 = (w2L + w2R)/2._wp - w3 = (w3L + w3R)/2._wp - normW = (normWL + normWR)/2._wp + w1 = (w1L + w1R)/2._wp + w2 = (w2L + w2R)/2._wp + w3 = (w3L + w3R)/2._wp + normW = (normWL + normWR)/2._wp - if (normW > capillary_cutoff) then - @:compute_capillary_stress_tensor() + if (normW > capillary_cutoff) then + @:compute_capillary_stress_tensor() - do i = 1, num_dims + do i = 1, num_dims - flux_src_vf(momxb + i - 1)%sf(j, k, l) = & - flux_src_vf(momxb + i - 1)%sf(j, k, l) + Omega(3, i) + flux_src_vf(momxb + i - 1)%sf(j, k, l) = & + flux_src_vf(momxb + i - 1)%sf(j, k, l) + Omega(3, i) - flux_src_vf(E_idx)%sf(j, k, l) = flux_src_vf(E_idx)%sf(j, k, l) + & - Omega(3, i)*vSrc_rsz_vf(l, k, j, i) + flux_src_vf(E_idx)%sf(j, k, l) = flux_src_vf(E_idx)%sf(j, k, l) + & + Omega(3, i)*vSrc_rsz_vf(l, k, j, i) - end do + end do - flux_src_vf(E_idx)%sf(j, k, l) = flux_src_vf(E_idx)%sf(j, k, l) + & - sigma*c_divs(num_dims + 1)%sf(j, k, l)*vSrc_rsz_vf(l, k, j, 3) - end if + flux_src_vf(E_idx)%sf(j, k, l) = flux_src_vf(E_idx)%sf(j, k, l) + & + sigma*c_divs(num_dims + 1)%sf(j, k, l)*vSrc_rsz_vf(l, k, j, 3) + end if + end do end do end do - end do + #:endcall GPU_PARALLEL_LOOP end if @@ -246,54 +243,58 @@ contains isx%end = m; isy%end = n; isz%end = p ! compute gradient components - $:GPU_PARALLEL_LOOP(collapse=3) - do l = 0, p - do k = 0, n - do j = 0, m - c_divs(1)%sf(j, k, l) = 1._wp/(x_cc(j + 1) - x_cc(j - 1))* & - (q_prim_vf(c_idx)%sf(j + 1, k, l) - q_prim_vf(c_idx)%sf(j - 1, k, l)) - end do - end do - end do - - $:GPU_PARALLEL_LOOP(collapse=3) - do l = 0, p - do k = 0, n - do j = 0, m - c_divs(2)%sf(j, k, l) = 1._wp/(y_cc(k + 1) - y_cc(k - 1))* & - (q_prim_vf(c_idx)%sf(j, k + 1, l) - q_prim_vf(c_idx)%sf(j, k - 1, l)) + #:call GPU_PARALLEL_LOOP(collapse=3) + do l = 0, p + do k = 0, n + do j = 0, m + c_divs(1)%sf(j, k, l) = 1._wp/(x_cc(j + 1) - x_cc(j - 1))* & + (q_prim_vf(c_idx)%sf(j + 1, k, l) - q_prim_vf(c_idx)%sf(j - 1, k, l)) + end do end do end do - end do + #:endcall GPU_PARALLEL_LOOP - if (p > 0) then - $:GPU_PARALLEL_LOOP(collapse=3) + #:call GPU_PARALLEL_LOOP(collapse=3) do l = 0, p do k = 0, n do j = 0, m - c_divs(3)%sf(j, k, l) = 1._wp/(z_cc(l + 1) - z_cc(l - 1))* & - (q_prim_vf(c_idx)%sf(j, k, l + 1) - q_prim_vf(c_idx)%sf(j, k, l - 1)) + c_divs(2)%sf(j, k, l) = 1._wp/(y_cc(k + 1) - y_cc(k - 1))* & + (q_prim_vf(c_idx)%sf(j, k + 1, l) - q_prim_vf(c_idx)%sf(j, k - 1, l)) end do end do end do + #:endcall GPU_PARALLEL_LOOP + + if (p > 0) then + #:call GPU_PARALLEL_LOOP(collapse=3) + do l = 0, p + do k = 0, n + do j = 0, m + c_divs(3)%sf(j, k, l) = 1._wp/(z_cc(l + 1) - z_cc(l - 1))* & + (q_prim_vf(c_idx)%sf(j, k, l + 1) - q_prim_vf(c_idx)%sf(j, k, l - 1)) + end do + end do + end do + #:endcall GPU_PARALLEL_LOOP end if - $:GPU_PARALLEL_LOOP(collapse=3) - do l = 0, p - do k = 0, n - do j = 0, m - c_divs(num_dims + 1)%sf(j, k, l) = 0._wp - $:GPU_LOOP(parallelism='[seq]') - do i = 1, num_dims + #:call GPU_PARALLEL_LOOP(collapse=3) + do l = 0, p + do k = 0, n + do j = 0, m + c_divs(num_dims + 1)%sf(j, k, l) = 0._wp + $:GPU_LOOP(parallelism='[seq]') + do i = 1, num_dims + c_divs(num_dims + 1)%sf(j, k, l) = & + c_divs(num_dims + 1)%sf(j, k, l) + & + c_divs(i)%sf(j, k, l)**2._wp + end do c_divs(num_dims + 1)%sf(j, k, l) = & - c_divs(num_dims + 1)%sf(j, k, l) + & - c_divs(i)%sf(j, k, l)**2._wp + sqrt(c_divs(num_dims + 1)%sf(j, k, l)) end do - c_divs(num_dims + 1)%sf(j, k, l) = & - sqrt(c_divs(num_dims + 1)%sf(j, k, l)) end do end do - end do + #:endcall GPU_PARALLEL_LOOP call s_populate_capillary_buffers(c_divs, bc_type) @@ -342,41 +343,44 @@ contains $:GPU_UPDATE(device='[is1,is2,is3,iv]') if (recon_dir == 1) then - $:GPU_PARALLEL_LOOP(collapse=4) - do i = iv%beg, iv%end - do l = is3%beg, is3%end - do k = is2%beg, is2%end - do j = is1%beg, is1%end - vL_x(j, k, l, i) = v_vf(i)%sf(j, k, l) - vR_x(j, k, l, i) = v_vf(i)%sf(j, k, l) + #:call GPU_PARALLEL_LOOP(collapse=4) + do i = iv%beg, iv%end + do l = is3%beg, is3%end + do k = is2%beg, is2%end + do j = is1%beg, is1%end + vL_x(j, k, l, i) = v_vf(i)%sf(j, k, l) + vR_x(j, k, l, i) = v_vf(i)%sf(j, k, l) + end do end do end do end do - end do + #:endcall GPU_PARALLEL_LOOP else if (recon_dir == 2) then - $:GPU_PARALLEL_LOOP(collapse=4) - do i = iv%beg, iv%end - do l = is3%beg, is3%end - do k = is2%beg, is2%end - do j = is1%beg, is1%end - vL_y(j, k, l, i) = v_vf(i)%sf(k, j, l) - vR_y(j, k, l, i) = v_vf(i)%sf(k, j, l) + #:call GPU_PARALLEL_LOOP(collapse=4) + do i = iv%beg, iv%end + do l = is3%beg, is3%end + do k = is2%beg, is2%end + do j = is1%beg, is1%end + vL_y(j, k, l, i) = v_vf(i)%sf(k, j, l) + vR_y(j, k, l, i) = v_vf(i)%sf(k, j, l) + end do end do end do end do - end do + #:endcall GPU_PARALLEL_LOOP else if (recon_dir == 3) then - $:GPU_PARALLEL_LOOP(collapse=4) - do i = iv%beg, iv%end - do l = is3%beg, is3%end - do k = is2%beg, is2%end - do j = is1%beg, is1%end - vL_z(j, k, l, i) = v_vf(i)%sf(l, k, j) - vR_z(j, k, l, i) = v_vf(i)%sf(l, k, j) + #:call GPU_PARALLEL_LOOP(collapse=4) + do i = iv%beg, iv%end + do l = is3%beg, is3%end + do k = is2%beg, is2%end + do j = is1%beg, is1%end + vL_z(j, k, l, i) = v_vf(i)%sf(l, k, j) + vR_z(j, k, l, i) = v_vf(i)%sf(l, k, j) + end do end do end do end do - end do + #:endcall GPU_PARALLEL_LOOP end if end if #:endfor diff --git a/src/simulation/m_time_steppers.fpp b/src/simulation/m_time_steppers.fpp index e7d4ba6017..a8905822bd 100644 --- a/src/simulation/m_time_steppers.fpp +++ b/src/simulation/m_time_steppers.fpp @@ -487,52 +487,55 @@ contains if (bubbles_lagrange .and. .not. adap_dt) call s_update_lagrange_tdv_rk(stage=1) - $:GPU_PARALLEL_LOOP(collapse=4) - do i = 1, sys_size - do l = 0, p - do k = 0, n - do j = 0, m - q_cons_ts(1)%vf(i)%sf(j, k, l) = & - q_cons_ts(1)%vf(i)%sf(j, k, l) & - + dt*rhs_vf(i)%sf(j, k, l) + #:call GPU_PARALLEL_LOOP(collapse=4) + do i = 1, sys_size + do l = 0, p + do k = 0, n + do j = 0, m + q_cons_ts(1)%vf(i)%sf(j, k, l) = & + q_cons_ts(1)%vf(i)%sf(j, k, l) & + + dt*rhs_vf(i)%sf(j, k, l) + end do end do end do end do - end do + #:endcall GPU_PARALLEL_LOOP !Evolve pb and mv for non-polytropic qbmm if (qbmm .and. (.not. polytropic)) then - $:GPU_PARALLEL_LOOP(collapse=5) - do i = 1, nb - do l = 0, p - do k = 0, n - do j = 0, m - do q = 1, nnode - pb_ts(1)%sf(j, k, l, q, i) = & - pb_ts(1)%sf(j, k, l, q, i) & - + dt*rhs_pb(j, k, l, q, i) + #:call GPU_PARALLEL_LOOP(collapse=5) + do i = 1, nb + do l = 0, p + do k = 0, n + do j = 0, m + do q = 1, nnode + pb_ts(1)%sf(j, k, l, q, i) = & + pb_ts(1)%sf(j, k, l, q, i) & + + dt*rhs_pb(j, k, l, q, i) + end do end do end do end do end do - end do + #:endcall GPU_PARALLEL_LOOP end if if (qbmm .and. (.not. polytropic)) then - $:GPU_PARALLEL_LOOP(collapse=5) - do i = 1, nb - do l = 0, p - do k = 0, n - do j = 0, m - do q = 1, nnode - mv_ts(1)%sf(j, k, l, q, i) = & - mv_ts(1)%sf(j, k, l, q, i) & - + dt*rhs_mv(j, k, l, q, i) + #:call GPU_PARALLEL_LOOP(collapse=5) + do i = 1, nb + do l = 0, p + do k = 0, n + do j = 0, m + do q = 1, nnode + mv_ts(1)%sf(j, k, l, q, i) = & + mv_ts(1)%sf(j, k, l, q, i) & + + dt*rhs_mv(j, k, l, q, i) + end do end do end do end do end do - end do + #:endcall GPU_PARALLEL_LOOP end if if (bodyForces) call s_apply_bodyforces(q_cons_ts(1)%vf, q_prim_vf, rhs_vf, dt) @@ -607,72 +610,75 @@ contains if (bubbles_lagrange .and. .not. adap_dt) call s_update_lagrange_tdv_rk(stage=1) #if defined(__NVCOMPILER_GPU_UNIFIED_MEM) || defined(FRONTIER_UNIFIED) - $:GPU_PARALLEL_LOOP(collapse=4) - do i = 1, sys_size - do l = 0, p - do k = 0, n - do j = 0, m - q_cons_ts(2)%vf(i)%sf(j, k, l) = & - q_cons_ts(1)%vf(i)%sf(j, k, l) - q_cons_ts(1)%vf(i)%sf(j, k, l) = & - q_cons_ts(1)%vf(i)%sf(j, k, l) & - + dt*rhs_vf(i)%sf(j, k, l) + #:call GPU_PARALLEL_LOOP(collapse=4) + do i = 1, sys_size + do l = 0, p + do k = 0, n + do j = 0, m + q_cons_ts(2)%vf(i)%sf(j, k, l) = & + q_cons_ts(1)%vf(i)%sf(j, k, l) + q_cons_ts(1)%vf(i)%sf(j, k, l) = & + q_cons_ts(1)%vf(i)%sf(j, k, l) & + + dt*rhs_vf(i)%sf(j, k, l) + end do end do end do end do - end do + #:endcall GPU_PARALLEL_LOOP dest = 1 ! Result in q_cons_ts(1)%vf #else - $:GPU_PARALLEL_LOOP(collapse=4) - do i = 1, sys_size - do l = 0, p - do k = 0, n - do j = 0, m - q_cons_ts(2)%vf(i)%sf(j, k, l) = & - q_cons_ts(1)%vf(i)%sf(j, k, l) & - + dt*rhs_vf(i)%sf(j, k, l) + #:call GPU_PARALLEL_LOOP(collapse=4) + do i = 1, sys_size + do l = 0, p + do k = 0, n + do j = 0, m + q_cons_ts(2)%vf(i)%sf(j, k, l) = & + q_cons_ts(1)%vf(i)%sf(j, k, l) & + + dt*rhs_vf(i)%sf(j, k, l) + end do end do end do end do - end do + #:endcall GPU_PARALLEL_LOOP dest = 2 ! Result in q_cons_ts(2)%vf #endif - !Evolve pb and mv for non-polytropic qbmm if (qbmm .and. (.not. polytropic)) then - $:GPU_PARALLEL_LOOP(collapse=5) - do i = 1, nb - do l = 0, p - do k = 0, n - do j = 0, m - do q = 1, nnode - pb_ts(2)%sf(j, k, l, q, i) = & - pb_ts(1)%sf(j, k, l, q, i) & - + dt*rhs_pb(j, k, l, q, i) + #:call GPU_PARALLEL_LOOP(collapse=5) + do i = 1, nb + do l = 0, p + do k = 0, n + do j = 0, m + do q = 1, nnode + pb_ts(2)%sf(j, k, l, q, i) = & + pb_ts(1)%sf(j, k, l, q, i) & + + dt*rhs_pb(j, k, l, q, i) + end do end do end do end do end do - end do + #:endcall GPU_PARALLEL_LOOP end if if (qbmm .and. (.not. polytropic)) then - $:GPU_PARALLEL_LOOP(collapse=5) - do i = 1, nb - do l = 0, p - do k = 0, n - do j = 0, m - do q = 1, nnode - mv_ts(2)%sf(j, k, l, q, i) = & - mv_ts(1)%sf(j, k, l, q, i) & - + dt*rhs_mv(j, k, l, q, i) + #:call GPU_PARALLEL_LOOP(collapse=5) + do i = 1, nb + do l = 0, p + do k = 0, n + do j = 0, m + do q = 1, nnode + mv_ts(2)%sf(j, k, l, q, i) = & + mv_ts(1)%sf(j, k, l, q, i) & + + dt*rhs_mv(j, k, l, q, i) + end do end do end do end do end do - end do + #:endcall GPU_PARALLEL_LOOP end if if (bodyForces) call s_apply_bodyforces(q_cons_ts(dest)%vf, q_prim_vf, rhs_vf, dt) @@ -699,73 +705,76 @@ contains if (bubbles_lagrange .and. .not. adap_dt) call s_update_lagrange_tdv_rk(stage=2) #if defined(__NVCOMPILER_GPU_UNIFIED_MEM) || defined(FRONTIER_UNIFIED) - $:GPU_PARALLEL_LOOP(collapse=4) - do i = 1, sys_size - do l = 0, p - do k = 0, n - do j = 0, m - q_cons_ts(1)%vf(i)%sf(j, k, l) = & - (q_cons_ts(2)%vf(i)%sf(j, k, l) & - + q_cons_ts(1)%vf(i)%sf(j, k, l) & - + dt*rhs_vf(i)%sf(j, k, l))/2._wp + #:call GPU_PARALLEL_LOOP(collapse=4) + do i = 1, sys_size + do l = 0, p + do k = 0, n + do j = 0, m + q_cons_ts(1)%vf(i)%sf(j, k, l) = & + (q_cons_ts(2)%vf(i)%sf(j, k, l) & + + q_cons_ts(1)%vf(i)%sf(j, k, l) & + + dt*rhs_vf(i)%sf(j, k, l))/2._wp + end do end do end do end do - end do + #:endcall GPU_PARALLEL_LOOP dest = 1 ! Result in q_cons_ts(1)%vf #else - $:GPU_PARALLEL_LOOP(collapse=4) - do i = 1, sys_size - do l = 0, p - do k = 0, n - do j = 0, m - q_cons_ts(1)%vf(i)%sf(j, k, l) = & - (q_cons_ts(1)%vf(i)%sf(j, k, l) & - + q_cons_ts(2)%vf(i)%sf(j, k, l) & - + dt*rhs_vf(i)%sf(j, k, l))/2._wp + #:call GPU_PARALLEL_LOOP(collapse=4) + do i = 1, sys_size + do l = 0, p + do k = 0, n + do j = 0, m + q_cons_ts(1)%vf(i)%sf(j, k, l) = & + (q_cons_ts(1)%vf(i)%sf(j, k, l) & + + q_cons_ts(2)%vf(i)%sf(j, k, l) & + + dt*rhs_vf(i)%sf(j, k, l))/2._wp + end do end do end do end do - end do - + #:endcall GPU_PARALLEL_LOOP dest = 1 ! Result in q_cons_ts(1)%vf #endif if (qbmm .and. (.not. polytropic)) then - $:GPU_PARALLEL_LOOP(collapse=5) - do i = 1, nb - do l = 0, p - do k = 0, n - do j = 0, m - do q = 1, nnode - pb_ts(1)%sf(j, k, l, q, i) = & - (pb_ts(1)%sf(j, k, l, q, i) & - + pb_ts(2)%sf(j, k, l, q, i) & - + dt*rhs_pb(j, k, l, q, i))/2._wp + #:call GPU_PARALLEL_LOOP(collapse=5) + do i = 1, nb + do l = 0, p + do k = 0, n + do j = 0, m + do q = 1, nnode + pb_ts(1)%sf(j, k, l, q, i) = & + (pb_ts(1)%sf(j, k, l, q, i) & + + pb_ts(2)%sf(j, k, l, q, i) & + + dt*rhs_pb(j, k, l, q, i))/2._wp + end do end do end do end do end do - end do + #:endcall GPU_PARALLEL_LOOP end if if (qbmm .and. (.not. polytropic)) then - $:GPU_PARALLEL_LOOP(collapse=5) - do i = 1, nb - do l = 0, p - do k = 0, n - do j = 0, m - do q = 1, nnode - mv_ts(1)%sf(j, k, l, q, i) = & - (mv_ts(1)%sf(j, k, l, q, i) & - + mv_ts(2)%sf(j, k, l, q, i) & - + dt*rhs_mv(j, k, l, q, i))/2._wp + #:call GPU_PARALLEL_LOOP(collapse=5) + do i = 1, nb + do l = 0, p + do k = 0, n + do j = 0, m + do q = 1, nnode + mv_ts(1)%sf(j, k, l, q, i) = & + (mv_ts(1)%sf(j, k, l, q, i) & + + mv_ts(2)%sf(j, k, l, q, i) & + + dt*rhs_mv(j, k, l, q, i))/2._wp + end do end do end do end do end do - end do + #:endcall GPU_PARALLEL_LOOP end if if (bodyForces) call s_apply_bodyforces(q_cons_ts(dest)%vf, q_prim_vf, rhs_vf, 2._wp*dt/3._wp) @@ -843,72 +852,75 @@ contains if (bubbles_lagrange .and. .not. adap_dt) call s_update_lagrange_tdv_rk(stage=1) #if defined(__NVCOMPILER_GPU_UNIFIED_MEM) || defined(FRONTIER_UNIFIED) - $:GPU_PARALLEL_LOOP(collapse=4) - do i = 1, sys_size - do l = 0, p - do k = 0, n - do j = 0, m - q_cons_ts(2)%vf(i)%sf(j, k, l) = & - q_cons_ts(1)%vf(i)%sf(j, k, l) - q_cons_ts(1)%vf(i)%sf(j, k, l) = & - q_cons_ts(1)%vf(i)%sf(j, k, l) & - + dt*rhs_vf(i)%sf(j, k, l) + #:call GPU_PARALLEL_LOOP(collapse=4) + do i = 1, sys_size + do l = 0, p + do k = 0, n + do j = 0, m + q_cons_ts(2)%vf(i)%sf(j, k, l) = & + q_cons_ts(1)%vf(i)%sf(j, k, l) + q_cons_ts(1)%vf(i)%sf(j, k, l) = & + q_cons_ts(1)%vf(i)%sf(j, k, l) & + + dt*rhs_vf(i)%sf(j, k, l) + end do end do end do end do - end do - + #:endcall GPU_PARALLEL_LOOP dest = 1 ! result in q_cons_ts(1)%vf #else - $:GPU_PARALLEL_LOOP(collapse=4) - do i = 1, sys_size - do l = 0, p - do k = 0, n - do j = 0, m - q_cons_ts(2)%vf(i)%sf(j, k, l) = & - q_cons_ts(1)%vf(i)%sf(j, k, l) & - + dt*rhs_vf(i)%sf(j, k, l) + #:call GPU_PARALLEL_LOOP(collapse=4) + do i = 1, sys_size + do l = 0, p + do k = 0, n + do j = 0, m + q_cons_ts(2)%vf(i)%sf(j, k, l) = & + q_cons_ts(1)%vf(i)%sf(j, k, l) & + + dt*rhs_vf(i)%sf(j, k, l) + end do end do end do end do - end do + #:endcall GPU_PARALLEL_LOOP dest = 2 ! result in q_cons_ts(2)%vf #endif !Evolve pb and mv for non-polytropic qbmm if (qbmm .and. (.not. polytropic)) then - $:GPU_PARALLEL_LOOP(collapse=5) - do i = 1, nb - do l = 0, p - do k = 0, n - do j = 0, m - do q = 1, nnode - pb_ts(2)%sf(j, k, l, q, i) = & - pb_ts(1)%sf(j, k, l, q, i) & - + dt*rhs_pb(j, k, l, q, i) + #:call GPU_PARALLEL_LOOP(collapse=5) + do i = 1, nb + do l = 0, p + do k = 0, n + do j = 0, m + do q = 1, nnode + pb_ts(2)%sf(j, k, l, q, i) = & + pb_ts(1)%sf(j, k, l, q, i) & + + dt*rhs_pb(j, k, l, q, i) + end do end do end do end do end do - end do + #:endcall GPU_PARALLEL_LOOP end if if (qbmm .and. (.not. polytropic)) then - $:GPU_PARALLEL_LOOP(collapse=5) - do i = 1, nb - do l = 0, p - do k = 0, n - do j = 0, m - do q = 1, nnode - mv_ts(2)%sf(j, k, l, q, i) = & - mv_ts(1)%sf(j, k, l, q, i) & - + dt*rhs_mv(j, k, l, q, i) + #:call GPU_PARALLEL_LOOP(collapse=5) + do i = 1, nb + do l = 0, p + do k = 0, n + do j = 0, m + do q = 1, nnode + mv_ts(2)%sf(j, k, l, q, i) = & + mv_ts(1)%sf(j, k, l, q, i) & + + dt*rhs_mv(j, k, l, q, i) + end do end do end do end do end do - end do + #:endcall GPU_PARALLEL_LOOP end if if (bodyForces) call s_apply_bodyforces(q_cons_ts(dest)%vf, q_prim_vf, rhs_vf, dt) @@ -935,73 +947,76 @@ contains if (bubbles_lagrange .and. .not. adap_dt) call s_update_lagrange_tdv_rk(stage=2) #if defined(__NVCOMPILER_GPU_UNIFIED_MEM) || defined(FRONTIER_UNIFIED) - $:GPU_PARALLEL_LOOP(collapse=4) - do i = 1, sys_size - do l = 0, p - do k = 0, n - do j = 0, m - q_cons_ts(1)%vf(i)%sf(j, k, l) = & - (3._wp*q_cons_ts(2)%vf(i)%sf(j, k, l) & - + q_cons_ts(1)%vf(i)%sf(j, k, l) & - + dt*rhs_vf(i)%sf(j, k, l))/4._wp + #:call GPU_PARALLEL_LOOP(collapse=4) + do i = 1, sys_size + do l = 0, p + do k = 0, n + do j = 0, m + q_cons_ts(1)%vf(i)%sf(j, k, l) = & + (3._wp*q_cons_ts(2)%vf(i)%sf(j, k, l) & + + q_cons_ts(1)%vf(i)%sf(j, k, l) & + + dt*rhs_vf(i)%sf(j, k, l))/4._wp + end do end do end do end do - end do - + #:endcall GPU_PARALLEL_LOOP dest = 1 ! Result in q_cons_ts(1)%vf #else - $:GPU_PARALLEL_LOOP(collapse=4) - do i = 1, sys_size - do l = 0, p - do k = 0, n - do j = 0, m - q_cons_ts(2)%vf(i)%sf(j, k, l) = & - (3._wp*q_cons_ts(1)%vf(i)%sf(j, k, l) & - + q_cons_ts(2)%vf(i)%sf(j, k, l) & - + dt*rhs_vf(i)%sf(j, k, l))/4._wp + #:call GPU_PARALLEL_LOOP(collapse=4) + do i = 1, sys_size + do l = 0, p + do k = 0, n + do j = 0, m + q_cons_ts(2)%vf(i)%sf(j, k, l) = & + (3._wp*q_cons_ts(1)%vf(i)%sf(j, k, l) & + + q_cons_ts(2)%vf(i)%sf(j, k, l) & + + dt*rhs_vf(i)%sf(j, k, l))/4._wp + end do end do end do end do - end do + #:endcall GPU_PARALLEL_LOOP dest = 2 ! Result in q_cons_ts(2)%vf #endif if (qbmm .and. (.not. polytropic)) then - $:GPU_PARALLEL_LOOP(collapse=5) - do i = 1, nb - do l = 0, p - do k = 0, n - do j = 0, m - do q = 1, nnode - pb_ts(2)%sf(j, k, l, q, i) = & - (3._wp*pb_ts(1)%sf(j, k, l, q, i) & - + pb_ts(2)%sf(j, k, l, q, i) & - + dt*rhs_pb(j, k, l, q, i))/4._wp + #:call GPU_PARALLEL_LOOP(collapse=5) + do i = 1, nb + do l = 0, p + do k = 0, n + do j = 0, m + do q = 1, nnode + pb_ts(2)%sf(j, k, l, q, i) = & + (3._wp*pb_ts(1)%sf(j, k, l, q, i) & + + pb_ts(2)%sf(j, k, l, q, i) & + + dt*rhs_pb(j, k, l, q, i))/4._wp + end do end do end do end do end do - end do + #:endcall GPU_PARALLEL_LOOP end if if (qbmm .and. (.not. polytropic)) then - $:GPU_PARALLEL_LOOP(collapse=5) - do i = 1, nb - do l = 0, p - do k = 0, n - do j = 0, m - do q = 1, nnode - mv_ts(2)%sf(j, k, l, q, i) = & - (3._wp*mv_ts(1)%sf(j, k, l, q, i) & - + mv_ts(2)%sf(j, k, l, q, i) & - + dt*rhs_mv(j, k, l, q, i))/4._wp + #:call GPU_PARALLEL_LOOP(collapse=5) + do i = 1, nb + do l = 0, p + do k = 0, n + do j = 0, m + do q = 1, nnode + mv_ts(2)%sf(j, k, l, q, i) = & + (3._wp*mv_ts(1)%sf(j, k, l, q, i) & + + mv_ts(2)%sf(j, k, l, q, i) & + + dt*rhs_mv(j, k, l, q, i))/4._wp + end do end do end do end do end do - end do + #:endcall GPU_PARALLEL_LOOP end if if (bodyForces) call s_apply_bodyforces(q_cons_ts(dest)%vf, q_prim_vf, rhs_vf, dt/4._wp) @@ -1028,73 +1043,76 @@ contains if (bubbles_lagrange .and. .not. adap_dt) call s_update_lagrange_tdv_rk(stage=3) #if defined(__NVCOMPILER_GPU_UNIFIED_MEM) || defined(FRONTIER_UNIFIED) - $:GPU_PARALLEL_LOOP(collapse=4) - do i = 1, sys_size - do l = 0, p - do k = 0, n - do j = 0, m - q_cons_ts(1)%vf(i)%sf(j, k, l) = & - (q_cons_ts(2)%vf(i)%sf(j, k, l) & - + 2._wp*q_cons_ts(1)%vf(i)%sf(j, k, l) & - + 2._wp*dt*rhs_vf(i)%sf(j, k, l))/3._wp + #:call GPU_PARALLEL_LOOP(collapse=4) + do i = 1, sys_size + do l = 0, p + do k = 0, n + do j = 0, m + q_cons_ts(1)%vf(i)%sf(j, k, l) = & + (q_cons_ts(2)%vf(i)%sf(j, k, l) & + + 2._wp*q_cons_ts(1)%vf(i)%sf(j, k, l) & + + 2._wp*dt*rhs_vf(i)%sf(j, k, l))/3._wp + end do end do end do end do - end do - + #:endcall GPU_PARALLEL_LOOP dest = 1 ! Result in q_cons_ts(1)%vf #else - $:GPU_PARALLEL_LOOP(collapse=4) - do i = 1, sys_size - do l = 0, p - do k = 0, n - do j = 0, m - q_cons_ts(1)%vf(i)%sf(j, k, l) = & - (q_cons_ts(1)%vf(i)%sf(j, k, l) & - + 2._wp*q_cons_ts(2)%vf(i)%sf(j, k, l) & - + 2._wp*dt*rhs_vf(i)%sf(j, k, l))/3._wp + #:call GPU_PARALLEL_LOOP(collapse=4) + do i = 1, sys_size + do l = 0, p + do k = 0, n + do j = 0, m + q_cons_ts(1)%vf(i)%sf(j, k, l) = & + (q_cons_ts(1)%vf(i)%sf(j, k, l) & + + 2._wp*q_cons_ts(2)%vf(i)%sf(j, k, l) & + + 2._wp*dt*rhs_vf(i)%sf(j, k, l))/3._wp + end do end do end do end do - end do + #:endcall GPU_PARALLEL_LOOP dest = 1 ! Result in q_cons_ts(2)%vf #endif if (qbmm .and. (.not. polytropic)) then - $:GPU_PARALLEL_LOOP(collapse=5) - do i = 1, nb - do l = 0, p - do k = 0, n - do j = 0, m - do q = 1, nnode - pb_ts(1)%sf(j, k, l, q, i) = & - (pb_ts(1)%sf(j, k, l, q, i) & - + 2._wp*pb_ts(2)%sf(j, k, l, q, i) & - + 2._wp*dt*rhs_pb(j, k, l, q, i))/3._wp + #:call GPU_PARALLEL_LOOP(collapse=5) + do i = 1, nb + do l = 0, p + do k = 0, n + do j = 0, m + do q = 1, nnode + pb_ts(1)%sf(j, k, l, q, i) = & + (pb_ts(1)%sf(j, k, l, q, i) & + + 2._wp*pb_ts(2)%sf(j, k, l, q, i) & + + 2._wp*dt*rhs_pb(j, k, l, q, i))/3._wp + end do end do end do end do end do - end do + #:endcall GPU_PARALLEL_LOOP end if if (qbmm .and. (.not. polytropic)) then - $:GPU_PARALLEL_LOOP(collapse=5) - do i = 1, nb - do l = 0, p - do k = 0, n - do j = 0, m - do q = 1, nnode - mv_ts(1)%sf(j, k, l, q, i) = & - (mv_ts(1)%sf(j, k, l, q, i) & - + 2._wp*mv_ts(2)%sf(j, k, l, q, i) & - + 2._wp*dt*rhs_mv(j, k, l, q, i))/3._wp + #:call GPU_PARALLEL_LOOP(collapse=5) + do i = 1, nb + do l = 0, p + do k = 0, n + do j = 0, m + do q = 1, nnode + mv_ts(1)%sf(j, k, l, q, i) = & + (mv_ts(1)%sf(j, k, l, q, i) & + + 2._wp*mv_ts(2)%sf(j, k, l, q, i) & + + 2._wp*dt*rhs_mv(j, k, l, q, i))/3._wp + end do end do end do end do end do - end do + #:endcall GPU_PARALLEL_LOOP end if if (bodyForces) call s_apply_bodyforces(q_cons_ts(dest)%vf, q_prim_vf, rhs_vf, 2._wp*dt/3._wp) @@ -1236,23 +1254,24 @@ contains idwint) end if - $:GPU_PARALLEL_LOOP(collapse=3, private='[vel, alpha, Re]') - do l = 0, p - do k = 0, n - do j = 0, m - if (igr) then - call s_compute_enthalpy(q_cons_ts(1)%vf, pres, rho, gamma, pi_inf, Re, H, alpha, vel, vel_sum, j, k, l) - else - call s_compute_enthalpy(q_prim_vf, pres, rho, gamma, pi_inf, Re, H, alpha, vel, vel_sum, j, k, l) - end if + #:call GPU_PARALLEL_LOOP(collapse=3, private='[vel, alpha, Re]') + do l = 0, p + do k = 0, n + do j = 0, m + if (igr) then + call s_compute_enthalpy(q_cons_ts(1)%vf, pres, rho, gamma, pi_inf, Re, H, alpha, vel, vel_sum, j, k, l) + else + call s_compute_enthalpy(q_prim_vf, pres, rho, gamma, pi_inf, Re, H, alpha, vel, vel_sum, j, k, l) + end if - ! Compute mixture sound speed - call s_compute_speed_of_sound(pres, rho, gamma, pi_inf, H, alpha, vel_sum, 0._wp, c) + ! Compute mixture sound speed + call s_compute_speed_of_sound(pres, rho, gamma, pi_inf, H, alpha, vel_sum, 0._wp, c) - call s_compute_dt_from_cfl(vel, c, max_dt, rho, Re, j, k, l) + call s_compute_dt_from_cfl(vel, c, max_dt, rho, Re, j, k, l) + end do end do end do - end do + #:endcall GPU_PARALLEL_LOOP #:call GPU_PARALLEL(copyout='[dt_local]', copyin='[max_dt]') dt_local = minval(max_dt) @@ -1283,17 +1302,18 @@ contains call nvtxStartRange("RHS-BODYFORCES") call s_compute_body_forces_rhs(q_prim_vf_in, q_cons_vf, rhs_vf_in) - $:GPU_PARALLEL_LOOP(collapse=4) - do i = momxb, E_idx - do l = 0, p - do k = 0, n - do j = 0, m - q_cons_vf(i)%sf(j, k, l) = q_cons_vf(i)%sf(j, k, l) + & - ldt*rhs_vf_in(i)%sf(j, k, l) + #:call GPU_PARALLEL_LOOP(collapse=4) + do i = momxb, E_idx + do l = 0, p + do k = 0, n + do j = 0, m + q_cons_vf(i)%sf(j, k, l) = q_cons_vf(i)%sf(j, k, l) + & + ldt*rhs_vf_in(i)%sf(j, k, l) + end do end do end do end do - end do + #:endcall GPU_PARALLEL_LOOP call nvtxEndRange @@ -1309,63 +1329,68 @@ contains integer :: i, j, k, l !< Generic loop iterator if (t_step == t_step_start) then - $:GPU_PARALLEL_LOOP(collapse=4) - do i = 1, sys_size - do l = 0, p - do k = 0, n - do j = 0, m - q_prim_ts(3)%vf(i)%sf(j, k, l) = q_prim_vf(i)%sf(j, k, l) + #:call GPU_PARALLEL_LOOP(collapse=4) + do i = 1, sys_size + do l = 0, p + do k = 0, n + do j = 0, m + q_prim_ts(3)%vf(i)%sf(j, k, l) = q_prim_vf(i)%sf(j, k, l) + end do end do end do end do - end do + #:endcall GPU_PARALLEL_LOOP elseif (t_step == t_step_start + 1) then - $:GPU_PARALLEL_LOOP(collapse=4) - do i = 1, sys_size - do l = 0, p - do k = 0, n - do j = 0, m - q_prim_ts(2)%vf(i)%sf(j, k, l) = q_prim_vf(i)%sf(j, k, l) + #:call GPU_PARALLEL_LOOP(collapse=4) + do i = 1, sys_size + do l = 0, p + do k = 0, n + do j = 0, m + q_prim_ts(2)%vf(i)%sf(j, k, l) = q_prim_vf(i)%sf(j, k, l) + end do end do end do end do - end do + #:endcall GPU_PARALLEL_LOOP elseif (t_step == t_step_start + 2) then - $:GPU_PARALLEL_LOOP(collapse=4) - do i = 1, sys_size - do l = 0, p - do k = 0, n - do j = 0, m - q_prim_ts(1)%vf(i)%sf(j, k, l) = q_prim_vf(i)%sf(j, k, l) + #:call GPU_PARALLEL_LOOP(collapse=4) + do i = 1, sys_size + do l = 0, p + do k = 0, n + do j = 0, m + q_prim_ts(1)%vf(i)%sf(j, k, l) = q_prim_vf(i)%sf(j, k, l) + end do end do end do end do - end do + #:endcall GPU_PARALLEL_LOOP elseif (t_step == t_step_start + 3) then - $:GPU_PARALLEL_LOOP(collapse=4) - do i = 1, sys_size - do l = 0, p - do k = 0, n - do j = 0, m - q_prim_ts(0)%vf(i)%sf(j, k, l) = q_prim_vf(i)%sf(j, k, l) + #:call GPU_PARALLEL_LOOP(collapse=4) + do i = 1, sys_size + do l = 0, p + do k = 0, n + do j = 0, m + q_prim_ts(0)%vf(i)%sf(j, k, l) = q_prim_vf(i)%sf(j, k, l) + end do end do end do end do - end do + #:endcall GPU_PARALLEL_LOOP else ! All other timesteps - $:GPU_PARALLEL_LOOP(collapse=4) - do i = 1, sys_size - do l = 0, p - do k = 0, n - do j = 0, m - q_prim_ts(3)%vf(i)%sf(j, k, l) = q_prim_ts(2)%vf(i)%sf(j, k, l) - q_prim_ts(2)%vf(i)%sf(j, k, l) = q_prim_ts(1)%vf(i)%sf(j, k, l) - q_prim_ts(1)%vf(i)%sf(j, k, l) = q_prim_ts(0)%vf(i)%sf(j, k, l) - q_prim_ts(0)%vf(i)%sf(j, k, l) = q_prim_vf(i)%sf(j, k, l) + #:call GPU_PARALLEL_LOOP(collapse=4) + do i = 1, sys_size + do l = 0, p + do k = 0, n + do j = 0, m + q_prim_ts(3)%vf(i)%sf(j, k, l) = q_prim_ts(2)%vf(i)%sf(j, k, l) + q_prim_ts(2)%vf(i)%sf(j, k, l) = q_prim_ts(1)%vf(i)%sf(j, k, l) + q_prim_ts(1)%vf(i)%sf(j, k, l) = q_prim_ts(0)%vf(i)%sf(j, k, l) + q_prim_ts(0)%vf(i)%sf(j, k, l) = q_prim_vf(i)%sf(j, k, l) + end do end do end do end do - end do + #:endcall GPU_PARALLEL_LOOP end if end subroutine s_time_step_cycling diff --git a/src/simulation/m_viscous.fpp b/src/simulation/m_viscous.fpp index c69f99330c..f6d8cb6b2d 100644 --- a/src/simulation/m_viscous.fpp +++ b/src/simulation/m_viscous.fpp @@ -80,438 +80,440 @@ contains $:GPU_UPDATE(device='[is1_viscous,is2_viscous,is3_viscous]') - $:GPU_PARALLEL_LOOP(collapse=3) - do l = is3_viscous%beg, is3_viscous%end - do k = is2_viscous%beg, is2_viscous%end - do j = is1_viscous%beg, is1_viscous%end - $:GPU_LOOP(parallelism='[seq]') - do i = momxb, E_idx - tau_Re_vf(i)%sf(j, k, l) = 0._wp + #:call GPU_PARALLEL_LOOP(collapse=3) + do l = is3_viscous%beg, is3_viscous%end + do k = is2_viscous%beg, is2_viscous%end + do j = is1_viscous%beg, is1_viscous%end + $:GPU_LOOP(parallelism='[seq]') + do i = momxb, E_idx + tau_Re_vf(i)%sf(j, k, l) = 0._wp + end do end do end do end do - end do + #:endcall GPU_PARALLEL_LOOP + if (shear_stress) then ! Shear stresses - $:GPU_PARALLEL_LOOP(collapse=3, private='[alpha_visc, & - & alpha_rho_visc, Re_visc, tau_Re]') - do l = is3_viscous%beg, is3_viscous%end - do k = -1, 1 - do j = is1_viscous%beg, is1_viscous%end + #:call GPU_PARALLEL_LOOP(collapse=3, private='[alpha_visc, alpha_rho_visc, Re_visc, tau_Re]') + do l = is3_viscous%beg, is3_viscous%end + do k = -1, 1 + do j = is1_viscous%beg, is1_viscous%end - $:GPU_LOOP(parallelism='[seq]') - do i = 1, num_fluids - alpha_rho_visc(i) = q_prim_vf(i)%sf(j, k, l) - if (bubbles_euler .and. num_fluids == 1) then - alpha_visc(i) = 1._wp - q_prim_vf(E_idx + i)%sf(j, k, l) - else - alpha_visc(i) = q_prim_vf(E_idx + i)%sf(j, k, l) - end if - end do + $:GPU_LOOP(parallelism='[seq]') + do i = 1, num_fluids + alpha_rho_visc(i) = q_prim_vf(i)%sf(j, k, l) + if (bubbles_euler .and. num_fluids == 1) then + alpha_visc(i) = 1._wp - q_prim_vf(E_idx + i)%sf(j, k, l) + else + alpha_visc(i) = q_prim_vf(E_idx + i)%sf(j, k, l) + end if + end do - if (bubbles_euler) then - rho_visc = 0._wp - gamma_visc = 0._wp - pi_inf_visc = 0._wp + if (bubbles_euler) then + rho_visc = 0._wp + gamma_visc = 0._wp + pi_inf_visc = 0._wp - if (mpp_lim .and. (model_eqns == 2) .and. (num_fluids > 2)) then - $:GPU_LOOP(parallelism='[seq]') - do i = 1, num_fluids - rho_visc = rho_visc + alpha_rho_visc(i) - gamma_visc = gamma_visc + alpha_visc(i)*gammas(i) - pi_inf_visc = pi_inf_visc + alpha_visc(i)*pi_infs(i) - end do - else if ((model_eqns == 2) .and. (num_fluids > 2)) then - $:GPU_LOOP(parallelism='[seq]') - do i = 1, num_fluids - 1 - rho_visc = rho_visc + alpha_rho_visc(i) - gamma_visc = gamma_visc + alpha_visc(i)*gammas(i) - pi_inf_visc = pi_inf_visc + alpha_visc(i)*pi_infs(i) - end do + if (mpp_lim .and. (model_eqns == 2) .and. (num_fluids > 2)) then + $:GPU_LOOP(parallelism='[seq]') + do i = 1, num_fluids + rho_visc = rho_visc + alpha_rho_visc(i) + gamma_visc = gamma_visc + alpha_visc(i)*gammas(i) + pi_inf_visc = pi_inf_visc + alpha_visc(i)*pi_infs(i) + end do + else if ((model_eqns == 2) .and. (num_fluids > 2)) then + $:GPU_LOOP(parallelism='[seq]') + do i = 1, num_fluids - 1 + rho_visc = rho_visc + alpha_rho_visc(i) + gamma_visc = gamma_visc + alpha_visc(i)*gammas(i) + pi_inf_visc = pi_inf_visc + alpha_visc(i)*pi_infs(i) + end do + else + rho_visc = alpha_rho_visc(1) + gamma_visc = gammas(1) + pi_inf_visc = pi_infs(1) + end if else - rho_visc = alpha_rho_visc(1) - gamma_visc = gammas(1) - pi_inf_visc = pi_infs(1) - end if - else - rho_visc = 0._wp - gamma_visc = 0._wp - pi_inf_visc = 0._wp - - alpha_visc_sum = 0._wp + rho_visc = 0._wp + gamma_visc = 0._wp + pi_inf_visc = 0._wp - if (mpp_lim) then - $:GPU_LOOP(parallelism='[seq]') - do i = 1, num_fluids - alpha_rho_visc(i) = max(0._wp, alpha_rho_visc(i)) - alpha_visc(i) = min(max(0._wp, alpha_visc(i)), 1._wp) - alpha_visc_sum = alpha_visc_sum + alpha_visc(i) - end do + alpha_visc_sum = 0._wp - alpha_visc = alpha_visc/max(alpha_visc_sum, sgm_eps) + if (mpp_lim) then + $:GPU_LOOP(parallelism='[seq]') + do i = 1, num_fluids + alpha_rho_visc(i) = max(0._wp, alpha_rho_visc(i)) + alpha_visc(i) = min(max(0._wp, alpha_visc(i)), 1._wp) + alpha_visc_sum = alpha_visc_sum + alpha_visc(i) + end do - end if + alpha_visc = alpha_visc/max(alpha_visc_sum, sgm_eps) - $:GPU_LOOP(parallelism='[seq]') - do i = 1, num_fluids - rho_visc = rho_visc + alpha_rho_visc(i) - gamma_visc = gamma_visc + alpha_visc(i)*gammas(i) - pi_inf_visc = pi_inf_visc + alpha_visc(i)*pi_infs(i) - end do + end if - if (viscous) then $:GPU_LOOP(parallelism='[seq]') - do i = 1, 2 - Re_visc(i) = dflt_real + do i = 1, num_fluids + rho_visc = rho_visc + alpha_rho_visc(i) + gamma_visc = gamma_visc + alpha_visc(i)*gammas(i) + pi_inf_visc = pi_inf_visc + alpha_visc(i)*pi_infs(i) + end do - if (Re_size(i) > 0) Re_visc(i) = 0._wp + if (viscous) then $:GPU_LOOP(parallelism='[seq]') - do q = 1, Re_size(i) - Re_visc(i) = alpha_visc(Re_idx(i, q))/Res_viscous(i, q) & - + Re_visc(i) - end do + do i = 1, 2 + Re_visc(i) = dflt_real + + if (Re_size(i) > 0) Re_visc(i) = 0._wp + $:GPU_LOOP(parallelism='[seq]') + do q = 1, Re_size(i) + Re_visc(i) = alpha_visc(Re_idx(i, q))/Res_viscous(i, q) & + + Re_visc(i) + end do - Re_visc(i) = 1._wp/max(Re_visc(i), sgm_eps) + Re_visc(i) = 1._wp/max(Re_visc(i), sgm_eps) - end do + end do + end if end if - end if - - tau_Re(2, 1) = (grad_y_vf(1)%sf(j, k, l) + & - grad_x_vf(2)%sf(j, k, l))/ & - Re_visc(1) - tau_Re(2, 2) = (4._wp*grad_y_vf(2)%sf(j, k, l) & - - 2._wp*grad_x_vf(1)%sf(j, k, l) & - - 2._wp*q_prim_vf(momxb + 1)%sf(j, k, l)/y_cc(k))/ & - (3._wp*Re_visc(1)) - $:GPU_LOOP(parallelism='[seq]') - do i = 1, 2 - tau_Re_vf(contxe + i)%sf(j, k, l) = & - tau_Re_vf(contxe + i)%sf(j, k, l) - & - tau_Re(2, i) + tau_Re(2, 1) = (grad_y_vf(1)%sf(j, k, l) + & + grad_x_vf(2)%sf(j, k, l))/ & + Re_visc(1) - tau_Re_vf(E_idx)%sf(j, k, l) = & - tau_Re_vf(E_idx)%sf(j, k, l) - & - q_prim_vf(contxe + i)%sf(j, k, l)*tau_Re(2, i) + tau_Re(2, 2) = (4._wp*grad_y_vf(2)%sf(j, k, l) & + - 2._wp*grad_x_vf(1)%sf(j, k, l) & + - 2._wp*q_prim_vf(momxb + 1)%sf(j, k, l)/y_cc(k))/ & + (3._wp*Re_visc(1)) + $:GPU_LOOP(parallelism='[seq]') + do i = 1, 2 + tau_Re_vf(contxe + i)%sf(j, k, l) = & + tau_Re_vf(contxe + i)%sf(j, k, l) - & + tau_Re(2, i) + + tau_Re_vf(E_idx)%sf(j, k, l) = & + tau_Re_vf(E_idx)%sf(j, k, l) - & + q_prim_vf(contxe + i)%sf(j, k, l)*tau_Re(2, i) + end do end do end do end do - end do + #:endcall GPU_PARALLEL_LOOP end if if (bulk_stress) then ! Bulk stresses - $:GPU_PARALLEL_LOOP(collapse=3, private='[alpha_visc, & - & alpha_rho_visc, Re_visc, tau_Re]') - do l = is3_viscous%beg, is3_viscous%end - do k = -1, 1 - do j = is1_viscous%beg, is1_viscous%end + #:call GPU_PARALLEL_LOOP(collapse=3, private='[alpha_visc, alpha_rho_visc, Re_visc, tau_Re]') + do l = is3_viscous%beg, is3_viscous%end + do k = -1, 1 + do j = is1_viscous%beg, is1_viscous%end - $:GPU_LOOP(parallelism='[seq]') - do i = 1, num_fluids - alpha_rho_visc(i) = q_prim_vf(i)%sf(j, k, l) - if (bubbles_euler .and. num_fluids == 1) then - alpha_visc(i) = 1._wp - q_prim_vf(E_idx + i)%sf(j, k, l) - else - alpha_visc(i) = q_prim_vf(E_idx + i)%sf(j, k, l) - end if - end do + $:GPU_LOOP(parallelism='[seq]') + do i = 1, num_fluids + alpha_rho_visc(i) = q_prim_vf(i)%sf(j, k, l) + if (bubbles_euler .and. num_fluids == 1) then + alpha_visc(i) = 1._wp - q_prim_vf(E_idx + i)%sf(j, k, l) + else + alpha_visc(i) = q_prim_vf(E_idx + i)%sf(j, k, l) + end if + end do - if (bubbles_euler) then - rho_visc = 0._wp - gamma_visc = 0._wp - pi_inf_visc = 0._wp + if (bubbles_euler) then + rho_visc = 0._wp + gamma_visc = 0._wp + pi_inf_visc = 0._wp - if (mpp_lim .and. (model_eqns == 2) .and. (num_fluids > 2)) then - $:GPU_LOOP(parallelism='[seq]') - do i = 1, num_fluids - rho_visc = rho_visc + alpha_rho_visc(i) - gamma_visc = gamma_visc + alpha_visc(i)*gammas(i) - pi_inf_visc = pi_inf_visc + alpha_visc(i)*pi_infs(i) - end do - else if ((model_eqns == 2) .and. (num_fluids > 2)) then - $:GPU_LOOP(parallelism='[seq]') - do i = 1, num_fluids - 1 - rho_visc = rho_visc + alpha_rho_visc(i) - gamma_visc = gamma_visc + alpha_visc(i)*gammas(i) - pi_inf_visc = pi_inf_visc + alpha_visc(i)*pi_infs(i) - end do + if (mpp_lim .and. (model_eqns == 2) .and. (num_fluids > 2)) then + $:GPU_LOOP(parallelism='[seq]') + do i = 1, num_fluids + rho_visc = rho_visc + alpha_rho_visc(i) + gamma_visc = gamma_visc + alpha_visc(i)*gammas(i) + pi_inf_visc = pi_inf_visc + alpha_visc(i)*pi_infs(i) + end do + else if ((model_eqns == 2) .and. (num_fluids > 2)) then + $:GPU_LOOP(parallelism='[seq]') + do i = 1, num_fluids - 1 + rho_visc = rho_visc + alpha_rho_visc(i) + gamma_visc = gamma_visc + alpha_visc(i)*gammas(i) + pi_inf_visc = pi_inf_visc + alpha_visc(i)*pi_infs(i) + end do + else + rho_visc = alpha_rho_visc(1) + gamma_visc = gammas(1) + pi_inf_visc = pi_infs(1) + end if else - rho_visc = alpha_rho_visc(1) - gamma_visc = gammas(1) - pi_inf_visc = pi_infs(1) - end if - else - rho_visc = 0._wp - gamma_visc = 0._wp - pi_inf_visc = 0._wp - - alpha_visc_sum = 0._wp + rho_visc = 0._wp + gamma_visc = 0._wp + pi_inf_visc = 0._wp - if (mpp_lim) then - $:GPU_LOOP(parallelism='[seq]') - do i = 1, num_fluids - alpha_rho_visc(i) = max(0._wp, alpha_rho_visc(i)) - alpha_visc(i) = min(max(0._wp, alpha_visc(i)), 1._wp) - alpha_visc_sum = alpha_visc_sum + alpha_visc(i) - end do + alpha_visc_sum = 0._wp - alpha_visc = alpha_visc/max(alpha_visc_sum, sgm_eps) + if (mpp_lim) then + $:GPU_LOOP(parallelism='[seq]') + do i = 1, num_fluids + alpha_rho_visc(i) = max(0._wp, alpha_rho_visc(i)) + alpha_visc(i) = min(max(0._wp, alpha_visc(i)), 1._wp) + alpha_visc_sum = alpha_visc_sum + alpha_visc(i) + end do - end if + alpha_visc = alpha_visc/max(alpha_visc_sum, sgm_eps) - $:GPU_LOOP(parallelism='[seq]') - do i = 1, num_fluids - rho_visc = rho_visc + alpha_rho_visc(i) - gamma_visc = gamma_visc + alpha_visc(i)*gammas(i) - pi_inf_visc = pi_inf_visc + alpha_visc(i)*pi_infs(i) - end do + end if - if (viscous) then $:GPU_LOOP(parallelism='[seq]') - do i = 1, 2 - Re_visc(i) = dflt_real + do i = 1, num_fluids + rho_visc = rho_visc + alpha_rho_visc(i) + gamma_visc = gamma_visc + alpha_visc(i)*gammas(i) + pi_inf_visc = pi_inf_visc + alpha_visc(i)*pi_infs(i) + end do - if (Re_size(i) > 0) Re_visc(i) = 0._wp + if (viscous) then $:GPU_LOOP(parallelism='[seq]') - do q = 1, Re_size(i) - Re_visc(i) = alpha_visc(Re_idx(i, q))/Res_viscous(i, q) & - + Re_visc(i) - end do + do i = 1, 2 + Re_visc(i) = dflt_real + + if (Re_size(i) > 0) Re_visc(i) = 0._wp + $:GPU_LOOP(parallelism='[seq]') + do q = 1, Re_size(i) + Re_visc(i) = alpha_visc(Re_idx(i, q))/Res_viscous(i, q) & + + Re_visc(i) + end do - Re_visc(i) = 1._wp/max(Re_visc(i), sgm_eps) + Re_visc(i) = 1._wp/max(Re_visc(i), sgm_eps) - end do + end do + end if end if - end if - tau_Re(2, 2) = (grad_x_vf(1)%sf(j, k, l) + & - grad_y_vf(2)%sf(j, k, l) + & - q_prim_vf(momxb + 1)%sf(j, k, l)/y_cc(k))/ & - Re_visc(2) + tau_Re(2, 2) = (grad_x_vf(1)%sf(j, k, l) + & + grad_y_vf(2)%sf(j, k, l) + & + q_prim_vf(momxb + 1)%sf(j, k, l)/y_cc(k))/ & + Re_visc(2) - tau_Re_vf(momxb + 1)%sf(j, k, l) = & - tau_Re_vf(momxb + 1)%sf(j, k, l) - & - tau_Re(2, 2) + tau_Re_vf(momxb + 1)%sf(j, k, l) = & + tau_Re_vf(momxb + 1)%sf(j, k, l) - & + tau_Re(2, 2) - tau_Re_vf(E_idx)%sf(j, k, l) = & - tau_Re_vf(E_idx)%sf(j, k, l) - & - q_prim_vf(momxb + 1)%sf(j, k, l)*tau_Re(2, 2) + tau_Re_vf(E_idx)%sf(j, k, l) = & + tau_Re_vf(E_idx)%sf(j, k, l) - & + q_prim_vf(momxb + 1)%sf(j, k, l)*tau_Re(2, 2) + end do end do end do - end do + #:endcall GPU_PARALLEL_LOOP end if if (p == 0) return if (shear_stress) then ! Shear stresses - $:GPU_PARALLEL_LOOP(collapse=3, private='[alpha_visc, & - & alpha_rho_visc, Re_visc, tau_Re]') - do l = is3_viscous%beg, is3_viscous%end - do k = -1, 1 - do j = is1_viscous%beg, is1_viscous%end + #:call GPU_PARALLEL_LOOP(collapse=3, private='[alpha_visc, alpha_rho_visc, Re_visc, tau_Re]') + do l = is3_viscous%beg, is3_viscous%end + do k = -1, 1 + do j = is1_viscous%beg, is1_viscous%end - $:GPU_LOOP(parallelism='[seq]') - do i = 1, num_fluids - alpha_rho_visc(i) = q_prim_vf(i)%sf(j, k, l) - if (bubbles_euler .and. num_fluids == 1) then - alpha_visc(i) = 1._wp - q_prim_vf(E_idx + i)%sf(j, k, l) - else - alpha_visc(i) = q_prim_vf(E_idx + i)%sf(j, k, l) - end if - end do + $:GPU_LOOP(parallelism='[seq]') + do i = 1, num_fluids + alpha_rho_visc(i) = q_prim_vf(i)%sf(j, k, l) + if (bubbles_euler .and. num_fluids == 1) then + alpha_visc(i) = 1._wp - q_prim_vf(E_idx + i)%sf(j, k, l) + else + alpha_visc(i) = q_prim_vf(E_idx + i)%sf(j, k, l) + end if + end do - if (bubbles_euler) then - rho_visc = 0._wp - gamma_visc = 0._wp - pi_inf_visc = 0._wp + if (bubbles_euler) then + rho_visc = 0._wp + gamma_visc = 0._wp + pi_inf_visc = 0._wp - if (mpp_lim .and. (model_eqns == 2) .and. (num_fluids > 2)) then - $:GPU_LOOP(parallelism='[seq]') - do i = 1, num_fluids - rho_visc = rho_visc + alpha_rho_visc(i) - gamma_visc = gamma_visc + alpha_visc(i)*gammas(i) - pi_inf_visc = pi_inf_visc + alpha_visc(i)*pi_infs(i) - end do - else if ((model_eqns == 2) .and. (num_fluids > 2)) then - $:GPU_LOOP(parallelism='[seq]') - do i = 1, num_fluids - 1 - rho_visc = rho_visc + alpha_rho_visc(i) - gamma_visc = gamma_visc + alpha_visc(i)*gammas(i) - pi_inf_visc = pi_inf_visc + alpha_visc(i)*pi_infs(i) - end do + if (mpp_lim .and. (model_eqns == 2) .and. (num_fluids > 2)) then + $:GPU_LOOP(parallelism='[seq]') + do i = 1, num_fluids + rho_visc = rho_visc + alpha_rho_visc(i) + gamma_visc = gamma_visc + alpha_visc(i)*gammas(i) + pi_inf_visc = pi_inf_visc + alpha_visc(i)*pi_infs(i) + end do + else if ((model_eqns == 2) .and. (num_fluids > 2)) then + $:GPU_LOOP(parallelism='[seq]') + do i = 1, num_fluids - 1 + rho_visc = rho_visc + alpha_rho_visc(i) + gamma_visc = gamma_visc + alpha_visc(i)*gammas(i) + pi_inf_visc = pi_inf_visc + alpha_visc(i)*pi_infs(i) + end do + else + rho_visc = alpha_rho_visc(1) + gamma_visc = gammas(1) + pi_inf_visc = pi_infs(1) + end if else - rho_visc = alpha_rho_visc(1) - gamma_visc = gammas(1) - pi_inf_visc = pi_infs(1) - end if - else - rho_visc = 0._wp - gamma_visc = 0._wp - pi_inf_visc = 0._wp + rho_visc = 0._wp + gamma_visc = 0._wp + pi_inf_visc = 0._wp - alpha_visc_sum = 0._wp - - if (mpp_lim) then - $:GPU_LOOP(parallelism='[seq]') - do i = 1, num_fluids - alpha_rho_visc(i) = max(0._wp, alpha_rho_visc(i)) - alpha_visc(i) = min(max(0._wp, alpha_visc(i)), 1._wp) - alpha_visc_sum = alpha_visc_sum + alpha_visc(i) - end do + alpha_visc_sum = 0._wp - alpha_visc = alpha_visc/max(alpha_visc_sum, sgm_eps) + if (mpp_lim) then + $:GPU_LOOP(parallelism='[seq]') + do i = 1, num_fluids + alpha_rho_visc(i) = max(0._wp, alpha_rho_visc(i)) + alpha_visc(i) = min(max(0._wp, alpha_visc(i)), 1._wp) + alpha_visc_sum = alpha_visc_sum + alpha_visc(i) + end do - end if + alpha_visc = alpha_visc/max(alpha_visc_sum, sgm_eps) - $:GPU_LOOP(parallelism='[seq]') - do i = 1, num_fluids - rho_visc = rho_visc + alpha_rho_visc(i) - gamma_visc = gamma_visc + alpha_visc(i)*gammas(i) - pi_inf_visc = pi_inf_visc + alpha_visc(i)*pi_infs(i) - end do + end if - if (viscous) then $:GPU_LOOP(parallelism='[seq]') - do i = 1, 2 - Re_visc(i) = dflt_real + do i = 1, num_fluids + rho_visc = rho_visc + alpha_rho_visc(i) + gamma_visc = gamma_visc + alpha_visc(i)*gammas(i) + pi_inf_visc = pi_inf_visc + alpha_visc(i)*pi_infs(i) + end do - if (Re_size(i) > 0) Re_visc(i) = 0._wp + if (viscous) then $:GPU_LOOP(parallelism='[seq]') - do q = 1, Re_size(i) - Re_visc(i) = alpha_visc(Re_idx(i, q))/Res_viscous(i, q) & - + Re_visc(i) - end do + do i = 1, 2 + Re_visc(i) = dflt_real + + if (Re_size(i) > 0) Re_visc(i) = 0._wp + $:GPU_LOOP(parallelism='[seq]') + do q = 1, Re_size(i) + Re_visc(i) = alpha_visc(Re_idx(i, q))/Res_viscous(i, q) & + + Re_visc(i) + end do - Re_visc(i) = 1._wp/max(Re_visc(i), sgm_eps) + Re_visc(i) = 1._wp/max(Re_visc(i), sgm_eps) - end do + end do + end if end if - end if - tau_Re(2, 2) = -(2._wp/3._wp)*grad_z_vf(3)%sf(j, k, l)/y_cc(k)/ & - Re_visc(1) + tau_Re(2, 2) = -(2._wp/3._wp)*grad_z_vf(3)%sf(j, k, l)/y_cc(k)/ & + Re_visc(1) - tau_Re(2, 3) = ((grad_z_vf(2)%sf(j, k, l) - & - q_prim_vf(momxe)%sf(j, k, l))/ & - y_cc(k) + grad_y_vf(3)%sf(j, k, l))/ & - Re_visc(1) + tau_Re(2, 3) = ((grad_z_vf(2)%sf(j, k, l) - & + q_prim_vf(momxe)%sf(j, k, l))/ & + y_cc(k) + grad_y_vf(3)%sf(j, k, l))/ & + Re_visc(1) - $:GPU_LOOP(parallelism='[seq]') - do i = 2, 3 - tau_Re_vf(contxe + i)%sf(j, k, l) = & - tau_Re_vf(contxe + i)%sf(j, k, l) - & - tau_Re(2, i) + $:GPU_LOOP(parallelism='[seq]') + do i = 2, 3 + tau_Re_vf(contxe + i)%sf(j, k, l) = & + tau_Re_vf(contxe + i)%sf(j, k, l) - & + tau_Re(2, i) + + tau_Re_vf(E_idx)%sf(j, k, l) = & + tau_Re_vf(E_idx)%sf(j, k, l) - & + q_prim_vf(contxe + i)%sf(j, k, l)*tau_Re(2, i) + end do - tau_Re_vf(E_idx)%sf(j, k, l) = & - tau_Re_vf(E_idx)%sf(j, k, l) - & - q_prim_vf(contxe + i)%sf(j, k, l)*tau_Re(2, i) end do - end do end do - end do + #:endcall GPU_PARALLEL_LOOP end if if (bulk_stress) then ! Bulk stresses - $:GPU_PARALLEL_LOOP(collapse=3, private='[alpha_visc, & - & alpha_rho_visc, Re_visc, tau_Re]') - do l = is3_viscous%beg, is3_viscous%end - do k = -1, 1 - do j = is1_viscous%beg, is1_viscous%end + #:call GPU_PARALLEL_LOOP(collapse=3, private='[alpha_visc, alpha_rho_visc, Re_visc, tau_Re]') + do l = is3_viscous%beg, is3_viscous%end + do k = -1, 1 + do j = is1_viscous%beg, is1_viscous%end - $:GPU_LOOP(parallelism='[seq]') - do i = 1, num_fluids - alpha_rho_visc(i) = q_prim_vf(i)%sf(j, k, l) - if (bubbles_euler .and. num_fluids == 1) then - alpha_visc(i) = 1._wp - q_prim_vf(E_idx + i)%sf(j, k, l) - else - alpha_visc(i) = q_prim_vf(E_idx + i)%sf(j, k, l) - end if - end do + $:GPU_LOOP(parallelism='[seq]') + do i = 1, num_fluids + alpha_rho_visc(i) = q_prim_vf(i)%sf(j, k, l) + if (bubbles_euler .and. num_fluids == 1) then + alpha_visc(i) = 1._wp - q_prim_vf(E_idx + i)%sf(j, k, l) + else + alpha_visc(i) = q_prim_vf(E_idx + i)%sf(j, k, l) + end if + end do - if (bubbles_euler) then - rho_visc = 0._wp - gamma_visc = 0._wp - pi_inf_visc = 0._wp + if (bubbles_euler) then + rho_visc = 0._wp + gamma_visc = 0._wp + pi_inf_visc = 0._wp - if (mpp_lim .and. (model_eqns == 2) .and. (num_fluids > 2)) then - $:GPU_LOOP(parallelism='[seq]') - do i = 1, num_fluids - rho_visc = rho_visc + alpha_rho_visc(i) - gamma_visc = gamma_visc + alpha_visc(i)*gammas(i) - pi_inf_visc = pi_inf_visc + alpha_visc(i)*pi_infs(i) - end do - else if ((model_eqns == 2) .and. (num_fluids > 2)) then - $:GPU_LOOP(parallelism='[seq]') - do i = 1, num_fluids - 1 - rho_visc = rho_visc + alpha_rho_visc(i) - gamma_visc = gamma_visc + alpha_visc(i)*gammas(i) - pi_inf_visc = pi_inf_visc + alpha_visc(i)*pi_infs(i) - end do + if (mpp_lim .and. (model_eqns == 2) .and. (num_fluids > 2)) then + $:GPU_LOOP(parallelism='[seq]') + do i = 1, num_fluids + rho_visc = rho_visc + alpha_rho_visc(i) + gamma_visc = gamma_visc + alpha_visc(i)*gammas(i) + pi_inf_visc = pi_inf_visc + alpha_visc(i)*pi_infs(i) + end do + else if ((model_eqns == 2) .and. (num_fluids > 2)) then + $:GPU_LOOP(parallelism='[seq]') + do i = 1, num_fluids - 1 + rho_visc = rho_visc + alpha_rho_visc(i) + gamma_visc = gamma_visc + alpha_visc(i)*gammas(i) + pi_inf_visc = pi_inf_visc + alpha_visc(i)*pi_infs(i) + end do + else + rho_visc = alpha_rho_visc(1) + gamma_visc = gammas(1) + pi_inf_visc = pi_infs(1) + end if else - rho_visc = alpha_rho_visc(1) - gamma_visc = gammas(1) - pi_inf_visc = pi_infs(1) - end if - else - rho_visc = 0._wp - gamma_visc = 0._wp - pi_inf_visc = 0._wp + rho_visc = 0._wp + gamma_visc = 0._wp + pi_inf_visc = 0._wp - alpha_visc_sum = 0._wp + alpha_visc_sum = 0._wp - if (mpp_lim) then - $:GPU_LOOP(parallelism='[seq]') - do i = 1, num_fluids - alpha_rho_visc(i) = max(0._wp, alpha_rho_visc(i)) - alpha_visc(i) = min(max(0._wp, alpha_visc(i)), 1._wp) - alpha_visc_sum = alpha_visc_sum + alpha_visc(i) - end do - - alpha_visc = alpha_visc/max(alpha_visc_sum, sgm_eps) + if (mpp_lim) then + $:GPU_LOOP(parallelism='[seq]') + do i = 1, num_fluids + alpha_rho_visc(i) = max(0._wp, alpha_rho_visc(i)) + alpha_visc(i) = min(max(0._wp, alpha_visc(i)), 1._wp) + alpha_visc_sum = alpha_visc_sum + alpha_visc(i) + end do - end if + alpha_visc = alpha_visc/max(alpha_visc_sum, sgm_eps) - $:GPU_LOOP(parallelism='[seq]') - do i = 1, num_fluids - rho_visc = rho_visc + alpha_rho_visc(i) - gamma_visc = gamma_visc + alpha_visc(i)*gammas(i) - pi_inf_visc = pi_inf_visc + alpha_visc(i)*pi_infs(i) - end do + end if - if (viscous) then $:GPU_LOOP(parallelism='[seq]') - do i = 1, 2 - Re_visc(i) = dflt_real + do i = 1, num_fluids + rho_visc = rho_visc + alpha_rho_visc(i) + gamma_visc = gamma_visc + alpha_visc(i)*gammas(i) + pi_inf_visc = pi_inf_visc + alpha_visc(i)*pi_infs(i) + end do - if (Re_size(i) > 0) Re_visc(i) = 0._wp + if (viscous) then $:GPU_LOOP(parallelism='[seq]') - do q = 1, Re_size(i) - Re_visc(i) = alpha_visc(Re_idx(i, q))/Res_viscous(i, q) & - + Re_visc(i) - end do + do i = 1, 2 + Re_visc(i) = dflt_real + + if (Re_size(i) > 0) Re_visc(i) = 0._wp + $:GPU_LOOP(parallelism='[seq]') + do q = 1, Re_size(i) + Re_visc(i) = alpha_visc(Re_idx(i, q))/Res_viscous(i, q) & + + Re_visc(i) + end do - Re_visc(i) = 1._wp/max(Re_visc(i), sgm_eps) + Re_visc(i) = 1._wp/max(Re_visc(i), sgm_eps) - end do + end do + end if end if - end if - tau_Re(2, 2) = grad_z_vf(3)%sf(j, k, l)/y_cc(k)/ & - Re_visc(2) + tau_Re(2, 2) = grad_z_vf(3)%sf(j, k, l)/y_cc(k)/ & + Re_visc(2) - tau_Re_vf(momxb + 1)%sf(j, k, l) = & - tau_Re_vf(momxb + 1)%sf(j, k, l) - & - tau_Re(2, 2) + tau_Re_vf(momxb + 1)%sf(j, k, l) = & + tau_Re_vf(momxb + 1)%sf(j, k, l) - & + tau_Re(2, 2) - tau_Re_vf(E_idx)%sf(j, k, l) = & - tau_Re_vf(E_idx)%sf(j, k, l) - & - q_prim_vf(momxb + 1)%sf(j, k, l)*tau_Re(2, 2) + tau_Re_vf(E_idx)%sf(j, k, l) = & + tau_Re_vf(E_idx)%sf(j, k, l) - & + q_prim_vf(momxb + 1)%sf(j, k, l)*tau_Re(2, 2) + end do end do end do - end do + #:endcall GPU_PARALLEL_LOOP end if end subroutine s_compute_viscous_stress_tensor @@ -596,345 +598,363 @@ contains $:GPU_UPDATE(device='[is1_viscous,is2_viscous,is3_viscous]') - $:GPU_PARALLEL_LOOP(collapse=3) - do l = is3_viscous%beg, is3_viscous%end - do k = iy%beg, iy%end - do j = is1_viscous%beg + 1, is1_viscous%end - $:GPU_LOOP(parallelism='[seq]') - do i = iv%beg, iv%end - dqL_prim_dx_n(1)%vf(i)%sf(j, k, l) = & - (q_prim_qp%vf(i)%sf(j, k, l) - & - q_prim_qp%vf(i)%sf(j - 1, k, l))/ & - (x_cc(j) - x_cc(j - 1)) + #:call GPU_PARALLEL_LOOP(collapse=3) + do l = is3_viscous%beg, is3_viscous%end + do k = iy%beg, iy%end + do j = is1_viscous%beg + 1, is1_viscous%end + $:GPU_LOOP(parallelism='[seq]') + do i = iv%beg, iv%end + dqL_prim_dx_n(1)%vf(i)%sf(j, k, l) = & + (q_prim_qp%vf(i)%sf(j, k, l) - & + q_prim_qp%vf(i)%sf(j - 1, k, l))/ & + (x_cc(j) - x_cc(j - 1)) + end do end do end do end do - end do + #:endcall GPU_PARALLEL_LOOP - $:GPU_PARALLEL_LOOP(collapse=3) - do l = is3_viscous%beg, is3_viscous%end - do k = is2_viscous%beg, is2_viscous%end - do j = is1_viscous%beg, is1_viscous%end - 1 - $:GPU_LOOP(parallelism='[seq]') - do i = iv%beg, iv%end - dqR_prim_dx_n(1)%vf(i)%sf(j, k, l) = & - (q_prim_qp%vf(i)%sf(j + 1, k, l) - & - q_prim_qp%vf(i)%sf(j, k, l))/ & - (x_cc(j + 1) - x_cc(j)) + #:call GPU_PARALLEL_LOOP(collapse=3) + do l = is3_viscous%beg, is3_viscous%end + do k = is2_viscous%beg, is2_viscous%end + do j = is1_viscous%beg, is1_viscous%end - 1 + $:GPU_LOOP(parallelism='[seq]') + do i = iv%beg, iv%end + dqR_prim_dx_n(1)%vf(i)%sf(j, k, l) = & + (q_prim_qp%vf(i)%sf(j + 1, k, l) - & + q_prim_qp%vf(i)%sf(j, k, l))/ & + (x_cc(j + 1) - x_cc(j)) + end do end do end do end do - end do + #:endcall GPU_PARALLEL_LOOP if (n > 0) then - $:GPU_PARALLEL_LOOP(collapse=3) - do l = is3_viscous%beg, is3_viscous%end - do j = is2_viscous%beg + 1, is2_viscous%end - do k = is1_viscous%beg, is1_viscous%end - $:GPU_LOOP(parallelism='[seq]') - do i = iv%beg, iv%end - dqL_prim_dy_n(2)%vf(i)%sf(k, j, l) = & - (q_prim_qp%vf(i)%sf(k, j, l) - & - q_prim_qp%vf(i)%sf(k, j - 1, l))/ & - (y_cc(j) - y_cc(j - 1)) + #:call GPU_PARALLEL_LOOP(collapse=3) + do l = is3_viscous%beg, is3_viscous%end + do j = is2_viscous%beg + 1, is2_viscous%end + do k = is1_viscous%beg, is1_viscous%end + $:GPU_LOOP(parallelism='[seq]') + do i = iv%beg, iv%end + dqL_prim_dy_n(2)%vf(i)%sf(k, j, l) = & + (q_prim_qp%vf(i)%sf(k, j, l) - & + q_prim_qp%vf(i)%sf(k, j - 1, l))/ & + (y_cc(j) - y_cc(j - 1)) + end do end do end do end do - end do + #:endcall GPU_PARALLEL_LOOP - $:GPU_PARALLEL_LOOP(collapse=3) - do l = is3_viscous%beg, is3_viscous%end - do j = is2_viscous%beg, is2_viscous%end - 1 - do k = is1_viscous%beg, is1_viscous%end - $:GPU_LOOP(parallelism='[seq]') - do i = iv%beg, iv%end - dqR_prim_dy_n(2)%vf(i)%sf(k, j, l) = & - (q_prim_qp%vf(i)%sf(k, j + 1, l) - & - q_prim_qp%vf(i)%sf(k, j, l))/ & - (y_cc(j + 1) - y_cc(j)) + #:call GPU_PARALLEL_LOOP(collapse=3) + do l = is3_viscous%beg, is3_viscous%end + do j = is2_viscous%beg, is2_viscous%end - 1 + do k = is1_viscous%beg, is1_viscous%end + $:GPU_LOOP(parallelism='[seq]') + do i = iv%beg, iv%end + dqR_prim_dy_n(2)%vf(i)%sf(k, j, l) = & + (q_prim_qp%vf(i)%sf(k, j + 1, l) - & + q_prim_qp%vf(i)%sf(k, j, l))/ & + (y_cc(j + 1) - y_cc(j)) + end do end do end do end do - end do + #:endcall GPU_PARALLEL_LOOP - $:GPU_PARALLEL_LOOP(collapse=3) - do l = is3_viscous%beg, is3_viscous%end - do j = is2_viscous%beg + 1, is2_viscous%end - do k = is1_viscous%beg + 1, is1_viscous%end - 1 - $:GPU_LOOP(parallelism='[seq]') - do i = iv%beg, iv%end - dqL_prim_dx_n(2)%vf(i)%sf(k, j, l) = & - (dqL_prim_dx_n(1)%vf(i)%sf(k, j, l) + & - dqR_prim_dx_n(1)%vf(i)%sf(k, j, l) + & - dqL_prim_dx_n(1)%vf(i)%sf(k, j - 1, l) + & - dqR_prim_dx_n(1)%vf(i)%sf(k, j - 1, l)) - - dqL_prim_dx_n(2)%vf(i)%sf(k, j, l) = 25.e-2_wp* & - dqL_prim_dx_n(2)%vf(i)%sf(k, j, l) + #:call GPU_PARALLEL_LOOP(collapse=3) + do l = is3_viscous%beg, is3_viscous%end + do j = is2_viscous%beg + 1, is2_viscous%end + do k = is1_viscous%beg + 1, is1_viscous%end - 1 + $:GPU_LOOP(parallelism='[seq]') + do i = iv%beg, iv%end + dqL_prim_dx_n(2)%vf(i)%sf(k, j, l) = & + (dqL_prim_dx_n(1)%vf(i)%sf(k, j, l) + & + dqR_prim_dx_n(1)%vf(i)%sf(k, j, l) + & + dqL_prim_dx_n(1)%vf(i)%sf(k, j - 1, l) + & + dqR_prim_dx_n(1)%vf(i)%sf(k, j - 1, l)) + + dqL_prim_dx_n(2)%vf(i)%sf(k, j, l) = 25.e-2_wp* & + dqL_prim_dx_n(2)%vf(i)%sf(k, j, l) + end do end do end do end do - end do + #:endcall GPU_PARALLEL_LOOP - $:GPU_PARALLEL_LOOP(collapse=3) - do l = is3_viscous%beg, is3_viscous%end - do j = is2_viscous%beg, is2_viscous%end - 1 - do k = is1_viscous%beg + 1, is1_viscous%end - 1 - $:GPU_LOOP(parallelism='[seq]') - do i = iv%beg, iv%end - dqR_prim_dx_n(2)%vf(i)%sf(k, j, l) = & - (dqL_prim_dx_n(1)%vf(i)%sf(k, j + 1, l) + & - dqR_prim_dx_n(1)%vf(i)%sf(k, j + 1, l) + & - dqL_prim_dx_n(1)%vf(i)%sf(k, j, l) + & - dqR_prim_dx_n(1)%vf(i)%sf(k, j, l)) + #:call GPU_PARALLEL_LOOP(collapse=3) + do l = is3_viscous%beg, is3_viscous%end + do j = is2_viscous%beg, is2_viscous%end - 1 + do k = is1_viscous%beg + 1, is1_viscous%end - 1 + $:GPU_LOOP(parallelism='[seq]') + do i = iv%beg, iv%end + dqR_prim_dx_n(2)%vf(i)%sf(k, j, l) = & + (dqL_prim_dx_n(1)%vf(i)%sf(k, j + 1, l) + & + dqR_prim_dx_n(1)%vf(i)%sf(k, j + 1, l) + & + dqL_prim_dx_n(1)%vf(i)%sf(k, j, l) + & + dqR_prim_dx_n(1)%vf(i)%sf(k, j, l)) - dqR_prim_dx_n(2)%vf(i)%sf(k, j, l) = 25.e-2_wp* & - dqR_prim_dx_n(2)%vf(i)%sf(k, j, l) + dqR_prim_dx_n(2)%vf(i)%sf(k, j, l) = 25.e-2_wp* & + dqR_prim_dx_n(2)%vf(i)%sf(k, j, l) + end do end do end do end do - end do + #:endcall GPU_PARALLEL_LOOP - $:GPU_PARALLEL_LOOP(collapse=3) - do l = is3_viscous%beg, is3_viscous%end - do k = is2_viscous%beg + 1, is2_viscous%end - 1 - do j = is1_viscous%beg + 1, is1_viscous%end - $:GPU_LOOP(parallelism='[seq]') - do i = iv%beg, iv%end - dqL_prim_dy_n(1)%vf(i)%sf(j, k, l) = & - (dqL_prim_dy_n(2)%vf(i)%sf(j, k, l) + & - dqR_prim_dy_n(2)%vf(i)%sf(j, k, l) + & - dqL_prim_dy_n(2)%vf(i)%sf(j - 1, k, l) + & - dqR_prim_dy_n(2)%vf(i)%sf(j - 1, k, l)) + #:call GPU_PARALLEL_LOOP(collapse=3) + do l = is3_viscous%beg, is3_viscous%end + do k = is2_viscous%beg + 1, is2_viscous%end - 1 + do j = is1_viscous%beg + 1, is1_viscous%end + $:GPU_LOOP(parallelism='[seq]') + do i = iv%beg, iv%end + dqL_prim_dy_n(1)%vf(i)%sf(j, k, l) = & + (dqL_prim_dy_n(2)%vf(i)%sf(j, k, l) + & + dqR_prim_dy_n(2)%vf(i)%sf(j, k, l) + & + dqL_prim_dy_n(2)%vf(i)%sf(j - 1, k, l) + & + dqR_prim_dy_n(2)%vf(i)%sf(j - 1, k, l)) - dqL_prim_dy_n(1)%vf(i)%sf(j, k, l) = 25.e-2_wp* & - dqL_prim_dy_n(1)%vf(i)%sf(j, k, l) + dqL_prim_dy_n(1)%vf(i)%sf(j, k, l) = 25.e-2_wp* & + dqL_prim_dy_n(1)%vf(i)%sf(j, k, l) + end do end do end do end do - end do + #:endcall GPU_PARALLEL_LOOP - $:GPU_PARALLEL_LOOP(collapse=3) - do l = is3_viscous%beg, is3_viscous%end - do k = is2_viscous%beg + 1, is2_viscous%end - 1 - do j = is1_viscous%beg, is1_viscous%end - 1 - $:GPU_LOOP(parallelism='[seq]') - do i = iv%beg, iv%end - dqR_prim_dy_n(1)%vf(i)%sf(j, k, l) = & - (dqL_prim_dy_n(2)%vf(i)%sf(j + 1, k, l) + & - dqR_prim_dy_n(2)%vf(i)%sf(j + 1, k, l) + & - dqL_prim_dy_n(2)%vf(i)%sf(j, k, l) + & - dqR_prim_dy_n(2)%vf(i)%sf(j, k, l)) + #:call GPU_PARALLEL_LOOP(collapse=3) + do l = is3_viscous%beg, is3_viscous%end + do k = is2_viscous%beg + 1, is2_viscous%end - 1 + do j = is1_viscous%beg, is1_viscous%end - 1 + $:GPU_LOOP(parallelism='[seq]') + do i = iv%beg, iv%end + dqR_prim_dy_n(1)%vf(i)%sf(j, k, l) = & + (dqL_prim_dy_n(2)%vf(i)%sf(j + 1, k, l) + & + dqR_prim_dy_n(2)%vf(i)%sf(j + 1, k, l) + & + dqL_prim_dy_n(2)%vf(i)%sf(j, k, l) + & + dqR_prim_dy_n(2)%vf(i)%sf(j, k, l)) - dqR_prim_dy_n(1)%vf(i)%sf(j, k, l) = 25.e-2_wp* & - dqR_prim_dy_n(1)%vf(i)%sf(j, k, l) + dqR_prim_dy_n(1)%vf(i)%sf(j, k, l) = 25.e-2_wp* & + dqR_prim_dy_n(1)%vf(i)%sf(j, k, l) + end do end do end do end do - end do + #:endcall GPU_PARALLEL_LOOP if (p > 0) then - $:GPU_PARALLEL_LOOP(collapse=3) - do j = is3_viscous%beg + 1, is3_viscous%end - do l = is2_viscous%beg, is2_viscous%end - do k = is1_viscous%beg, is1_viscous%end - $:GPU_LOOP(parallelism='[seq]') - do i = iv%beg, iv%end + #:call GPU_PARALLEL_LOOP(collapse=3) + do j = is3_viscous%beg + 1, is3_viscous%end + do l = is2_viscous%beg, is2_viscous%end + do k = is1_viscous%beg, is1_viscous%end + $:GPU_LOOP(parallelism='[seq]') + do i = iv%beg, iv%end - dqL_prim_dz_n(3)%vf(i)%sf(k, l, j) = & - (q_prim_qp%vf(i)%sf(k, l, j) - & - q_prim_qp%vf(i)%sf(k, l, j - 1))/ & - (z_cc(j) - z_cc(j - 1)) + dqL_prim_dz_n(3)%vf(i)%sf(k, l, j) = & + (q_prim_qp%vf(i)%sf(k, l, j) - & + q_prim_qp%vf(i)%sf(k, l, j - 1))/ & + (z_cc(j) - z_cc(j - 1)) + end do end do end do end do - end do + #:endcall GPU_PARALLEL_LOOP - $:GPU_PARALLEL_LOOP(collapse=3) - do j = is3_viscous%beg, is3_viscous%end - 1 - do l = is2_viscous%beg, is2_viscous%end - do k = is1_viscous%beg, is1_viscous%end - $:GPU_LOOP(parallelism='[seq]') - do i = iv%beg, iv%end + #:call GPU_PARALLEL_LOOP(collapse=3) + do j = is3_viscous%beg, is3_viscous%end - 1 + do l = is2_viscous%beg, is2_viscous%end + do k = is1_viscous%beg, is1_viscous%end + $:GPU_LOOP(parallelism='[seq]') + do i = iv%beg, iv%end - dqR_prim_dz_n(3)%vf(i)%sf(k, l, j) = & - (q_prim_qp%vf(i)%sf(k, l, j + 1) - & - q_prim_qp%vf(i)%sf(k, l, j))/ & - (z_cc(j + 1) - z_cc(j)) + dqR_prim_dz_n(3)%vf(i)%sf(k, l, j) = & + (q_prim_qp%vf(i)%sf(k, l, j + 1) - & + q_prim_qp%vf(i)%sf(k, l, j))/ & + (z_cc(j + 1) - z_cc(j)) + end do end do end do end do - end do + #:endcall GPU_PARALLEL_LOOP - $:GPU_PARALLEL_LOOP(collapse=3) - do l = is3_viscous%beg + 1, is3_viscous%end - 1 - do k = is2_viscous%beg, is2_viscous%end - do j = is1_viscous%beg + 1, is1_viscous%end - $:GPU_LOOP(parallelism='[seq]') - do i = iv%beg, iv%end + #:call GPU_PARALLEL_LOOP(collapse=3) + do l = is3_viscous%beg + 1, is3_viscous%end - 1 + do k = is2_viscous%beg, is2_viscous%end + do j = is1_viscous%beg + 1, is1_viscous%end + $:GPU_LOOP(parallelism='[seq]') + do i = iv%beg, iv%end - dqL_prim_dz_n(1)%vf(i)%sf(j, k, l) = & - (dqL_prim_dz_n(3)%vf(i)%sf(j, k, l) + & - dqR_prim_dz_n(3)%vf(i)%sf(j, k, l) + & - dqL_prim_dz_n(3)%vf(i)%sf(j - 1, k, l) + & - dqR_prim_dz_n(3)%vf(i)%sf(j - 1, k, l)) + dqL_prim_dz_n(1)%vf(i)%sf(j, k, l) = & + (dqL_prim_dz_n(3)%vf(i)%sf(j, k, l) + & + dqR_prim_dz_n(3)%vf(i)%sf(j, k, l) + & + dqL_prim_dz_n(3)%vf(i)%sf(j - 1, k, l) + & + dqR_prim_dz_n(3)%vf(i)%sf(j - 1, k, l)) - dqL_prim_dz_n(1)%vf(i)%sf(j, k, l) = 25.e-2_wp* & - dqL_prim_dz_n(1)%vf(i)%sf(j, k, l) + dqL_prim_dz_n(1)%vf(i)%sf(j, k, l) = 25.e-2_wp* & + dqL_prim_dz_n(1)%vf(i)%sf(j, k, l) + end do end do end do end do - end do + #:endcall GPU_PARALLEL_LOOP - $:GPU_PARALLEL_LOOP(collapse=3) - do l = is3_viscous%beg + 1, is3_viscous%end - 1 - do k = is2_viscous%beg, is2_viscous%end - do j = is1_viscous%beg, is1_viscous%end - 1 - $:GPU_LOOP(parallelism='[seq]') - do i = iv%beg, iv%end + #:call GPU_PARALLEL_LOOP(collapse=3) + do l = is3_viscous%beg + 1, is3_viscous%end - 1 + do k = is2_viscous%beg, is2_viscous%end + do j = is1_viscous%beg, is1_viscous%end - 1 + $:GPU_LOOP(parallelism='[seq]') + do i = iv%beg, iv%end - dqR_prim_dz_n(1)%vf(i)%sf(j, k, l) = & - (dqL_prim_dz_n(3)%vf(i)%sf(j + 1, k, l) + & - dqR_prim_dz_n(3)%vf(i)%sf(j + 1, k, l) + & - dqL_prim_dz_n(3)%vf(i)%sf(j, k, l) + & - dqR_prim_dz_n(3)%vf(i)%sf(j, k, l)) + dqR_prim_dz_n(1)%vf(i)%sf(j, k, l) = & + (dqL_prim_dz_n(3)%vf(i)%sf(j + 1, k, l) + & + dqR_prim_dz_n(3)%vf(i)%sf(j + 1, k, l) + & + dqL_prim_dz_n(3)%vf(i)%sf(j, k, l) + & + dqR_prim_dz_n(3)%vf(i)%sf(j, k, l)) - dqR_prim_dz_n(1)%vf(i)%sf(j, k, l) = 25.e-2_wp* & - dqR_prim_dz_n(1)%vf(i)%sf(j, k, l) + dqR_prim_dz_n(1)%vf(i)%sf(j, k, l) = 25.e-2_wp* & + dqR_prim_dz_n(1)%vf(i)%sf(j, k, l) + end do end do end do end do - end do + #:endcall GPU_PARALLEL_LOOP - $:GPU_PARALLEL_LOOP(collapse=3) - do l = is3_viscous%beg + 1, is3_viscous%end - 1 - do j = is2_viscous%beg + 1, is2_viscous%end - do k = is1_viscous%beg, is1_viscous%end - $:GPU_LOOP(parallelism='[seq]') - do i = iv%beg, iv%end + #:call GPU_PARALLEL_LOOP(collapse=3) + do l = is3_viscous%beg + 1, is3_viscous%end - 1 + do j = is2_viscous%beg + 1, is2_viscous%end + do k = is1_viscous%beg, is1_viscous%end + $:GPU_LOOP(parallelism='[seq]') + do i = iv%beg, iv%end - dqL_prim_dz_n(2)%vf(i)%sf(k, j, l) = & - (dqL_prim_dz_n(3)%vf(i)%sf(k, j, l) + & - dqR_prim_dz_n(3)%vf(i)%sf(k, j, l) + & - dqL_prim_dz_n(3)%vf(i)%sf(k, j - 1, l) + & - dqR_prim_dz_n(3)%vf(i)%sf(k, j - 1, l)) + dqL_prim_dz_n(2)%vf(i)%sf(k, j, l) = & + (dqL_prim_dz_n(3)%vf(i)%sf(k, j, l) + & + dqR_prim_dz_n(3)%vf(i)%sf(k, j, l) + & + dqL_prim_dz_n(3)%vf(i)%sf(k, j - 1, l) + & + dqR_prim_dz_n(3)%vf(i)%sf(k, j - 1, l)) - dqL_prim_dz_n(2)%vf(i)%sf(k, j, l) = 25.e-2_wp* & - dqL_prim_dz_n(2)%vf(i)%sf(k, j, l) + dqL_prim_dz_n(2)%vf(i)%sf(k, j, l) = 25.e-2_wp* & + dqL_prim_dz_n(2)%vf(i)%sf(k, j, l) + end do end do end do end do - end do + #:endcall GPU_PARALLEL_LOOP - $:GPU_PARALLEL_LOOP(collapse=3) - do l = is3_viscous%beg + 1, is3_viscous%end - 1 - do j = is2_viscous%beg, is2_viscous%end - 1 - do k = is1_viscous%beg, is1_viscous%end - $:GPU_LOOP(parallelism='[seq]') - do i = iv%beg, iv%end + #:call GPU_PARALLEL_LOOP(collapse=3) + do l = is3_viscous%beg + 1, is3_viscous%end - 1 + do j = is2_viscous%beg, is2_viscous%end - 1 + do k = is1_viscous%beg, is1_viscous%end + $:GPU_LOOP(parallelism='[seq]') + do i = iv%beg, iv%end - dqR_prim_dz_n(2)%vf(i)%sf(k, j, l) = & - (dqL_prim_dz_n(3)%vf(i)%sf(k, j + 1, l) + & - dqR_prim_dz_n(3)%vf(i)%sf(k, j + 1, l) + & - dqL_prim_dz_n(3)%vf(i)%sf(k, j, l) + & - dqR_prim_dz_n(3)%vf(i)%sf(k, j, l)) + dqR_prim_dz_n(2)%vf(i)%sf(k, j, l) = & + (dqL_prim_dz_n(3)%vf(i)%sf(k, j + 1, l) + & + dqR_prim_dz_n(3)%vf(i)%sf(k, j + 1, l) + & + dqL_prim_dz_n(3)%vf(i)%sf(k, j, l) + & + dqR_prim_dz_n(3)%vf(i)%sf(k, j, l)) - dqR_prim_dz_n(2)%vf(i)%sf(k, j, l) = 25.e-2_wp* & - dqR_prim_dz_n(2)%vf(i)%sf(k, j, l) + dqR_prim_dz_n(2)%vf(i)%sf(k, j, l) = 25.e-2_wp* & + dqR_prim_dz_n(2)%vf(i)%sf(k, j, l) + end do end do end do end do - end do + #:endcall GPU_PARALLEL_LOOP - $:GPU_PARALLEL_LOOP(collapse=3) - do j = is3_viscous%beg + 1, is3_viscous%end - do l = is2_viscous%beg + 1, is2_viscous%end - 1 - do k = is1_viscous%beg, is1_viscous%end - $:GPU_LOOP(parallelism='[seq]') - do i = iv%beg, iv%end + #:call GPU_PARALLEL_LOOP(collapse=3) + do j = is3_viscous%beg + 1, is3_viscous%end + do l = is2_viscous%beg + 1, is2_viscous%end - 1 + do k = is1_viscous%beg, is1_viscous%end + $:GPU_LOOP(parallelism='[seq]') + do i = iv%beg, iv%end - dqL_prim_dy_n(3)%vf(i)%sf(k, l, j) = & - (dqL_prim_dy_n(2)%vf(i)%sf(k, l, j) + & - dqR_prim_dy_n(2)%vf(i)%sf(k, l, j) + & - dqL_prim_dy_n(2)%vf(i)%sf(k, l, j - 1) + & - dqR_prim_dy_n(2)%vf(i)%sf(k, l, j - 1)) + dqL_prim_dy_n(3)%vf(i)%sf(k, l, j) = & + (dqL_prim_dy_n(2)%vf(i)%sf(k, l, j) + & + dqR_prim_dy_n(2)%vf(i)%sf(k, l, j) + & + dqL_prim_dy_n(2)%vf(i)%sf(k, l, j - 1) + & + dqR_prim_dy_n(2)%vf(i)%sf(k, l, j - 1)) - dqL_prim_dy_n(3)%vf(i)%sf(k, l, j) = 25.e-2_wp* & - dqL_prim_dy_n(3)%vf(i)%sf(k, l, j) + dqL_prim_dy_n(3)%vf(i)%sf(k, l, j) = 25.e-2_wp* & + dqL_prim_dy_n(3)%vf(i)%sf(k, l, j) + end do end do end do end do - end do + #:endcall GPU_PARALLEL_LOOP - $:GPU_PARALLEL_LOOP(collapse=3) - do j = is3_viscous%beg, is3_viscous%end - 1 - do l = is2_viscous%beg + 1, is2_viscous%end - 1 - do k = is1_viscous%beg, is1_viscous%end - $:GPU_LOOP(parallelism='[seq]') - do i = iv%beg, iv%end + #:call GPU_PARALLEL_LOOP(collapse=3) + do j = is3_viscous%beg, is3_viscous%end - 1 + do l = is2_viscous%beg + 1, is2_viscous%end - 1 + do k = is1_viscous%beg, is1_viscous%end + $:GPU_LOOP(parallelism='[seq]') + do i = iv%beg, iv%end - dqR_prim_dy_n(3)%vf(i)%sf(k, l, j) = & - (dqL_prim_dy_n(2)%vf(i)%sf(k, l, j + 1) + & - dqR_prim_dy_n(2)%vf(i)%sf(k, l, j + 1) + & - dqL_prim_dy_n(2)%vf(i)%sf(k, l, j) + & - dqR_prim_dy_n(2)%vf(i)%sf(k, l, j)) + dqR_prim_dy_n(3)%vf(i)%sf(k, l, j) = & + (dqL_prim_dy_n(2)%vf(i)%sf(k, l, j + 1) + & + dqR_prim_dy_n(2)%vf(i)%sf(k, l, j + 1) + & + dqL_prim_dy_n(2)%vf(i)%sf(k, l, j) + & + dqR_prim_dy_n(2)%vf(i)%sf(k, l, j)) - dqR_prim_dy_n(3)%vf(i)%sf(k, l, j) = 25.e-2_wp* & - dqR_prim_dy_n(3)%vf(i)%sf(k, l, j) + dqR_prim_dy_n(3)%vf(i)%sf(k, l, j) = 25.e-2_wp* & + dqR_prim_dy_n(3)%vf(i)%sf(k, l, j) + end do end do end do end do - end do - $:GPU_PARALLEL_LOOP(collapse=3) - do j = is3_viscous%beg + 1, is3_viscous%end - do l = is2_viscous%beg, is2_viscous%end - do k = is1_viscous%beg + 1, is1_viscous%end - 1 - $:GPU_LOOP(parallelism='[seq]') - do i = iv%beg, iv%end + #:endcall GPU_PARALLEL_LOOP + #:call GPU_PARALLEL_LOOP(collapse=3) + do j = is3_viscous%beg + 1, is3_viscous%end + do l = is2_viscous%beg, is2_viscous%end + do k = is1_viscous%beg + 1, is1_viscous%end - 1 + $:GPU_LOOP(parallelism='[seq]') + do i = iv%beg, iv%end - dqL_prim_dx_n(3)%vf(i)%sf(k, l, j) = & - (dqL_prim_dx_n(1)%vf(i)%sf(k, l, j) + & - dqR_prim_dx_n(1)%vf(i)%sf(k, l, j) + & - dqL_prim_dx_n(1)%vf(i)%sf(k, l, j - 1) + & - dqR_prim_dx_n(1)%vf(i)%sf(k, l, j - 1)) + dqL_prim_dx_n(3)%vf(i)%sf(k, l, j) = & + (dqL_prim_dx_n(1)%vf(i)%sf(k, l, j) + & + dqR_prim_dx_n(1)%vf(i)%sf(k, l, j) + & + dqL_prim_dx_n(1)%vf(i)%sf(k, l, j - 1) + & + dqR_prim_dx_n(1)%vf(i)%sf(k, l, j - 1)) - dqL_prim_dx_n(3)%vf(i)%sf(k, l, j) = 25.e-2_wp* & - dqL_prim_dx_n(3)%vf(i)%sf(k, l, j) + dqL_prim_dx_n(3)%vf(i)%sf(k, l, j) = 25.e-2_wp* & + dqL_prim_dx_n(3)%vf(i)%sf(k, l, j) + end do end do end do end do - end do - $:GPU_PARALLEL_LOOP(collapse=3) - do j = is3_viscous%beg, is3_viscous%end - 1 - do l = is2_viscous%beg, is2_viscous%end - do k = is1_viscous%beg + 1, is1_viscous%end - 1 - $:GPU_LOOP(parallelism='[seq]') - do i = iv%beg, iv%end - dqR_prim_dx_n(3)%vf(i)%sf(k, l, j) = & - (dqL_prim_dx_n(1)%vf(i)%sf(k, l, j + 1) + & - dqR_prim_dx_n(1)%vf(i)%sf(k, l, j + 1) + & - dqL_prim_dx_n(1)%vf(i)%sf(k, l, j) + & - dqR_prim_dx_n(1)%vf(i)%sf(k, l, j)) + #:endcall GPU_PARALLEL_LOOP + #:call GPU_PARALLEL_LOOP(collapse=3) + do j = is3_viscous%beg, is3_viscous%end - 1 + do l = is2_viscous%beg, is2_viscous%end + do k = is1_viscous%beg + 1, is1_viscous%end - 1 + $:GPU_LOOP(parallelism='[seq]') + do i = iv%beg, iv%end + dqR_prim_dx_n(3)%vf(i)%sf(k, l, j) = & + (dqL_prim_dx_n(1)%vf(i)%sf(k, l, j + 1) + & + dqR_prim_dx_n(1)%vf(i)%sf(k, l, j + 1) + & + dqL_prim_dx_n(1)%vf(i)%sf(k, l, j) + & + dqR_prim_dx_n(1)%vf(i)%sf(k, l, j)) - dqR_prim_dx_n(3)%vf(i)%sf(k, l, j) = 25.e-2_wp* & - dqR_prim_dx_n(3)%vf(i)%sf(k, l, j) + dqR_prim_dx_n(3)%vf(i)%sf(k, l, j) = 25.e-2_wp* & + dqR_prim_dx_n(3)%vf(i)%sf(k, l, j) + end do end do end do end do - end do + #:endcall GPU_PARALLEL_LOOP do i = iv%beg, iv%end call s_compute_fd_gradient(q_prim_qp%vf(i), & @@ -1027,46 +1047,50 @@ contains if (viscous) then if (weno_Re_flux) then if (norm_dir == 2) then - $:GPU_PARALLEL_LOOP(collapse=4) - do i = iv%beg, iv%end - do l = is3_viscous%beg, is3_viscous%end - do j = is1_viscous%beg, is1_viscous%end - do k = is2_viscous%beg, is2_viscous%end - vL_prim_vf(i)%sf(k, j, l) = vL_y(j, k, l, i) - vR_prim_vf(i)%sf(k, j, l) = vR_y(j, k, l, i) + #:call GPU_PARALLEL_LOOP(collapse=4) + do i = iv%beg, iv%end + do l = is3_viscous%beg, is3_viscous%end + do j = is1_viscous%beg, is1_viscous%end + do k = is2_viscous%beg, is2_viscous%end + vL_prim_vf(i)%sf(k, j, l) = vL_y(j, k, l, i) + vR_prim_vf(i)%sf(k, j, l) = vR_y(j, k, l, i) + end do end do end do end do - end do + #:endcall GPU_PARALLEL_LOOP elseif (norm_dir == 3) then - $:GPU_PARALLEL_LOOP(collapse=4) - do i = iv%beg, iv%end - do j = is1_viscous%beg, is1_viscous%end - do k = is2_viscous%beg, is2_viscous%end - do l = is3_viscous%beg, is3_viscous%end - vL_prim_vf(i)%sf(l, k, j) = vL_z(j, k, l, i) - vR_prim_vf(i)%sf(l, k, j) = vR_z(j, k, l, i) + #:call GPU_PARALLEL_LOOP(collapse=4) + do i = iv%beg, iv%end + do j = is1_viscous%beg, is1_viscous%end + do k = is2_viscous%beg, is2_viscous%end + do l = is3_viscous%beg, is3_viscous%end + vL_prim_vf(i)%sf(l, k, j) = vL_z(j, k, l, i) + vR_prim_vf(i)%sf(l, k, j) = vR_z(j, k, l, i) + end do end do end do end do - end do + #:endcall GPU_PARALLEL_LOOP elseif (norm_dir == 1) then - $:GPU_PARALLEL_LOOP(collapse=4) - do i = iv%beg, iv%end - do l = is3_viscous%beg, is3_viscous%end - do k = is2_viscous%beg, is2_viscous%end - do j = is1_viscous%beg, is1_viscous%end - vL_prim_vf(i)%sf(j, k, l) = vL_x(j, k, l, i) - vR_prim_vf(i)%sf(j, k, l) = vR_x(j, k, l, i) + #:call GPU_PARALLEL_LOOP(collapse=4) + do i = iv%beg, iv%end + do l = is3_viscous%beg, is3_viscous%end + do k = is2_viscous%beg, is2_viscous%end + do j = is1_viscous%beg, is1_viscous%end + vL_prim_vf(i)%sf(j, k, l) = vL_x(j, k, l, i) + vR_prim_vf(i)%sf(j, k, l) = vR_x(j, k, l, i) + end do end do end do end do - end do + #:endcall GPU_PARALLEL_LOOP end if end if end if end if #:endfor + end subroutine s_reconstruct_cell_boundary_values_visc subroutine s_reconstruct_cell_boundary_values_visc_deriv(v_vf, vL_x, vL_y, vL_z, vR_x, vR_y, vR_z, & @@ -1127,41 +1151,44 @@ contains if (viscous) then if (weno_Re_flux) then if (norm_dir == 2) then - $:GPU_PARALLEL_LOOP(collapse=4) - do i = iv%beg, iv%end - do l = is3_viscous%beg, is3_viscous%end - do j = is1_viscous%beg, is1_viscous%end - do k = is2_viscous%beg, is2_viscous%end - vL_prim_vf(i)%sf(k, j, l) = vL_y(j, k, l, i) - vR_prim_vf(i)%sf(k, j, l) = vR_y(j, k, l, i) + #:call GPU_PARALLEL_LOOP(collapse=4) + do i = iv%beg, iv%end + do l = is3_viscous%beg, is3_viscous%end + do j = is1_viscous%beg, is1_viscous%end + do k = is2_viscous%beg, is2_viscous%end + vL_prim_vf(i)%sf(k, j, l) = vL_y(j, k, l, i) + vR_prim_vf(i)%sf(k, j, l) = vR_y(j, k, l, i) + end do end do end do end do - end do + #:endcall GPU_PARALLEL_LOOP elseif (norm_dir == 3) then - $:GPU_PARALLEL_LOOP(collapse=4) - do i = iv%beg, iv%end - do j = is1_viscous%beg, is1_viscous%end - do k = is2_viscous%beg, is2_viscous%end - do l = is3_viscous%beg, is3_viscous%end - vL_prim_vf(i)%sf(l, k, j) = vL_z(j, k, l, i) - vR_prim_vf(i)%sf(l, k, j) = vR_z(j, k, l, i) + #:call GPU_PARALLEL_LOOP(collapse=4) + do i = iv%beg, iv%end + do j = is1_viscous%beg, is1_viscous%end + do k = is2_viscous%beg, is2_viscous%end + do l = is3_viscous%beg, is3_viscous%end + vL_prim_vf(i)%sf(l, k, j) = vL_z(j, k, l, i) + vR_prim_vf(i)%sf(l, k, j) = vR_z(j, k, l, i) + end do end do end do end do - end do + #:endcall GPU_PARALLEL_LOOP elseif (norm_dir == 1) then - $:GPU_PARALLEL_LOOP(collapse=4) - do i = iv%beg, iv%end - do l = is3_viscous%beg, is3_viscous%end - do k = is2_viscous%beg, is2_viscous%end - do j = is1_viscous%beg, is1_viscous%end - vL_prim_vf(i)%sf(j, k, l) = vL_x(j, k, l, i) - vR_prim_vf(i)%sf(j, k, l) = vR_x(j, k, l, i) + #:call GPU_PARALLEL_LOOP(collapse=4) + do i = iv%beg, iv%end + do l = is3_viscous%beg, is3_viscous%end + do k = is2_viscous%beg, is2_viscous%end + do j = is1_viscous%beg, is1_viscous%end + vL_prim_vf(i)%sf(j, k, l) = vL_x(j, k, l, i) + vR_prim_vf(i)%sf(j, k, l) = vR_x(j, k, l, i) + end do end do end do end do - end do + #:endcall GPU_PARALLEL_LOOP end if end if end if @@ -1217,22 +1244,23 @@ contains ! cell-boundaries, to calculate the cell-averaged first-order ! spatial derivatives inside the cell. - $:GPU_PARALLEL_LOOP(collapse=3) - do l = is3_viscous%beg, is3_viscous%end - do k = is2_viscous%beg, is2_viscous%end - do j = is1_viscous%beg + 1, is1_viscous%end - 1 - $:GPU_LOOP(parallelism='[seq]') - do i = iv%beg, iv%end - dv_ds_vf(i)%sf(j, k, l) = & - 1._wp/((1._wp + wa_flg)*dL(j)) & - *(wa_flg*vL_vf(i)%sf(j + 1, k, l) & - + vR_vf(i)%sf(j, k, l) & - - vL_vf(i)%sf(j, k, l) & - - wa_flg*vR_vf(i)%sf(j - 1, k, l)) + #:call GPU_PARALLEL_LOOP(collapse=3) + do l = is3_viscous%beg, is3_viscous%end + do k = is2_viscous%beg, is2_viscous%end + do j = is1_viscous%beg + 1, is1_viscous%end - 1 + $:GPU_LOOP(parallelism='[seq]') + do i = iv%beg, iv%end + dv_ds_vf(i)%sf(j, k, l) = & + 1._wp/((1._wp + wa_flg)*dL(j)) & + *(wa_flg*vL_vf(i)%sf(j + 1, k, l) & + + vR_vf(i)%sf(j, k, l) & + - vL_vf(i)%sf(j, k, l) & + - wa_flg*vR_vf(i)%sf(j - 1, k, l)) + end do end do end do end do - end do + #:endcall GPU_PARALLEL_LOOP ! END: First-Order Spatial Derivatives in x-direction @@ -1245,22 +1273,23 @@ contains ! cell-boundaries, to calculate the cell-averaged first-order ! spatial derivatives inside the cell. - $:GPU_PARALLEL_LOOP(collapse=3) - do l = is3_viscous%beg, is3_viscous%end - do k = is2_viscous%beg + 1, is2_viscous%end - 1 - do j = is1_viscous%beg, is1_viscous%end - $:GPU_LOOP(parallelism='[seq]') - do i = iv%beg, iv%end - dv_ds_vf(i)%sf(j, k, l) = & - 1._wp/((1._wp + wa_flg)*dL(k)) & - *(wa_flg*vL_vf(i)%sf(j, k + 1, l) & - + vR_vf(i)%sf(j, k, l) & - - vL_vf(i)%sf(j, k, l) & - - wa_flg*vR_vf(i)%sf(j, k - 1, l)) + #:call GPU_PARALLEL_LOOP(collapse=3) + do l = is3_viscous%beg, is3_viscous%end + do k = is2_viscous%beg + 1, is2_viscous%end - 1 + do j = is1_viscous%beg, is1_viscous%end + $:GPU_LOOP(parallelism='[seq]') + do i = iv%beg, iv%end + dv_ds_vf(i)%sf(j, k, l) = & + 1._wp/((1._wp + wa_flg)*dL(k)) & + *(wa_flg*vL_vf(i)%sf(j, k + 1, l) & + + vR_vf(i)%sf(j, k, l) & + - vL_vf(i)%sf(j, k, l) & + - wa_flg*vR_vf(i)%sf(j, k - 1, l)) + end do end do end do end do - end do + #:endcall GPU_PARALLEL_LOOP ! END: First-Order Spatial Derivatives in y-direction @@ -1273,22 +1302,23 @@ contains ! cell-boundaries, to calculate the cell-averaged first-order ! spatial derivatives inside the cell. - $:GPU_PARALLEL_LOOP(collapse=3) - do l = is3_viscous%beg + 1, is3_viscous%end - 1 - do k = is2_viscous%beg, is2_viscous%end - do j = is1_viscous%beg, is1_viscous%end - $:GPU_LOOP(parallelism='[seq]') - do i = iv%beg, iv%end - dv_ds_vf(i)%sf(j, k, l) = & - 1._wp/((1._wp + wa_flg)*dL(l)) & - *(wa_flg*vL_vf(i)%sf(j, k, l + 1) & - + vR_vf(i)%sf(j, k, l) & - - vL_vf(i)%sf(j, k, l) & - - wa_flg*vR_vf(i)%sf(j, k, l - 1)) + #:call GPU_PARALLEL_LOOP(collapse=3) + do l = is3_viscous%beg + 1, is3_viscous%end - 1 + do k = is2_viscous%beg, is2_viscous%end + do j = is1_viscous%beg, is1_viscous%end + $:GPU_LOOP(parallelism='[seq]') + do i = iv%beg, iv%end + dv_ds_vf(i)%sf(j, k, l) = & + 1._wp/((1._wp + wa_flg)*dL(l)) & + *(wa_flg*vL_vf(i)%sf(j, k, l + 1) & + + vR_vf(i)%sf(j, k, l) & + - vL_vf(i)%sf(j, k, l) & + - wa_flg*vR_vf(i)%sf(j, k, l - 1)) + end do end do end do end do - end do + #:endcall GPU_PARALLEL_LOOP end if ! END: First-Order Spatial Derivatives in z-direction @@ -1328,138 +1358,150 @@ contains $:GPU_UPDATE(device='[is1_viscous,is2_viscous,is3_viscous]') - $:GPU_PARALLEL_LOOP(collapse=3) - do l = is3_viscous%beg, is3_viscous%end - do k = is2_viscous%beg, is2_viscous%end - do j = is1_viscous%beg, is1_viscous%end - grad_x%sf(j, k, l) = & - (var%sf(j + 1, k, l) - var%sf(j - 1, k, l))/ & - (x_cc(j + 1) - x_cc(j - 1)) - end do - end do - end do - - if (n > 0) then - $:GPU_PARALLEL_LOOP(collapse=3) + #:call GPU_PARALLEL_LOOP(collapse=3) do l = is3_viscous%beg, is3_viscous%end do k = is2_viscous%beg, is2_viscous%end do j = is1_viscous%beg, is1_viscous%end - grad_y%sf(j, k, l) = & - (var%sf(j, k + 1, l) - var%sf(j, k - 1, l))/ & - (y_cc(k + 1) - y_cc(k - 1)) + grad_x%sf(j, k, l) = & + (var%sf(j + 1, k, l) - var%sf(j - 1, k, l))/ & + (x_cc(j + 1) - x_cc(j - 1)) end do end do end do + #:endcall GPU_PARALLEL_LOOP + + if (n > 0) then + #:call GPU_PARALLEL_LOOP(collapse=3) + do l = is3_viscous%beg, is3_viscous%end + do k = is2_viscous%beg, is2_viscous%end + do j = is1_viscous%beg, is1_viscous%end + grad_y%sf(j, k, l) = & + (var%sf(j, k + 1, l) - var%sf(j, k - 1, l))/ & + (y_cc(k + 1) - y_cc(k - 1)) + end do + end do + end do + #:endcall GPU_PARALLEL_LOOP end if if (p > 0) then - $:GPU_PARALLEL_LOOP(collapse=3) - do l = is3_viscous%beg, is3_viscous%end - do k = is2_viscous%beg, is2_viscous%end - do j = is1_viscous%beg, is1_viscous%end - grad_z%sf(j, k, l) = & - (var%sf(j, k, l + 1) - var%sf(j, k, l - 1))/ & - (z_cc(l + 1) - z_cc(l - 1)) + #:call GPU_PARALLEL_LOOP(collapse=3) + do l = is3_viscous%beg, is3_viscous%end + do k = is2_viscous%beg, is2_viscous%end + do j = is1_viscous%beg, is1_viscous%end + grad_z%sf(j, k, l) = & + (var%sf(j, k, l + 1) - var%sf(j, k, l - 1))/ & + (z_cc(l + 1) - z_cc(l - 1)) + end do end do end do - end do + #:endcall GPU_PARALLEL_LOOP end if - $:GPU_PARALLEL_LOOP(collapse=2) - do l = idwbuff(3)%beg, idwbuff(3)%end - do k = idwbuff(2)%beg, idwbuff(2)%end - grad_x%sf(idwbuff(1)%beg, k, l) = & - (-3._wp*var%sf(idwbuff(1)%beg, k, l) + 4._wp*var%sf(idwbuff(1)%beg + 1, k, l) - var%sf(idwbuff(1)%beg + 2, k, l))/ & - (x_cc(idwbuff(1)%beg + 2) - x_cc(idwbuff(1)%beg)) - grad_x%sf(idwbuff(1)%end, k, l) = & - (+3._wp*var%sf(idwbuff(1)%end, k, l) - 4._wp*var%sf(idwbuff(1)%end - 1, k, l) + var%sf(idwbuff(1)%end - 2, k, l))/ & - (x_cc(idwbuff(1)%end) - x_cc(idwbuff(1)%end - 2)) - end do - end do - if (n > 0) then - $:GPU_PARALLEL_LOOP(collapse=2) + #:call GPU_PARALLEL_LOOP(collapse=2) do l = idwbuff(3)%beg, idwbuff(3)%end - do j = idwbuff(1)%beg, idwbuff(1)%end - grad_y%sf(j, idwbuff(2)%beg, l) = & - (-3._wp*var%sf(j, idwbuff(2)%beg, l) + 4._wp*var%sf(j, idwbuff(2)%beg + 1, l) - var%sf(j, idwbuff(2)%beg + 2, l))/ & - (y_cc(idwbuff(2)%beg + 2) - y_cc(idwbuff(2)%beg)) - grad_y%sf(j, idwbuff(2)%end, l) = & - (+3._wp*var%sf(j, idwbuff(2)%end, l) - 4._wp*var%sf(j, idwbuff(2)%end - 1, l) + var%sf(j, idwbuff(2)%end - 2, l))/ & - (y_cc(idwbuff(2)%end) - y_cc(idwbuff(2)%end - 2)) + do k = idwbuff(2)%beg, idwbuff(2)%end + grad_x%sf(idwbuff(1)%beg, k, l) = & + (-3._wp*var%sf(idwbuff(1)%beg, k, l) + 4._wp*var%sf(idwbuff(1)%beg + 1, k, l) - var%sf(idwbuff(1)%beg + 2, k, l))/ & + (x_cc(idwbuff(1)%beg + 2) - x_cc(idwbuff(1)%beg)) + grad_x%sf(idwbuff(1)%end, k, l) = & + (+3._wp*var%sf(idwbuff(1)%end, k, l) - 4._wp*var%sf(idwbuff(1)%end - 1, k, l) + var%sf(idwbuff(1)%end - 2, k, l))/ & + (x_cc(idwbuff(1)%end) - x_cc(idwbuff(1)%end - 2)) end do end do - if (p > 0) then - $:GPU_PARALLEL_LOOP(collapse=2) - do k = idwbuff(2)%beg, idwbuff(2)%end + #:endcall GPU_PARALLEL_LOOP + if (n > 0) then + #:call GPU_PARALLEL_LOOP(collapse=2) + do l = idwbuff(3)%beg, idwbuff(3)%end do j = idwbuff(1)%beg, idwbuff(1)%end - grad_z%sf(j, k, idwbuff(3)%beg) = & - (-3._wp*var%sf(j, k, idwbuff(3)%beg) + 4._wp*var%sf(j, k, idwbuff(3)%beg + 1) - var%sf(j, k, idwbuff(3)%beg + 2))/ & - (z_cc(idwbuff(3)%beg + 2) - z_cc(is3_viscous%beg)) - grad_z%sf(j, k, idwbuff(3)%end) = & - (+3._wp*var%sf(j, k, idwbuff(3)%end) - 4._wp*var%sf(j, k, idwbuff(3)%end - 1) + var%sf(j, k, idwbuff(3)%end - 2))/ & - (z_cc(idwbuff(3)%end) - z_cc(idwbuff(3)%end - 2)) + grad_y%sf(j, idwbuff(2)%beg, l) = & + (-3._wp*var%sf(j, idwbuff(2)%beg, l) + 4._wp*var%sf(j, idwbuff(2)%beg + 1, l) - var%sf(j, idwbuff(2)%beg + 2, l))/ & + (y_cc(idwbuff(2)%beg + 2) - y_cc(idwbuff(2)%beg)) + grad_y%sf(j, idwbuff(2)%end, l) = & + (+3._wp*var%sf(j, idwbuff(2)%end, l) - 4._wp*var%sf(j, idwbuff(2)%end - 1, l) + var%sf(j, idwbuff(2)%end - 2, l))/ & + (y_cc(idwbuff(2)%end) - y_cc(idwbuff(2)%end - 2)) end do end do + #:endcall GPU_PARALLEL_LOOP + if (p > 0) then + #:call GPU_PARALLEL_LOOP(collapse=2) + do k = idwbuff(2)%beg, idwbuff(2)%end + do j = idwbuff(1)%beg, idwbuff(1)%end + grad_z%sf(j, k, idwbuff(3)%beg) = & + (-3._wp*var%sf(j, k, idwbuff(3)%beg) + 4._wp*var%sf(j, k, idwbuff(3)%beg + 1) - var%sf(j, k, idwbuff(3)%beg + 2))/ & + (z_cc(idwbuff(3)%beg + 2) - z_cc(is3_viscous%beg)) + grad_z%sf(j, k, idwbuff(3)%end) = & + (+3._wp*var%sf(j, k, idwbuff(3)%end) - 4._wp*var%sf(j, k, idwbuff(3)%end - 1) + var%sf(j, k, idwbuff(3)%end - 2))/ & + (z_cc(idwbuff(3)%end) - z_cc(idwbuff(3)%end - 2)) + end do + end do + #:endcall GPU_PARALLEL_LOOP end if end if if (bc_x%beg <= BC_GHOST_EXTRAP) then - $:GPU_PARALLEL_LOOP(collapse=2) - do l = idwbuff(3)%beg, idwbuff(3)%end - do k = idwbuff(2)%beg, idwbuff(2)%end - grad_x%sf(0, k, l) = (-3._wp*var%sf(0, k, l) + 4._wp*var%sf(1, k, l) - var%sf(2, k, l))/ & - (x_cc(2) - x_cc(0)) + #:call GPU_PARALLEL_LOOP(collapse=2) + do l = idwbuff(3)%beg, idwbuff(3)%end + do k = idwbuff(2)%beg, idwbuff(2)%end + grad_x%sf(0, k, l) = (-3._wp*var%sf(0, k, l) + 4._wp*var%sf(1, k, l) - var%sf(2, k, l))/ & + (x_cc(2) - x_cc(0)) + end do end do - end do + #:endcall GPU_PARALLEL_LOOP end if if (bc_x%end <= BC_GHOST_EXTRAP) then - $:GPU_PARALLEL_LOOP(collapse=2) - do l = idwbuff(3)%beg, idwbuff(3)%end - do k = idwbuff(2)%beg, idwbuff(2)%end - grad_x%sf(m, k, l) = (3._wp*var%sf(m, k, l) - 4._wp*var%sf(m - 1, k, l) + var%sf(m - 2, k, l))/ & - (x_cc(m) - x_cc(m - 2)) + #:call GPU_PARALLEL_LOOP(collapse=2) + do l = idwbuff(3)%beg, idwbuff(3)%end + do k = idwbuff(2)%beg, idwbuff(2)%end + grad_x%sf(m, k, l) = (3._wp*var%sf(m, k, l) - 4._wp*var%sf(m - 1, k, l) + var%sf(m - 2, k, l))/ & + (x_cc(m) - x_cc(m - 2)) + end do end do - end do + #:endcall GPU_PARALLEL_LOOP end if if (n > 0) then if (bc_y%beg <= BC_GHOST_EXTRAP .and. bc_y%beg /= BC_NULL) then - $:GPU_PARALLEL_LOOP(collapse=2) - do l = idwbuff(3)%beg, idwbuff(3)%end - do j = idwbuff(1)%beg, idwbuff(1)%end - grad_y%sf(j, 0, l) = (-3._wp*var%sf(j, 0, l) + 4._wp*var%sf(j, 1, l) - var%sf(j, 2, l))/ & - (y_cc(2) - y_cc(0)) + #:call GPU_PARALLEL_LOOP(collapse=2) + do l = idwbuff(3)%beg, idwbuff(3)%end + do j = idwbuff(1)%beg, idwbuff(1)%end + grad_y%sf(j, 0, l) = (-3._wp*var%sf(j, 0, l) + 4._wp*var%sf(j, 1, l) - var%sf(j, 2, l))/ & + (y_cc(2) - y_cc(0)) + end do end do - end do + #:endcall GPU_PARALLEL_LOOP end if if (bc_y%end <= BC_GHOST_EXTRAP) then - $:GPU_PARALLEL_LOOP(collapse=2) - do l = idwbuff(3)%beg, idwbuff(3)%end - do j = idwbuff(1)%beg, idwbuff(1)%end - grad_y%sf(j, n, l) = (3._wp*var%sf(j, n, l) - 4._wp*var%sf(j, n - 1, l) + var%sf(j, n - 2, l))/ & - (y_cc(n) - y_cc(n - 2)) + #:call GPU_PARALLEL_LOOP(collapse=2) + do l = idwbuff(3)%beg, idwbuff(3)%end + do j = idwbuff(1)%beg, idwbuff(1)%end + grad_y%sf(j, n, l) = (3._wp*var%sf(j, n, l) - 4._wp*var%sf(j, n - 1, l) + var%sf(j, n - 2, l))/ & + (y_cc(n) - y_cc(n - 2)) + end do end do - end do + #:endcall GPU_PARALLEL_LOOP end if if (p > 0) then if (bc_z%beg <= BC_GHOST_EXTRAP) then - $:GPU_PARALLEL_LOOP(collapse=2) - do k = idwbuff(2)%beg, idwbuff(2)%end - do j = idwbuff(1)%beg, idwbuff(1)%end - grad_z%sf(j, k, 0) = & - (-3._wp*var%sf(j, k, 0) + 4._wp*var%sf(j, k, 1) - var%sf(j, k, 2))/ & - (z_cc(2) - z_cc(0)) + #:call GPU_PARALLEL_LOOP(collapse=2) + do k = idwbuff(2)%beg, idwbuff(2)%end + do j = idwbuff(1)%beg, idwbuff(1)%end + grad_z%sf(j, k, 0) = & + (-3._wp*var%sf(j, k, 0) + 4._wp*var%sf(j, k, 1) - var%sf(j, k, 2))/ & + (z_cc(2) - z_cc(0)) + end do end do - end do + #:endcall GPU_PARALLEL_LOOP end if if (bc_z%end <= BC_GHOST_EXTRAP) then - $:GPU_PARALLEL_LOOP(collapse=2) - do k = idwbuff(2)%beg, idwbuff(2)%end - do j = idwbuff(1)%beg, idwbuff(1)%end - grad_z%sf(j, k, p) = & - (3._wp*var%sf(j, k, p) - 4._wp*var%sf(j, k, p - 1) + var%sf(j, k, p - 2))/ & - (z_cc(p) - z_cc(p - 2)) + #:call GPU_PARALLEL_LOOP(collapse=2) + do k = idwbuff(2)%beg, idwbuff(2)%end + do j = idwbuff(1)%beg, idwbuff(1)%end + grad_z%sf(j, k, p) = & + (3._wp*var%sf(j, k, p) - 4._wp*var%sf(j, k, p - 1) + var%sf(j, k, p - 2))/ & + (z_cc(p) - z_cc(p - 2)) + end do end do - end do + #:endcall GPU_PARALLEL_LOOP end if end if end if diff --git a/src/simulation/m_weno.fpp b/src/simulation/m_weno.fpp index a9846124ba..6874237a4e 100644 --- a/src/simulation/m_weno.fpp +++ b/src/simulation/m_weno.fpp @@ -24,9 +24,7 @@ module m_weno use m_variables_conversion !< State variables type conversion procedures -#ifdef MFC_OPENACC - use openacc -#endif + ! $:USE_GPU_MODULE() use m_mpi_proxy @@ -672,232 +670,237 @@ contains if (weno_order == 1) then if (weno_dir == 1) then - $:GPU_PARALLEL_LOOP(collapse=4) - do i = 1, ubound(v_vf, 1) - do l = is3_weno%beg, is3_weno%end - do k = is2_weno%beg, is2_weno%end - do j = is1_weno%beg, is1_weno%end - vL_rs_vf_x(j, k, l, i) = v_vf(i)%sf(j, k, l) - vR_rs_vf_x(j, k, l, i) = v_vf(i)%sf(j, k, l) + #:call GPU_PARALLEL_LOOP(collapse=4) + do i = 1, ubound(v_vf, 1) + do l = is3_weno%beg, is3_weno%end + do k = is2_weno%beg, is2_weno%end + do j = is1_weno%beg, is1_weno%end + vL_rs_vf_x(j, k, l, i) = v_vf(i)%sf(j, k, l) + vR_rs_vf_x(j, k, l, i) = v_vf(i)%sf(j, k, l) + end do end do end do end do - end do + #:endcall GPU_PARALLEL_LOOP else if (weno_dir == 2) then - $:GPU_PARALLEL_LOOP(collapse=4) - do i = 1, ubound(v_vf, 1) - do l = is3_weno%beg, is3_weno%end - do k = is2_weno%beg, is2_weno%end - do j = is1_weno%beg, is1_weno%end - vL_rs_vf_y(j, k, l, i) = v_vf(i)%sf(k, j, l) - vR_rs_vf_y(j, k, l, i) = v_vf(i)%sf(k, j, l) + #:call GPU_PARALLEL_LOOP(collapse=4) + do i = 1, ubound(v_vf, 1) + do l = is3_weno%beg, is3_weno%end + do k = is2_weno%beg, is2_weno%end + do j = is1_weno%beg, is1_weno%end + vL_rs_vf_y(j, k, l, i) = v_vf(i)%sf(k, j, l) + vR_rs_vf_y(j, k, l, i) = v_vf(i)%sf(k, j, l) + end do end do end do end do - end do + #:endcall GPU_PARALLEL_LOOP else if (weno_dir == 3) then - $:GPU_PARALLEL_LOOP(collapse=4) - do i = 1, ubound(v_vf, 1) - do l = is3_weno%beg, is3_weno%end - do k = is2_weno%beg, is2_weno%end - do j = is1_weno%beg, is1_weno%end - vL_rs_vf_z(j, k, l, i) = v_vf(i)%sf(l, k, j) - vR_rs_vf_z(j, k, l, i) = v_vf(i)%sf(l, k, j) + #:call GPU_PARALLEL_LOOP(collapse=4) + do i = 1, ubound(v_vf, 1) + do l = is3_weno%beg, is3_weno%end + do k = is2_weno%beg, is2_weno%end + do j = is1_weno%beg, is1_weno%end + vL_rs_vf_z(j, k, l, i) = v_vf(i)%sf(l, k, j) + vR_rs_vf_z(j, k, l, i) = v_vf(i)%sf(l, k, j) + end do end do end do end do - end do + #:endcall GPU_PARALLEL_LOOP end if elseif (weno_order == 3) then #:for WENO_DIR, XYZ in [(1, 'x'), (2, 'y'), (3, 'z')] if (weno_dir == ${WENO_DIR}$) then - $:GPU_PARALLEL_LOOP(collapse=4,private='[beta,dvd,poly,omega,alpha,tau]') - do l = is3_weno%beg, is3_weno%end - do k = is2_weno%beg, is2_weno%end - do j = is1_weno%beg, is1_weno%end - do i = 1, v_size - ! reconstruct from left side - - dvd(0) = v_rs_ws_${XYZ}$ (j + 1, k, l, i) & - - v_rs_ws_${XYZ}$ (j, k, l, i) - dvd(-1) = v_rs_ws_${XYZ}$ (j, k, l, i) & - - v_rs_ws_${XYZ}$ (j - 1, k, l, i) - - poly(0) = v_rs_ws_${XYZ}$ (j, k, l, i) & - + poly_coef_cbL_${XYZ}$ (j, 0, 0)*dvd(0) - poly(1) = v_rs_ws_${XYZ}$ (j, k, l, i) & - + poly_coef_cbL_${XYZ}$ (j, 1, 0)*dvd(-1) - - beta(0) = beta_coef_${XYZ}$ (j, 0, 0)*dvd(0)*dvd(0) & - + weno_eps - beta(1) = beta_coef_${XYZ}$ (j, 1, 0)*dvd(-1)*dvd(-1) & - + weno_eps - - if (wenojs) then - alpha = d_cbL_${XYZ}$ (:, j)/(beta*beta) - - elseif (mapped_weno) then - alpha = d_cbL_${XYZ}$ (:, j)/(beta*beta) - omega = alpha/sum(alpha) - alpha = (d_cbL_${XYZ}$ (:, j)*(1._wp + d_cbL_${XYZ}$ (:, j) - 3._wp*omega) + omega**2._wp) & - *(omega/(d_cbL_${XYZ}$ (:, j)**2._wp + omega*(1._wp - 2._wp*d_cbL_${XYZ}$ (:, j)))) + #:call GPU_PARALLEL_LOOP(collapse=4,private='[beta,dvd,poly,omega,alpha,tau]') + do l = is3_weno%beg, is3_weno%end + do k = is2_weno%beg, is2_weno%end + do j = is1_weno%beg, is1_weno%end + do i = 1, v_size + ! reconstruct from left side - elseif (wenoz) then - ! Borges, et al. (2008) + dvd(0) = v_rs_ws_${XYZ}$ (j + 1, k, l, i) & + - v_rs_ws_${XYZ}$ (j, k, l, i) + dvd(-1) = v_rs_ws_${XYZ}$ (j, k, l, i) & + - v_rs_ws_${XYZ}$ (j - 1, k, l, i) - tau = abs(beta(1) - beta(0)) - alpha = d_cbL_${XYZ}$ (:, j)*(1._wp + tau/beta) + poly(0) = v_rs_ws_${XYZ}$ (j, k, l, i) & + + poly_coef_cbL_${XYZ}$ (j, 0, 0)*dvd(0) + poly(1) = v_rs_ws_${XYZ}$ (j, k, l, i) & + + poly_coef_cbL_${XYZ}$ (j, 1, 0)*dvd(-1) - end if + beta(0) = beta_coef_${XYZ}$ (j, 0, 0)*dvd(0)*dvd(0) & + + weno_eps + beta(1) = beta_coef_${XYZ}$ (j, 1, 0)*dvd(-1)*dvd(-1) & + + weno_eps - omega = alpha/sum(alpha) + if (wenojs) then + alpha = d_cbL_${XYZ}$ (:, j)/(beta*beta) - vL_rs_vf_${XYZ}$ (j, k, l, i) = omega(0)*poly(0) + omega(1)*poly(1) + elseif (mapped_weno) then + alpha = d_cbL_${XYZ}$ (:, j)/(beta*beta) + omega = alpha/sum(alpha) + alpha = (d_cbL_${XYZ}$ (:, j)*(1._wp + d_cbL_${XYZ}$ (:, j) - 3._wp*omega) + omega**2._wp) & + *(omega/(d_cbL_${XYZ}$ (:, j)**2._wp + omega*(1._wp - 2._wp*d_cbL_${XYZ}$ (:, j)))) - ! reconstruct from right side + elseif (wenoz) then + ! Borges, et al. (2008) - poly(0) = v_rs_ws_${XYZ}$ (j, k, l, i) & - + poly_coef_cbR_${XYZ}$ (j, 0, 0)*dvd(0) - poly(1) = v_rs_ws_${XYZ}$ (j, k, l, i) & - + poly_coef_cbR_${XYZ}$ (j, 1, 0)*dvd(-1) + tau = abs(beta(1) - beta(0)) + alpha = d_cbL_${XYZ}$ (:, j)*(1._wp + tau/beta) - if (wenojs) then - alpha = d_cbR_${XYZ}$ (:, j)/(beta*beta) + end if - elseif (mapped_weno) then - alpha = d_cbR_${XYZ}$ (:, j)/(beta*beta) omega = alpha/sum(alpha) - alpha = (d_cbR_${XYZ}$ (:, j)*(1._wp + d_cbR_${XYZ}$ (:, j) - 3._wp*omega) + omega**2._wp) & - *(omega/(d_cbR_${XYZ}$ (:, j)**2._wp + omega*(1._wp - 2._wp*d_cbR_${XYZ}$ (:, j)))) - elseif (wenoz) then + vL_rs_vf_${XYZ}$ (j, k, l, i) = omega(0)*poly(0) + omega(1)*poly(1) + + ! reconstruct from right side + + poly(0) = v_rs_ws_${XYZ}$ (j, k, l, i) & + + poly_coef_cbR_${XYZ}$ (j, 0, 0)*dvd(0) + poly(1) = v_rs_ws_${XYZ}$ (j, k, l, i) & + + poly_coef_cbR_${XYZ}$ (j, 1, 0)*dvd(-1) + + if (wenojs) then + alpha = d_cbR_${XYZ}$ (:, j)/(beta*beta) + + elseif (mapped_weno) then + alpha = d_cbR_${XYZ}$ (:, j)/(beta*beta) + omega = alpha/sum(alpha) + alpha = (d_cbR_${XYZ}$ (:, j)*(1._wp + d_cbR_${XYZ}$ (:, j) - 3._wp*omega) + omega**2._wp) & + *(omega/(d_cbR_${XYZ}$ (:, j)**2._wp + omega*(1._wp - 2._wp*d_cbR_${XYZ}$ (:, j)))) - alpha = d_cbR_${XYZ}$ (:, j)*(1._wp + tau/beta) + elseif (wenoz) then - end if + alpha = d_cbR_${XYZ}$ (:, j)*(1._wp + tau/beta) - omega = alpha/sum(alpha) + end if - vR_rs_vf_${XYZ}$ (j, k, l, i) = omega(0)*poly(0) + omega(1)*poly(1) + omega = alpha/sum(alpha) + + vR_rs_vf_${XYZ}$ (j, k, l, i) = omega(0)*poly(0) + omega(1)*poly(1) + end do end do end do end do - end do + #:endcall GPU_PARALLEL_LOOP end if #:endfor elseif (weno_order == 5) then #:for WENO_DIR, XYZ in [(1, 'x'), (2, 'y'), (3, 'z')] if (weno_dir == ${WENO_DIR}$) then - $:GPU_PARALLEL_LOOP(collapse=3,private='[dvd,poly,beta,alpha,omega,tau,delta]') - do l = is3_weno%beg, is3_weno%end - do k = is2_weno%beg, is2_weno%end - do j = is1_weno%beg, is1_weno%end - $:GPU_LOOP(parallelism='[seq]') - do i = 1, v_size - ! reconstruct from left side - - dvd(1) = v_rs_ws_${XYZ}$ (j + 2, k, l, i) & - - v_rs_ws_${XYZ}$ (j + 1, k, l, i) - dvd(0) = v_rs_ws_${XYZ}$ (j + 1, k, l, i) & - - v_rs_ws_${XYZ}$ (j, k, l, i) - dvd(-1) = v_rs_ws_${XYZ}$ (j, k, l, i) & - - v_rs_ws_${XYZ}$ (j - 1, k, l, i) - dvd(-2) = v_rs_ws_${XYZ}$ (j - 1, k, l, i) & - - v_rs_ws_${XYZ}$ (j - 2, k, l, i) - - poly(0) = v_rs_ws_${XYZ}$ (j, k, l, i) & - + poly_coef_cbL_${XYZ}$ (j, 0, 0)*dvd(1) & - + poly_coef_cbL_${XYZ}$ (j, 0, 1)*dvd(0) - poly(1) = v_rs_ws_${XYZ}$ (j, k, l, i) & - + poly_coef_cbL_${XYZ}$ (j, 1, 0)*dvd(0) & - + poly_coef_cbL_${XYZ}$ (j, 1, 1)*dvd(-1) - poly(2) = v_rs_ws_${XYZ}$ (j, k, l, i) & - + poly_coef_cbL_${XYZ}$ (j, 2, 0)*dvd(-1) & - + poly_coef_cbL_${XYZ}$ (j, 2, 1)*dvd(-2) - - beta(0) = beta_coef_${XYZ}$ (j, 0, 0)*dvd(1)*dvd(1) & - + beta_coef_${XYZ}$ (j, 0, 1)*dvd(1)*dvd(0) & - + beta_coef_${XYZ}$ (j, 0, 2)*dvd(0)*dvd(0) & - + weno_eps - beta(1) = beta_coef_${XYZ}$ (j, 1, 0)*dvd(0)*dvd(0) & - + beta_coef_${XYZ}$ (j, 1, 1)*dvd(0)*dvd(-1) & - + beta_coef_${XYZ}$ (j, 1, 2)*dvd(-1)*dvd(-1) & - + weno_eps - beta(2) = beta_coef_${XYZ}$ (j, 2, 0)*dvd(-1)*dvd(-1) & - + beta_coef_${XYZ}$ (j, 2, 1)*dvd(-1)*dvd(-2) & - + beta_coef_${XYZ}$ (j, 2, 2)*dvd(-2)*dvd(-2) & - + weno_eps - - if (wenojs) then - alpha = d_cbL_${XYZ}$ (:, j)/(beta*beta) - - elseif (mapped_weno) then - alpha = d_cbL_${XYZ}$ (:, j)/(beta*beta) - omega = alpha/sum(alpha) - alpha = (d_cbL_${XYZ}$ (:, j)*(1._wp + d_cbL_${XYZ}$ (:, j) - 3._wp*omega) + omega**2._wp) & - *(omega/(d_cbL_${XYZ}$ (:, j)**2._wp + omega*(1._wp - 2._wp*d_cbL_${XYZ}$ (:, j)))) + #:call GPU_PARALLEL_LOOP(collapse=3,private='[dvd,poly,beta,alpha,omega,tau,delta]') + do l = is3_weno%beg, is3_weno%end + do k = is2_weno%beg, is2_weno%end + do j = is1_weno%beg, is1_weno%end + $:GPU_LOOP(parallelism='[seq]') + do i = 1, v_size + ! reconstruct from left side - elseif (wenoz) then - ! Borges, et al. (2008) + dvd(1) = v_rs_ws_${XYZ}$ (j + 2, k, l, i) & + - v_rs_ws_${XYZ}$ (j + 1, k, l, i) + dvd(0) = v_rs_ws_${XYZ}$ (j + 1, k, l, i) & + - v_rs_ws_${XYZ}$ (j, k, l, i) + dvd(-1) = v_rs_ws_${XYZ}$ (j, k, l, i) & + - v_rs_ws_${XYZ}$ (j - 1, k, l, i) + dvd(-2) = v_rs_ws_${XYZ}$ (j - 1, k, l, i) & + - v_rs_ws_${XYZ}$ (j - 2, k, l, i) - tau = abs(beta(2) - beta(0)) ! Equation 25 - alpha = d_cbL_${XYZ}$ (:, j)*(1._wp + tau/beta) ! Equation 28 (note: weno_eps was already added to beta) + poly(0) = v_rs_ws_${XYZ}$ (j, k, l, i) & + + poly_coef_cbL_${XYZ}$ (j, 0, 0)*dvd(1) & + + poly_coef_cbL_${XYZ}$ (j, 0, 1)*dvd(0) + poly(1) = v_rs_ws_${XYZ}$ (j, k, l, i) & + + poly_coef_cbL_${XYZ}$ (j, 1, 0)*dvd(0) & + + poly_coef_cbL_${XYZ}$ (j, 1, 1)*dvd(-1) + poly(2) = v_rs_ws_${XYZ}$ (j, k, l, i) & + + poly_coef_cbL_${XYZ}$ (j, 2, 0)*dvd(-1) & + + poly_coef_cbL_${XYZ}$ (j, 2, 1)*dvd(-2) - elseif (teno) then - ! Fu, et al. (2016) - ! Fu''s code: https://dx.doi.org/10.13140/RG.2.2.36250.34247 - tau = abs(beta(2) - beta(0)) - alpha = 1._wp + tau/beta ! Equation 22 (reuse alpha as gamma; pick C=1 & q=6) - alpha = (alpha*alpha*alpha)**2._wp ! Equation 22 cont. (some CPU compilers cannot optimize x**6.0) - omega = alpha/sum(alpha) ! Equation 25 (reuse omega as xi) - delta = merge(0._wp, 1._wp, omega < teno_CT)! Equation 26 - alpha = delta*d_cbL_${XYZ}$ (:, j) ! Equation 27 + beta(0) = beta_coef_${XYZ}$ (j, 0, 0)*dvd(1)*dvd(1) & + + beta_coef_${XYZ}$ (j, 0, 1)*dvd(1)*dvd(0) & + + beta_coef_${XYZ}$ (j, 0, 2)*dvd(0)*dvd(0) & + + weno_eps + beta(1) = beta_coef_${XYZ}$ (j, 1, 0)*dvd(0)*dvd(0) & + + beta_coef_${XYZ}$ (j, 1, 1)*dvd(0)*dvd(-1) & + + beta_coef_${XYZ}$ (j, 1, 2)*dvd(-1)*dvd(-1) & + + weno_eps + beta(2) = beta_coef_${XYZ}$ (j, 2, 0)*dvd(-1)*dvd(-1) & + + beta_coef_${XYZ}$ (j, 2, 1)*dvd(-1)*dvd(-2) & + + beta_coef_${XYZ}$ (j, 2, 2)*dvd(-2)*dvd(-2) & + + weno_eps - end if + if (wenojs) then + alpha = d_cbL_${XYZ}$ (:, j)/(beta*beta) - omega = alpha/sum(alpha) + elseif (mapped_weno) then + alpha = d_cbL_${XYZ}$ (:, j)/(beta*beta) + omega = alpha/sum(alpha) + alpha = (d_cbL_${XYZ}$ (:, j)*(1._wp + d_cbL_${XYZ}$ (:, j) - 3._wp*omega) + omega**2._wp) & + *(omega/(d_cbL_${XYZ}$ (:, j)**2._wp + omega*(1._wp - 2._wp*d_cbL_${XYZ}$ (:, j)))) - vL_rs_vf_${XYZ}$ (j, k, l, i) = sum(omega*poly) + elseif (wenoz) then + ! Borges, et al. (2008) - ! reconstruct from right side + tau = abs(beta(2) - beta(0)) ! Equation 25 + alpha = d_cbL_${XYZ}$ (:, j)*(1._wp + tau/beta) ! Equation 28 (note: weno_eps was already added to beta) - poly(0) = v_rs_ws_${XYZ}$ (j, k, l, i) & - + poly_coef_cbR_${XYZ}$ (j, 0, 0)*dvd(1) & - + poly_coef_cbR_${XYZ}$ (j, 0, 1)*dvd(0) - poly(1) = v_rs_ws_${XYZ}$ (j, k, l, i) & - + poly_coef_cbR_${XYZ}$ (j, 1, 0)*dvd(0) & - + poly_coef_cbR_${XYZ}$ (j, 1, 1)*dvd(-1) - poly(2) = v_rs_ws_${XYZ}$ (j, k, l, i) & - + poly_coef_cbR_${XYZ}$ (j, 2, 0)*dvd(-1) & - + poly_coef_cbR_${XYZ}$ (j, 2, 1)*dvd(-2) + elseif (teno) then + ! Fu, et al. (2016) + ! Fu''s code: https://dx.doi.org/10.13140/RG.2.2.36250.34247 + tau = abs(beta(2) - beta(0)) + alpha = 1._wp + tau/beta ! Equation 22 (reuse alpha as gamma; pick C=1 & q=6) + alpha = (alpha*alpha*alpha)**2._wp ! Equation 22 cont. (some CPU compilers cannot optimize x**6.0) + omega = alpha/sum(alpha) ! Equation 25 (reuse omega as xi) + delta = merge(0._wp, 1._wp, omega < teno_CT)! Equation 26 + alpha = delta*d_cbL_${XYZ}$ (:, j) ! Equation 27 - if (wenojs) then - alpha = d_cbR_${XYZ}$ (:, j)/(beta*beta) + end if - elseif (mapped_weno) then - alpha = d_cbR_${XYZ}$ (:, j)/(beta*beta) omega = alpha/sum(alpha) - alpha = (d_cbR_${XYZ}$ (:, j)*(1._wp + d_cbR_${XYZ}$ (:, j) - 3._wp*omega) + omega**2._wp) & - *(omega/(d_cbR_${XYZ}$ (:, j)**2._wp + omega*(1._wp - 2._wp*d_cbR_${XYZ}$ (:, j)))) - elseif (wenoz) then + vL_rs_vf_${XYZ}$ (j, k, l, i) = sum(omega*poly) + + ! reconstruct from right side - alpha = d_cbR_${XYZ}$ (:, j)*(1._wp + tau/beta) + poly(0) = v_rs_ws_${XYZ}$ (j, k, l, i) & + + poly_coef_cbR_${XYZ}$ (j, 0, 0)*dvd(1) & + + poly_coef_cbR_${XYZ}$ (j, 0, 1)*dvd(0) + poly(1) = v_rs_ws_${XYZ}$ (j, k, l, i) & + + poly_coef_cbR_${XYZ}$ (j, 1, 0)*dvd(0) & + + poly_coef_cbR_${XYZ}$ (j, 1, 1)*dvd(-1) + poly(2) = v_rs_ws_${XYZ}$ (j, k, l, i) & + + poly_coef_cbR_${XYZ}$ (j, 2, 0)*dvd(-1) & + + poly_coef_cbR_${XYZ}$ (j, 2, 1)*dvd(-2) - elseif (teno) then - alpha = delta*d_cbR_${XYZ}$ (:, j) + if (wenojs) then + alpha = d_cbR_${XYZ}$ (:, j)/(beta*beta) - end if + elseif (mapped_weno) then + alpha = d_cbR_${XYZ}$ (:, j)/(beta*beta) + omega = alpha/sum(alpha) + alpha = (d_cbR_${XYZ}$ (:, j)*(1._wp + d_cbR_${XYZ}$ (:, j) - 3._wp*omega) + omega**2._wp) & + *(omega/(d_cbR_${XYZ}$ (:, j)**2._wp + omega*(1._wp - 2._wp*d_cbR_${XYZ}$ (:, j)))) - omega = alpha/sum(alpha) + elseif (wenoz) then - vR_rs_vf_${XYZ}$ (j, k, l, i) = sum(omega*poly) + alpha = d_cbR_${XYZ}$ (:, j)*(1._wp + tau/beta) + elseif (teno) then + alpha = delta*d_cbR_${XYZ}$ (:, j) + + end if + + omega = alpha/sum(alpha) + + vR_rs_vf_${XYZ}$ (j, k, l, i) = sum(omega*poly) + + end do end do end do end do - end do + #:endcall GPU_PARALLEL_LOOP if (mp_weno) then call s_preserve_monotonicity(v_rs_ws_${XYZ}$, vL_rs_vf_${XYZ}$, & @@ -908,191 +911,192 @@ contains elseif (weno_order == 7) then #:for WENO_DIR, XYZ in [(1, 'x'), (2, 'y'), (3, 'z')] if (weno_dir == ${WENO_DIR}$) then - $:GPU_PARALLEL_LOOP(collapse=3,private='[poly,beta,alpha,omega,tau,delta,dvd,v]') - do l = is3_weno%beg, is3_weno%end - do k = is2_weno%beg, is2_weno%end - do j = is1_weno%beg, is1_weno%end - $:GPU_LOOP(parallelism='[seq]') - do i = 1, v_size - - if (teno) v = v_rs_ws_${XYZ}$ (j - 3:j + 3, k, l, i) ! temporary field value array for clarity - - if (.not. teno) then - dvd(2) = v_rs_ws_${XYZ}$ (j + 3, k, l, i) & - - v_rs_ws_${XYZ}$ (j + 2, k, l, i) - dvd(1) = v_rs_ws_${XYZ}$ (j + 2, k, l, i) & - - v_rs_ws_${XYZ}$ (j + 1, k, l, i) - dvd(0) = v_rs_ws_${XYZ}$ (j + 1, k, l, i) & - - v_rs_ws_${XYZ}$ (j, k, l, i) - dvd(-1) = v_rs_ws_${XYZ}$ (j, k, l, i) & - - v_rs_ws_${XYZ}$ (j - 1, k, l, i) - dvd(-2) = v_rs_ws_${XYZ}$ (j - 1, k, l, i) & - - v_rs_ws_${XYZ}$ (j - 2, k, l, i) - dvd(-3) = v_rs_ws_${XYZ}$ (j - 2, k, l, i) & - - v_rs_ws_${XYZ}$ (j - 3, k, l, i) + #:call GPU_PARALLEL_LOOP(collapse=3,private='[poly,beta,alpha,omega,tau,delta,dvd,v]') + do l = is3_weno%beg, is3_weno%end + do k = is2_weno%beg, is2_weno%end + do j = is1_weno%beg, is1_weno%end + $:GPU_LOOP(parallelism='[seq]') + do i = 1, v_size + + if (teno) v = v_rs_ws_${XYZ}$ (j - 3:j + 3, k, l, i) ! temporary field value array for clarity + + if (.not. teno) then + dvd(2) = v_rs_ws_${XYZ}$ (j + 3, k, l, i) & + - v_rs_ws_${XYZ}$ (j + 2, k, l, i) + dvd(1) = v_rs_ws_${XYZ}$ (j + 2, k, l, i) & + - v_rs_ws_${XYZ}$ (j + 1, k, l, i) + dvd(0) = v_rs_ws_${XYZ}$ (j + 1, k, l, i) & + - v_rs_ws_${XYZ}$ (j, k, l, i) + dvd(-1) = v_rs_ws_${XYZ}$ (j, k, l, i) & + - v_rs_ws_${XYZ}$ (j - 1, k, l, i) + dvd(-2) = v_rs_ws_${XYZ}$ (j - 1, k, l, i) & + - v_rs_ws_${XYZ}$ (j - 2, k, l, i) + dvd(-3) = v_rs_ws_${XYZ}$ (j - 2, k, l, i) & + - v_rs_ws_${XYZ}$ (j - 3, k, l, i) + + poly(3) = v_rs_ws_${XYZ}$ (j, k, l, i) & + + poly_coef_cbL_${XYZ}$ (j, 0, 0)*dvd(2) & + + poly_coef_cbL_${XYZ}$ (j, 0, 1)*dvd(1) & + + poly_coef_cbL_${XYZ}$ (j, 0, 2)*dvd(0) + poly(2) = v_rs_ws_${XYZ}$ (j, k, l, i) & + + poly_coef_cbL_${XYZ}$ (j, 1, 0)*dvd(1) & + + poly_coef_cbL_${XYZ}$ (j, 1, 1)*dvd(0) & + + poly_coef_cbL_${XYZ}$ (j, 1, 2)*dvd(-1) + poly(1) = v_rs_ws_${XYZ}$ (j, k, l, i) & + + poly_coef_cbL_${XYZ}$ (j, 2, 0)*dvd(0) & + + poly_coef_cbL_${XYZ}$ (j, 2, 1)*dvd(-1) & + + poly_coef_cbL_${XYZ}$ (j, 2, 2)*dvd(-2) + poly(0) = v_rs_ws_${XYZ}$ (j, k, l, i) & + + poly_coef_cbL_${XYZ}$ (j, 3, 0)*dvd(-1) & + + poly_coef_cbL_${XYZ}$ (j, 3, 1)*dvd(-2) & + + poly_coef_cbL_${XYZ}$ (j, 3, 2)*dvd(-3) + + else + ! (Fu, et al., 2016) Table 1 + ! Note: Unlike TENO5, TENO7 stencils differ from WENO7 stencils + ! See Figure 2 (right) for right-sided flux (at i+1/2) + ! Here we need the left-sided flux, so we flip the weights with respect to the x=i point + ! But we need to keep the stencil order to reuse the beta coefficients + poly(0) = ( 2._wp*v(-1) + 5._wp*v( 0) - 1._wp*v( 1)) / 6._wp !& + poly(1) = (11._wp*v( 0) - 7._wp*v( 1) + 2._wp*v( 2)) / 6._wp !& + poly(2) = (-1._wp*v(-2) + 5._wp*v(-1) + 2._wp*v( 0)) / 6._wp !& + poly(3) = (25._wp*v( 0) - 23._wp*v( 1) + 13._wp*v( 2) - 3._wp*v( 3)) / 12._wp !& + poly(4) = ( 1._wp*v(-3) - 5._wp*v(-2) + 13._wp*v(-1) + 3._wp*v( 0)) / 12._wp !& + end if + + if (.not. teno) then + + beta(3) = beta_coef_${XYZ}$ (j, 0, 0)*dvd(2)*dvd(2) & + + beta_coef_${XYZ}$ (j, 0, 1)*dvd(2)*dvd(1) & + + beta_coef_${XYZ}$ (j, 0, 2)*dvd(2)*dvd(0) & + + beta_coef_${XYZ}$ (j, 0, 3)*dvd(1)*dvd(1) & + + beta_coef_${XYZ}$ (j, 0, 4)*dvd(1)*dvd(0) & + + beta_coef_${XYZ}$ (j, 0, 5)*dvd(0)*dvd(0) & + + weno_eps + + beta(2) = beta_coef_${XYZ}$ (j, 1, 0)*dvd(1)*dvd(1) & + + beta_coef_${XYZ}$ (j, 1, 1)*dvd(1)*dvd(0) & + + beta_coef_${XYZ}$ (j, 1, 2)*dvd(1)*dvd(-1) & + + beta_coef_${XYZ}$ (j, 1, 3)*dvd(0)*dvd(0) & + + beta_coef_${XYZ}$ (j, 1, 4)*dvd(0)*dvd(-1) & + + beta_coef_${XYZ}$ (j, 1, 5)*dvd(-1)*dvd(-1) & + + weno_eps + + beta(1) = beta_coef_${XYZ}$ (j, 2, 0)*dvd(0)*dvd(0) & + + beta_coef_${XYZ}$ (j, 2, 1)*dvd(0)*dvd(-1) & + + beta_coef_${XYZ}$ (j, 2, 2)*dvd(0)*dvd(-2) & + + beta_coef_${XYZ}$ (j, 2, 3)*dvd(-1)*dvd(-1) & + + beta_coef_${XYZ}$ (j, 2, 4)*dvd(-1)*dvd(-2) & + + beta_coef_${XYZ}$ (j, 2, 5)*dvd(-2)*dvd(-2) & + + weno_eps + + beta(0) = beta_coef_${XYZ}$ (j, 3, 0)*dvd(-1)*dvd(-1) & + + beta_coef_${XYZ}$ (j, 3, 1)*dvd(-1)*dvd(-2) & + + beta_coef_${XYZ}$ (j, 3, 2)*dvd(-1)*dvd(-3) & + + beta_coef_${XYZ}$ (j, 3, 3)*dvd(-2)*dvd(-2) & + + beta_coef_${XYZ}$ (j, 3, 4)*dvd(-2)*dvd(-3) & + + beta_coef_${XYZ}$ (j, 3, 5)*dvd(-3)*dvd(-3) & + + weno_eps + + else ! TENO + ! High-Order Low-Dissipation Targeted ENO Schemes for Ideal Magnetohydrodynamics (Fu & Tang, 2019) Section 3.2 + beta(0) = 13._wp/12._wp*(v(-1) - 2._wp*v( 0) + v( 1))**2._wp + (( v(-1) - v( 1))**2._wp)/4._wp + weno_eps !& + beta(1) = 13._wp/12._wp*(v( 0) - 2._wp*v( 1) + v( 2))**2._wp + ((3._wp*v( 0) - 4._wp*v( 1) + v( 2))**2._wp)/4._wp + weno_eps !& + beta(2) = 13._wp/12._wp*(v(-2) - 2._wp*v(-1) + v( 0))**2._wp + (( v(-2) - 4._wp*v(-1) + 3._wp*v( 0))**2._wp)/4._wp + weno_eps !& + + beta(3) = ( v( 0)*(2107._wp*v( 0) - 9402._wp*v( 1) + 7042._wp*v( 2) - 1854._wp*v( 3)) & !& + + v( 1)*( 11003._wp*v( 1) - 17246._wp*v( 2) + 4642._wp*v( 3)) & !& + + v( 2)*( 7043._wp*v( 2) - 3882._wp*v( 3)) & !& + + v( 3)*( 547._wp*v( 3)) ) / 240._wp & !& + + weno_eps !& + + beta(4) = ( v(-3)*(547._wp*v(-3) - 3882._wp*v(-2) + 4642._wp*v(-1) - 1854._wp*v( 0)) & !& + + v(-2)*( 7043._wp*v(-2) - 17246._wp*v(-1) + 7042._wp*v( 0)) & !& + + v(-1)*( 11003._wp*v(-1) - 9402._wp*v( 0)) & !& + + v( 0)*( 2107._wp*v( 0)) ) / 240._wp & !& + + weno_eps !& + end if + + if (wenojs) then + alpha = d_cbL_${XYZ}$ (:, j)/(beta*beta) + + elseif (mapped_weno) then + alpha = d_cbL_${XYZ}$ (:, j)/(beta*beta) + omega = alpha/sum(alpha) + alpha = (d_cbL_${XYZ}$ (:, j)*(1._wp + d_cbL_${XYZ}$ (:, j) - 3._wp*omega) + omega**2._wp) & + *(omega/(d_cbL_${XYZ}$ (:, j)**2._wp + omega*(1._wp - 2._wp*d_cbL_${XYZ}$ (:, j)))) + + elseif (wenoz) then + ! Castro, et al. (2010) + ! Don & Borges (2013) also helps + tau = abs(beta(3) - beta(0)) ! Equation 50 + alpha = d_cbL_${XYZ}$ (:, j)*(1._wp + (tau/beta)**wenoz_q) ! q = 2,3,4 for stability + + elseif (teno) then + tau = abs(beta(4) - beta(3)) ! Note the reordering of stencils + alpha = 1._wp + tau/beta + alpha = (alpha*alpha*alpha)**2._wp ! some CPU compilers cannot optimize x**6.0 + omega = alpha/sum(alpha) + delta = merge(0._wp, 1._wp, omega < teno_CT) + alpha = delta*d_cbL_${XYZ}$ (:, j) + + end if - poly(3) = v_rs_ws_${XYZ}$ (j, k, l, i) & - + poly_coef_cbL_${XYZ}$ (j, 0, 0)*dvd(2) & - + poly_coef_cbL_${XYZ}$ (j, 0, 1)*dvd(1) & - + poly_coef_cbL_${XYZ}$ (j, 0, 2)*dvd(0) - poly(2) = v_rs_ws_${XYZ}$ (j, k, l, i) & - + poly_coef_cbL_${XYZ}$ (j, 1, 0)*dvd(1) & - + poly_coef_cbL_${XYZ}$ (j, 1, 1)*dvd(0) & - + poly_coef_cbL_${XYZ}$ (j, 1, 2)*dvd(-1) - poly(1) = v_rs_ws_${XYZ}$ (j, k, l, i) & - + poly_coef_cbL_${XYZ}$ (j, 2, 0)*dvd(0) & - + poly_coef_cbL_${XYZ}$ (j, 2, 1)*dvd(-1) & - + poly_coef_cbL_${XYZ}$ (j, 2, 2)*dvd(-2) - poly(0) = v_rs_ws_${XYZ}$ (j, k, l, i) & - + poly_coef_cbL_${XYZ}$ (j, 3, 0)*dvd(-1) & - + poly_coef_cbL_${XYZ}$ (j, 3, 1)*dvd(-2) & - + poly_coef_cbL_${XYZ}$ (j, 3, 2)*dvd(-3) - - else - ! (Fu, et al., 2016) Table 1 - ! Note: Unlike TENO5, TENO7 stencils differ from WENO7 stencils - ! See Figure 2 (right) for right-sided flux (at i+1/2) - ! Here we need the left-sided flux, so we flip the weights with respect to the x=i point - ! But we need to keep the stencil order to reuse the beta coefficients - poly(0) = ( 2._wp*v(-1) + 5._wp*v( 0) - 1._wp*v( 1)) / 6._wp !& - poly(1) = (11._wp*v( 0) - 7._wp*v( 1) + 2._wp*v( 2)) / 6._wp !& - poly(2) = (-1._wp*v(-2) + 5._wp*v(-1) + 2._wp*v( 0)) / 6._wp !& - poly(3) = (25._wp*v( 0) - 23._wp*v( 1) + 13._wp*v( 2) - 3._wp*v( 3)) / 12._wp !& - poly(4) = ( 1._wp*v(-3) - 5._wp*v(-2) + 13._wp*v(-1) + 3._wp*v( 0)) / 12._wp !& - end if - - if (.not. teno) then - - beta(3) = beta_coef_${XYZ}$ (j, 0, 0)*dvd(2)*dvd(2) & - + beta_coef_${XYZ}$ (j, 0, 1)*dvd(2)*dvd(1) & - + beta_coef_${XYZ}$ (j, 0, 2)*dvd(2)*dvd(0) & - + beta_coef_${XYZ}$ (j, 0, 3)*dvd(1)*dvd(1) & - + beta_coef_${XYZ}$ (j, 0, 4)*dvd(1)*dvd(0) & - + beta_coef_${XYZ}$ (j, 0, 5)*dvd(0)*dvd(0) & - + weno_eps - - beta(2) = beta_coef_${XYZ}$ (j, 1, 0)*dvd(1)*dvd(1) & - + beta_coef_${XYZ}$ (j, 1, 1)*dvd(1)*dvd(0) & - + beta_coef_${XYZ}$ (j, 1, 2)*dvd(1)*dvd(-1) & - + beta_coef_${XYZ}$ (j, 1, 3)*dvd(0)*dvd(0) & - + beta_coef_${XYZ}$ (j, 1, 4)*dvd(0)*dvd(-1) & - + beta_coef_${XYZ}$ (j, 1, 5)*dvd(-1)*dvd(-1) & - + weno_eps - - beta(1) = beta_coef_${XYZ}$ (j, 2, 0)*dvd(0)*dvd(0) & - + beta_coef_${XYZ}$ (j, 2, 1)*dvd(0)*dvd(-1) & - + beta_coef_${XYZ}$ (j, 2, 2)*dvd(0)*dvd(-2) & - + beta_coef_${XYZ}$ (j, 2, 3)*dvd(-1)*dvd(-1) & - + beta_coef_${XYZ}$ (j, 2, 4)*dvd(-1)*dvd(-2) & - + beta_coef_${XYZ}$ (j, 2, 5)*dvd(-2)*dvd(-2) & - + weno_eps - - beta(0) = beta_coef_${XYZ}$ (j, 3, 0)*dvd(-1)*dvd(-1) & - + beta_coef_${XYZ}$ (j, 3, 1)*dvd(-1)*dvd(-2) & - + beta_coef_${XYZ}$ (j, 3, 2)*dvd(-1)*dvd(-3) & - + beta_coef_${XYZ}$ (j, 3, 3)*dvd(-2)*dvd(-2) & - + beta_coef_${XYZ}$ (j, 3, 4)*dvd(-2)*dvd(-3) & - + beta_coef_${XYZ}$ (j, 3, 5)*dvd(-3)*dvd(-3) & - + weno_eps - - else ! TENO - ! High-Order Low-Dissipation Targeted ENO Schemes for Ideal Magnetohydrodynamics (Fu & Tang, 2019) Section 3.2 - beta(0) = 13._wp/12._wp*(v(-1) - 2._wp*v( 0) + v( 1))**2._wp + (( v(-1) - v( 1))**2._wp)/4._wp + weno_eps !& - beta(1) = 13._wp/12._wp*(v( 0) - 2._wp*v( 1) + v( 2))**2._wp + ((3._wp*v( 0) - 4._wp*v( 1) + v( 2))**2._wp)/4._wp + weno_eps !& - beta(2) = 13._wp/12._wp*(v(-2) - 2._wp*v(-1) + v( 0))**2._wp + (( v(-2) - 4._wp*v(-1) + 3._wp*v( 0))**2._wp)/4._wp + weno_eps !& - - beta(3) = ( v( 0)*(2107._wp*v( 0) - 9402._wp*v( 1) + 7042._wp*v( 2) - 1854._wp*v( 3)) & !& - + v( 1)*( 11003._wp*v( 1) - 17246._wp*v( 2) + 4642._wp*v( 3)) & !& - + v( 2)*( 7043._wp*v( 2) - 3882._wp*v( 3)) & !& - + v( 3)*( 547._wp*v( 3)) ) / 240._wp & !& - + weno_eps !& - - beta(4) = ( v(-3)*(547._wp*v(-3) - 3882._wp*v(-2) + 4642._wp*v(-1) - 1854._wp*v( 0)) & !& - + v(-2)*( 7043._wp*v(-2) - 17246._wp*v(-1) + 7042._wp*v( 0)) & !& - + v(-1)*( 11003._wp*v(-1) - 9402._wp*v( 0)) & !& - + v( 0)*( 2107._wp*v( 0)) ) / 240._wp & !& - + weno_eps !& - end if - - if (wenojs) then - alpha = d_cbL_${XYZ}$ (:, j)/(beta*beta) - - elseif (mapped_weno) then - alpha = d_cbL_${XYZ}$ (:, j)/(beta*beta) omega = alpha/sum(alpha) - alpha = (d_cbL_${XYZ}$ (:, j)*(1._wp + d_cbL_${XYZ}$ (:, j) - 3._wp*omega) + omega**2._wp) & - *(omega/(d_cbL_${XYZ}$ (:, j)**2._wp + omega*(1._wp - 2._wp*d_cbL_${XYZ}$ (:, j)))) - - elseif (wenoz) then - ! Castro, et al. (2010) - ! Don & Borges (2013) also helps - tau = abs(beta(3) - beta(0)) ! Equation 50 - alpha = d_cbL_${XYZ}$ (:, j)*(1._wp + (tau/beta)**wenoz_q) ! q = 2,3,4 for stability - - elseif (teno) then - tau = abs(beta(4) - beta(3)) ! Note the reordering of stencils - alpha = 1._wp + tau/beta - alpha = (alpha*alpha*alpha)**2._wp ! some CPU compilers cannot optimize x**6.0 - omega = alpha/sum(alpha) - delta = merge(0._wp, 1._wp, omega < teno_CT) - alpha = delta*d_cbL_${XYZ}$ (:, j) - - end if - - omega = alpha/sum(alpha) - vL_rs_vf_${XYZ}$ (j, k, l, i) = sum(omega*poly) + vL_rs_vf_${XYZ}$ (j, k, l, i) = sum(omega*poly) + + if (.not. teno) then + poly(3) = v_rs_ws_${XYZ}$ (j, k, l, i) & + + poly_coef_cbR_${XYZ}$ (j, 0, 0)*dvd(2) & + + poly_coef_cbR_${XYZ}$ (j, 0, 1)*dvd(1) & + + poly_coef_cbR_${XYZ}$ (j, 0, 2)*dvd(0) + poly(2) = v_rs_ws_${XYZ}$ (j, k, l, i) & + + poly_coef_cbR_${XYZ}$ (j, 1, 0)*dvd(1) & + + poly_coef_cbR_${XYZ}$ (j, 1, 1)*dvd(0) & + + poly_coef_cbR_${XYZ}$ (j, 1, 2)*dvd(-1) + poly(1) = v_rs_ws_${XYZ}$ (j, k, l, i) & + + poly_coef_cbR_${XYZ}$ (j, 2, 0)*dvd(0) & + + poly_coef_cbR_${XYZ}$ (j, 2, 1)*dvd(-1) & + + poly_coef_cbR_${XYZ}$ (j, 2, 2)*dvd(-2) + poly(0) = v_rs_ws_${XYZ}$ (j, k, l, i) & + + poly_coef_cbR_${XYZ}$ (j, 3, 0)*dvd(-1) & + + poly_coef_cbR_${XYZ}$ (j, 3, 1)*dvd(-2) & + + poly_coef_cbR_${XYZ}$ (j, 3, 2)*dvd(-3) + else + poly(0) = (-1._wp*v(-1) + 5._wp*v( 0) + 2._wp*v( 1)) / 6._wp !& + poly(1) = ( 2._wp*v( 0) + 5._wp*v( 1) - 1._wp*v( 2)) / 6._wp !& + poly(2) = ( 2._wp*v(-2) - 7._wp*v(-1) + 11._wp*v( 0)) / 6._wp !& + poly(3) = ( 3._wp*v( 0) + 13._wp*v( 1) - 5._wp*v( 2) + 1._wp*v( 3)) / 12._wp !& + poly(4) = (-3._wp*v(-3) + 13._wp*v(-2) - 23._wp*v(-1) + 25._wp*v( 0)) / 12._wp !& + end if + + if (wenojs) then + alpha = d_cbR_${XYZ}$ (:, j)/(beta*beta) + + elseif (mapped_weno) then + alpha = d_cbR_${XYZ}$ (:, j)/(beta*beta) + omega = alpha/sum(alpha) + alpha = (d_cbR_${XYZ}$ (:, j)*(1._wp + d_cbR_${XYZ}$ (:, j) - 3._wp*omega) + omega**2._wp) & + *(omega/(d_cbR_${XYZ}$ (:, j)**2._wp + omega*(1._wp - 2._wp*d_cbR_${XYZ}$ (:, j)))) + + elseif (wenoz) then + alpha = d_cbR_${XYZ}$ (:, j)*(1._wp + (tau/beta)**wenoz_q) + + elseif (teno) then + alpha = delta*d_cbR_${XYZ}$ (:, j) + + end if - if (.not. teno) then - poly(3) = v_rs_ws_${XYZ}$ (j, k, l, i) & - + poly_coef_cbR_${XYZ}$ (j, 0, 0)*dvd(2) & - + poly_coef_cbR_${XYZ}$ (j, 0, 1)*dvd(1) & - + poly_coef_cbR_${XYZ}$ (j, 0, 2)*dvd(0) - poly(2) = v_rs_ws_${XYZ}$ (j, k, l, i) & - + poly_coef_cbR_${XYZ}$ (j, 1, 0)*dvd(1) & - + poly_coef_cbR_${XYZ}$ (j, 1, 1)*dvd(0) & - + poly_coef_cbR_${XYZ}$ (j, 1, 2)*dvd(-1) - poly(1) = v_rs_ws_${XYZ}$ (j, k, l, i) & - + poly_coef_cbR_${XYZ}$ (j, 2, 0)*dvd(0) & - + poly_coef_cbR_${XYZ}$ (j, 2, 1)*dvd(-1) & - + poly_coef_cbR_${XYZ}$ (j, 2, 2)*dvd(-2) - poly(0) = v_rs_ws_${XYZ}$ (j, k, l, i) & - + poly_coef_cbR_${XYZ}$ (j, 3, 0)*dvd(-1) & - + poly_coef_cbR_${XYZ}$ (j, 3, 1)*dvd(-2) & - + poly_coef_cbR_${XYZ}$ (j, 3, 2)*dvd(-3) - else - poly(0) = (-1._wp*v(-1) + 5._wp*v( 0) + 2._wp*v( 1)) / 6._wp !& - poly(1) = ( 2._wp*v( 0) + 5._wp*v( 1) - 1._wp*v( 2)) / 6._wp !& - poly(2) = ( 2._wp*v(-2) - 7._wp*v(-1) + 11._wp*v( 0)) / 6._wp !& - poly(3) = ( 3._wp*v( 0) + 13._wp*v( 1) - 5._wp*v( 2) + 1._wp*v( 3)) / 12._wp !& - poly(4) = (-3._wp*v(-3) + 13._wp*v(-2) - 23._wp*v(-1) + 25._wp*v( 0)) / 12._wp !& - end if - - if (wenojs) then - alpha = d_cbR_${XYZ}$ (:, j)/(beta*beta) - - elseif (mapped_weno) then - alpha = d_cbR_${XYZ}$ (:, j)/(beta*beta) omega = alpha/sum(alpha) - alpha = (d_cbR_${XYZ}$ (:, j)*(1._wp + d_cbR_${XYZ}$ (:, j) - 3._wp*omega) + omega**2._wp) & - *(omega/(d_cbR_${XYZ}$ (:, j)**2._wp + omega*(1._wp - 2._wp*d_cbR_${XYZ}$ (:, j)))) - - elseif (wenoz) then - alpha = d_cbR_${XYZ}$ (:, j)*(1._wp + (tau/beta)**wenoz_q) - - elseif (teno) then - alpha = delta*d_cbR_${XYZ}$ (:, j) - end if - - omega = alpha/sum(alpha) - - vR_rs_vf_${XYZ}$ (j, k, l, i) = sum(omega*poly) + vR_rs_vf_${XYZ}$ (j, k, l, i) = sum(omega*poly) + end do end do end do end do - end do + #:endcall GPU_PARALLEL_LOOP end if #:endfor @@ -1135,48 +1139,51 @@ contains $:GPU_UPDATE(device='[v_size]') if (weno_dir == 1) then - $:GPU_PARALLEL_LOOP(collapse=4) - do j = 1, v_size - do q = is3_weno%beg, is3_weno%end - do l = is2_weno%beg, is2_weno%end - do k = is1_weno%beg - weno_polyn, is1_weno%end + weno_polyn - v_rs_ws_x(k, l, q, j) = v_vf(j)%sf(k, l, q) + #:call GPU_PARALLEL_LOOP(collapse=4) + do j = 1, v_size + do q = is3_weno%beg, is3_weno%end + do l = is2_weno%beg, is2_weno%end + do k = is1_weno%beg - weno_polyn, is1_weno%end + weno_polyn + v_rs_ws_x(k, l, q, j) = v_vf(j)%sf(k, l, q) + end do end do end do end do - end do + #:endcall GPU_PARALLEL_LOOP end if ! Reshaping/Projecting onto Characteristic Fields in y-direction if (n == 0) return if (weno_dir == 2) then - $:GPU_PARALLEL_LOOP(collapse=4) - do j = 1, v_size - do q = is3_weno%beg, is3_weno%end - do l = is2_weno%beg, is2_weno%end - do k = is1_weno%beg - weno_polyn, is1_weno%end + weno_polyn - v_rs_ws_y(k, l, q, j) = v_vf(j)%sf(l, k, q) + #:call GPU_PARALLEL_LOOP(collapse=4) + do j = 1, v_size + do q = is3_weno%beg, is3_weno%end + do l = is2_weno%beg, is2_weno%end + do k = is1_weno%beg - weno_polyn, is1_weno%end + weno_polyn + v_rs_ws_y(k, l, q, j) = v_vf(j)%sf(l, k, q) + end do end do end do end do - end do + #:endcall GPU_PARALLEL_LOOP end if ! Reshaping/Projecting onto Characteristic Fields in z-direction if (p == 0) return if (weno_dir == 3) then - $:GPU_PARALLEL_LOOP(collapse=4) - do j = 1, v_size - do q = is3_weno%beg, is3_weno%end - do l = is2_weno%beg, is2_weno%end - do k = is1_weno%beg - weno_polyn, is1_weno%end + weno_polyn - v_rs_ws_z(k, l, q, j) = v_vf(j)%sf(q, l, k) + #:call GPU_PARALLEL_LOOP(collapse=4) + do j = 1, v_size + do q = is3_weno%beg, is3_weno%end + do l = is2_weno%beg, is2_weno%end + do k = is1_weno%beg - weno_polyn, is1_weno%end + weno_polyn + v_rs_ws_z(k, l, q, j) = v_vf(j)%sf(q, l, k) + end do end do end do end do - end do + #:endcall GPU_PARALLEL_LOOP end if end subroutine s_initialize_weno @@ -1192,7 +1199,7 @@ contains !! @param j First-coordinate cell index !! @param k Secone-coordinate cell index !! @param l Thire-coordinate cell index - pure subroutine s_preserve_monotonicity(v_rs_ws, vL_rs_vf, vR_rs_vf) + subroutine s_preserve_monotonicity(v_rs_ws, vL_rs_vf, vR_rs_vf) real(wp), dimension(idwbuff(1)%beg:, idwbuff(2)%beg:, idwbuff(3)%beg:, 1:), intent(IN) :: v_rs_ws real(wp), dimension(idwbuff(1)%beg:, idwbuff(2)%beg:, idwbuff(3)%beg:, 1:), intent(INOUT) :: vL_rs_vf, vR_rs_vf @@ -1226,131 +1233,132 @@ contains real(wp), parameter :: alpha_mp = 2._wp real(wp), parameter :: beta_mp = 4._wp/3._wp - $:GPU_PARALLEL_LOOP(collapse=4,private='[d]') - do l = is3_weno%beg, is3_weno%end - do k = is2_weno%beg, is2_weno%end - do j = is1_weno%beg, is1_weno%end - do i = 1, v_size - d(-1) = v_rs_ws(j, k, l, i) & - + v_rs_ws(j - 2, k, l, i) & - - v_rs_ws(j - 1, k, l, i) & - *2._wp - d(0) = v_rs_ws(j + 1, k, l, i) & - + v_rs_ws(j - 1, k, l, i) & - - v_rs_ws(j, k, l, i) & - *2._wp - d(1) = v_rs_ws(j + 2, k, l, i) & - + v_rs_ws(j, k, l, i) & - - v_rs_ws(j + 1, k, l, i) & - *2._wp - - d_MD = (sign(1._wp, 4._wp*d(-1) - d(0)) + sign(1._wp, 4._wp*d(0) - d(-1))) & - *abs((sign(1._wp, 4._wp*d(-1) - d(0)) + sign(1._wp, d(-1))) & - *(sign(1._wp, 4._wp*d(-1) - d(0)) + sign(1._wp, d(0)))) & - *min(abs(4._wp*d(-1) - d(0)), abs(d(-1)), & - abs(4._wp*d(0) - d(-1)), abs(d(0)))/8._wp - - d_LC = (sign(1._wp, 4._wp*d(0) - d(1)) + sign(1._wp, 4._wp*d(1) - d(0))) & - *abs((sign(1._wp, 4._wp*d(0) - d(1)) + sign(1._wp, d(0))) & - *(sign(1._wp, 4._wp*d(0) - d(1)) + sign(1._wp, d(1)))) & - *min(abs(4._wp*d(0) - d(1)), abs(d(0)), & - abs(4._wp*d(1) - d(0)), abs(d(1)))/8._wp - - vL_UL = v_rs_ws(j, k, l, i) & - - (v_rs_ws(j + 1, k, l, i) & - - v_rs_ws(j, k, l, i))*alpha_mp - - vL_MD = (v_rs_ws(j, k, l, i) & - + v_rs_ws(j - 1, k, l, i) & - - d_MD)*5.e-1_wp - - vL_LC = v_rs_ws(j, k, l, i) & - - (v_rs_ws(j + 1, k, l, i) & - - v_rs_ws(j, k, l, i))*5.e-1_wp + beta_mp*d_LC - - vL_min = max(min(v_rs_ws(j, k, l, i), & - v_rs_ws(j - 1, k, l, i), & - vL_MD), & - min(v_rs_ws(j, k, l, i), & - vL_UL, & - vL_LC)) - - vL_max = min(max(v_rs_ws(j, k, l, i), & - v_rs_ws(j - 1, k, l, i), & - vL_MD), & - max(v_rs_ws(j, k, l, i), & - vL_UL, & - vL_LC)) - - vL_rs_vf(j, k, l, i) = vL_rs_vf(j, k, l, i) & - + (sign(5.e-1_wp, vL_min - vL_rs_vf(j, k, l, i)) & - + sign(5.e-1_wp, vL_max - vL_rs_vf(j, k, l, i))) & - *min(abs(vL_min - vL_rs_vf(j, k, l, i)), & - abs(vL_max - vL_rs_vf(j, k, l, i))) - ! END: Left Monotonicity Preserving Bound - - ! Right Monotonicity Preserving Bound - d(-1) = v_rs_ws(j, k, l, i) & - + v_rs_ws(j - 2, k, l, i) & - - v_rs_ws(j - 1, k, l, i) & - *2._wp - d(0) = v_rs_ws(j + 1, k, l, i) & - + v_rs_ws(j - 1, k, l, i) & - - v_rs_ws(j, k, l, i) & - *2._wp - d(1) = v_rs_ws(j + 2, k, l, i) & - + v_rs_ws(j, k, l, i) & - - v_rs_ws(j + 1, k, l, i) & - *2._wp - - d_MD = (sign(1._wp, 4._wp*d(0) - d(1)) + sign(1._wp, 4._wp*d(1) - d(0))) & - *abs((sign(1._wp, 4._wp*d(0) - d(1)) + sign(1._wp, d(0))) & - *(sign(1._wp, 4._wp*d(0) - d(1)) + sign(1._wp, d(1)))) & - *min(abs(4._wp*d(0) - d(1)), abs(d(0)), & - abs(4._wp*d(1) - d(0)), abs(d(1)))/8._wp - - d_LC = (sign(1._wp, 4._wp*d(-1) - d(0)) + sign(1._wp, 4._wp*d(0) - d(-1))) & - *abs((sign(1._wp, 4._wp*d(-1) - d(0)) + sign(1._wp, d(-1))) & - *(sign(1._wp, 4._wp*d(-1) - d(0)) + sign(1._wp, d(0)))) & - *min(abs(4._wp*d(-1) - d(0)), abs(d(-1)), & - abs(4._wp*d(0) - d(-1)), abs(d(0)))/8._wp - - vR_UL = v_rs_ws(j, k, l, i) & - + (v_rs_ws(j, k, l, i) & - - v_rs_ws(j - 1, k, l, i))*alpha_mp - - vR_MD = (v_rs_ws(j, k, l, i) & - + v_rs_ws(j + 1, k, l, i) & - - d_MD)*5.e-1_wp - - vR_LC = v_rs_ws(j, k, l, i) & - + (v_rs_ws(j, k, l, i) & - - v_rs_ws(j - 1, k, l, i))*5.e-1_wp + beta_mp*d_LC - - vR_min = max(min(v_rs_ws(j, k, l, i), & - v_rs_ws(j + 1, k, l, i), & - vR_MD), & - min(v_rs_ws(j, k, l, i), & - vR_UL, & - vR_LC)) - - vR_max = min(max(v_rs_ws(j, k, l, i), & - v_rs_ws(j + 1, k, l, i), & - vR_MD), & - max(v_rs_ws(j, k, l, i), & - vR_UL, & - vR_LC)) - - vR_rs_vf(j, k, l, i) = vR_rs_vf(j, k, l, i) & - + (sign(5.e-1_wp, vR_min - vR_rs_vf(j, k, l, i)) & - + sign(5.e-1_wp, vR_max - vR_rs_vf(j, k, l, i))) & - *min(abs(vR_min - vR_rs_vf(j, k, l, i)), & - abs(vR_max - vR_rs_vf(j, k, l, i))) - ! END: Right Monotonicity Preserving Bound + #:call GPU_PARALLEL_LOOP(collapse=4,private='[d]') + do l = is3_weno%beg, is3_weno%end + do k = is2_weno%beg, is2_weno%end + do j = is1_weno%beg, is1_weno%end + do i = 1, v_size + d(-1) = v_rs_ws(j, k, l, i) & + + v_rs_ws(j - 2, k, l, i) & + - v_rs_ws(j - 1, k, l, i) & + *2._wp + d(0) = v_rs_ws(j + 1, k, l, i) & + + v_rs_ws(j - 1, k, l, i) & + - v_rs_ws(j, k, l, i) & + *2._wp + d(1) = v_rs_ws(j + 2, k, l, i) & + + v_rs_ws(j, k, l, i) & + - v_rs_ws(j + 1, k, l, i) & + *2._wp + + d_MD = (sign(1._wp, 4._wp*d(-1) - d(0)) + sign(1._wp, 4._wp*d(0) - d(-1))) & + *abs((sign(1._wp, 4._wp*d(-1) - d(0)) + sign(1._wp, d(-1))) & + *(sign(1._wp, 4._wp*d(-1) - d(0)) + sign(1._wp, d(0)))) & + *min(abs(4._wp*d(-1) - d(0)), abs(d(-1)), & + abs(4._wp*d(0) - d(-1)), abs(d(0)))/8._wp + + d_LC = (sign(1._wp, 4._wp*d(0) - d(1)) + sign(1._wp, 4._wp*d(1) - d(0))) & + *abs((sign(1._wp, 4._wp*d(0) - d(1)) + sign(1._wp, d(0))) & + *(sign(1._wp, 4._wp*d(0) - d(1)) + sign(1._wp, d(1)))) & + *min(abs(4._wp*d(0) - d(1)), abs(d(0)), & + abs(4._wp*d(1) - d(0)), abs(d(1)))/8._wp + + vL_UL = v_rs_ws(j, k, l, i) & + - (v_rs_ws(j + 1, k, l, i) & + - v_rs_ws(j, k, l, i))*alpha_mp + + vL_MD = (v_rs_ws(j, k, l, i) & + + v_rs_ws(j - 1, k, l, i) & + - d_MD)*5.e-1_wp + + vL_LC = v_rs_ws(j, k, l, i) & + - (v_rs_ws(j + 1, k, l, i) & + - v_rs_ws(j, k, l, i))*5.e-1_wp + beta_mp*d_LC + + vL_min = max(min(v_rs_ws(j, k, l, i), & + v_rs_ws(j - 1, k, l, i), & + vL_MD), & + min(v_rs_ws(j, k, l, i), & + vL_UL, & + vL_LC)) + + vL_max = min(max(v_rs_ws(j, k, l, i), & + v_rs_ws(j - 1, k, l, i), & + vL_MD), & + max(v_rs_ws(j, k, l, i), & + vL_UL, & + vL_LC)) + + vL_rs_vf(j, k, l, i) = vL_rs_vf(j, k, l, i) & + + (sign(5.e-1_wp, vL_min - vL_rs_vf(j, k, l, i)) & + + sign(5.e-1_wp, vL_max - vL_rs_vf(j, k, l, i))) & + *min(abs(vL_min - vL_rs_vf(j, k, l, i)), & + abs(vL_max - vL_rs_vf(j, k, l, i))) + ! END: Left Monotonicity Preserving Bound + + ! Right Monotonicity Preserving Bound + d(-1) = v_rs_ws(j, k, l, i) & + + v_rs_ws(j - 2, k, l, i) & + - v_rs_ws(j - 1, k, l, i) & + *2._wp + d(0) = v_rs_ws(j + 1, k, l, i) & + + v_rs_ws(j - 1, k, l, i) & + - v_rs_ws(j, k, l, i) & + *2._wp + d(1) = v_rs_ws(j + 2, k, l, i) & + + v_rs_ws(j, k, l, i) & + - v_rs_ws(j + 1, k, l, i) & + *2._wp + + d_MD = (sign(1._wp, 4._wp*d(0) - d(1)) + sign(1._wp, 4._wp*d(1) - d(0))) & + *abs((sign(1._wp, 4._wp*d(0) - d(1)) + sign(1._wp, d(0))) & + *(sign(1._wp, 4._wp*d(0) - d(1)) + sign(1._wp, d(1)))) & + *min(abs(4._wp*d(0) - d(1)), abs(d(0)), & + abs(4._wp*d(1) - d(0)), abs(d(1)))/8._wp + + d_LC = (sign(1._wp, 4._wp*d(-1) - d(0)) + sign(1._wp, 4._wp*d(0) - d(-1))) & + *abs((sign(1._wp, 4._wp*d(-1) - d(0)) + sign(1._wp, d(-1))) & + *(sign(1._wp, 4._wp*d(-1) - d(0)) + sign(1._wp, d(0)))) & + *min(abs(4._wp*d(-1) - d(0)), abs(d(-1)), & + abs(4._wp*d(0) - d(-1)), abs(d(0)))/8._wp + + vR_UL = v_rs_ws(j, k, l, i) & + + (v_rs_ws(j, k, l, i) & + - v_rs_ws(j - 1, k, l, i))*alpha_mp + + vR_MD = (v_rs_ws(j, k, l, i) & + + v_rs_ws(j + 1, k, l, i) & + - d_MD)*5.e-1_wp + + vR_LC = v_rs_ws(j, k, l, i) & + + (v_rs_ws(j, k, l, i) & + - v_rs_ws(j - 1, k, l, i))*5.e-1_wp + beta_mp*d_LC + + vR_min = max(min(v_rs_ws(j, k, l, i), & + v_rs_ws(j + 1, k, l, i), & + vR_MD), & + min(v_rs_ws(j, k, l, i), & + vR_UL, & + vR_LC)) + + vR_max = min(max(v_rs_ws(j, k, l, i), & + v_rs_ws(j + 1, k, l, i), & + vR_MD), & + max(v_rs_ws(j, k, l, i), & + vR_UL, & + vR_LC)) + + vR_rs_vf(j, k, l, i) = vR_rs_vf(j, k, l, i) & + + (sign(5.e-1_wp, vR_min - vR_rs_vf(j, k, l, i)) & + + sign(5.e-1_wp, vR_max - vR_rs_vf(j, k, l, i))) & + *min(abs(vR_min - vR_rs_vf(j, k, l, i)), & + abs(vR_max - vR_rs_vf(j, k, l, i))) + ! END: Right Monotonicity Preserving Bound + end do end do end do end do - end do + #:endcall GPU_PARALLEL_LOOP end subroutine s_preserve_monotonicity diff --git a/src/syscheck/syscheck.fpp b/src/syscheck/syscheck.fpp index ca2641057e..75e18efc33 100644 --- a/src/syscheck/syscheck.fpp +++ b/src/syscheck/syscheck.fpp @@ -30,6 +30,15 @@ #endif #:enddef ACCC +#:def OMPC(*args) +#ifdef MFC_OpenMP + @:LOG("[TEST] OMP: ${','.join([ x.replace("'", '') for x in args ])}$") + ${','.join([ x.replace("'", '') for x in args ])}$ +#else + @:LOG("[SKIP] OMP: ${','.join([ x.replace("'", '') for x in args ])}$") +#endif +#:enddef OMPC + #:def MPI(*args) #ifdef MFC_MPI ${','.join([ x.replace("'", '') for x in args ])}$ @@ -42,10 +51,17 @@ #endif #:enddef ACC +#:def OMP(*args) +#ifdef MFC_OpenMP + ${','.join([ x.replace("'", '') for x in args ])}$ +#endif +#:enddef OMP + program syscheck @:MPI(use mpi) @:ACC(use openacc) + @:OMP(use omp_lib) implicit none @@ -55,6 +71,7 @@ program syscheck @:ACC(integer :: i, num_devices) @:ACC(real(8), allocatable, dimension(:) :: arr) @:ACC(integer, parameter :: N = 100) + @:OMP(integer :: num_devices_omp) @:MPIC(call mpi_init(ierr)) @:MPIC(call mpi_comm_rank(MPI_COMM_WORLD, rank, ierr)) @@ -76,6 +93,10 @@ program syscheck @:ACCC('!$acc update host(arr(1:N))') @:ACCC('!$acc exit data delete(arr)') + @:OMPC('num_devices_omp = omp_get_num_devices()') + @:OMPC(call assert(num_devices_omp > 0)) + @:OMPC(call omp_set_default_device(mod(rank, nRanks))) + @:MPIC(call mpi_barrier(MPI_COMM_WORLD, ierr)) @:MPIC(call mpi_finalize(ierr)) diff --git a/toolchain/cmake/configuration.cmake.in b/toolchain/cmake/configuration.cmake.in index a9b9b5399c..4cdd90fcaa 100644 --- a/toolchain/cmake/configuration.cmake.in +++ b/toolchain/cmake/configuration.cmake.in @@ -14,6 +14,7 @@ CMake Configuration: MPI : ${MFC_MPI} OpenACC : ${MFC_OpenACC} + OpenMP : ${MFC_OpenMP} Fypp : ${FYPP_EXE} Doxygen : ${DOXYGEN_EXECUTABLE} diff --git a/toolchain/mfc/args.py b/toolchain/mfc/args.py index 7659132bb6..169ee076a4 100644 --- a/toolchain/mfc/args.py +++ b/toolchain/mfc/args.py @@ -4,9 +4,10 @@ from .build import TARGETS, DEFAULT_TARGETS from .common import MFCException, format_list_to_string from .test.cases import list_cases +from .state import gpuConfigOptions, MFCConfig # pylint: disable=too-many-locals, too-many-branches, too-many-statements -def parse(config): +def parse(config: MFCConfig): parser = argparse.ArgumentParser( prog="./mfc.sh", description="""\ @@ -46,7 +47,7 @@ def parse(config): compare.add_argument("-rel", "--reltol", metavar="RELTOL", type=float, default=1e-12, help="Relative tolerance.") compare.add_argument("-abs", "--abstol", metavar="ABSTOL", type=float, default=1e-12, help="Absolute tolerance.") - def add_common_arguments(p, mask = None): + def add_common_arguments(p: argparse.ArgumentParser, mask = None): if mask is None: mask = "" @@ -57,6 +58,10 @@ def add_common_arguments(p, mask = None): if "m" not in mask: for f in dataclasses.fields(config): + if f.name == 'gpu': + p.add_argument(f"--{f.name}", action="store", nargs='?', const= gpuConfigOptions.ACC.value,default=gpuConfigOptions.ACC.value, dest=f.name, choices=[e.value for e in gpuConfigOptions], help=f"Turn the {f.name} option to OpenACC or OpenMP.") + p.add_argument(f"--no-{f.name}", action="store_const", const = gpuConfigOptions.NONE.value, dest=f.name, help=f"Turn the {f.name} option OFF.") + continue p.add_argument( f"--{f.name}", action="store_true", help=f"Turn the {f.name} option ON.") p.add_argument(f"--no-{f.name}", action="store_false", dest=f.name, help=f"Turn the {f.name} option OFF.") diff --git a/toolchain/mfc/build.py b/toolchain/mfc/build.py index 846763b233..72bfd04686 100644 --- a/toolchain/mfc/build.py +++ b/toolchain/mfc/build.py @@ -6,6 +6,7 @@ format_list_to_string from .state import ARG, CFG from .run import input +from .state import gpuConfigOptions @dataclasses.dataclass class MFCTarget: @@ -17,7 +18,7 @@ class Dependencies: def compute(self) -> typing.Set: r = self.all[:] - r += self.gpu[:] if ARG("gpu") else self.cpu[:] + r += self.gpu[:] if (ARG("gpu") != gpuConfigOptions.NONE.value) else self.cpu[:] return r @@ -144,7 +145,10 @@ def configure(self, case: Case): if not self.isDependency: flags.append(f"-DMFC_MPI={ 'ON' if ARG('mpi') else 'OFF'}") - flags.append(f"-DMFC_OpenACC={'ON' if ARG('gpu') else 'OFF'}") + # flags.append(f"-DMFC_OpenACC={'ON' if ARG('acc') else 'OFF'}") + # flags.append(f"-DMFC_OpenMP={'ON' if ARG('mp') else 'OFF'}") + flags.append(f"-DMFC_OpenACC={'ON' if (ARG('gpu') == gpuConfigOptions.ACC.value) else 'OFF'}") + flags.append(f"-DMFC_OpenMP={'ON' if (ARG('gpu') == gpuConfigOptions.MP.value) else 'OFF'}") flags.append(f"-DMFC_GCov={ 'ON' if ARG('gcov') else 'OFF'}") flags.append(f"-DMFC_Unified={'ON' if ARG('unified') else 'OFF'}") flags.append(f"-DMFC_Fastmath={'ON' if ARG('fastmath') else 'OFF'}") diff --git a/toolchain/mfc/run/input.py b/toolchain/mfc/run/input.py index 5a7022eec7..ac34dae9d4 100644 --- a/toolchain/mfc/run/input.py +++ b/toolchain/mfc/run/input.py @@ -5,7 +5,7 @@ from ..printer import cons from .. import common, build -from ..state import ARGS, ARG +from ..state import ARGS, ARG, gpuConfigOptions from ..case import Case @dataclasses.dataclass(init=False) @@ -73,13 +73,20 @@ def generate_fpp(self, target) -> None: # Determine the real type based on the single precision flag real_type = 'real(sp)' if ARG('single') else 'real(dp)' + if ARG("gpu") == gpuConfigOptions.MP.value: + directive_str = 'mp' + elif ARG("gpu") == gpuConfigOptions.ACC.value: + directive_str = 'acc' + else: + directive_str = None + # Write the generated Fortran code to the m_thermochem.f90 file with the chosen precision common.file_write( os.path.join(modules_dir, "m_thermochem.f90"), pyro.FortranCodeGenerator().generate( "m_thermochem", self.get_cantera_solution(), - pyro.CodeGenerationOptions(scalar_type = real_type, directive_offload="acc") + pyro.CodeGenerationOptions(scalar_type = real_type, directive_offload = directive_str) ), True ) diff --git a/toolchain/mfc/state.py b/toolchain/mfc/state.py index a16f4ec548..05865a803c 100644 --- a/toolchain/mfc/state.py +++ b/toolchain/mfc/state.py @@ -1,10 +1,18 @@ import typing, dataclasses +from enum import Enum, unique +@unique +class gpuConfigOptions(Enum): + NONE = 'no' + ACC = 'acc' + MP = 'mp' @dataclasses.dataclass class MFCConfig: mpi: bool = True - gpu: bool = False + gpu: str = gpuConfigOptions.NONE.value + # mp: bool = False + # acc: bool = False debug: bool = False gcov: bool = False unified: bool = False @@ -22,18 +30,30 @@ def from_dict(d: dict): return r - def items(self) -> typing.List[typing.Tuple[str, bool]]: + def items(self) -> typing.Iterable[typing.Tuple[str, typing.Any]]: return dataclasses.asdict(self).items() def make_options(self) -> typing.List[str]: """ Returns a list of options that could be passed to mfc.sh again. Example: --no-debug --mpi --no-gpu --no-gcov --no-unified""" - return [ f"--{'no-' if not v else ''}{k}" for k, v in self.items() ] + options = [] + for k, v in self.items(): + if k == 'gpu': + options.append(f"--{v}-{k}") + else: + options.append(f"--{'no-' if not v else ''}{k}") + return options def make_slug(self) -> str: """ Sort the items by key, then join them with underscores. This uniquely identifies the configuration. Example: no-debug_no-gpu_no_mpi_no-gcov """ - return '_'.join([ f"{'no-' if not v else ''}{k}" for k, v in sorted(self.items(), key=lambda x: x[0]) ]) + options = [] + for k, v in sorted(self.items(), key=lambda x: x[0]): + if k == 'gpu': + options.append(f"--{v}-{k}") + else: + options.append(f"--{'no-' if not v else ''}{k}") + return '_'.join(options) def __eq__(self, other) -> bool: """ Check if two MFCConfig objects are equal, field by field. """ @@ -45,8 +65,18 @@ def __eq__(self, other) -> bool: def __str__(self) -> str: """ Returns a string like "mpi=No & gpu=No & debug=No & gcov=No & unified=No" """ - - return ' & '.join([ f"{k}={'Yes' if v else 'No'}" for k, v in self.items() ]) + strings = [] + for k,v in self.items(): + if isinstance(v, bool): + strings.append(f"{k}={'Yes' if v else 'No'}") + elif isinstance(v, str): + strings.append(f"{k}={v.capitalize()}") + elif isinstance(v, int): + strings.append(f"{k}={v}") + else: + strings.append(f"{k}={v.__str__()}") + + return ' & '.join(strings) gCFG: MFCConfig = MFCConfig()