diff --git a/docs/documentation/gpuParallelization.md b/docs/documentation/gpuParallelization.md
new file mode 100644
index 0000000000..8579914485
--- /dev/null
+++ b/docs/documentation/gpuParallelization.md
@@ -0,0 +1,566 @@
+# GPU Parallelization
+
+MFC compiles GPU code via OpenACC and in the future OpenMP as well.
+
+In order to swap between OpenACC and OpenMP, custom GPU macros are used that translate to equivalent OpenACC and OpenMP directives.
+FYPP is used to process the GPU macros.
+
+[OpenACC Quick start Guide](https://openacc-best-practices-guide.readthedocs.io/en/latest/01-Introduction.html)
+
+[OpenACC API Documentation](https://www.openacc.org/sites/default/files/inline-files/API%20Guide%202.7.pdf)
+
+------------------------------------------------------------------------------------------
+
+## Macro API Documentation
+
+Note: Ordering is not guaranteed or stable, so use key-value pairing when using macros
+
+### Data Type Meanings
+
+- Integer is a number
+
+- Boolean is a pythonic boolean - Valid options: `True` or `False`
+
+- String List is given as a comma separated list surrounding by brackets and inside quotations
+  - Ex: ``'[hello, world, Fortran]'``
+
+- 2-level string list is given as a comma separated list of string lists surrounding by brackets and inside quotations
+  - Ex: ``'[[hello, world], [Fortran, MFC]]'`` or ``'[[hello]]'``
+
+### Data Flow
+
+- Data on the GPU has a reference counter
+- When data is referred to being allocated, it means that GPU memory is allocated if it is not already present in GPU memory. If a variable is already present, the reference counter is just incremented.
+- When data is referred to being dellocated, it means that the reference counter is decremented. If the reference counter is zero, then the data is actually deallocated from GPU memory
+- When data is referred to being attached, it means that the device pointer attaches to target if it not already attached. If pointer is already attached, then the attachment counter is just incremented
+- When data is referred to being detached, it means that the attachment counter is decremented. If attachment counter is zero, then actually detached
+
+------------------------------------------------------------------------------------------
+
+### Computation Macros
+
+<details>
+  <summary><code>GPU_PARALLEL_LOOP</code> -- <code>(Execute the following loop on the GPU in parallel)</code></summary>
+
+**Macro Invocation**
+
+Uses FYPP eval directive using `$:`
+
+`$:GPU_PARALLEL_LOOP(...)`
+
+**Parameters**
+
+| name             | data type           | Default Value     | description                                                                               |
+|------------------|---------------------|-------------------|-------------------------------------------------------------------------------------------|
+| `collapse`       | integer             | None              | Number of loops to combine into 1 loop                                                    |
+| `parallelism`    | string list         | '\[gang,vector\]' | Parallelism granularity to use for this loop                                              |
+| `default`        | string              | 'present'         | Implicit assumptions compiler should make                                                 |
+| `private`        | string list         | None              | Variables that are private to each iteration/thread                                       |
+| `firstprivate`   | string list         | None              | Initialized variables that are private to each iteration/thread                           |
+| `reduction`      | 2-level string list | None              | Variables unique to each iteration and reduced at the end                                 |
+| `reductionOp`    | string list         | None              | Operator that each list of reduction will reduce with                                     |
+| `copy`           | string list         | None              | Allocates and copies data to GPU on entrance, then deallocated and copies to CPU on exit  |
+| `copyin`         | string list         | None              | Allocates and copies data to GPU on entrance and then deallocated on exit                 |
+| `copyinReadOnly` | string list         | None              | Allocates and copies readonly data to GPU and then deallocated on exit                    |
+| `copyout`        | string list         | None              | Allocates data on GPU on entrance and then deallocates and copies to CPU on exit          |
+| `create`         | string list         | None              | Allocates data on GPU on entrance and then deallocates on exit                            |
+| `no_create`      | string list         | None              | Use data in CPU memory unless data is already in GPU memory                               |
+| `present`        | string list         | None              | Data that must be present in GPU memory. Increment counter on entrance, decrement on exit |
+| `deviceptr`      | string list         | None              | Pointer variables that are already allocated on GPU memory                                |
+| `attach`         | string list         | None              | Attaches device pointer to device targets on entrance, then detach on exit                |
+| `extraAccArgs`   | string              | None              | String of any extra arguments added to the OpenACC directive                              |
+
+**Parameter Restrictions**
+
+| name          | Restricted range                                  |
+|---------------|---------------------------------------------------|
+| `collapse`    | Must be greater than 1                            |
+| `parallelism` | Valid elements: 'gang', 'worker', 'vector', 'seq' |
+| `default`     | 'present' or 'none'                               |
+
+**Additional information**
+
+- default present means that the any non-scalar data in assumed to be present on the GPU
+- default none means that the compiler should not implicitly determine the data attributes for any variable
+- reduction and reductionOp must match in length
+- With ``reduction='[[sum1, sum2], [largest]]'`` and ``reductionOp='[+, max]'``, `sum1` and `sum2` will be the sum of sum1/sum2 in each loop iteration, and `largest` will the maximum value of `largest` all the loop iterations
+- A reduction implies a copy, so it does not need to be added for both
+
+**Example**
+
+```python
+ $:GPU_PARALLEL_LOOP(collapse=3, private='[tmp, r]', reduction='[[vol, avg], [max_val]]', reductionOp='[+, MAX]')
+ $:GPU_PARALLEL_LOOP(collapse=2, private='[sum_holder]', copyin='[starting_sum]', copyout='[eigenval,C]')
+```
+
+</details>
+
+<details>
+  <summary><code>GPU_LOOP</code> -- <code>(Execute loop on GPU)</code></summary>
+
+**Macro Invocation**
+
+Uses FYPP eval directive using `$:`
+
+`$:GPU_LOOP(...)`
+
+**Parameters**
+
+| name              | data type           | Default Value | description                                                                                      |
+|-------------------|---------------------|---------------|--------------------------------------------------------------------------------------------------|
+| `collapse`        | integer             | None          | Number of loops to combine into 1 loop                                                           |
+| `parallelism`     | string list         | None          | Parallelism granularity to use for this loop                                                     |
+| `data_dependency` | string              | None          | 'independent'-> assert loop iterations are independent, 'auto->let compiler analyze dependencies |
+| `private`         | string list         | None          | Variables that are private to each iteration/thread                                              |
+| `reduction`       | 2-level string list | None          | Variables unique to each iteration and reduced at the end                                        |
+| `reductionOp`     | string list         | None          | Operator that each list of reduction will reduce with                                            |
+| `extraAccArgs`    | string              | None          | String of any extra arguments added to the OpenACC directive                                     |
+
+**Parameter Restrictions**
+
+| name              | Restricted range                                  |
+|-------------------|---------------------------------------------------|
+| `collapse`        | Must be greater than 1                            |
+| `parallelism`     | Valid elements: 'gang', 'worker', 'vector', 'seq' |
+| `data_dependency` | 'auto' or 'independent'                           |
+
+**Additional information**
+
+- Loop parallelism is most commonly ``'[seq]'``
+- reduction and reductionOp must match in length
+- With ``reduction='[[sum1, sum2], [largest]]'`` and ``reductionOp='[+, max]'``, `sum1` and `sum2` will be the sum of sum1/sum2 in each loop iteration, and `largest` will the maximum value of `largest` all the loop iterations
+
+**Example**
+
+```python
+ $:GPU_LOOP(parallelism='[seq]')
+ $:GPU_LOOP(collapse=3, parallelism='[seq]',private='[tmp, r]')
+```
+
+</details>
+
+<details>
+  <summary><code>GPU_PARALLEL</code> -- <code>(Execute the following on the GPU in parallel)</code></summary>
+
+**Macro Invocation**
+
+Uses FYPP call directive using `#:call`
+
+```C
+#:call GPU_PARALLEL(...)
+   {code}
+#:endcall GPU_PARALLEL 
+```
+
+**Parameters**
+
+| name             | data type           | Default Value     | description                                                                               |
+|------------------|---------------------|-------------------|-------------------------------------------------------------------------------------------|
+| `default`        | string              | 'present'         | Implicit assumptions compiler should make                                                 |
+| `private`        | string list         | None              | Variables that are private to each iteration/thread                                       |
+| `firstprivate`   | string list         | None              | Initialized variables that are private to each iteration/thread                           |
+| `reduction`      | 2-level string list | None              | Variables unique to each iteration and reduced at the end                                 |
+| `reductionOp`    | string list         | None              | Operator that each list of reduction will reduce with                                     |
+| `copy`           | string list         | None              | Allocates and copies data to GPU on entrance, then deallocated and copies to CPU on exit  |
+| `copyin`         | string list         | None              | Allocates and copies data to GPU on entrance and then deallocated on exit                 |
+| `copyinReadOnly` | string list         | None              | Allocates and copies readonly data to GPU and then deallocated on exit                    |
+| `copyout`        | string list         | None              | Allocates data on GPU on entrance and then deallocates and copies to CPU on exit          |
+| `create`         | string list         | None              | Allocates data on GPU on entrance and then deallocates on exit                            |
+| `no_create`      | string list         | None              | Use data in CPU memory unless data is already in GPU memory                               |
+| `present`        | string list         | None              | Data that must be present in GPU memory. Increment counter on entrance, decrement on exit |
+| `deviceptr`      | string list         | None              | Pointer variables that are already allocated on GPU memory                                |
+| `attach`         | string list         | None              | Attaches device pointer to device targets on entrance, then detach on exit                |
+| `extraAccArgs`   | string              | None              | String of any extra arguments added to the OpenACC directive                              |
+
+**Parameter Restrictions**
+
+| name          | Restricted range                                  |
+|---------------|---------------------------------------------------|
+| `default`     | 'present' or 'none'                               |
+
+**Additional information**
+
+- default present means that the any non-scalar data in assumed to be present on the GPU
+- default none means that the compiler should not implicitly determine the data attributes for any variable
+- reduction and reductionOp must match in length
+- With ``reduction='[[sum1, sum2], [largest]]'`` and ``reductionOp='[+, max]'``, `sum1` and `sum2` will be the sum of sum1/sum2 in each loop iteration, and `largest` will the maximum value of `largest` all the loop iterations
+- A reduction implies a copy, so it does not need to be added for both
+
+**Example**
+
+```C
+ #:call GPU_PARALLEL()
+      {code}
+      ...
+ #:endcall GPU_PARALLEL
+ #:call GPU_PARALLEL(create='[pixel_arr]', copyin='[initial_index]')
+      {code}
+      ...
+ #:endcall
+```
+
+</details>
+
+------------------------------------------------------------------------------------------
+
+### Data Control Macros
+
+<details>
+ <summary><code>GPU_DATA</code> -- <code>(Make data accessible on GPU in specified region)</code></summary>
+
+**Macro Invocation**
+
+Uses FYPP call directive using `#:call`
+
+```C
+#:call GPU_DATA(...)
+   {code}
+#:endcall GPU_DATA 
+```
+
+**Parameters**
+
+| name             | data type   | Default Value | description                                                                                  |
+|------------------|-------------|---------------|----------------------------------------------------------------------------------------------|
+| `code`           | code        | Required      | Region of code where defined data is accessible                                              |
+| `copy`           | string list | None          | Allocates and copies variable to GPU on entrance, then deallocated and copies to CPU on exit |
+| `copyin`         | string list | None          | Allocates and copies data to GPU on entrance and then deallocated on exit                    |
+| `copyinReadOnly` | string list | None          | Allocates and copies a readonly variable to GPU and then deallocated on exit                 |
+| `copyout`        | string list | None          | Allocates data on GPU on entrance and then deallocates and copies to CPU on exit             |
+| `create`         | string list | None          | Allocates data on GPU on entrance and then deallocates on exit                               |
+| `no_create`      | string list | None          | Use data in CPU memory unless data is already in GPU memory                                  |
+| `present`        | string list | None          | Data that must be present in GPU memory. Increment counter on entrance, decrement on exit    |
+| `deviceptr`      | string list | None          | Pointer variables that are already allocated on GPU memory                                   |
+| `attach`         | string list | None          | Attaches device pointer to device targets on entrance, then detach on exit                   |
+| `default`        | string      | None          | Implicit assumptions compiler should make                                                    |
+| `extraAccArgs`   | string      | None          | String of any extra arguments added to the OpenACC directive                                 |
+
+**Parameter Restrictions**
+
+| name   | Restricted range                                 |
+|--------|--------------------------------------------------|
+| `code` | Do not assign it manually with key-value pairing |
+
+**Example**
+
+```C
+ #:call GPU_DATA(copy='[pixel_arr]', copyin='[starting_pixels, initial_index]',attach='[p_real, p_cmplx, p_fltr_cmplx]')
+      {code}
+      ...
+ #:endcall GPU_DATA
+ #:call GPU_DATA(create='[pixel_arr]', copyin='[initial_index]')
+      {code}
+      ...
+ #:endcall
+```
+
+</details>
+
+<details>
+ <summary><code>GPU_ENTER_DATA</code> -- <code>(Allocate/move data to GPU until matching GPU_EXIT_DATA or program termination)</code></summary>
+
+**Macro Invocation**
+
+Uses FYPP eval directive using `$:`
+
+`$:GPU_ENTER_DATA(...)`
+
+**Parameter**
+
+| name             | data type   | Default Value | description                                                  |
+|------------------|-------------|---------------|--------------------------------------------------------------|
+| `copyin`         | string list | None          | Allocates and copies data to GPU on entrance                 |
+| `copyinReadOnly` | string list | None          | Allocates and copies a readonly variable to GPU on entrance  |
+| `create`         | string list | None          | Allocates data on GPU on entrance                            |
+| `attach`         | string list | None          | Attaches device pointer to device targets on entrance        |
+| `extraAccArgs`   | string      | None          | String of any extra arguments added to the OpenACC directive |
+
+**Example**
+
+```python
+ $:GPU_ENTER_DATA(copyin='[pixels_arr]', copyinReadOnly='[starting_pixels, initial_index]')
+ $:GPU_ENTER_DATA(create='[bc_buffers(1:num_dims, -1:1)]', copyin='[initial_index]')
+```
+
+</details>
+
+<details>
+ <summary><code>GPU_EXIT_DATA</code> -- <code>(Deallocate/move data from GPU created by GPU_ENTER_DATA)</code></summary>
+
+**Macro Invocation**
+
+Uses FYPP eval directive using `$:`
+
+`$:GPU_EXIT_DATA(...)`
+
+**Parameters**
+
+| name           | data type   | Default Value | description                                                  |
+|----------------|-------------|---------------|--------------------------------------------------------------|
+| `copyout`      | string list | None          | Deallocates and copies data from GPU to CPU on exit          |
+| `delete`       | string list | None          | Deallocates data on GPU on exit                              |
+| `detach`       | string list | None          | Detach device pointer from device targets on exit            |
+| `extraAccArgs` | string      | None          | String of any extra arguments added to the OpenACC directive |
+
+**Example**
+
+```python
+ $:GPU_EXIT_DATA(copyout='[pixels_arr]', delete='[starting_pixels, initial_index]')
+ $:GPU_EXIT_DATA(delete='[bc_buffers(1:num_dims, -1:1)]', copyout='[initial_index]')
+```
+
+</details>
+
+<details>
+ <summary><code>GPU_DECLARE</code> -- <code>(Allocate module variables on GPU or for implicit data region )</code></summary>
+
+**Macro Invocation**
+
+Uses FYPP eval directive using `$:`
+
+`$:GPU_DECLARE(...)`
+
+**Parameters**
+
+| name             | data type   | Default Value | description                                                                               |
+|------------------|-------------|---------------|-------------------------------------------------------------------------------------------|
+| `copy`           | string list | None          | Allocates and copies data to GPU on entrance, then deallocated and copies to CPU on exit  |
+| `copyin`         | string list | None          | Allocates and copies data to GPU on entrance and then deallocated on exit                 |
+| `copyinReadOnly` | string list | None          | Allocates and copies a readonly variable to GPU and then deallocated on exit              |
+| `copyout`        | string list | None          | Allocates data on GPU on entrance and then deallocates and copies to CPU on exit          |
+| `create`         | string list | None          | Allocates data on GPU on entrance and then deallocates on exit                            |
+| `present`        | string list | None          | Data that must be present in GPU memory. Increment counter on entrance, decrement on exit |
+| `deviceptr`      | string list | None          | Pointer variables that are already allocated on GPU memory                                |
+| `link`           | string list | None          | Declare global link, and only allocate when variable used in data clause.                 |
+| `extraAccArgs`   | string      | None          | String of any extra arguments added to the OpenACC directive                              |
+
+**Additional information**
+
+- An implicit data region is created at the start of each procedure and ends after the last executable statement in that procedure.
+- Use only create, copyin, device_resident or link clauses for module variables
+- GPU_DECLARE exit is the end of the implicit data region
+- Link is useful for large global static data objects
+
+**Example**
+
+```python
+ $:GPU_DECLARE(create='[x_cb,y_cb,z_cb,x_cc,y_cc,z_cc,dx,dy,dz,dt,m,n,p]')
+ $:GPU_DECLARE(create='[x_cb,y_cb,z_cb]', copyin='[x_cc,y_cc,z_cc]', link='[dx,dy,dz,dt,m,n,p]')
+```
+
+</details>
+
+<details>
+ <summary><code>GPU_UPDATE</code> -- <code>(Updates data from CPU to GPU or GPU to CPU)</code></summary>
+
+**Macro Invocation**
+
+Uses FYPP eval directive using `$:`
+
+`$:GPU_UPDATE(...)`
+
+**Parameters**
+
+| name           | data type   | Default Value | description                                                  |
+|----------------|-------------|---------------|--------------------------------------------------------------|
+| `host`         | string list | None          | Updates data from GPU to CPU                                 |
+| `device`       | string list | None          | Updates data from CPU to GPU                                 |
+| `extraAccArgs` | string      | None          | String of any extra arguments added to the OpenACC directive |
+
+**Example**
+
+```python
+ $:GPU_UPDATE(host='[arr1, arr2]')
+ $:GPU_UPDATE(host='[updated_gpu_val]', device='[updated_cpu_val]')
+```
+
+</details>
+
+<details>
+ <summary><code>GPU_HOST_DATA</code> -- <code>(Make GPU memory address available on CPU)</code></summary>
+
+**Macro Invocation**
+
+Uses FYPP call directive using `#:call`
+
+```C
+ #:call GPU_HOST_DATA(...)
+    {code}
+ #:endcall GPU_HOST_DATA 
+```
+
+**Parameters**
+
+| name           | data type   | Default Value | description                                                      |
+|----------------|-------------|---------------|------------------------------------------------------------------|
+| `code`         | code        | Required      | Region of code where GPU memory addresses is accessible          |
+| `use_device`   | string list | None          | Use GPU memory address of variable instead of CPU memory address |
+| `extraAccArgs` | string      | None          | String of any extra arguments added to the OpenACC directive     |
+
+**Parameter Restrictions**
+
+| name   | Restricted range                                 |
+|--------|--------------------------------------------------|
+| `code` | Do not assign it manually with key-value pairing |
+
+**Example**
+
+```C
+ #:call GPU_HOST_DATA(use_device='[addr1, addr2]')
+      {code}
+      ...
+ #:endcall GPU_HOST_DATA
+ #:call GPU_HOST_DATA(use_device='[display_arr]')
+      {code}
+      ...
+  #:endcall
+```
+
+</details>
+
+------------------------------------------------------------------------------------------
+
+### Synchronization Macros
+
+<details>
+ <summary><code>GPU_WAIT</code> -- <code>(Makes CPU wait for async GPU activities)</code></summary>
+
+**Macro Invocation**
+
+Uses FYPP eval directive using `$:`
+
+`$:GPU_WAIT(...)`
+
+**Parameters**
+
+| name           | data type | Default Value | description                                                  |
+|----------------|-----------|---------------|--------------------------------------------------------------|
+| `extraAccArgs` | string    | None          | String of any extra arguments added to the OpenACC directive |
+
+**Example**
+
+```python
+ $:GPU_WAIT()
+```
+
+</details>
+
+<details>
+ <summary><code>GPU_ATOMIC</code> -- <code>(Do an atomic operation on the GPU)</code></summary>
+
+**Macro Invocation**
+
+Uses FYPP eval directive using `$:`
+
+`$:GPU_ATOMIC(...)`
+
+**Parameters**
+
+| name           | data type | Default Value | description                                                  |
+|----------------|-----------|---------------|--------------------------------------------------------------|
+| `atomic`       | string    | Required      | Which atomic operation is performed                          |
+| `extraAccArgs` | string    | None          | String of any extra arguments added to the OpenACC directive |
+
+**Parameter Restrictions**
+
+| name     | Restricted range                        |
+|----------|-----------------------------------------|
+| `atomic` | 'read', 'write', 'update', or 'capture' |
+
+**Additional information**
+
+- read atomic is reading in a value
+  - Ex: `v=x`
+- write atomic is writing a value to a variable
+  - Ex:`x=square(tmp)`
+- update atomic is updating a variable in-place
+  - Ex:`x= x .and. 1`
+- Capture is a pair of read/write/update operations with one dependent on the other
+  - Ex:
+
+  ```Fortran
+      x=x .and. 1
+      v=x
+  ```
+
+**Example**
+
+```python
+ $:GPU_ATOMIC(atomic='update')
+ x = square(x)
+ $:GPU_ATOMIC(atomic='capture')
+ x = square(x)
+ v = x
+```
+
+</details>
+
+------------------------------------------------------------------------------------------
+
+### Miscellaneous Macros
+
+<details>
+ <summary><code>GPU_ROUTINE</code> -- <code>(Compile a procedure for the GPU)</code></summary>
+
+**Macro Invocation**
+
+Uses FYPP eval directive using `$:`
+
+`$:GPU_ROUTINE(...)`
+
+**Parameters**
+
+| name            | data type   | Default Value | description                                                  |
+|-----------------|-------------|---------------|--------------------------------------------------------------|
+| `function_name` | string      | None          | Name of subroutine/function                                  |
+| `parallelism`   | string list | None          | Parallelism granularity to use for this routine              |
+| `nohost`        | boolean     | False         | Do not compile procedure code for CPU                        |
+| `cray_inline`   | boolean     | False         | Inline procedure on cray compiler                            |
+| `extraAccArgs`  | string      | None          | String of any extra arguments added to the OpenACC directive |
+
+**Parameter Restrictions**
+
+| name          | Restricted range                                  |
+|---------------|---------------------------------------------------|
+| `parallelism` | Valid elements: 'gang', 'worker', 'vector', 'seq' |
+
+**Additional information**
+
+- Function name only needs to be given when cray_inline is True
+- Future capability is to parse function header for function name
+- Routine parallelism is most commonly ``'[seq]'``
+
+**Example**
+
+```python
+ $:GPU_ROUTINE(parallelism='[seq]')
+ $:GPU_ROUTINE(function_name='s_matmult', parallelism='[seq]', cray_inline=True)
+```
+
+</details>
+
+<details>
+ <summary><code>GPU_CACHE</code> -- <code>(Data to be cache in software-managed cache)</code></summary>
+
+**Macro Invocation**
+
+Uses FYPP eval directive using `$:`
+
+`$:GPU_CACHE(...)`
+
+**Parameters**
+
+| name             | data type   | Default Value | description                                                  |
+|------------------|-------------|---------------|--------------------------------------------------------------|
+| `cache`          | string list | Required      | Data that should to stored in cache                          |
+| `extraAccArgs`   | string      | None          | String of any extra arguments added to the OpenACC directive |
+
+**Example**
+
+```python
+ $:GPU_CACHE(cache='[pixels_arr]')
+```
+
+</details>
+
+------------------------------------------------------------------------------------------
diff --git a/src/common/include/macros.fpp b/src/common/include/macros.fpp
index 02ceb8fe29..fea730cbd1 100644
--- a/src/common/include/macros.fpp
+++ b/src/common/include/macros.fpp
@@ -1,3 +1,5 @@
+#:include 'parallel_macros.fpp'
+
 #:def LOG(expr)
 #ifdef MFC_DEBUG
     block
@@ -12,14 +14,16 @@
 
 #:def ALLOCATE(*args)
     @:LOG({'@:ALLOCATE(${re.sub(' +', ' ', ', '.join(args))}$)'})
-    allocate (${', '.join(args)}$)
-    !$acc enter data create(${', '.join(args)}$)
+    #:set allocated_variables = ', '.join(args)
+    allocate (${allocated_variables}$)
+    $:GPU_ENTER_DATA(create=('[' + allocated_variables + ']'))
 #:enddef ALLOCATE
 
 #:def DEALLOCATE(*args)
     @:LOG({'@:DEALLOCATE(${re.sub(' +', ' ', ', '.join(args))}$)'})
-    !$acc exit data delete(${', '.join(args)}$)
-    deallocate (${', '.join(args)}$)
+    #:set allocated_variables = ', '.join(args)
+    $:GPU_EXIT_DATA(delete=('[' + allocated_variables + ']'))
+    deallocate (${allocated_variables}$)
 #:enddef DEALLOCATE
 
 #:def ACC_SETUP_VFs(*args)
@@ -30,13 +34,13 @@
         @:LOG({'@:ACC_SETUP_VFs(${', '.join(args)}$)'})
 
         #:for arg in args
-            !$acc enter data copyin(${arg}$)
-            !$acc enter data copyin(${arg}$%vf)
+            $:GPU_ENTER_DATA(copyin=('[' + arg + ']'))
+            $:GPU_ENTER_DATA(copyin=('[' + arg + '%vf]'))
             if (allocated(${arg}$%vf)) then
                 do macros_setup_vfs_i = lbound(${arg}$%vf, 1), ubound(${arg}$%vf, 1)
                     if (associated(${arg}$%vf(macros_setup_vfs_i)%sf)) then
-                        !$acc enter data copyin(${arg}$%vf(macros_setup_vfs_i))
-                        !$acc enter data create(${arg}$%vf(macros_setup_vfs_i)%sf)
+                        $:GPU_ENTER_DATA(copyin=('[' + arg + '%vf(macros_setup_vfs_i)]'))
+                        $:GPU_ENTER_DATA(copyin=('[' + arg + '%vf(macros_setup_vfs_i)%sf]'))
                     end if
                 end do
             end if
@@ -52,9 +56,9 @@
         @:LOG({'@:ACC_SETUP_SFs(${', '.join(args)}$)'})
 
         #:for arg in args
-            !$acc enter data copyin(${arg}$)
+            $:GPU_ENTER_DATA(copyin=('[' + arg + ']'))
             if (associated(${arg}$%sf)) then
-                !$acc enter data create(${arg}$%sf)
+                $:GPU_ENTER_DATA(copyin=('[' + arg + '%sf]'))
             end if
         #:endfor
     end block
@@ -68,18 +72,18 @@
         @:LOG({'@:ACC_SETUP_source_spatials(${', '.join(args)}$)'})
 
         #:for arg in args
-            !$acc enter data copyin(${arg}$)
+            $:GPU_ENTER_DATA(copyin=('[' + arg + ']'))
             if (allocated(${arg}$%coord)) then
-                !$acc enter data create(${arg}$%coord)
+                $:GPU_ENTER_DATA(copyin=('[' + arg + '%coord]'))
             end if
             if (allocated(${arg}$%val)) then
-                !$acc enter data create(${arg}$%val)
+                $:GPU_ENTER_DATA(copyin=('[' + arg + '%val]'))
             end if
             if (allocated(${arg}$%angle)) then
-                !$acc enter data create(${arg}$%angle)
+                $:GPU_ENTER_DATA(copyin=('[' + arg + '%angle]'))
             end if
             if (allocated(${arg}$%xyz_to_r_ratios)) then
-                !$acc enter data create(${arg}$%xyz_to_r_ratios)
+                $:GPU_ENTER_DATA(copyin=('[' + arg + '%xyz_to_r_ratios]'))
             end if
         #:endfor
     end block
@@ -102,3 +106,4 @@
                          //${message or '"No error description."'}$)
     end if
 #:enddef
+! New line at end of file is required for FYPP
diff --git a/src/common/include/parallel_macros.fpp b/src/common/include/parallel_macros.fpp
new file mode 100644
index 0000000000..8d0a5a673b
--- /dev/null
+++ b/src/common/include/parallel_macros.fpp
@@ -0,0 +1,425 @@
+#:mute
+
+#:def ASSERT_LIST(data, datatype)
+    #:assert data is not None
+    #:assert isinstance(data, list)
+    #:assert len(data) != 0
+    #:assert all(isinstance(element, datatype) for element in data)
+#:enddef
+
+#:def GEN_PARENTHESES_CLAUSE(clause_name, clause_str)
+    #:set clause_regex = re.compile(',(?![^(]*\\))')
+    #:assert isinstance(clause_name, str)
+    #:if clause_str is not None
+        #:set count = 0
+        #:assert isinstance(clause_str, str)
+        #:assert clause_str[0] == '[' and clause_str[-1] == ']'
+        #:for c in clause_str
+            #:if c == '('
+                #:set count = count + 1
+            #:elif c == ')'
+                #:set count = count - 1
+            #:endif
+            #:if c == ',' and count > 1
+                #:stop 'Nested parentheses with comma inside is not supported. Incorrect clause: {}'.format(clause_str)
+            #:elif count < 0
+                #:stop 'Missing parentheses. Incorrect clause: {}'.format(clause_str)
+            #:endif
+        #:endfor
+        #:set clause_str = re.sub(clause_regex, ';', clause_str)
+        #:set clause_list = [x.strip() for x in clause_str.strip('[]').split(';')]
+        $:ASSERT_LIST(clause_list, str)
+        #:set clause_str = clause_name + '(' + ', '.join(clause_list) + ') '
+    #:else
+        #:set clause_str = ''
+    #:endif
+    $:clause_str
+#:enddef
+
+#:def GEN_PRIVATE_STR(private, initialized_values)
+    #:assert isinstance(initialized_values, bool)
+    #:if initialized_values == True
+        #:set private_val = GEN_PARENTHESES_CLAUSE('firstprivate', private)
+    #:else
+        #:set private_val = GEN_PARENTHESES_CLAUSE('private', private)
+    #:endif
+    $:private_val
+#:enddef
+
+#:def GEN_COPY_STR(copy)
+    #:set copy_val = GEN_PARENTHESES_CLAUSE('copy', copy)
+    $:copy_val
+#:enddef
+
+#:def GEN_COPYIN_STR(copyin, readonly)
+    #:assert isinstance(readonly, bool)
+    #:set copyin_val = GEN_PARENTHESES_CLAUSE('copyin', copyin)
+    #:if copyin is not None and readonly == True
+        #:set index = copyin_val.find('copyin(') + len('copyin(')
+        #:set copyin_val = copyin_val[:index] + 'readonly:' + copyin_val[index:]
+    #:endif
+    $:copyin_val
+#:enddef
+
+#:def GEN_COPYOUT_STR(copyout)
+    #:set copyout_val = GEN_PARENTHESES_CLAUSE('copyout', copyout)
+    $:copyout_val
+#:enddef
+
+#:def GEN_CREATE_STR(create)
+    #:set create_val = GEN_PARENTHESES_CLAUSE('create', create)
+    $:create_val
+#:enddef
+
+#:def GEN_NOCREATE_STR(no_create)
+    #:set nocreate_val = GEN_PARENTHESES_CLAUSE('no_create', no_create)
+    $:nocreate_val
+#:enddef
+
+#:def GEN_DELETE_STR(delete)
+    #:set delete_val = GEN_PARENTHESES_CLAUSE('delete', delete)
+    $:delete_val
+#:enddef
+
+#:def GEN_PRESENT_STR(present)
+    #:set present_val = GEN_PARENTHESES_CLAUSE('present', present)
+    $:present_val
+#:enddef
+
+#:def GEN_DEVICEPTR_STR(deviceptr)
+    #:set deviceptr_val = GEN_PARENTHESES_CLAUSE('deviceptr', deviceptr)
+    $:deviceptr_val
+#:enddef
+
+#:def GEN_ATTACH_STR(attach)
+    #:set attach_val = GEN_PARENTHESES_CLAUSE('attach', attach)
+    $:attach_val
+#:enddef
+
+#:def GEN_DETACH_STR(detach)
+    #:set detach_val = GEN_PARENTHESES_CLAUSE('detach', detach)
+    $:detach_val
+#:enddef
+
+#:def GEN_LINK_STR(link)
+    #:set link_val = GEN_PARENTHESES_CLAUSE('link', link)
+    $:link_val
+#:enddef
+
+#:def GEN_EXTRA_ARGS_STR(extraArgs)
+    #:if extraArgs is not None
+        #:assert isinstance(extraArgs, str)
+        #:set extraArgs_val = extraArgs
+    #:else
+        #:set extraArgs_val = ''
+    #:endif
+    $:extraArgs_val
+#:enddef
+
+#:def GEN_PARALLELISM_STR(parallelism)
+    #:if parallelism is not None
+        #:assert isinstance(parallelism, str)
+        #:assert parallelism[0] == '[' and parallelism[-1] == ']'
+        #:set parallelism_list = [x.strip() for x in parallelism.strip('[]').split(',')]
+        $:ASSERT_LIST(parallelism_list, str)
+        #:assert all((element == 'gang' or element == 'worker' or &
+            & element == 'vector' or element == 'seq') for element in parallelism_list)
+        #:set parallelism_val = ' '.join(parallelism_list) + ' '
+    #:else
+        #:set parallelism_val = ''
+    #:endif
+    $:parallelism_val
+#:enddef
+
+#:def GEN_COLLAPSE_STR(collapse)
+    #:if collapse is not None
+        #:set collapse = int(collapse)
+        #:assert isinstance(collapse, int)
+        #:assert collapse > 1
+        #:set collapse_val = 'collapse(' + str(collapse) + ') '
+    #:else
+        #:set collapse_val = ''
+    #:endif
+    $:collapse_val
+#:enddef
+
+#:def GEN_DEFAULT_STR(default)
+    #:if default is not None
+        #:assert isinstance(default, str)
+        #:assert (default == 'present' or default == 'none')
+        #:set default_val = 'default(' + default + ') '
+    #:else
+        #:set default_val = ''
+    #:endif
+    $:default_val
+#:enddef
+
+#:def GEN_REDUCTION_STR(reduction, reductionOp)
+    #:if reduction is not None and reductionOp is not None
+        #:assert isinstance(reduction, str)
+        #:assert isinstance(reductionOp, str)
+        #:assert reduction[0] == '[' and reduction[-1] == ']'
+        #:assert reductionOp[0] == '[' and reductionOp[-1] == ']'
+        #:set reduction = reduction.replace(' ', '')
+        #:set reduction = reduction[1:-1]
+        #:set reduction_list = reduction.split('],')
+        #:set reduction_list = [str + ']' for str in reduction_list]
+        #:assert all(str[0] == '[' and str[-1] == ']' for str in reduction_list)
+
+        #:set reductionOp_list = [x.strip() for x in reductionOp.strip('[]').split(',')]
+        $:ASSERT_LIST(reduction_list, str)
+        $:ASSERT_LIST(reductionOp_list, str)
+        #:assert len(reduction_list) == len(reductionOp_list)
+        #:set reduction_val = ''
+        #:for i in range(len(reduction_list))
+            #:set temp_clause = GEN_PARENTHESES_CLAUSE('reduction', reduction_list[i]).strip('\n')
+            #:set ind = temp_clause.find('reduction(') + len('reduction(')
+            #:set reduction_val = reduction_val.strip('\n') + temp_clause[:ind] + reductionOp_list[i] + ':' + temp_clause[ind:]
+        #:endfor
+    #:elif reduction is not None or reductionOp is not None
+        #:stop 'Cannot set the reduction list or reduction operation without setting the other'
+    #:else
+        #:set reduction_val = ''
+    #:endif
+    $:reduction_val
+#:enddef
+
+#:def GEN_HOST_STR(host)
+    #:set host_val = GEN_PARENTHESES_CLAUSE('host', host)
+    $:host_val
+#:enddef
+
+#:def GEN_DEVICE_STR(device)
+    #:set device_val = GEN_PARENTHESES_CLAUSE('device', device)
+    $:device_val
+#:enddef
+
+#:def GEN_USE_DEVICE_STR(use_device)
+    #:set use_device_val = GEN_PARENTHESES_CLAUSE('use_device', use_device)
+    $:use_device_val
+#:enddef
+
+#:def GPU_PARALLEL(code, private=None, default='present', firstprivate=None, reduction=None, reductionOp=None, &
+    & copy=None, copyin=None, copyinReadOnly=None, copyout=None, create=None, &
+    & no_create=None, present=None, deviceptr=None, attach=None, extraAccArgs=None)
+    #:set default_val = GEN_DEFAULT_STR(default)
+    #:set private_val = GEN_PRIVATE_STR(private, False).strip('\n') + GEN_PRIVATE_STR(firstprivate, True).strip('\n')
+    #:set reduction_val = GEN_REDUCTION_STR(reduction, reductionOp)
+    #:set copy_val = GEN_COPY_STR(copy)
+    #:set copyin_val = GEN_COPYIN_STR(copyin, False).strip('\n') + GEN_COPYIN_STR(copyinReadOnly, True).strip('\n')
+    #:set copyout_val = GEN_COPYOUT_STR(copyout)
+    #:set create_val = GEN_CREATE_STR(create)
+    #:set no_create_val = GEN_NOCREATE_STR(no_create)
+    #:set present_val = GEN_PRESENT_STR(present)
+    #:set deviceptr_val = GEN_DEVICEPTR_STR(deviceptr)
+    #:set attach_val = GEN_ATTACH_STR(attach)
+    #:set extraAccArgs_val = GEN_EXTRA_ARGS_STR(extraAccArgs)
+    #:set clause_val = default_val.strip('\n') + private_val.strip('\n') + reduction_val.strip('\n') + &
+        & copy_val.strip('\n') + copyin_val.strip('\n') + &
+        & copyout_val.strip('\n') + create_val.strip('\n') + &
+        & no_create_val.strip('\n') + present_val.strip('\n') + &
+        & deviceptr_val.strip('\n') + attach_val.strip('\n')
+    #:set acc_directive = '!$acc parallel ' + &
+        & clause_val + extraAccArgs_val.strip('\n')
+    #:set end_acc_directive = '!$acc end parallel'
+    $:acc_directive
+    $:code
+    $:end_acc_directive
+#:enddef
+
+
+#:def GPU_PARALLEL_LOOP(collapse=None, private=None, parallelism='[gang, vector]', &
+    & default='present', firstprivate=None, reduction=None, reductionOp=None, &
+    & copy=None, copyin=None, copyinReadOnly=None, copyout=None, create=None, &
+    & no_create=None, present=None, deviceptr=None, attach=None, extraAccArgs=None)
+    #:set collapse_val = GEN_COLLAPSE_STR(collapse)
+    #:set parallelism_val = GEN_PARALLELISM_STR(parallelism)
+    #:set default_val = GEN_DEFAULT_STR(default)
+    #:set private_val = GEN_PRIVATE_STR(private, False).strip('\n') + GEN_PRIVATE_STR(firstprivate, True).strip('\n')
+    #:set reduction_val = GEN_REDUCTION_STR(reduction, reductionOp)
+    #:set copy_val = GEN_COPY_STR(copy)
+    #:set copyin_val = GEN_COPYIN_STR(copyin, False).strip('\n') + GEN_COPYIN_STR(copyinReadOnly, True).strip('\n')
+    #:set copyout_val = GEN_COPYOUT_STR(copyout)
+    #:set create_val = GEN_CREATE_STR(create)
+    #:set no_create_val = GEN_NOCREATE_STR(no_create)
+    #:set present_val = GEN_PRESENT_STR(present)
+    #:set deviceptr_val = GEN_DEVICEPTR_STR(deviceptr)
+    #:set attach_val = GEN_ATTACH_STR(attach)
+    #:set extraAccArgs_val = GEN_EXTRA_ARGS_STR(extraAccArgs)
+    #:set clause_val = collapse_val.strip('\n') + parallelism_val.strip('\n') + &
+        & default_val.strip('\n') + private_val.strip('\n') + reduction_val.strip('\n') + &
+        & copy_val.strip('\n') + copyin_val.strip('\n') + &
+        & copyout_val.strip('\n') + create_val.strip('\n') + &
+        & no_create_val.strip('\n') + present_val.strip('\n') + &
+        & deviceptr_val.strip('\n') + attach_val.strip('\n')
+    #:set acc_directive = '!$acc parallel loop ' + &
+        & clause_val + extraAccArgs_val.strip('\n')
+    $:acc_directive
+#:enddef
+
+#:def GPU_ROUTINE(function_name=None, parallelism=None, nohost=False, cray_inline=False, extraAccArgs=None)
+    #:assert isinstance(cray_inline, bool)
+    #:set parallelism_val = GEN_PARALLELISM_STR(parallelism)
+    #:assert isinstance(nohost, bool)
+    #:if nohost == True
+        #:set nohost_val = 'nohost'
+    #:else
+        #:set nohost_val = ''
+    #:endif
+    #:set extraAccArgs_val = GEN_EXTRA_ARGS_STR(extraAccArgs)
+    #:set clause_val = parallelism_val.strip('\n') + nohost_val.strip('\n')
+    #:set acc_directive = '!$acc routine ' + &
+        & clause_val + extraAccArgs_val.strip('\n')
+    #:if cray_inline == True
+        #:if not isinstance(function_name, str)
+            #:stop "When inlining for Cray Compiler, function name must be given and given as a string"
+        #:endif
+        #:set cray_directive = ('!DIR$ INLINEALWAYS ' + function_name).strip('\n')
+#ifdef _CRAYFTN
+        $:cray_directive
+#else
+        $:acc_directive
+#endif
+    #:else
+        $:acc_directive
+    #:endif
+#:enddef
+
+#:def GPU_DECLARE(copy=None, copyin=None, copyinReadOnly=None, copyout=None, create=None, present=None, deviceptr=None, link=None, extraAccArgs=None)
+    #:set copy_val = GEN_COPY_STR(copy)
+    #:set copyin_val = GEN_COPYIN_STR(copyin, False).strip('\n') + GEN_COPYIN_STR(copyinReadOnly, True).strip('\n')
+    #:set copyout_val = GEN_COPYOUT_STR(copyout)
+    #:set create_val = GEN_CREATE_STR(create)
+    #:set present_val = GEN_PRESENT_STR(present)
+    #:set deviceptr_val = GEN_DEVICEPTR_STR(deviceptr)
+    #:set link_val = GEN_LINK_STR(link)
+    #:set extraAccArgs_val = GEN_EXTRA_ARGS_STR(extraAccArgs)
+    #:set clause_val = copy_val.strip('\n') + copyin_val.strip('\n') + &
+        & copyout_val.strip('\n') + create_val.strip('\n') + &
+        & present_val.strip('\n') + deviceptr_val.strip('\n') + &
+        & link_val.strip('\n')
+    #:set acc_directive = '!$acc declare ' + clause_val + extraAccArgs_val.strip('\n')
+    $:acc_directive
+#:enddef
+
+#:def GPU_LOOP(collapse=None, parallelism=None, data_dependency=None, reduction=None, reductionOp=None, private=None, extraAccArgs=None)
+    #:set collapse_val = GEN_COLLAPSE_STR(collapse)
+    #:set parallelism_val = GEN_PARALLELISM_STR(parallelism)
+    #:if data_dependency is not None
+        #:assert isinstance(data_dependency, str)
+        #:assert (data_dependency == 'auto' or data_dependency == 'independent')
+        #:set data_dependency_val = data_dependency 
+    #:else
+        #:set data_dependency_val = ''
+    #:endif
+    #:set private_val = GEN_PRIVATE_STR(private, False)
+    #:set reduction_val = GEN_REDUCTION_STR(reduction, reductionOp)
+    #:set extraAccArgs_val = GEN_EXTRA_ARGS_STR(extraAccArgs)
+    #:set clause_val = collapse_val.strip('\n') + parallelism_val.strip('\n') + &
+        & data_dependency_val.strip('\n') + private_val.strip('\n') + &
+        & reduction_val.strip('\n')
+    #:set acc_directive = '!$acc loop ' + &
+        & clause_val + extraAccArgs_val.strip('\n')
+    $:acc_directive
+#:enddef
+
+#:def GPU_DATA(code, copy=None, copyin=None, copyinReadOnly=None, copyout=None, create=None, no_create=None, present=None, deviceptr=None, attach=None, default=None, extraAccArgs=None)
+    #:assert code is not None
+    #:assert isinstance(code, str)
+    #:if code == '' or code.isspace()
+        #:stop 'GPU_DATA macro has no effect on the code as it is not surrounding any code'
+    #:endif
+    #:set copy_val = GEN_COPY_STR(copy)
+    #:set copyin_val = GEN_COPYIN_STR(copyin, False).strip('\n') + GEN_COPYIN_STR(copyinReadOnly, True).strip('\n')
+    #:set copyout_val = GEN_COPYOUT_STR(copyout)
+    #:set create_val = GEN_CREATE_STR(create)
+    #:set no_create_val = GEN_NOCREATE_STR(no_create)
+    #:set present_val = GEN_PRESENT_STR(present)
+    #:set deviceptr_val = GEN_DEVICEPTR_STR(deviceptr)
+    #:set attach_val = GEN_ATTACH_STR(attach)
+    #:set default_val = GEN_DEFAULT_STR(default)
+    #:set extraAccArgs_val = GEN_EXTRA_ARGS_STR(extraAccArgs)
+    #:set clause_val = copy_val.strip('\n') + copyin_val.strip('\n') + &
+        & copyout_val.strip('\n') + create_val.strip('\n') + &
+        & no_create_val.strip('\n') + present_val.strip('\n') + & 
+        & deviceptr_val.strip('\n') + attach_val.strip('\n') + &
+        & default_val.strip('\n')
+    #:set acc_directive = '!$acc data ' + clause_val + extraAccArgs_val.strip('\n')
+    #:set end_acc_directive = '!$acc end data'
+    $:acc_directive
+    $:code
+    $:end_acc_directive
+#:enddef
+
+#:def GPU_HOST_DATA(code, use_device=None, extraAccArgs=None)
+    #:assert code is not None
+    #:assert isinstance(code, str)
+    #:if code == '' or code.isspace()
+        #:stop 'GPU_HOST_DATA macro has no effect on the code as it is not surrounding any code'
+    #:endif
+    #:set use_device_val = GEN_USE_DEVICE_STR(use_device)
+    #:set extraAccArgs_val = GEN_EXTRA_ARGS_STR(extraAccArgs)
+    #:set clause_val = use_device_val.strip('\n')
+    #:set acc_directive = '!$acc host_data ' + clause_val + extraAccArgs_val.strip('\n')
+    #:set end_acc_directive = '!$acc end host_data'
+    $:acc_directive
+    $:code
+    $:end_acc_directive
+#:enddef
+
+#:def GPU_ENTER_DATA(copyin=None, copyinReadOnly=None, create=None, attach=None, extraAccArgs=None)
+    #:set copyin_val = GEN_COPYIN_STR(copyin, False).strip('\n') + GEN_COPYIN_STR(copyinReadOnly, True).strip('\n')
+    #:set create_val = GEN_CREATE_STR(create)
+    #:set attach_val = GEN_ATTACH_STR(attach)
+    #:set extraAccArgs_val = GEN_EXTRA_ARGS_STR(extraAccArgs)
+    #:set clause_val = copyin_val.strip('\n') + create_val.strip('\n') + attach_val.strip('\n')
+    #:set acc_directive = '!$acc enter data ' + clause_val + extraAccArgs_val.strip('\n')
+    $:acc_directive
+#:enddef
+
+#:def GPU_EXIT_DATA(copyout=None, delete=None, detach=None, extraAccArgs=None)
+    #:set copyout_val = GEN_COPYOUT_STR(copyout)
+    #:set delete_val = GEN_DELETE_STR(delete)
+    #:set detach_val = GEN_DETACH_STR(detach)
+    #:set extraAccArgs_val = GEN_EXTRA_ARGS_STR(extraAccArgs)
+    #:set clause_val = copyout_val.strip('\n') + delete_val.strip('\n') + detach_val.strip('\n')
+    #:set acc_directive = '!$acc exit data ' + clause_val + extraAccArgs_val.strip('\n')
+    $:acc_directive
+#:enddef
+
+#:def GPU_CACHE(cache, extraAccArgs=None)
+    #:set cache_val = GEN_PARENTHESES_CLAUSE('cache', cache)
+    #:set extraAccArgs_val = GEN_EXTRA_ARGS_STR(extraAccArgs)
+    #:set clause_val = cache_val.strip('\n')
+    #:set acc_directive = '!$acc ' + clause_val + extraAccArgs_val.strip('\n')
+    $:acc_directive
+#:enddef
+
+#:def GPU_ATOMIC(atomic, extraAccArgs=None)
+    #:assert isinstance(atomic, str)
+    #:assert (atomic == 'read' or atomic == 'write' or atomic == 'update' or atomic == 'capture')
+    #:set atomic_val = atomic
+    #:set extraAccArgs_val = GEN_EXTRA_ARGS_STR(extraAccArgs)
+    #:set clause_val = atomic_val.strip('\n')
+    #:set acc_directive = '!$acc atomic ' + clause_val + extraAccArgs_val.strip('\n')
+    $:acc_directive
+#:enddef
+
+#:def GPU_UPDATE(host=None, device=None, extraAccArgs=None)
+    #:set host_val = GEN_HOST_STR(host)
+    #:set device_val = GEN_DEVICE_STR(device)
+    #:set extraAccArgs_val = GEN_EXTRA_ARGS_STR(extraAccArgs)
+    #:set clause_val = host_val.strip('\n') + device_val.strip('\n')
+    #:set acc_directive = '!$acc update ' + clause_val + extraAccArgs_val.strip('\n')
+    $:acc_directive
+#:enddef
+
+#:def GPU_WAIT(extraAccArgs=None)
+    #:set extraAccArgs_val = GEN_EXTRA_ARGS_STR(extraAccArgs)
+    #:set clause_val = ''
+    #:set acc_directive = '!$acc wait ' + clause_val + extraAccArgs_val.strip('\n')
+    $:acc_directive
+#:enddef
+
+#:endmute
+! New line at end of file is required for FYPP
diff --git a/src/common/m_boundary_common.fpp b/src/common/m_boundary_common.fpp
index 4087a0045e..2c48c760f2 100644
--- a/src/common/m_boundary_common.fpp
+++ b/src/common/m_boundary_common.fpp
@@ -24,7 +24,7 @@ module m_boundary_common
     implicit none
 
     type(scalar_field), dimension(:, :), allocatable :: bc_buffers
-!$acc declare create(bc_buffers)
+    $:GPU_DECLARE(create='[bc_buffers]')
 
 #ifdef MFC_MPI
     integer, dimension(1:3, -1:1) :: MPI_BC_TYPE_TYPE, MPI_BC_BUFFER_TYPE
@@ -87,7 +87,7 @@ contains
         if (bc_x%beg >= 0) then
             call s_mpi_sendrecv_variables_buffers(q_prim_vf, 1, -1, sys_size, pb, mv)
         else
-            !$acc parallel loop collapse(2) gang vector default(present)
+            $:GPU_PARALLEL_LOOP(collapse=2)
             do l = 0, p
                 do k = 0, n
                     select case (int(bc_type(1, -1)%sf(0, k, l)))
@@ -116,7 +116,7 @@ contains
         if (bc_x%end >= 0) then
             call s_mpi_sendrecv_variables_buffers(q_prim_vf, 1, 1, sys_size, pb, mv)
         else
-            !$acc parallel loop collapse(2) gang vector default(present)
+            $:GPU_PARALLEL_LOOP(collapse=2)
             do l = 0, p
                 do k = 0, n
                     select case (int(bc_type(1, 1)%sf(0, k, l)))
@@ -149,7 +149,7 @@ contains
         if (bc_y%beg >= 0) then
             call s_mpi_sendrecv_variables_buffers(q_prim_vf, 2, -1, sys_size, pb, mv)
         else
-            !$acc parallel loop collapse(2) gang vector default(present)
+            $:GPU_PARALLEL_LOOP(collapse=2)
             do l = 0, p
                 do k = -buff_size, m + buff_size
                     select case (int(bc_type(2, -1)%sf(k, 0, l)))
@@ -181,7 +181,7 @@ contains
         if (bc_y%end >= 0) then
             call s_mpi_sendrecv_variables_buffers(q_prim_vf, 2, 1, sys_size, pb, mv)
         else
-            !$acc parallel loop collapse(2) gang vector default(present)
+            $:GPU_PARALLEL_LOOP(collapse=2)
             do l = 0, p
                 do k = -buff_size, m + buff_size
                     select case (int(bc_type(2, 1)%sf(k, 0, l)))
@@ -214,7 +214,7 @@ contains
         if (bc_z%beg >= 0) then
             call s_mpi_sendrecv_variables_buffers(q_prim_vf, 3, -1, sys_size, pb, mv)
         else
-            !$acc parallel loop collapse(2) gang vector default(present)
+            $:GPU_PARALLEL_LOOP(collapse=2)
             do l = -buff_size, n + buff_size
                 do k = -buff_size, m + buff_size
                     select case (int(bc_type(3, -1)%sf(k, l, 0)))
@@ -243,7 +243,7 @@ contains
         if (bc_z%end >= 0) then
             call s_mpi_sendrecv_variables_buffers(q_prim_vf, 3, 1, sys_size, pb, mv)
         else
-            !$acc parallel loop collapse(2) gang vector default(present)
+            $:GPU_PARALLEL_LOOP(collapse=2)
             do l = -buff_size, n + buff_size
                 do k = -buff_size, m + buff_size
                     select case (int(bc_type(3, 1)%sf(k, l, 0)))
@@ -273,11 +273,8 @@ contains
     end subroutine s_populate_variables_buffers
 
     pure subroutine s_ghost_cell_extrapolation(q_prim_vf, bc_dir, bc_loc, k, l)
-#ifdef _CRAYFTN
-        !DIR$ INLINEALWAYS s_ghost_cell_extrapolation
-#else
-        !$acc routine seq
-#endif
+        $:GPU_ROUTINE(function_name='s_ghost_cell_extrapolation', &
+            & parallelism='[seq]', cray_inline=True)
         type(scalar_field), dimension(sys_size), intent(inout) :: q_prim_vf
         integer, intent(in) :: bc_dir, bc_loc
         integer, intent(in) :: k, l
@@ -337,7 +334,7 @@ contains
     end subroutine s_ghost_cell_extrapolation
 
     pure subroutine s_symmetry(q_prim_vf, bc_dir, bc_loc, k, l, pb, mv)
-        !$acc routine seq
+        $:GPU_ROUTINE(parallelism='[seq]')
         type(scalar_field), dimension(sys_size), intent(inout) :: q_prim_vf
         real(wp), optional, dimension(idwbuff(1)%beg:, idwbuff(2)%beg:, idwbuff(3)%beg:, 1:, 1:), intent(inout) :: pb, mv
         integer, intent(in) :: bc_dir, bc_loc
@@ -597,7 +594,7 @@ contains
     end subroutine s_symmetry
 
     pure subroutine s_periodic(q_prim_vf, bc_dir, bc_loc, k, l, pb, mv)
-        !$acc routine seq
+        $:GPU_ROUTINE(parallelism='[seq]')
         type(scalar_field), dimension(sys_size), intent(inout) :: q_prim_vf
         real(wp), optional, dimension(idwbuff(1)%beg:, idwbuff(2)%beg:, idwbuff(3)%beg:, 1:, 1:), intent(inout) :: pb, mv
         integer, intent(in) :: bc_dir, bc_loc
@@ -736,7 +733,7 @@ contains
     end subroutine s_periodic
 
     pure subroutine s_axis(q_prim_vf, pb, mv, k, l)
-        !$acc routine seq
+        $:GPU_ROUTINE(parallelism='[seq]')
         type(scalar_field), dimension(sys_size), intent(inout) :: q_prim_vf
         real(wp), dimension(idwbuff(1)%beg:, idwbuff(2)%beg:, idwbuff(3)%beg:, 1:, 1:), intent(inout) :: pb, mv
         integer, intent(in) :: k, l
@@ -795,11 +792,8 @@ contains
     end subroutine s_axis
 
     pure subroutine s_slip_wall(q_prim_vf, bc_dir, bc_loc, k, l)
-#ifdef _CRAYFTN
-        !DIR$ INLINEALWAYS s_slip_wall
-#else
-        !$acc routine seq
-#endif
+        $:GPU_ROUTINE(function_name='s_slip_wall',parallelism='[seq]', &
+            & cray_inline=True)
         type(scalar_field), dimension(sys_size), intent(inout) :: q_prim_vf
         integer, intent(in) :: bc_dir, bc_loc
         integer, intent(in) :: k, l
@@ -889,11 +883,9 @@ contains
     end subroutine s_slip_wall
 
     pure subroutine s_no_slip_wall(q_prim_vf, bc_dir, bc_loc, k, l)
-#ifdef _CRAYFTN
-        !DIR$ INLINEALWAYS s_no_slip_wall
-#else
-        !$acc routine seq
-#endif
+        $:GPU_ROUTINE(function_name='s_no_slip_wall',parallelism='[seq]', &
+            & cray_inline=True)
+
         type(scalar_field), dimension(sys_size), intent(inout) :: q_prim_vf
         integer, intent(in) :: bc_dir, bc_loc
         integer, intent(in) :: k, l
@@ -1019,11 +1011,8 @@ contains
     end subroutine s_no_slip_wall
 
     pure subroutine s_dirichlet(q_prim_vf, bc_dir, bc_loc, k, l)
-#ifdef _CRAYFTN
-        !DIR$ INLINEALWAYS s_dirichlet
-#else
-        !$acc routine seq
-#endif
+        $:GPU_ROUTINE(function_name='s_dirichlet',parallelism='[seq]', &
+            & cray_inline=True)
         type(scalar_field), dimension(sys_size), intent(inout) :: q_prim_vf
         integer, intent(in) :: bc_dir, bc_loc
         integer, intent(in) :: k, l
@@ -1087,7 +1076,7 @@ contains
     end subroutine s_dirichlet
 
     pure subroutine s_qbmm_extrapolation(bc_dir, bc_loc, k, l, pb, mv)
-        !$acc routine seq
+        $:GPU_ROUTINE(parallelism='[seq]')
         real(wp), optional, dimension(idwbuff(1)%beg:, idwbuff(2)%beg:, idwbuff(3)%beg:, 1:, 1:), intent(inout) :: pb, mv
         integer, intent(in) :: bc_dir, bc_loc
         integer, intent(in) :: k, l
@@ -1169,7 +1158,7 @@ contains
         if (bc_x%beg >= 0) then
             call s_mpi_sendrecv_variables_buffers(c_divs, 1, -1, num_dims + 1)
         else
-            !$acc parallel loop collapse(2) gang vector default(present)
+            $:GPU_PARALLEL_LOOP(collapse=2)
             do l = 0, p
                 do k = 0, n
                     select case (bc_type(1, -1)%sf(0, k, l))
@@ -1187,7 +1176,7 @@ contains
         if (bc_x%end >= 0) then
             call s_mpi_sendrecv_variables_buffers(c_divs, 1, 1, num_dims + 1)
         else
-            !$acc parallel loop collapse(2) gang vector default(present)
+            $:GPU_PARALLEL_LOOP(collapse=2)
             do l = 0, p
                 do k = 0, n
                     select case (bc_type(1, 1)%sf(0, k, l))
@@ -1208,7 +1197,7 @@ contains
         if (bc_y%beg >= 0) then
             call s_mpi_sendrecv_variables_buffers(c_divs, 2, -1, num_dims + 1)
         else
-            !$acc parallel loop collapse(2) gang vector default(present)
+            $:GPU_PARALLEL_LOOP(collapse=2)
             do l = 0, p
                 do k = -buff_size, m + buff_size
                     select case (bc_type(2, -1)%sf(k, 0, l))
@@ -1226,7 +1215,7 @@ contains
         if (bc_y%end >= 0) then
             call s_mpi_sendrecv_variables_buffers(c_divs, 2, 1, num_dims + 1)
         else
-            !$acc parallel loop collapse(2) gang vector default(present)
+            $:GPU_PARALLEL_LOOP(collapse=2)
             do l = 0, p
                 do k = -buff_size, m + buff_size
                     select case (bc_type(2, 1)%sf(k, 0, l))
@@ -1247,7 +1236,7 @@ contains
         if (bc_z%beg >= 0) then
             call s_mpi_sendrecv_variables_buffers(c_divs, 3, -1, num_dims + 1)
         else
-            !$acc parallel loop collapse(2) gang vector default(present)
+            $:GPU_PARALLEL_LOOP(collapse=2)
             do l = -buff_size, n + buff_size
                 do k = -buff_size, m + buff_size
                     select case (bc_type(3, -1)%sf(k, l, 0))
@@ -1265,7 +1254,7 @@ contains
         if (bc_z%end >= 0) then
             call s_mpi_sendrecv_variables_buffers(c_divs, 3, 1, num_dims + 1)
         else
-            !$acc parallel loop collapse(2) gang vector default(present)
+            $:GPU_PARALLEL_LOOP(collapse=2)
             do l = -buff_size, n + buff_size
                 do k = -buff_size, m + buff_size
                     select case (bc_type(3, 1)%sf(k, l, 0))
@@ -1282,11 +1271,8 @@ contains
     end subroutine s_populate_capillary_buffers
 
     pure subroutine s_color_function_periodic(c_divs, bc_dir, bc_loc, k, l)
-#ifdef _CRAYFTN
-        !DIR$ INLINEALWAYS s_color_function_periodic
-#else
-        !$acc routine seq
-#endif
+        $:GPU_ROUTINE(function_name='s_color_function_periodic', &
+            & parallelism='[seq]', cray_inline=True)
         type(scalar_field), dimension(num_dims + 1), intent(inout) :: c_divs
         integer, intent(in) :: bc_dir, bc_loc
         integer, intent(in) :: k, l
@@ -1340,11 +1326,8 @@ contains
     end subroutine s_color_function_periodic
 
     pure subroutine s_color_function_reflective(c_divs, bc_dir, bc_loc, k, l)
-#ifdef _CRAYFTN
-        !DIR$ INLINEALWAYS s_color_function_reflective
-#else
-        !$acc routine seq
-#endif
+        $:GPU_ROUTINE(function_name='s_color_function_reflective', &
+            & parallelism='[seq]', cray_inline=True)
         type(scalar_field), dimension(num_dims + 1), intent(inout) :: c_divs
         integer, intent(in) :: bc_dir, bc_loc
         integer, intent(in) :: k, l
@@ -1422,11 +1405,8 @@ contains
     end subroutine s_color_function_reflective
 
     pure subroutine s_color_function_ghost_cell_extrapolation(c_divs, bc_dir, bc_loc, k, l)
-#ifdef _CRAYFTN
-        !DIR$ INLINEALWAYS s_color_function_ghost_cell_extrapolation
-#else
-        !$acc routine seq
-#endif
+        $:GPU_ROUTINE(function_name='s_color_function_ghost_cell_extrapolation', &
+            & parallelism='[seq]', cray_inline=True)
         type(scalar_field), dimension(num_dims + 1), intent(inout) :: c_divs
         integer, intent(in) :: bc_dir, bc_loc
         integer, intent(in) :: k, l
@@ -1638,7 +1618,7 @@ contains
         do dir = 1, num_dims
             do loc = -1, 1, 2
                 read (1) bc_type(dir, loc)%sf
-                !$acc update device(bc_type(dir, loc)%sf)
+                $:GPU_UPDATE(device='[bc_type(dir, loc)%sf]')
             end do
         end do
         close (1)
@@ -1654,7 +1634,7 @@ contains
         do dir = 1, num_dims
             do loc = -1, 1, 2
                 read (1) bc_buffers(dir, loc)%sf
-                !$acc update device(bc_buffers(dir, loc)%sf)
+                $:GPU_UPDATE(device='[bc_buffers(dir, loc)%sf]')
             end do
         end do
         close (1)
@@ -1704,7 +1684,7 @@ contains
                 call MPI_File_set_view(file_id, int(offset, KIND=MPI_ADDRESS_KIND), MPI_INTEGER, MPI_BC_TYPE_TYPE(dir, loc), 'native', MPI_INFO_NULL, ierr)
                 call MPI_File_read_all(file_id, bc_type(dir, loc)%sf, 1, MPI_BC_TYPE_TYPE(dir, loc), MPI_STATUS_IGNORE, ierr)
                 offset = offset + sizeof(bc_type(dir, loc)%sf)
-                !$acc update device(bc_type(dir, loc)%sf)
+                $:GPU_UPDATE(device='[bc_type(dir, loc)%sf]')
             end do
         end do
 
@@ -1714,7 +1694,7 @@ contains
                 call MPI_File_set_view(file_id, int(offset, KIND=MPI_ADDRESS_KIND), mpi_p, MPI_BC_BUFFER_TYPE(dir, loc), 'native', MPI_INFO_NULL, ierr)
                 call MPI_File_read_all(file_id, bc_buffers(dir, loc)%sf, 1, MPI_BC_BUFFER_TYPE(dir, loc), MPI_STATUS_IGNORE, ierr)
                 offset = offset + sizeof(bc_buffers(dir, loc)%sf)
-                !$acc update device(bc_buffers(dir, loc)%sf)
+                $:GPU_UPDATE(device='[bc_buffers(dir, loc)%sf]')
             end do
         end do
 
@@ -1767,17 +1747,17 @@ contains
 
         bc_type(1, -1)%sf(:, :, :) = bc_x%beg
         bc_type(1, 1)%sf(:, :, :) = bc_x%end
-        !$acc update device(bc_type(1,-1)%sf, bc_type(1,1)%sf)
+        $:GPU_UPDATE(device='[bc_type(1,-1)%sf,bc_type(1,1)%sf]')
 
         if (n > 0) then
             bc_type(2, -1)%sf(:, :, :) = bc_y%beg
             bc_type(2, 1)%sf(:, :, :) = bc_y%end
-            !$acc update device(bc_type(2,-1)%sf, bc_type(2,1)%sf)
+            $:GPU_UPDATE(device='[bc_type(2,-1)%sf,bc_type(2,1)%sf]')
 
             if (p > 0) then
                 bc_type(3, -1)%sf(:, :, :) = bc_z%beg
                 bc_type(3, 1)%sf(:, :, :) = bc_z%end
-                !$acc update device(bc_type(3,-1)%sf, bc_type(3,1)%sf)
+                $:GPU_UPDATE(device='[bc_type(3,-1)%sf,bc_type(3,1)%sf]')
             end if
         end if
 
diff --git a/src/common/m_chemistry.fpp b/src/common/m_chemistry.fpp
index d61c42b1af..71aa890e87 100644
--- a/src/common/m_chemistry.fpp
+++ b/src/common/m_chemistry.fpp
@@ -35,7 +35,7 @@ contains
         do z = bounds(3)%beg, bounds(3)%end
             do y = bounds(2)%beg, bounds(2)%end
                 do x = bounds(1)%beg, bounds(1)%end
-                    !$acc loop seq
+                    $:GPU_LOOP(parallelism='[seq]')
                     do eqn = chemxb, chemxe
                         Ys(eqn - chemxb + 1) = &
                             q_cons_vf(eqn)%sf(x, y, z)/q_cons_vf(contxb)%sf(x, y, z)
@@ -46,7 +46,7 @@ contains
                     ! cons. contxb    = \rho         (1-fluid model)
                     ! cons. momxb + i = \rho u_i
                     energy = q_cons_vf(E_idx)%sf(x, y, z)/q_cons_vf(contxb)%sf(x, y, z)
-                    !$acc loop seq
+                    $:GPU_LOOP(parallelism='[seq]')
                     do eqn = momxb, momxe
                         energy = energy - &
                                  0.5_wp*(q_cons_vf(eqn)%sf(x, y, z)/q_cons_vf(contxb)%sf(x, y, z))**2._wp
@@ -72,7 +72,7 @@ contains
         do z = bounds(3)%beg, bounds(3)%end
             do y = bounds(2)%beg, bounds(2)%end
                 do x = bounds(1)%beg, bounds(1)%end
-                    !$acc loop seq
+                    $:GPU_LOOP(parallelism='[seq]')
                     do i = chemxb, chemxe
                         Ys(i - chemxb + 1) = q_prim_vf(i)%sf(x, y, z)
                     end do
@@ -99,13 +99,12 @@ contains
         real(wp), dimension(num_species) :: Ys
         real(wp), dimension(num_species) :: omega
 
-        !$acc parallel loop collapse(3) gang vector default(present) &
-        !$acc private(Ys, omega)
+        $:GPU_PARALLEL_LOOP(collapse=3, private='[Ys, omega]')
         do z = bounds(3)%beg, bounds(3)%end
             do y = bounds(2)%beg, bounds(2)%end
                 do x = bounds(1)%beg, bounds(1)%end
 
-                    !$acc loop seq
+                    $:GPU_LOOP(parallelism='[seq]')
                     do eqn = chemxb, chemxe
                         Ys(eqn - chemxb + 1) = q_prim_qp(eqn)%sf(x, y, z)
                     end do
@@ -115,7 +114,7 @@ contains
 
                     call get_net_production_rates(rho, T, Ys, omega)
 
-                    !$acc loop seq
+                    $:GPU_LOOP(parallelism='[seq]')
                     do eqn = chemxb, chemxe
 
                         omega_m = molecular_weights(eqn - chemxb + 1)*omega(eqn - chemxb + 1)
diff --git a/src/common/m_finite_differences.fpp b/src/common/m_finite_differences.fpp
index 2eb7920422..1857a31cd8 100644
--- a/src/common/m_finite_differences.fpp
+++ b/src/common/m_finite_differences.fpp
@@ -1,3 +1,5 @@
+#:include 'macros.fpp'
+
 module m_finite_differences
 
     use m_global_parameters
@@ -15,7 +17,8 @@ contains
         integer :: x, y, z !< Generic loop iterators
 
         real(wp) :: divergence
-        !$acc parallel loop collapse(3) gang vector default(present) private(divergence)
+
+        $:GPU_PARALLEL_LOOP(collapse=3, private='[divergence]')
         do x = ix_s%beg, ix_s%end
             do y = iy_s%beg, iy_s%end
                 do z = iz_s%beg, iz_s%end
diff --git a/src/common/m_helper.fpp b/src/common/m_helper.fpp
index 6a376d7da5..f222e24d50 100644
--- a/src/common/m_helper.fpp
+++ b/src/common/m_helper.fpp
@@ -1,4 +1,5 @@
 #:include 'macros.fpp'
+
 !>
 !! @file m_helper.f90
 !! @brief Contains module m_helper
@@ -43,7 +44,7 @@ contains
         !! @param Rtmp is the  bubble radii
         !! @param ntmp is the output number bubble density
     pure subroutine s_comp_n_from_prim(vftmp, Rtmp, ntmp, weights)
-        !$acc routine seq
+        $:GPU_ROUTINE(parallelism='[seq]')
         real(wp), intent(in) :: vftmp
         real(wp), dimension(nb), intent(in) :: Rtmp
         real(wp), intent(out) :: ntmp
@@ -57,7 +58,7 @@ contains
     end subroutine s_comp_n_from_prim
 
     pure subroutine s_comp_n_from_cons(vftmp, nRtmp, ntmp, weights)
-        !$acc routine seq
+        $:GPU_ROUTINE(parallelism='[seq]')
         real(wp), intent(in) :: vftmp
         real(wp), dimension(nb), intent(in) :: nRtmp
         real(wp), intent(out) :: ntmp
diff --git a/src/common/m_helper_basic.f90 b/src/common/m_helper_basic.fpp
similarity index 96%
rename from src/common/m_helper_basic.f90
rename to src/common/m_helper_basic.fpp
index 74cb61f2ab..c78140c94b 100644
--- a/src/common/m_helper_basic.f90
+++ b/src/common/m_helper_basic.fpp
@@ -2,6 +2,8 @@
 !! @file m_helper_basic.f90
 !! @brief Contains module m_helper_basic
 
+#:include 'macros.fpp'
+
 module m_helper_basic
 
     use m_derived_types        !< Definitions of the derived types
@@ -24,7 +26,7 @@ module m_helper_basic
     !! @param tol_input Relative error (default = 1.e-10_wp).
     !! @return Result of the comparison.
     logical pure elemental function f_approx_equal(a, b, tol_input) result(res)
-        !$acc routine seq
+        $:GPU_ROUTINE(parallelism='[seq]')
         real(wp), intent(in) :: a, b
         real(wp), optional, intent(in) :: tol_input
         real(wp) :: tol
@@ -50,7 +52,7 @@ end function f_approx_equal
     !! @param tol_input Relative error (default = 1e-10_wp).
     !! @return Result of the comparison.
     logical pure function f_approx_in_array(a, b, tol_input) result(res)
-        !$acc routine seq
+        $:GPU_ROUTINE(parallelism='[seq]')
         real(wp), intent(in) :: a
         real(wp), intent(in) :: b(:)
         real(wp), optional, intent(in) :: tol_input
@@ -76,7 +78,7 @@ end function f_approx_in_array
     !> Checks if a real(wp) variable is of default value.
     !! @param var Variable to check.
     logical pure elemental function f_is_default(var) result(res)
-        !$acc routine seq
+        $:GPU_ROUTINE(parallelism='[seq]')
         real(wp), intent(in) :: var
 
         res = f_approx_equal(var, dflt_real)
@@ -101,7 +103,7 @@ end function f_all_default
     !> Checks if a real(wp) variable is an integer.
     !! @param var Variable to check.
     logical pure elemental function f_is_integer(var) result(res)
-        !$acc routine seq
+        $:GPU_ROUTINE(parallelism='[seq]')
         real(wp), intent(in) :: var
 
         res = f_approx_equal(var, real(nint(var), wp))
diff --git a/src/common/m_mpi_common.fpp b/src/common/m_mpi_common.fpp
index b920151488..568dddb299 100644
--- a/src/common/m_mpi_common.fpp
+++ b/src/common/m_mpi_common.fpp
@@ -25,7 +25,7 @@ module m_mpi_common
     implicit none
 
     integer, private :: ierr, v_size !<
-    !$acc declare create(v_size)
+    $:GPU_DECLARE(create='[v_size]')
     !! Generic flags used to identify and report MPI errors
 
     real(wp), private, allocatable, dimension(:) :: buff_send !<
@@ -38,10 +38,10 @@ module m_mpi_common
     !! average primitive variables, for a single computational domain boundary
     !! at the time, from the relevant neighboring processor.
 
-    !$acc declare create(buff_send, buff_recv)
+    $:GPU_DECLARE(create='[buff_send, buff_recv]')
 
     integer :: halo_size
-    !$acc declare create(halo_size)
+    $:GPU_DECLARE(create='[halo_size]')
 
 contains
 
@@ -76,7 +76,7 @@ contains
             halo_size = -1 + buff_size*(v_size)
         end if
 
-        !$acc update device(halo_size, v_size)
+        $:GPU_UPDATE(device='[halo_size, v_size]')
 
         @:ALLOCATE(buff_send(0:halo_size), buff_recv(0:halo_size))
 #endif
@@ -631,7 +631,7 @@ contains
                             /)
         end if
 
-        !$acc update device(v_size)
+        $:GPU_UPDATE(device='[v_size]')
 
         buffer_count = buffer_counts(mpi_dir)
         boundary_conditions = (/bc_x, bc_y, bc_z/)
@@ -667,7 +667,7 @@ contains
         #:for mpi_dir in [1, 2, 3]
             if (mpi_dir == ${mpi_dir}$) then
                 #:if mpi_dir == 1
-                    !$acc parallel loop collapse(4) gang vector default(present) private(r)
+                    $:GPU_PARALLEL_LOOP(collapse=4,private='[r]')
                     do l = 0, p
                         do k = 0, n
                             do j = 0, buff_size - 1
@@ -680,7 +680,7 @@ contains
                     end do
 
                     if (qbmm_comm) then
-                        !$acc parallel loop collapse(4) gang vector default(present) private(r)
+                        $:GPU_PARALLEL_LOOP(collapse=4,private='[r]')
                         do l = 0, p
                             do k = 0, n
                                 do j = 0, buff_size - 1
@@ -695,7 +695,7 @@ contains
                             end do
                         end do
 
-                        !$acc parallel loop collapse(5) gang vector default(present) private(r)
+                        $:GPU_PARALLEL_LOOP(collapse=5,private='[r]')
                         do l = 0, p
                             do k = 0, n
                                 do j = 0, buff_size - 1
@@ -711,7 +711,7 @@ contains
                         end do
                     end if
                 #:elif mpi_dir == 2
-                    !$acc parallel loop collapse(4) gang vector default(present) private(r)
+                    $:GPU_PARALLEL_LOOP(collapse=4,private='[r]')
                     do i = 1, nVar
                         do l = 0, p
                             do k = 0, buff_size - 1
@@ -726,7 +726,7 @@ contains
                     end do
 
                     if (qbmm_comm) then
-                        !$acc parallel loop collapse(5) gang vector default(present) private(r)
+                        $:GPU_PARALLEL_LOOP(collapse=5,private='[r]')
                         do i = nVar + 1, nVar + 4
                             do l = 0, p
                                 do k = 0, buff_size - 1
@@ -742,7 +742,7 @@ contains
                             end do
                         end do
 
-                        !$acc parallel loop collapse(5) gang vector default(present) private(r)
+                        $:GPU_PARALLEL_LOOP(collapse=5,private='[r]')
                         do i = nVar + 1, nVar + 4
                             do l = 0, p
                                 do k = 0, buff_size - 1
@@ -759,7 +759,7 @@ contains
                         end do
                     end if
                 #:else
-                    !$acc parallel loop collapse(4) gang vector default(present) private(r)
+                    $:GPU_PARALLEL_LOOP(collapse=4,private='[r]')
                     do i = 1, nVar
                         do l = 0, buff_size - 1
                             do k = -buff_size, n + buff_size
@@ -774,7 +774,7 @@ contains
                     end do
 
                     if (qbmm_comm) then
-                        !$acc parallel loop collapse(5) gang vector default(present) private(r)
+                        $:GPU_PARALLEL_LOOP(collapse=5,private='[r]')
                         do i = nVar + 1, nVar + 4
                             do l = 0, buff_size - 1
                                 do k = -buff_size, n + buff_size
@@ -790,7 +790,7 @@ contains
                             end do
                         end do
 
-                        !$acc parallel loop collapse(5) gang vector default(present) private(r)
+                        $:GPU_PARALLEL_LOOP(collapse=5,private='[r]')
                         do i = nVar + 1, nVar + 4
                             do l = 0, buff_size - 1
                                 do k = -buff_size, n + buff_size
@@ -816,28 +816,33 @@ contains
         #:for rdma_mpi in [False, True]
             if (rdma_mpi .eqv. ${'.true.' if rdma_mpi else '.false.'}$) then
                 #:if rdma_mpi
-                    !$acc host_data use_device(buff_send, buff_recv)
-                    call nvtxStartRange("RHS-COMM-SENDRECV-RDMA")
+                    #:call GPU_HOST_DATA(use_device='[buff_send, buff_recv]')
+                        call nvtxStartRange("RHS-COMM-SENDRECV-RDMA")
+
+                        call MPI_SENDRECV( &
+                            buff_send, buffer_count, mpi_p, dst_proc, send_tag, &
+                            buff_recv, buffer_count, mpi_p, src_proc, recv_tag, &
+                            MPI_COMM_WORLD, MPI_STATUS_IGNORE, ierr)
+
+                        call nvtxEndRange ! RHS-MPI-SENDRECV-(NO)-RDMA
+
+                    #:endcall GPU_HOST_DATA
+                    $:GPU_WAIT()
                 #:else
                     call nvtxStartRange("RHS-COMM-DEV2HOST")
-                    !$acc update host(buff_send)
+                    $:GPU_UPDATE(host='[buff_send]')
                     call nvtxEndRange
                     call nvtxStartRange("RHS-COMM-SENDRECV-NO-RMDA")
-                #:endif
 
-                call MPI_SENDRECV( &
-                    buff_send, buffer_count, mpi_p, dst_proc, send_tag, &
-                    buff_recv, buffer_count, mpi_p, src_proc, recv_tag, &
-                    MPI_COMM_WORLD, MPI_STATUS_IGNORE, ierr)
+                    call MPI_SENDRECV( &
+                        buff_send, buffer_count, mpi_p, dst_proc, send_tag, &
+                        buff_recv, buffer_count, mpi_p, src_proc, recv_tag, &
+                        MPI_COMM_WORLD, MPI_STATUS_IGNORE, ierr)
 
-                call nvtxEndRange ! RHS-MPI-SENDRECV-(NO)-RDMA
+                    call nvtxEndRange ! RHS-MPI-SENDRECV-(NO)-RDMA
 
-                #:if rdma_mpi
-                    !$acc end host_data
-                    !$acc wait
-                #:else
                     call nvtxStartRange("RHS-COMM-HOST2DEV")
-                    !$acc update device(buff_recv)
+                    $:GPU_UPDATE(device='[buff_recv]')
                     call nvtxEndRange
                 #:endif
             end if
@@ -854,7 +859,7 @@ contains
         #:for mpi_dir in [1, 2, 3]
             if (mpi_dir == ${mpi_dir}$) then
                 #:if mpi_dir == 1
-                    !$acc parallel loop collapse(4) gang vector default(present) private(r)
+                    $:GPU_PARALLEL_LOOP(collapse=4,private='[r]')
                     do l = 0, p
                         do k = 0, n
                             do j = -buff_size, -1
@@ -874,7 +879,7 @@ contains
                     end do
 
                     if (qbmm_comm) then
-                        !$acc parallel loop collapse(5) gang vector default(present) private(r)
+                        $:GPU_PARALLEL_LOOP(collapse=5,private='[r]')
                         do l = 0, p
                             do k = 0, n
                                 do j = -buff_size, -1
@@ -889,7 +894,7 @@ contains
                             end do
                         end do
 
-                        !$acc parallel loop collapse(5) gang vector default(present) private(r)
+                        $:GPU_PARALLEL_LOOP(collapse=5,private='[r]')
                         do l = 0, p
                             do k = 0, n
                                 do j = -buff_size, -1
@@ -905,7 +910,7 @@ contains
                         end do
                     end if
                 #:elif mpi_dir == 2
-                    !$acc parallel loop collapse(4) gang vector default(present) private(r)
+                    $:GPU_PARALLEL_LOOP(collapse=4,private='[r]')
                     do i = 1, nVar
                         do l = 0, p
                             do k = -buff_size, -1
@@ -926,7 +931,7 @@ contains
                     end do
 
                     if (qbmm_comm) then
-                        !$acc parallel loop collapse(5) gang vector default(present) private(r)
+                        $:GPU_PARALLEL_LOOP(collapse=5,private='[r]')
                         do i = nVar + 1, nVar + 4
                             do l = 0, p
                                 do k = -buff_size, -1
@@ -942,7 +947,7 @@ contains
                             end do
                         end do
 
-                        !$acc parallel loop collapse(5) gang vector default(present) private(r)
+                        $:GPU_PARALLEL_LOOP(collapse=5,private='[r]')
                         do i = nVar + 1, nVar + 4
                             do l = 0, p
                                 do k = -buff_size, -1
@@ -960,7 +965,7 @@ contains
                     end if
                 #:else
                     ! Unpacking buffer from bc_z%beg
-                    !$acc parallel loop collapse(4) gang vector default(present) private(r)
+                    $:GPU_PARALLEL_LOOP(collapse=4,private='[r]')
                     do i = 1, nVar
                         do l = -buff_size, -1
                             do k = -buff_size, n + buff_size
@@ -982,7 +987,7 @@ contains
                     end do
 
                     if (qbmm_comm) then
-                        !$acc parallel loop collapse(5) gang vector default(present) private(r)
+                        $:GPU_PARALLEL_LOOP(collapse=5,private='[r]')
                         do i = nVar + 1, nVar + 4
                             do l = -buff_size, -1
                                 do k = -buff_size, n + buff_size
@@ -999,7 +1004,7 @@ contains
                             end do
                         end do
 
-                        !$acc parallel loop collapse(5) gang vector default(present) private(r)
+                        $:GPU_PARALLEL_LOOP(collapse=5,private='[r]')
                         do i = nVar + 1, nVar + 4
                             do l = -buff_size, -1
                                 do k = -buff_size, n + buff_size
diff --git a/src/common/m_phase_change.fpp b/src/common/m_phase_change.fpp
index d1dca0f6ca..e04242a787 100644
--- a/src/common/m_phase_change.fpp
+++ b/src/common/m_phase_change.fpp
@@ -42,7 +42,7 @@ module m_phase_change
     real(wp) :: A, B, C, D
     !> @}
 
-    !$acc declare create(max_iter,pCr,TCr,mixM,lp,vp,A,B,C,D)
+    $:GPU_DECLARE(create='[max_iter,pCr,TCr,mixM,lp,vp,A,B,C,D]')
 
 contains
 
@@ -89,23 +89,26 @@ contains
         real(wp) :: rho, rM, m1, m2, MCT !< total density, total reacting mass, individual reacting masses
         real(wp) :: TvF !< total volume fraction
 
-        !$acc declare create(pS, pSOV, pSSL, TS, TSOV, TSatOV, TSatSL, TSSL, rhoe, dynE, rhos, rho, rM, m1, m2, MCT, TvF)
+        $:GPU_DECLARE(create='[pS,pSOV,pSSL,TS,TSOV,TSSL,TSatOV,TSatSL]')
+        $:GPU_DECLARE(create='[rhoe,dynE,rhos,rho,rM,m1,m2,MCT,TvF]')
 
         real(wp), dimension(num_fluids) :: p_infOV, p_infpT, p_infSL, sk, hk, gk, ek, rhok
+        $:GPU_DECLARE(create='[p_infOV,p_infpT,p_infSL,sk,hk,gk,ek,rhok]')
 
         !< Generic loop iterators
         integer :: i, j, k, l
 
-        !$acc declare create(p_infOV, p_infpT, p_infSL, sk, hk, gk, ek, rhok)
-
         ! starting equilibrium solver
-        !$acc parallel loop collapse(3) gang vector default(present) private(p_infOV, p_infpT, p_infSL, sk, hk, gk, ek, rhok,pS, pSOV, pSSL, TS, TSOV, TSatOV, TSatSL, TSSL, rhoe, dynE, rhos, rho, rM, m1, m2, MCT, TvF)
+        $:GPU_PARALLEL_LOOP(collapse=3, private='[p_infOV, p_infpT, p_infSL, &
+            & sk, hk, gk, ek, rhok,pS, pSOV, pSSL, &
+            & TS, TSOV, TSatOV, TSatSL, TSSL, rhoe, &
+            & dynE, rhos, rho, rM, m1, m2, MCT, TvF]')
         do j = 0, m
             do k = 0, n
                 do l = 0, p
 
                     rho = 0.0_wp; TvF = 0.0_wp
-                    !$acc loop seq
+                    $:GPU_LOOP(parallelism='[seq]')
                     do i = 1, num_fluids
 
                         ! Mixture density
@@ -131,7 +134,7 @@ contains
 
                     ! kinetic energy as an auxiliary variable to the calculation of the total internal energy
                     dynE = 0.0_wp
-                    !$acc loop seq
+                    $:GPU_LOOP(parallelism='[seq]')
                     do i = momxb, momxe
 
                         dynE = dynE + 5.0e-1_wp*q_cons_vf(i)%sf(j, k, l)**2/rho
@@ -253,7 +256,7 @@ contains
 
                     ! calculating volume fractions, internal energies, and total entropy
                     rhos = 0.0_wp
-                    !$acc loop seq
+                    $:GPU_LOOP(parallelism='[seq]')
                     do i = 1, num_fluids
 
                         ! volume fractions
@@ -284,12 +287,8 @@ contains
         !!  @param rhoe mixture energy
         !!  @param TS equilibrium temperature at the interface
     pure subroutine s_infinite_pt_relaxation_k(j, k, l, MFL, pS, p_infpT, q_cons_vf, rhoe, TS)
-
-#ifdef _CRAYFTN
-        !DIR$ INLINEALWAYS s_infinite_pt_relaxation_k
-#else
-        !$acc routine seq
-#endif
+        $:GPU_ROUTINE(function_name='s_infinite_pt_relaxation_k', &
+            & parallelism='[seq]', cray_inline=True)
 
         ! initializing variables
         integer, intent(in) :: j, k, l, MFL
@@ -305,7 +304,7 @@ contains
         ! auxiliary variables for the pT-equilibrium solver
         mCP = 0.0_wp; mQ = 0.0_wp; p_infpT = ps_inf; 
         ! Performing tests before initializing the pT-equilibrium
-        !$acc loop seq
+        $:GPU_LOOP(parallelism='[seq]')
         do i = 1, num_fluids
 
             ! sum of the total alpha*rho*cp of the system
@@ -354,7 +353,7 @@ contains
 
             ! updating functions used in the Newton's solver
             gpp = 0.0_wp; gp = 0.0_wp; hp = 0.0_wp
-            !$acc loop seq
+            $:GPU_LOOP(parallelism='[seq]')
             do i = 1, num_fluids
 
                 gp = gp + (gs_min(i) - 1.0_wp)*q_cons_vf(i + contxb - 1)%sf(j, k, l)*cvs(i) &
@@ -388,12 +387,8 @@ contains
         !!  @param q_cons_vf Cell-average conservative variables
         !!  @param TS equilibrium temperature at the interface
     pure subroutine s_infinite_ptg_relaxation_k(j, k, l, pS, p_infpT, rhoe, q_cons_vf, TS)
-
-#ifdef _CRAYFTN
-        !DIR$ INLINEALWAYS s_infinite_ptg_relaxation_k
-#else
-        !$acc routine seq
-#endif
+        $:GPU_ROUTINE(function_name='s_infinite_ptg_relaxation_k', &
+            & parallelism='[seq]', cray_inline=True)
 
         integer, intent(in) :: j, k, l
         real(wp), intent(inout) :: pS
@@ -448,7 +443,7 @@ contains
             mCP = 0.0_wp; mCPD = 0.0_wp; mCVGP = 0.0_wp; mCVGP2 = 0.0_wp; mQ = 0.0_wp; mQD = 0.0_wp
             ! Those must be updated through the iterations, as they either depend on
             ! the partial masses for all fluids, or on the equilibrium pressure
-            !$acc loop seq
+            $:GPU_LOOP(parallelism='[seq]')
             do i = 1, num_fluids
 
                 ! sum of the total alpha*rho*cp of the system
@@ -513,12 +508,8 @@ contains
         !!  @param k generic loop iterator for y direction
         !!  @param l generic loop iterator for z direction
     pure subroutine s_correct_partial_densities(MCT, q_cons_vf, rM, j, k, l)
-
-#ifdef _CRAYFTN
-        !DIR$ INLINEALWAYS s_correct_partial_densities
-#else
-        !$acc routine seq
-#endif
+        $:GPU_ROUTINE(function_name='s_correct_partial_densities', &
+            & parallelism='[seq]', cray_inline=True)
 
         !> @name variables for the correction of the reacting partial densities
         !> @{
@@ -576,12 +567,8 @@ contains
         !!  @param q_cons_vf Cell-average conservative variables
         !!  @param TJac Transpose of the Jacobian Matrix
     pure subroutine s_compute_jacobian_matrix(InvJac, j, Jac, k, l, mCPD, mCVGP, mCVGP2, pS, q_cons_vf, TJac)
-
-#ifdef _CRAYFTN
-        !DIR$ INLINEALWAYS s_compute_jacobian_matrix
-#else
-        !$acc routine seq
-#endif
+        $:GPU_ROUTINE(function_name='s_compute_jacobian_matrix', &
+            & parallelism='[seq]', cray_inline=True)
 
         real(wp), dimension(2, 2), intent(out) :: InvJac
         integer, intent(in) :: j
@@ -683,12 +670,8 @@ contains
         !!  @param rhoe mixture energy
         !!  @param R2D (2D) residue array
     pure subroutine s_compute_pTg_residue(j, k, l, mCPD, mCVGP, mQD, q_cons_vf, pS, rhoe, R2D)
-
-#ifdef _CRAYFTN
-        !DIR$ INLINEALWAYS s_compute_pTg_residue
-#else
-        !$acc routine seq
-#endif
+        $:GPU_ROUTINE(function_name='s_compute_pTg_residue', &
+            & parallelism='[seq]', cray_inline=True)
 
         integer, intent(in) :: j, k, l
         real(wp), intent(in) :: mCPD, mCVGP, mQD
@@ -734,12 +717,8 @@ contains
         !!  @param TSat Saturation Temperature
         !!  @param TSIn equilibrium Temperature
     pure elemental subroutine s_TSat(pSat, TSat, TSIn)
-
-#ifdef _CRAYFTN
-        !DIR$ INLINEALWAYS s_TSat
-#else
-        !$acc routine seq
-#endif
+        $:GPU_ROUTINE(function_name='s_TSat',parallelism='[seq]', &
+            & cray_inline=True)
 
         real(wp), intent(in) :: pSat
         real(wp), intent(out) :: TSat
diff --git a/src/common/m_variables_conversion.fpp b/src/common/m_variables_conversion.fpp
index 949eac92cb..6a67b56315 100644
--- a/src/common/m_variables_conversion.fpp
+++ b/src/common/m_variables_conversion.fpp
@@ -50,16 +50,16 @@ module m_variables_conversion
     !! In simulation, gammas, pi_infs, and qvs are already declared in m_global_variables
 #ifndef MFC_SIMULATION
     real(wp), allocatable, public, dimension(:) :: gammas, gs_min, pi_infs, ps_inf, cvs, qvs, qvps
-    !$acc declare create(gammas, gs_min, pi_infs, ps_inf, cvs, qvs, qvps)
+    $:GPU_DECLARE(create='[gammas,gs_min,pi_infs,ps_inf,cvs,qvs,qvps]')
 #endif
 
     real(wp), allocatable, dimension(:) :: Gs
     integer, allocatable, dimension(:) :: bubrs
     real(wp), allocatable, dimension(:, :) :: Res
-    !$acc declare create(bubrs, Gs, Res)
+    $:GPU_DECLARE(create='[bubrs,Gs,Res]')
 
     integer :: is1b, is2b, is3b, is1e, is2e, is3e
-    !$acc declare create(is1b, is2b, is3b, is1e, is2e, is3e)
+    $:GPU_DECLARE(create='[is1b,is2b,is3b,is1e,is2e,is3e]')
 
     real(wp), allocatable, dimension(:, :, :), public :: rho_sf !< Scalar density function
     real(wp), allocatable, dimension(:, :, :), public :: gamma_sf !< Scalar sp. heat ratio function
@@ -116,12 +116,8 @@ contains
         !! @param stress Shear Stress
         !! @param mom Momentum
     subroutine s_compute_pressure(energy, alf, dyn_p, pi_inf, gamma, rho, qv, rhoYks, pres, T, stress, mom, G, pres_mag)
-
-#ifdef _CRAYFTN
-        !DIR$ INLINEALWAYS s_compute_pressure
-#else
-        !$acc routine seq
-#endif
+        $:GPU_ROUTINE(function_name='s_compute_pressure',parallelism='[seq]', &
+            & cray_inline=True)
 
         real(wp), intent(in) :: energy, alf
         real(wp), intent(in) :: dyn_p
@@ -458,11 +454,8 @@ contains
                                                                gamma_K, pi_inf_K, qv_K, &
                                                                alpha_K, alpha_rho_K, Re_K, &
                                                                G_K, G)
-#ifdef _CRAYFTN
-        !DIR$ INLINEALWAYS s_convert_species_to_mixture_variables_acc
-#else
-        !$acc routine seq
-#endif
+        $:GPU_ROUTINE(function_name='s_convert_species_to_mixture_variables_acc', &
+            & parallelism='[seq]', cray_inline=True)
 
         real(wp), intent(out) :: rho_K, gamma_K, pi_inf_K, qv_K
 
@@ -539,11 +532,8 @@ contains
     pure subroutine s_convert_species_to_mixture_variables_bubbles_acc(rho_K, &
                                                                        gamma_K, pi_inf_K, qv_K, &
                                                                        alpha_K, alpha_rho_K, Re_K)
-#ifdef _CRAYFTN
-        !DIR$ INLINEALWAYS s_convert_species_to_mixture_variables_bubbles_acc
-#else
-        !$acc routine seq
-#endif
+        $:GPU_ROUTINE(function_name='s_convert_species_to_mixture_variables_bubbles_acc', &
+            & parallelism='[seq]', cray_inline=True)
 
         real(wp), intent(inout) :: rho_K, gamma_K, pi_inf_K, qv_K
 
@@ -610,7 +600,7 @@ contains
 
         integer :: i, j
 
-!$acc enter data copyin(is1b, is1e, is2b, is2e, is3b, is3e)
+        $:GPU_ENTER_DATA(copyin='[is1b,is1e,is2b,is2e,is3b,is3e]')
 
 #ifdef MFC_SIMULATION
         @:ALLOCATE(gammas (1:num_fluids))
@@ -642,7 +632,7 @@ contains
             qvs(i) = fluid_pp(i)%qv
             qvps(i) = fluid_pp(i)%qvp
         end do
-!$acc update device(gammas, gs_min, pi_infs, ps_inf, cvs, qvs, qvps, Gs)
+        $:GPU_UPDATE(device='[gammas,gs_min,pi_infs,ps_inf,cvs,qvs,qvps,Gs]')
 
 #ifdef MFC_SIMULATION
 
@@ -654,7 +644,7 @@ contains
                 end do
             end do
 
-            !$acc update device(Res, Re_idx, Re_size)
+            $:GPU_UPDATE(device='[Res,Re_idx,Re_size]')
         end if
 #endif
 
@@ -668,7 +658,7 @@ contains
             do i = 1, nb
                 bubrs(i) = bub_idx%rs(i)
             end do
-            !$acc update device(bubrs)
+            $:GPU_UPDATE(device='[bubrs]')
         end if
 
 #ifdef MFC_POST_PROCESS
@@ -748,7 +738,7 @@ contains
 
                     nbub_sc = qK_cons_vf(bubxb)%sf(j, k, l)
 
-                    !$acc loop seq
+                    $:GPU_LOOP(parallelism='[seq]')
                     do i = 1, nb
                         mu = qK_cons_vf(bubxb + 1 + (i - 1)*nmom)%sf(j, k, l)/nbub_sc
                         sig = (qK_cons_vf(bubxb + 3 + (i - 1)*nmom)%sf(j, k, l)/nbub_sc - mu**2)**0.5_wp
@@ -781,7 +771,7 @@ contains
 
                     nbub_sc = qK_cons_vf(bubxb)%sf(j, k, l)
 
-                    !$acc loop seq
+                    $:GPU_LOOP(parallelism='[seq]')
                     do i = 1, nb
                         mu = qK_cons_vf(bubxb + 1 + (i - 1)*nmom)%sf(j, k, l)/nbub_sc
                         sig = (qK_cons_vf(bubxb + 3 + (i - 1)*nmom)%sf(j, k, l)/nbub_sc - mu**2)**0.5_wp
@@ -869,15 +859,15 @@ contains
             end if
         #:endif
 
-        !$acc parallel loop collapse(3) gang vector default(present) &
-        !$acc private(alpha_K, alpha_rho_K, Re_K, nRtmp, rho_K, gamma_K, &
-        !$acc pi_inf_K, qv_K, dyn_pres_K, rhoYks, B)
+        $:GPU_PARALLEL_LOOP(collapse=3, private='[alpha_K, alpha_rho_K, Re_K, &
+            & nRtmp, rho_K, gamma_K, pi_inf_K,qv_K, &
+            & dyn_pres_K, rhoYks, B]')
         do l = ibounds(3)%beg, ibounds(3)%end
             do k = ibounds(2)%beg, ibounds(2)%end
                 do j = ibounds(1)%beg, ibounds(1)%end
                     dyn_pres_K = 0._wp
 
-                    !$acc loop seq
+                    $:GPU_LOOP(parallelism='[seq]')
                     do i = 1, num_fluids
                         alpha_rho_K(i) = qK_cons_vf(i)%sf(j, k, l)
                         alpha_K(i) = qK_cons_vf(advxb + i - 1)%sf(j, k, l)
@@ -921,13 +911,13 @@ contains
                         B2 = B(1)**2 + B(2)**2 + B(3)**2
 
                         m2 = 0._wp
-                        !$acc loop seq
+                        $:GPU_LOOP(parallelism='[seq]')
                         do i = momxb, momxe
                             m2 = m2 + qK_cons_vf(i)%sf(j, k, l)**2
                         end do
 
                         S = 0._wp
-                        !$acc loop seq
+                        $:GPU_LOOP(parallelism='[seq]')
                         do i = 1, 3
                             S = S + qK_cons_vf(momxb + i - 1)%sf(j, k, l)*B(i)
                         end do
@@ -935,14 +925,14 @@ contains
                         E = qK_cons_vf(E_idx)%sf(j, k, l)
 
                         D = 0._wp
-                        !$acc loop seq
+                        $:GPU_LOOP(parallelism='[seq]')
                         do i = 1, contxe
                             D = D + qK_cons_vf(i)%sf(j, k, l)
                         end do
 
                         ! Newton-Raphson
                         W = E + D
-                        !$acc loop seq
+                        $:GPU_LOOP(parallelism='[seq]')
                         do iter = 1, relativity_cons_to_prim_max_iter
                             Ga = (W + B2)*W/sqrt((W + B2)**2*W**2 - (m2*W**2 + S**2*(2*W + B2)))
                             pres = (W - D*Ga)/((gamma_K + 1)*Ga**2) ! Thermal pressure from EOS
@@ -968,13 +958,13 @@ contains
                         qK_prim_vf(E_idx)%sf(j, k, l) = (W - D*Ga)/((gamma_K + 1)*Ga**2)
 
                         ! Recover the other primitive variables
-                        !$acc loop seq
+                        $:GPU_LOOP(parallelism='[seq]')
                         do i = 1, 3
                             qK_prim_vf(momxb + i - 1)%sf(j, k, l) = (qK_cons_vf(momxb + i - 1)%sf(j, k, l) + (S/W)*B(i))/(W + B2)
                         end do
                         qK_prim_vf(1)%sf(j, k, l) = D/Ga ! Hard-coded for single-component for now
 
-                        !$acc loop seq
+                        $:GPU_LOOP(parallelism='[seq]')
                         do i = B_idx%beg, B_idx%end
                             qK_prim_vf(i)%sf(j, k, l) = qK_cons_vf(i)%sf(j, k, l)
                         end do
@@ -984,22 +974,22 @@ contains
 
                     if (chemistry) then
                         rho_K = 0._wp
-                        !$acc loop seq
+                        $:GPU_LOOP(parallelism='[seq]')
                         do i = chemxb, chemxe
                             rho_K = rho_K + max(0._wp, qK_cons_vf(i)%sf(j, k, l))
                         end do
 
-                        !$acc loop seq
+                        $:GPU_LOOP(parallelism='[seq]')
                         do i = 1, contxe
                             qK_prim_vf(i)%sf(j, k, l) = rho_K
                         end do
 
-                        !$acc loop seq
+                        $:GPU_LOOP(parallelism='[seq]')
                         do i = chemxb, chemxe
                             qK_prim_vf(i)%sf(j, k, l) = max(0._wp, qK_cons_vf(i)%sf(j, k, l)/rho_K)
                         end do
                     else
-                        !$acc loop seq
+                        $:GPU_LOOP(parallelism='[seq]')
                         do i = 1, contxe
                             qK_prim_vf(i)%sf(j, k, l) = qK_cons_vf(i)%sf(j, k, l)
                         end do
@@ -1009,7 +999,7 @@ contains
                     rho_K = max(rho_K, sgm_eps)
 #endif
 
-                    !$acc loop seq
+                    $:GPU_LOOP(parallelism='[seq]')
                     do i = momxb, momxe
                         if (model_eqns /= 4) then
                             qK_prim_vf(i)%sf(j, k, l) = qK_cons_vf(i)%sf(j, k, l) &
@@ -1023,7 +1013,7 @@ contains
                     end do
 
                     if (chemistry) then
-                        !$acc loop seq
+                        $:GPU_LOOP(parallelism='[seq]')
                         do i = 1, num_species
                             rhoYks(i) = qK_cons_vf(chemxb + i - 1)%sf(j, k, l)
                         end do
@@ -1053,7 +1043,7 @@ contains
                     end if
 
                     if (bubbles_euler) then
-                        !$acc loop seq
+                        $:GPU_LOOP(parallelism='[seq]')
                         do i = 1, nb
                             nRtmp(i) = qK_cons_vf(bubrs(i))%sf(j, k, l)
                         end do
@@ -1065,7 +1055,7 @@ contains
                             nbub_sc = qK_cons_vf(bubxb)%sf(j, k, l)
 
                             !Convert cons to prim
-                            !$acc loop seq
+                            $:GPU_LOOP(parallelism='[seq]')
                             do i = bubxb, bubxe
                                 qK_prim_vf(i)%sf(j, k, l) = qK_cons_vf(i)%sf(j, k, l)/nbub_sc
                             end do
@@ -1082,7 +1072,7 @@ contains
                                 call s_comp_n_from_cons(vftmp, nRtmp, nbub_sc, weight)
                             end if
 
-                            !$acc loop seq
+                            $:GPU_LOOP(parallelism='[seq]')
                             do i = bubxb, bubxe
                                 qK_prim_vf(i)%sf(j, k, l) = qK_cons_vf(i)%sf(j, k, l)/nbub_sc
                             end do
@@ -1090,21 +1080,21 @@ contains
                     end if
 
                     if (mhd) then
-                        !$acc loop seq
+                        $:GPU_LOOP(parallelism='[seq]')
                         do i = B_idx%beg, B_idx%end
                             qK_prim_vf(i)%sf(j, k, l) = qK_cons_vf(i)%sf(j, k, l)
                         end do
                     end if
 
                     if (elasticity) then
-                        !$acc loop seq
+                        $:GPU_LOOP(parallelism='[seq]')
                         do i = strxb, strxe
                             qK_prim_vf(i)%sf(j, k, l) = qK_cons_vf(i)%sf(j, k, l)/rho_K
                         end do
                     end if
 
                     if (hypoelasticity) then
-                        !$acc loop seq
+                        $:GPU_LOOP(parallelism='[seq]')
                         do i = strxb, strxe
                             ! subtracting elastic contribution for pressure calculation
                             if (G_K > verysmall) then
@@ -1121,13 +1111,13 @@ contains
                     end if
 
                     if (hyperelasticity) then
-                        !$acc loop seq
+                        $:GPU_LOOP(parallelism='[seq]')
                         do i = xibeg, xiend
                             qK_prim_vf(i)%sf(j, k, l) = qK_cons_vf(i)%sf(j, k, l)/rho_K
                         end do
                     end if
 
-                    !$acc loop seq
+                    $:GPU_LOOP(parallelism='[seq]')
                     do i = advxb, advxe
                         qK_prim_vf(i)%sf(j, k, l) = qK_cons_vf(i)%sf(j, k, l)
                     end do
@@ -1145,7 +1135,6 @@ contains
                 end do
             end do
         end do
-        !$acc end parallel loop
 
     end subroutine s_convert_conservative_to_primitive_variables
 
@@ -1465,32 +1454,33 @@ contains
         is2b = is2%beg; is2e = is2%end
         is3b = is3%beg; is3e = is3%end
 
-        !$acc update device(is1b, is2b, is3b, is1e, is2e, is3e)
+        $:GPU_UPDATE(device='[is1b,is2b,is3b,is1e,is2e,is3e]')
 
         ! Computing the flux variables from the primitive variables, without
         ! accounting for the contribution of either viscosity or capillarity
 #ifdef MFC_SIMULATION
-        !$acc parallel loop collapse(3) gang vector default(present) private(alpha_rho_K, vel_K, alpha_K, Re_K, Y_K)
+        $:GPU_PARALLEL_LOOP(collapse=3, private='[alpha_rho_K, vel_K, &
+            & alpha_K, Re_K, Y_K]')
         do l = is3b, is3e
             do k = is2b, is2e
                 do j = is1b, is1e
 
-                    !$acc loop seq
+                    $:GPU_LOOP(parallelism='[seq]')
                     do i = 1, contxe
                         alpha_rho_K(i) = qK_prim_vf(j, k, l, i)
                     end do
 
-                    !$acc loop seq
+                    $:GPU_LOOP(parallelism='[seq]')
                     do i = advxb, advxe
                         alpha_K(i - E_idx) = qK_prim_vf(j, k, l, i)
                     end do
-                    !$acc loop seq
+                    $:GPU_LOOP(parallelism='[seq]')
                     do i = 1, num_vels
                         vel_K(i) = qK_prim_vf(j, k, l, contxe + i)
                     end do
 
                     vel_K_sum = 0._wp
-                    !$acc loop seq
+                    $:GPU_LOOP(parallelism='[seq]')
                     do i = 1, num_vels
                         vel_K_sum = vel_K_sum + vel_K(i)**2._wp
                     end do
@@ -1511,7 +1501,7 @@ contains
                     ! Computing the energy from the pressure
 
                     if (chemistry) then
-                        !$acc loop seq
+                        $:GPU_LOOP(parallelism='[seq]')
                         do i = chemxb, chemxe
                             Y_K(i - chemxb + 1) = qK_prim_vf(j, k, l, i)
                         end do
@@ -1528,12 +1518,12 @@ contains
                     end if
 
                     ! mass flux, this should be \alpha_i \rho_i u_i
-                    !$acc loop seq
+                    $:GPU_LOOP(parallelism='[seq]')
                     do i = 1, contxe
                         FK_vf(j, k, l, i) = alpha_rho_K(i)*vel_K(dir_idx(1))
                     end do
 
-                    !$acc loop seq
+                    $:GPU_LOOP(parallelism='[seq]')
                     do i = 1, num_vels
                         FK_vf(j, k, l, contxe + dir_idx(i)) = &
                             rho_K*vel_K(dir_idx(1)) &
@@ -1546,14 +1536,14 @@ contains
 
                     ! Species advection Flux, \rho*u*Y
                     if (chemistry) then
-                        !$acc loop seq
+                        $:GPU_LOOP(parallelism='[seq]')
                         do i = 1, num_species
                             FK_vf(j, k, l, i - 1 + chemxb) = vel_K(dir_idx(1))*(rho_K*Y_K(i))
                         end do
                     end if
 
                     if (riemann_solver == 1 .or. riemann_solver == 4) then
-                        !$acc loop seq
+                        $:GPU_LOOP(parallelism='[seq]')
                         do i = advxb, advxe
                             FK_vf(j, k, l, i) = 0._wp
                             FK_src_vf(j, k, l, i) = alpha_K(i - E_idx)
@@ -1561,12 +1551,12 @@ contains
 
                     else
                         ! Could be bubbles_euler!
-                        !$acc loop seq
+                        $:GPU_LOOP(parallelism='[seq]')
                         do i = advxb, advxe
                             FK_vf(j, k, l, i) = vel_K(dir_idx(1))*alpha_K(i - E_idx)
                         end do
 
-                        !$acc loop seq
+                        $:GPU_LOOP(parallelism='[seq]')
                         do i = advxb, advxe
                             FK_src_vf(j, k, l, i) = vel_K(dir_idx(1))
                         end do
@@ -1603,11 +1593,8 @@ contains
 
 #ifndef MFC_PRE_PROCESS
     pure subroutine s_compute_speed_of_sound(pres, rho, gamma, pi_inf, H, adv, vel_sum, c_c, c)
-#ifdef _CRAYFTN
-        !DIR$ INLINEALWAYS s_compute_speed_of_sound
-#else
-        !$acc routine seq
-#endif
+        $:GPU_ROUTINE(function_name='s_compute_speed_of_sound', &
+            & parallelism='[seq]', cray_inline=True)
 
         real(wp), intent(in) :: pres
         real(wp), intent(in) :: rho, gamma, pi_inf
@@ -1640,7 +1627,7 @@ contains
                 c = (1._wp/(rho*(adv(1)/blkmod1 + adv(2)/blkmod2)))
             elseif (model_eqns == 3) then
                 c = 0._wp
-                !$acc loop seq
+                $:GPU_LOOP(parallelism='[seq]')
                 do q = 1, num_fluids
                     c = c + adv(q)*(1._wp/gammas(q) + 1._wp)* &
                         (pres + pi_infs(q)/(gammas(q) + 1._wp))
@@ -1673,11 +1660,8 @@ contains
 
 #ifndef MFC_PRE_PROCESS
     pure subroutine s_compute_fast_magnetosonic_speed(rho, c, B, norm, c_fast, h)
-#ifdef _CRAYFTN
-        !DIR$ INLINEALWAYS s_compute_fast_magnetosonic_speed
-#else
-        !$acc routine seq
-#endif
+        $:GPU_ROUTINE(function_name='s_compute_fast_magnetosonic_speed', &
+            & parallelism='[seq]', cray_inline=True)
 
         real(wp), intent(in) :: B(3), rho, c
         real(wp), intent(in) :: h ! only used for relativity
diff --git a/src/pre_process/m_assign_variables.fpp b/src/pre_process/m_assign_variables.fpp
index 3cf39533e4..f1bfe06fa3 100644
--- a/src/pre_process/m_assign_variables.fpp
+++ b/src/pre_process/m_assign_variables.fpp
@@ -3,6 +3,7 @@
 !! @brief Contains module m_assign_variables
 
 #:include 'case.fpp'
+#:include 'macros.fpp'
 
 module m_assign_variables
 
@@ -103,7 +104,7 @@ contains
         !! @param patch_id_fp Array to track patch ids
     pure subroutine s_assign_patch_mixture_primitive_variables(patch_id, j, k, l, &
                                                                eta, q_prim_vf, patch_id_fp)
-        !$acc routine seq
+        $:GPU_ROUTINE(parallelism='[seq]')
 
         integer, intent(in) :: patch_id
         integer, intent(in) :: j, k, l
@@ -276,7 +277,7 @@ contains
         !! @param patch_id_fp Array to track patch ids
     impure subroutine s_assign_patch_species_primitive_variables(patch_id, j, k, l, &
                                                                  eta, q_prim_vf, patch_id_fp)
-        !$acc routine seq
+        $:GPU_ROUTINE(parallelism='[seq]')
 
         integer, intent(in) :: patch_id
         integer, intent(in) :: j, k, l
diff --git a/src/pre_process/m_patches.fpp b/src/pre_process/m_patches.fpp
index b3f6b48feb..f7700e84ac 100644
--- a/src/pre_process/m_patches.fpp
+++ b/src/pre_process/m_patches.fpp
@@ -2389,7 +2389,7 @@ contains
     end subroutine s_model
 
     subroutine s_convert_cylindrical_to_cartesian_coord(cyl_y, cyl_z)
-        !$acc routine seq
+        $:GPU_ROUTINE(parallelism='[seq]')
 
         real(wp), intent(in) :: cyl_y, cyl_z
 
@@ -2400,7 +2400,7 @@ contains
 
     pure function f_convert_cyl_to_cart(cyl) result(cart)
 
-        !$acc routine seq
+        $:GPU_ROUTINE(parallelism='[seq]')
 
         t_vec3, intent(in) :: cyl
         t_vec3 :: cart
@@ -2412,7 +2412,7 @@ contains
     end function f_convert_cyl_to_cart
 
     subroutine s_convert_cylindrical_to_spherical_coord(cyl_x, cyl_y)
-        !$acc routine seq
+        $:GPU_ROUTINE(parallelism='[seq]')
 
         real(wp), intent(IN) :: cyl_x, cyl_y
 
@@ -2425,7 +2425,7 @@ contains
     !! @param offset Thickness
     !! @param a Starting position
     pure elemental function f_r(myth, offset, a)
-        !$acc routine seq
+        $:GPU_ROUTINE(parallelism='[seq]')
         real(wp), intent(in) :: myth, offset, a
         real(wp) :: b
         real(wp) :: f_r
diff --git a/src/simulation/include/inline_riemann.fpp b/src/simulation/include/inline_riemann.fpp
index 714ec3ef21..9972799b02 100644
--- a/src/simulation/include/inline_riemann.fpp
+++ b/src/simulation/include/inline_riemann.fpp
@@ -1,7 +1,7 @@
 #:def arithmetic_avg()
     rho_avg = 5.e-1_wp*(rho_L + rho_R)
     vel_avg_rms = 0._wp
-    !$acc loop seq
+    $:GPU_LOOP(parallelism='[seq]')
     do i = 1, num_vels
         vel_avg_rms = vel_avg_rms + (5.e-1_wp*(vel_L(i) + vel_R(i)))**2._wp
     end do
@@ -17,7 +17,7 @@
 
     vel_avg_rms = 0._wp
 
-    !$acc loop seq
+    $:GPU_LOOP(parallelism='[seq]')
     do i = 1, num_vels
         vel_avg_rms = vel_avg_rms + (sqrt(rho_L)*vel_L(i) + sqrt(rho_R)*vel_R(i))**2._wp/ &
                       (sqrt(rho_L) + sqrt(rho_R))**2._wp
diff --git a/src/simulation/m_acoustic_src.fpp b/src/simulation/m_acoustic_src.fpp
index 5f7e986d89..b14528b9d5 100644
--- a/src/simulation/m_acoustic_src.fpp
+++ b/src/simulation/m_acoustic_src.fpp
@@ -23,41 +23,43 @@ module m_acoustic_src
     private; public :: s_initialize_acoustic_src, s_precalculate_acoustic_spatial_sources, s_acoustic_src_calculations
 
     integer, allocatable, dimension(:) :: pulse, support
-    !$acc declare create(pulse, support)
+    $:GPU_DECLARE(create='[pulse,support]')
 
     logical, allocatable, dimension(:) :: dipole
-    !$acc declare create(dipole)
+    $:GPU_DECLARE(create='[dipole]')
 
     real(wp), allocatable, target, dimension(:, :) :: loc_acoustic
-    !$acc declare create(loc_acoustic)
+    $:GPU_DECLARE(create='[loc_acoustic]')
 
-    real(wp), allocatable, dimension(:) :: mag, length, height, wavelength, frequency, gauss_sigma_dist, gauss_sigma_time, npulse, dir, delay
-    !$acc declare create(mag, length, height, wavelength, frequency, gauss_sigma_dist, gauss_sigma_time, npulse, dir, delay)
+    real(wp), allocatable, dimension(:) :: mag, length, height, wavelength, frequency
+    real(wp), allocatable, dimension(:) :: gauss_sigma_dist, gauss_sigma_time, npulse, dir, delay
+    $:GPU_DECLARE(create='[mag,length,height,wavelength,frequency]')
+    $:GPU_DECLARE(create='[gauss_sigma_dist,gauss_sigma_time,npulse,dir,delay]')
 
     real(wp), allocatable, dimension(:) :: foc_length, aperture
-    !$acc declare create(foc_length, aperture)
+    $:GPU_DECLARE(create='[foc_length,aperture]')
 
     real(wp), allocatable, dimension(:) :: element_spacing_angle, element_polygon_ratio, rotate_angle
-    !$acc declare create(element_spacing_angle, element_polygon_ratio, rotate_angle)
+    $:GPU_DECLARE(create='[element_spacing_angle,element_polygon_ratio,rotate_angle]')
 
     real(wp), allocatable, dimension(:) :: bb_bandwidth, bb_lowest_freq
-    !$acc declare create(bb_bandwidth, bb_lowest_freq)
+    $:GPU_DECLARE(create='[bb_bandwidth,bb_lowest_freq]')
 
     integer, allocatable, dimension(:) :: num_elements, element_on, bb_num_freq
-    !$acc declare create(num_elements, element_on, bb_num_freq)
+    $:GPU_DECLARE(create='[num_elements,element_on,bb_num_freq]')
 
     !> @name Acoustic source terms
     !> @{
     real(wp), allocatable, dimension(:, :, :) :: mass_src, e_src
     real(wp), allocatable, dimension(:, :, :, :) :: mom_src
     !> @}
-    !$acc declare create(mass_src, e_src, mom_src)
+    $:GPU_DECLARE(create='[mass_src,e_src,mom_src]')
 
     integer, dimension(:), allocatable :: source_spatials_num_points !< Number of non-zero source grid points for each source
-    !$acc declare create(source_spatials_num_points)
+    $:GPU_DECLARE(create='[source_spatials_num_points]')
 
     type(source_spatial_type), dimension(:), allocatable :: source_spatials !< Data of non-zero source grid points for each source
-    !$acc declare create(source_spatials)
+    $:GPU_DECLARE(create='[source_spatials]')
 
 contains
 
@@ -108,7 +110,12 @@ contains
                 delay(i) = acoustic(i)%delay
             end if
         end do
-        !$acc update device(loc_acoustic, mag, dipole, support, length, height, wavelength, frequency, gauss_sigma_dist, gauss_sigma_time, foc_length, aperture, npulse, pulse, dir, delay, element_polygon_ratio, rotate_angle, element_spacing_angle, num_elements, element_on, bb_num_freq, bb_bandwidth, bb_lowest_freq)
+        $:GPU_UPDATE(device='[loc_acoustic,mag,dipole,support,length, &
+            & height,wavelength,frequency,gauss_sigma_dist, &
+            & gauss_sigma_time,foc_length,aperture,npulse,pulse, &
+            & dir,delay,element_polygon_ratio,rotate_angle, &
+            & element_spacing_angle,num_elements,element_on, &
+            & bb_num_freq,bb_bandwidth,bb_lowest_freq]')
 
         @:ALLOCATE(mass_src(0:m, 0:n, 0:p))
         @:ALLOCATE(mom_src(1:num_vels, 0:m, 0:n, 0:p))
@@ -159,7 +166,7 @@ contains
 
         sim_time = t_step*dt
 
-        !$acc parallel loop collapse(3) gang vector default(present)
+        $:GPU_PARALLEL_LOOP(collapse=3)
         do l = 0, p
             do k = 0, n
                 do j = 0, m
@@ -198,7 +205,7 @@ contains
                 call s_mpi_send_random_number(phi_rn, bb_num_freq(ai))
             end if
 
-            !$acc loop reduction(+:sum_BB)
+            $:GPU_LOOP(reduction='[[sum_BB]]', reductionOp='[+]')
             do k = 1, bb_num_freq(ai)
                 ! Acoustic period of the wave at each discrete frequency
                 period_BB = 1._wp/(bb_lowest_freq(ai) + k*bb_bandwidth(ai))
@@ -212,7 +219,7 @@ contains
 
             deallocate (phi_rn)
 
-            !$acc parallel loop gang vector default(present) private(myalpha, myalpha_rho)
+            $:GPU_PARALLEL_LOOP(private='[myalpha,myalpha_rho]')
             do i = 1, num_points
                 j = source_spatials(ai)%coord(1, i)
                 k = source_spatials(ai)%coord(2, i)
@@ -223,7 +230,7 @@ contains
                 B_tait = 0._wp
                 small_gamma = 0._wp
 
-                !$acc loop
+                $:GPU_LOOP(parallelism='[seq]')
                 do q = 1, num_fluids
                     myalpha_rho(q) = q_cons_vf(q)%sf(j, k, l)
                     myalpha(q) = q_cons_vf(advxb + q - 1)%sf(j, k, l)
@@ -231,7 +238,7 @@ contains
 
                 if (bubbles_euler) then
                     if (num_fluids > 2) then
-                        !$acc loop seq
+                        $:GPU_LOOP(parallelism='[seq]')
                         do q = 1, num_fluids - 1
                             myRho = myRho + myalpha_rho(q)
                             B_tait = B_tait + myalpha(q)*pi_infs(q)
@@ -245,7 +252,7 @@ contains
                 end if
 
                 if ((.not. bubbles_euler) .or. (mpp_lim .and. (num_fluids > 2))) then
-                    !$acc loop seq
+                    $:GPU_LOOP(parallelism='[seq]')
                     do q = 1, num_fluids
                         myRho = myRho + myalpha_rho(q)
                         B_tait = B_tait + myalpha(q)*pi_infs(q)
@@ -312,15 +319,15 @@ contains
         end do
 
         ! Update the rhs variables
-        !$acc parallel loop collapse(3) gang vector default(present)
+        $:GPU_PARALLEL_LOOP(collapse=3)
         do l = 0, p
             do k = 0, n
                 do j = 0, m
-                    !$acc loop seq
+                    $:GPU_LOOP(parallelism='[seq]')
                     do q = contxb, contxe
                         rhs_vf(q)%sf(j, k, l) = rhs_vf(q)%sf(j, k, l) + mass_src(j, k, l)
                     end do
-                    !$acc loop seq
+                    $:GPU_LOOP(parallelism='[seq]')
                     do q = momxb, momxe
                         rhs_vf(q)%sf(j, k, l) = rhs_vf(q)%sf(j, k, l) + mom_src(q - contxe, j, k, l)
                     end do
@@ -339,7 +346,7 @@ contains
     !! @param gauss_sigma_time_local sigma in time for Gaussian pulse
     !! @param source Source term amplitude
     pure elemental subroutine s_source_temporal(sim_time, c, ai, term_index, frequency_local, gauss_sigma_time_local, source, sum_BB)
-        !$acc routine seq
+        $:GPU_ROUTINE(parallelism='[seq]')
         integer, intent(in) :: ai, term_index
         real(wp), intent(in) :: sim_time, c, sum_BB
         real(wp), intent(in) :: frequency_local, gauss_sigma_time_local
@@ -466,14 +473,14 @@ contains
                 call s_mpi_abort('Fatal Error: Inconsistent allocation of source_spatials')
             end if
 
-            !$acc update device(source_spatials(ai)%coord)
-            !$acc update device(source_spatials(ai)%val)
+            $:GPU_UPDATE(device='[source_spatials(ai)%coord]')
+            $:GPU_UPDATE(device='[source_spatials(ai)%val]')
             if (support(ai) >= 5) then
                 if (dim == 2) then
-                    !$acc update device(source_spatials(ai)%angle)
+                    $:GPU_UPDATE(device='[source_spatials(ai)%angle]')
                 end if
                 if (dim == 3) then
-                    !$acc update device(source_spatials(ai)%xyz_to_r_ratios)
+                    $:GPU_UPDATE(device='[source_spatials(ai)%xyz_to_r_ratios]')
                 end if
             end if
 
@@ -691,7 +698,7 @@ contains
     !! @param c Speed of sound
     !! @return frequency_local Converted frequency
     pure elemental function f_frequency_local(freq_conv_flag, ai, c)
-        !$acc routine seq
+        $:GPU_ROUTINE(parallelism='[seq]')
         logical, intent(in) :: freq_conv_flag
         integer, intent(in) :: ai
         real(wp), intent(in) :: c
@@ -710,7 +717,7 @@ contains
     !! @param ai Acoustic source index
     !! @return gauss_sigma_time_local Converted Gaussian sigma time
     pure elemental function f_gauss_sigma_time_local(gauss_conv_flag, ai, c)
-        !$acc routine seq
+        $:GPU_ROUTINE(parallelism='[seq]')
         logical, intent(in) :: gauss_conv_flag
         integer, intent(in) :: ai
         real(wp), intent(in) :: c
diff --git a/src/simulation/m_body_forces.fpp b/src/simulation/m_body_forces.fpp
index 1ef74cbcee..d5f811d273 100644
--- a/src/simulation/m_body_forces.fpp
+++ b/src/simulation/m_body_forces.fpp
@@ -22,7 +22,7 @@ module m_body_forces
               s_finalize_body_forces_module
 
     real(wp), allocatable, dimension(:, :, :) :: rhoM
-    !$acc declare create(rhoM)
+    $:GPU_DECLARE(create='[rhoM]')
 
 contains
 
@@ -67,7 +67,7 @@ contains
             end if
         end if
 
-        !$acc update device(accel_bf)
+        $:GPU_UPDATE(device='[accel_bf]')
 
     end subroutine s_compute_acceleration
 
@@ -79,7 +79,7 @@ contains
         type(scalar_field), dimension(sys_size), intent(in) :: q_cons_vf
         integer :: i, j, k, l !< standard iterators
 
-        !$acc parallel loop collapse(3) gang vector default(present)
+        $:GPU_PARALLEL_LOOP(collapse=3)
         do l = 0, p
             do k = 0, n
                 do j = 0, m
@@ -109,7 +109,7 @@ contains
         call s_compute_acceleration(mytime)
         call s_compute_mixture_density(q_cons_vf)
 
-        !$acc parallel loop collapse(4) gang vector default(present)
+        $:GPU_PARALLEL_LOOP(collapse=4)
         do i = momxb, E_idx
             do l = 0, p
                 do k = 0, n
@@ -122,7 +122,7 @@ contains
 
         if (bf_x) then ! x-direction body forces
 
-            !$acc parallel loop collapse(3) gang vector default(present)
+            $:GPU_PARALLEL_LOOP(collapse=3)
             do l = 0, p
                 do k = 0, n
                     do j = 0, m
@@ -137,7 +137,7 @@ contains
 
         if (bf_y) then ! y-direction body forces
 
-            !$acc parallel loop collapse(3) gang vector default(present)
+            $:GPU_PARALLEL_LOOP(collapse=3)
             do l = 0, p
                 do k = 0, n
                     do j = 0, m
@@ -152,7 +152,7 @@ contains
 
         if (bf_z) then ! z-direction body forces
 
-            !$acc parallel loop collapse(3) gang vector default(present)
+            $:GPU_PARALLEL_LOOP(collapse=3)
             do l = 0, p
                 do k = 0, n
                     do j = 0, m
diff --git a/src/simulation/m_bubbles.fpp b/src/simulation/m_bubbles.fpp
index 79ca5fb982..6ee19c210c 100644
--- a/src/simulation/m_bubbles.fpp
+++ b/src/simulation/m_bubbles.fpp
@@ -22,7 +22,7 @@ module m_bubbles
     real(wp) :: chi_vw  !< Bubble wall properties (Ando 2010)
     real(wp) :: k_mw    !< Bubble wall properties (Ando 2010)
     real(wp) :: rho_mw  !< Bubble wall properties (Ando 2010)
-    !$acc declare create(chi_vw, k_mw, rho_mw)
+    $:GPU_DECLARE(create='[chi_vw,k_mw,rho_mw]')
 
 contains
 
@@ -41,7 +41,7 @@ contains
         !!  @param f_divu Divergence of velocity
         !!  @param fCson Speed of sound from fP (EL)
     pure elemental function f_rddot(fRho, fP, fR, fV, fR0, fpb, fpbdot, alf, fntait, fBtait, f_bub_adv_src, f_divu, fCson)
-        !$acc routine seq
+        $:GPU_ROUTINE(parallelism='[seq]')
         real(wp), intent(in) :: fRho, fP, fR, fV, fR0, fpb, fpbdot, alf
         real(wp), intent(in) :: fntait, fBtait, f_bub_adv_src, f_divu
         real(wp), intent(in) :: fCson
@@ -82,7 +82,7 @@ contains
         !!  @param fV Current bubble velocity
         !!  @param fpb Internal bubble pressure
     pure elemental function f_cpbw(fR0, fR, fV, fpb)
-        !$acc routine seq
+        $:GPU_ROUTINE(parallelism='[seq]')
         real(wp), intent(in) :: fR0, fR, fV, fpb
 
         real(wp) :: f_cpbw
@@ -101,7 +101,7 @@ contains
         !!  @param fntait Tait EOS parameter
         !!  @param fBtait Tait EOS parameter
     pure elemental function f_H(fCpbw, fCpinf, fntait, fBtait)
-        !$acc routine seq
+        $:GPU_ROUTINE(parallelism='[seq]')
         real(wp), intent(in) :: fCpbw, fCpinf, fntait, fBtait
 
         real(wp) :: tmp1, tmp2, tmp3
@@ -121,7 +121,7 @@ contains
         !! @param fBtait Tait EOS parameter
         !! @param fH Bubble enthalpy
     pure elemental function f_cgas(fCpinf, fntait, fBtait, fH)
-        !$acc routine seq
+        $:GPU_ROUTINE(parallelism='[seq]')
         real(wp), intent(in) :: fCpinf, fntait, fBtait, fH
 
         real(wp) :: tmp
@@ -144,7 +144,7 @@ contains
         !!  @param advsrc Advection equation source term
         !!  @param divu Divergence of velocity
     pure elemental function f_cpinfdot(fRho, fP, falf, fntait, fBtait, advsrc, divu)
-        !$acc routine seq
+        $:GPU_ROUTINE(parallelism='[seq]')
         real(wp), intent(in) :: fRho, fP, falf, fntait, fBtait, advsrc, divu
 
         real(wp) :: c2_liquid
@@ -174,7 +174,7 @@ contains
         !!  @param fR0 Equilibrium bubble radius
         !!  @param fpbdot Time derivative of the internal bubble pressure
     pure elemental function f_Hdot(fCpbw, fCpinf, fCpinf_dot, fntait, fBtait, fR, fV, fR0, fpbdot)
-        !$acc routine seq
+        $:GPU_ROUTINE(parallelism='[seq]')
         real(wp), intent(in) :: fCpbw, fCpinf, fCpinf_dot, fntait, fBtait
         real(wp), intent(in) :: fR, fV, fR0, fpbdot
 
@@ -210,7 +210,7 @@ contains
         !!  @param fR0 Equilibrium bubble radius
         !!  @param fCpbw Boundary wall pressure
     pure elemental function f_rddot_RP(fCp, fRho, fR, fV, fCpbw)
-        !$acc routine seq
+        $:GPU_ROUTINE(parallelism='[seq]')
         real(wp), intent(in) :: fCp, fRho, fR, fV, fCpbw
 
         real(wp) :: f_rddot_RP
@@ -233,7 +233,7 @@ contains
         !!  @param fntait Tait EOS parameter
         !!  @param fBtait Tait EOS parameter
     pure elemental function f_rddot_G(fCpbw, fR, fV, fH, fHdot, fcgas, fntait, fBtait)
-        !$acc routine seq
+        $:GPU_ROUTINE(parallelism='[seq]')
         real(wp), intent(in) :: fCpbw, fR, fV, fH, fHdot
         real(wp), intent(in) :: fcgas, fntait, fBtait
 
@@ -256,7 +256,7 @@ contains
         !!  @param fV Current bubble velocity
         !!  @param fpb Internal bubble pressure
     pure elemental function f_cpbw_KM(fR0, fR, fV, fpb)
-        !$acc routine seq
+        $:GPU_ROUTINE(parallelism='[seq]')
         real(wp), intent(in) :: fR0, fR, fV, fpb
         real(wp) :: f_cpbw_KM
 
@@ -283,7 +283,7 @@ contains
         !!  @param fR0 Equilibrium bubble radius
         !!  @param fC Current sound speed
     pure elemental function f_rddot_KM(fpbdot, fCp, fCpbw, fRho, fR, fV, fR0, fC)
-        !$acc routine seq
+        $:GPU_ROUTINE(parallelism='[seq]')
         real(wp), intent(in) :: fpbdot, fCp, fCpbw
         real(wp), intent(in) :: fRho, fR, fV, fR0, fC
 
@@ -317,7 +317,7 @@ contains
         !!  @param pb Internal bubble pressure
         !!  @param iR0 Current bubble size index
     pure elemental subroutine s_bwproperty(pb, iR0, chi_vw, k_mw, rho_mw)
-        !$acc routine seq
+        $:GPU_ROUTINE(parallelism='[seq]')
         real(wp), intent(in) :: pb
         integer, intent(in) :: iR0
         real(wp), intent(out) :: chi_vw  !< Bubble wall properties (Ando 2010)
@@ -347,7 +347,7 @@ contains
         !!  @param fR_m Mixture gas constant (EL)
         !!  @param fgamma_m Mixture gamma (EL)
     pure elemental subroutine s_vflux(fR, fV, fpb, fmass_v, iR0, vflux, fmass_n, fbeta_c, fR_m, fgamma_m)
-        !$acc routine seq
+        $:GPU_ROUTINE(parallelism='[seq]')
         real(wp), intent(in) :: fR
         real(wp), intent(in) :: fV
         real(wp), intent(in) :: fpb
@@ -405,7 +405,7 @@ contains
         !!  @param fR_m Mixture gas constant (EL)
         !!  @param fgamma_m Mixture gamma (EL)
     pure elemental function f_bpres_dot(fvflux, fR, fV, fpb, fmass_v, iR0, fbeta_t, fR_m, fgamma_m)
-        !$acc routine seq
+        $:GPU_ROUTINE(parallelism='[seq]')
         real(wp), intent(in) :: fvflux
         real(wp), intent(in) :: fR
         real(wp), intent(in) :: fV
@@ -465,11 +465,9 @@ contains
                                    fntait, fBtait, f_bub_adv_src, f_divu, &
                                    bub_id, fmass_v, fmass_n, fbeta_c, &
                                    fbeta_t, fCson, adap_dt_stop)
-#ifdef _CRAYFTN
-        !DIR$ INLINEALWAYS s_advance_step
-#else
-        !$acc routine seq
-#endif
+        $:GPU_ROUTINE(function_name='s_advance_step',parallelism='[seq]', &
+            & cray_inline=True)
+
         real(wp), intent(inout) :: fR, fV, fpb, fmass_v
         real(wp), intent(in) :: fRho, fP, fR0, fpbdot, alf
         real(wp), intent(in) :: fntait, fBtait, f_bub_adv_src, f_divu
@@ -599,11 +597,9 @@ contains
     pure subroutine s_initial_substep_h(fRho, fP, fR, fV, fR0, fpb, fpbdot, alf, &
                                         fntait, fBtait, f_bub_adv_src, f_divu, &
                                         fCson, h)
-#ifdef _CRAYFTN
-        !DIR$ INLINEALWAYS s_initial_substep_h
-#else
-        !$acc routine seq
-#endif
+        $:GPU_ROUTINE(function_name='s_initial_substep_h',parallelism='[seq]', &
+            & cray_inline=True)
+
         real(wp), intent(IN) :: fRho, fP, fR, fV, fR0, fpb, fpbdot, alf
         real(wp), intent(IN) :: fntait, fBtait, f_bub_adv_src, f_divu
         real(wp), intent(IN) :: fCson
@@ -685,11 +681,9 @@ contains
                                       bub_id, fmass_v, fmass_n, fbeta_c, &
                                       fbeta_t, fCson, h, &
                                       myR_tmp, myV_tmp, myPb_tmp, myMv_tmp)
-#ifdef _CRAYFTN
-        !DIR$ INLINEALWAYS s_advance_substep
-#else
-        !$acc routine seq
-#endif
+        $:GPU_ROUTINE(function_name='s_advance_substep',parallelism='[seq]', &
+            & cray_inline=True)
+
         real(wp), intent(OUT) :: err
         real(wp), intent(IN) :: fRho, fP, fR, fV, fR0, fpb, fpbdot, alf
         real(wp), intent(IN) :: fntait, fBtait, f_bub_adv_src, f_divu, h
@@ -786,7 +780,7 @@ contains
         !!  @param fdMvdt_tmp Rate of change of the mass of vapor in the bubble
     pure elemental subroutine s_advance_EL(fR_tmp, fV_tmp, fPb_tmp, fMv_tmp, bub_id, &
                                            fmass_n, fbeta_c, fbeta_t, fdPbdt_tmp, advance_EL)
-        !$acc routine seq
+        $:GPU_ROUTINE(parallelism='[seq]')
         real(wp), intent(IN) :: fR_tmp, fV_tmp, fPb_tmp, fMv_tmp
         real(wp), intent(IN) :: fmass_n, fbeta_c, fbeta_t
         integer, intent(IN) :: bub_id
diff --git a/src/simulation/m_bubbles_EE.fpp b/src/simulation/m_bubbles_EE.fpp
index ed3f064551..5fe6d38136 100644
--- a/src/simulation/m_bubbles_EE.fpp
+++ b/src/simulation/m_bubbles_EE.fpp
@@ -21,13 +21,13 @@ module m_bubbles_EE
 
     real(wp), allocatable, dimension(:, :, :) :: bub_adv_src
     real(wp), allocatable, dimension(:, :, :, :) :: bub_r_src, bub_v_src, bub_p_src, bub_m_src
-    !$acc declare create(bub_adv_src, bub_r_src, bub_v_src, bub_p_src, bub_m_src)
+    $:GPU_DECLARE(create='[bub_adv_src,bub_r_src,bub_v_src,bub_p_src,bub_m_src]')
 
     type(scalar_field) :: divu !< matrix for div(u)
-    !$acc declare create(divu)
+    $:GPU_DECLARE(create='[divu]')
 
     integer, allocatable, dimension(:) :: rs, vs, ms, ps
-    !$acc declare create(rs, vs, ms, ps)
+    $:GPU_DECLARE(create='[rs,vs,ms,ps]')
 
 contains
 
@@ -51,9 +51,9 @@ contains
             end if
         end do
 
-        !$acc update device(rs, vs)
+        $:GPU_UPDATE(device='[rs, vs]')
         if (.not. polytropic) then
-            !$acc update device(ps, ms)
+            $:GPU_UPDATE(device='[ps, ms]')
         end if
 
         @:ALLOCATE(divu%sf(idwbuff(1)%beg:idwbuff(1)%end, idwbuff(2)%beg:idwbuff(2)%end, idwbuff(3)%beg:idwbuff(3)%end))
@@ -76,12 +76,12 @@ contains
         real(wp) :: nR3bar
         integer(wp) :: i, j, k, l
 
-        !$acc parallel loop collapse(3) gang vector default(present)
+        $:GPU_PARALLEL_LOOP(collapse=3)
         do l = 0, p
             do k = 0, n
                 do j = 0, m
                     nR3bar = 0._wp
-                    !$acc loop seq
+                    $:GPU_LOOP(parallelism='[seq]')
                     do i = 1, nb
                         nR3bar = nR3bar + weight(i)*(q_cons_vf(rs(i))%sf(j, k, l))**3._wp
                     end do
@@ -103,7 +103,7 @@ contains
         if (idir == 1) then
 
             if (.not. qbmm) then
-                !$acc parallel loop collapse(3) gang vector default(present)
+                $:GPU_PARALLEL_LOOP(collapse=3)
                 do l = 0, p
                     do k = 0, n
                         do j = 0, m
@@ -119,7 +119,7 @@ contains
 
         elseif (idir == 2) then
 
-            !$acc parallel loop collapse(3) gang vector default(present)
+            $:GPU_PARALLEL_LOOP(collapse=3)
             do l = 0, p
                 do k = 0, n
                     do j = 0, m
@@ -133,7 +133,7 @@ contains
 
         elseif (idir == 3) then
 
-            !$acc parallel loop collapse(3) gang vector default(present)
+            $:GPU_PARALLEL_LOOP(collapse=3)
             do l = 0, p
                 do k = 0, n
                     do j = 0, m
@@ -172,13 +172,13 @@ contains
         integer :: dmBub_id !< Dummy variables for unified subgrid bubble subroutines
         real(wp) :: dmMass_v, dmMass_n, dmBeta_c, dmBeta_t, dmCson
 
-        !$acc parallel loop collapse(3) gang vector default(present)
+        $:GPU_PARALLEL_LOOP(collapse=3)
         do l = 0, p
             do k = 0, n
                 do j = 0, m
                     bub_adv_src(j, k, l) = 0._wp
 
-                    !$acc loop seq
+                    $:GPU_LOOP(parallelism='[seq]')
                     do q = 1, nb
                         bub_r_src(j, k, l, q) = 0._wp
                         bub_v_src(j, k, l, q) = 0._wp
@@ -190,8 +190,9 @@ contains
         end do
 
         adap_dt_stop_max = 0
-        !$acc parallel loop collapse(3) gang vector default(present) private(Rtmp, Vtmp, myalpha_rho, myalpha)  &
-        !$acc reduction(MAX:adap_dt_stop_max) copy(adap_dt_stop_max)
+        $:GPU_PARALLEL_LOOP(collapse=3, private='[Rtmp, Vtmp, myalpha_rho, myalpha]', &
+            & reduction='[[adap_dt_stop_max]]', reductionOp='[MAX]', &
+            & copy='[adap_dt_stop_max]')
         do l = 0, p
             do k = 0, n
                 do j = 0, m
@@ -199,7 +200,7 @@ contains
                     if (adv_n) then
                         nbub = q_prim_vf(n_idx)%sf(j, k, l)
                     else
-                        !$acc loop seq
+                        $:GPU_LOOP(parallelism='[seq]')
                         do q = 1, nb
                             Rtmp(q) = q_prim_vf(rs(q))%sf(j, k, l)
                             Vtmp(q) = q_prim_vf(vs(q))%sf(j, k, l)
@@ -207,7 +208,7 @@ contains
 
                         R3 = 0._wp
 
-                        !$acc loop seq
+                        $:GPU_LOOP(parallelism='[seq]')
                         do q = 1, nb
                             R3 = R3 + weight(q)*Rtmp(q)**3._wp
                         end do
@@ -218,7 +219,7 @@ contains
                     if (.not. adap_dt) then
                         R2Vav = 0._wp
 
-                        !$acc loop seq
+                        $:GPU_LOOP(parallelism='[seq]')
                         do q = 1, nb
                             R2Vav = R2Vav + weight(q)*Rtmp(q)**2._wp*Vtmp(q)
                         end do
@@ -226,10 +227,10 @@ contains
                         bub_adv_src(j, k, l) = 4._wp*pi*nbub*R2Vav
                     end if
 
-                    !$acc loop seq
+                    $:GPU_LOOP(parallelism='[seq]')
                     do q = 1, nb
 
-                        !$acc loop seq
+                        $:GPU_LOOP(parallelism='[seq]')
                         do ii = 1, num_fluids
                             myalpha_rho(ii) = q_cons_vf(ii)%sf(j, k, l)
                             myalpha(ii) = q_cons_vf(advxb + ii - 1)%sf(j, k, l)
@@ -240,14 +241,14 @@ contains
                         B_tait = 0._wp
 
                         if (mpp_lim .and. (num_fluids > 2)) then
-                            !$acc loop seq
+                            $:GPU_LOOP(parallelism='[seq]')
                             do ii = 1, num_fluids
                                 myRho = myRho + myalpha_rho(ii)
                                 n_tait = n_tait + myalpha(ii)*gammas(ii)
                                 B_tait = B_tait + myalpha(ii)*pi_infs(ii)
                             end do
                         else if (num_fluids > 2) then
-                            !$acc loop seq
+                            $:GPU_LOOP(parallelism='[seq]')
                             do ii = 1, num_fluids - 1
                                 myRho = myRho + myalpha_rho(ii)
                                 n_tait = n_tait + myalpha(ii)*gammas(ii)
@@ -323,14 +324,14 @@ contains
         if (adap_dt .and. adap_dt_stop_max > 0) call s_mpi_abort("Adaptive time stepping failed to converge.")
 
         if (.not. adap_dt) then
-            !$acc parallel loop collapse(3) gang vector default(present)
+            $:GPU_PARALLEL_LOOP(collapse=3)
             do l = 0, p
                 do q = 0, n
                     do i = 0, m
                         rhs_vf(alf_idx)%sf(i, q, l) = rhs_vf(alf_idx)%sf(i, q, l) + bub_adv_src(i, q, l)
                         if (num_fluids > 1) rhs_vf(advxb)%sf(i, q, l) = &
                             rhs_vf(advxb)%sf(i, q, l) - bub_adv_src(i, q, l)
-                        !$acc loop seq
+                        $:GPU_LOOP(parallelism='[seq]')
                         do k = 1, nb
                             rhs_vf(rs(k))%sf(i, q, l) = rhs_vf(rs(k))%sf(i, q, l) + bub_r_src(i, q, l, k)
                             rhs_vf(vs(k))%sf(i, q, l) = rhs_vf(vs(k))%sf(i, q, l) + bub_v_src(i, q, l, k)
diff --git a/src/simulation/m_bubbles_EL.fpp b/src/simulation/m_bubbles_EL.fpp
index 854e3f63a0..20e7515c83 100644
--- a/src/simulation/m_bubbles_EL.fpp
+++ b/src/simulation/m_bubbles_EL.fpp
@@ -34,20 +34,26 @@ module m_bubbles_EL
     real(wp), allocatable, dimension(:) :: bub_R0            !< Initial bubble radius
     real(wp), allocatable, dimension(:) :: Rmax_stats        !< Maximum radius
     real(wp), allocatable, dimension(:) :: Rmin_stats        !< Minimum radius
+    $:GPU_DECLARE(create='[lag_id, bub_R0, Rmax_stats, Rmin_stats]')
+
     real(wp), allocatable, dimension(:) :: gas_mg            !< Bubble's gas mass
     real(wp), allocatable, dimension(:) :: gas_betaT         !< heatflux model (Preston et al., 2007)
     real(wp), allocatable, dimension(:) :: gas_betaC         !< massflux model (Preston et al., 2007)
     real(wp), allocatable, dimension(:) :: bub_dphidt        !< subgrid velocity potential (Maeda & Colonius, 2018)
+    $:GPU_DECLARE(create='[gas_mg, gas_betaT, gas_betaC, bub_dphidt]')
+
     !(nBub, 1 -> actual val or 2 -> temp val)
     real(wp), allocatable, dimension(:, :) :: gas_p          !< Pressure in the bubble
     real(wp), allocatable, dimension(:, :) :: gas_mv         !< Vapor mass in the bubble
     real(wp), allocatable, dimension(:, :) :: intfc_rad      !< Bubble radius
     real(wp), allocatable, dimension(:, :) :: intfc_vel      !< Velocity of the bubble interface
+    $:GPU_DECLARE(create='[gas_p, gas_mv, intfc_rad, intfc_vel]')
     !(nBub, 1-> x or 2->y or 3 ->z, 1 -> actual or 2 -> temporal val)
     real(wp), allocatable, dimension(:, :, :) :: mtn_pos     !< Bubble's position
     real(wp), allocatable, dimension(:, :, :) :: mtn_posPrev !< Bubble's previous position
     real(wp), allocatable, dimension(:, :, :) :: mtn_vel     !< Bubble's velocity
     real(wp), allocatable, dimension(:, :, :) :: mtn_s       !< Bubble's computational cell position in real format
+    $:GPU_DECLARE(create='[mtn_pos, mtn_posPrev, mtn_vel, mtn_s]')
     !(nBub, 1-> x or 2->y or 3 ->z, time-stage)
     real(wp), allocatable, dimension(:, :) :: intfc_draddt   !< Time derivative of bubble's radius
     real(wp), allocatable, dimension(:, :) :: intfc_dveldt   !< Time derivative of bubble's interface velocity
@@ -55,21 +61,18 @@ module m_bubbles_EL
     real(wp), allocatable, dimension(:, :) :: gas_dmvdt      !< Time derivative of the vapor mass in the bubble
     real(wp), allocatable, dimension(:, :, :) :: mtn_dposdt  !< Time derivative of the bubble's position
     real(wp), allocatable, dimension(:, :, :) :: mtn_dveldt  !< Time derivative of the bubble's velocity
-
-    !$acc declare create(lag_id, bub_R0, Rmax_stats, Rmin_stats, gas_mg, gas_betaT, gas_betaC, bub_dphidt,       &
-    !$acc gas_p, gas_mv, intfc_rad, intfc_vel, mtn_pos, mtn_posPrev, mtn_vel, mtn_s, intfc_draddt, intfc_dveldt, &
-    !$acc gas_dpdt, gas_dmvdt, mtn_dposdt, mtn_dveldt)
+    $:GPU_DECLARE(create='[intfc_draddt, intfc_dveldt, gas_dpdt, gas_dmvdt, mtn_dposdt, mtn_dveldt]')
 
     integer, private :: lag_num_ts                                  !<  Number of time stages in the time-stepping scheme
 
-    !$acc declare create(lag_num_ts)
+    $:GPU_DECLARE(create='[lag_num_ts]')
 
     integer :: nBubs                            !< Number of bubbles in the local domain
     real(wp) :: Rmax_glb, Rmin_glb       !< Maximum and minimum bubbe size in the local domain
     type(vector_field) :: q_beta                !< Projection of the lagrangian particles in the Eulerian framework
     integer :: q_beta_idx                       !< Size of the q_beta vector field
 
-    !$acc declare create(nBubs, Rmax_glb, Rmin_glb, q_beta, q_beta_idx)
+    $:GPU_DECLARE(create='[nBubs,Rmax_glb,Rmin_glb,q_beta,q_beta_idx]')
 
 contains
 
@@ -99,7 +102,7 @@ contains
             call s_mpi_abort('Please check the lag_params%solver_approach input')
         end if
 
-        !$acc update device(lag_num_ts, q_beta_idx)
+        $:GPU_UPDATE(device='[lag_num_ts, q_beta_idx]')
 
         @:ALLOCATE(q_beta%vf(1:q_beta_idx))
 
@@ -247,17 +250,19 @@ contains
 
         print *, " Lagrange bubbles running, in proc", proc_rank, "number:", bub_id, "/", id
 
-        !$acc update device(bubbles_lagrange, lag_params)
+        $:GPU_UPDATE(device='[bubbles_lagrange, lag_params]')
 
-        !$acc update device(lag_id, bub_R0, Rmax_stats, Rmin_stats, gas_mg, gas_betaT, gas_betaC,   &
-        !$acc bub_dphidt, gas_p, gas_mv, intfc_rad, intfc_vel, mtn_pos, mtn_posPrev, mtn_vel,       &
-        !$acc mtn_s, intfc_draddt, intfc_dveldt, gas_dpdt, gas_dmvdt, mtn_dposdt, mtn_dveldt, nBubs)
+        $:GPU_UPDATE(device='[lag_id,bub_R0,Rmax_stats,Rmin_stats,gas_mg, &
+            & gas_betaT,gas_betaC,bub_dphidt,gas_p,gas_mv, &
+            & intfc_rad,intfc_vel,mtn_pos,mtn_posPrev,mtn_vel, &
+            & mtn_s,intfc_draddt,intfc_dveldt,gas_dpdt,gas_dmvdt, &
+            & mtn_dposdt,mtn_dveldt,nBubs]')
 
         Rmax_glb = min(dflt_real, -dflt_real)
         Rmin_glb = max(dflt_real, -dflt_real)
-        !$acc update device(Rmax_glb, Rmin_glb)
+        $:GPU_UPDATE(device='[Rmax_glb, Rmin_glb]')
 
-        !$acc update device(dx, dy, dz, x_cb, x_cc, y_cb, y_cc, z_cb, z_cc)
+        $:GPU_UPDATE(device='[dx,dy,dz,x_cb,x_cc,y_cb,y_cc,z_cb,z_cc]')
 
         !Populate temporal variables
         call s_transfer_data_to_tmp()
@@ -524,7 +529,7 @@ contains
         ! Subgrid p_inf model based on Maeda and Colonius (2018).
         if (lag_params%pressure_corrector) then
             ! Calculate velocity potentials (valid for one bubble per cell)
-            !$acc parallel loop gang vector default(present) private(k, cell)
+            $:GPU_PARALLEL_LOOP(private='[k,cell]')
             do k = 1, nBubs
                 call s_get_pinf(k, q_prim_vf, 2, paux, cell, preterm1, term2, Romega)
                 myR0 = bub_R0(k)
@@ -545,8 +550,9 @@ contains
 
         ! Radial motion model
         adap_dt_stop_max = 0
-        !$acc parallel loop gang vector default(present) private(k, myalpha_rho, myalpha, Re, cell) &
-        !$acc reduction(MAX:adap_dt_stop_max) copy(adap_dt_stop_max) copyin(stage)
+        $:GPU_PARALLEL_LOOP(private='[k,myalpha_rho,myalpha,Re,cell]', &
+            & reduction='[[adap_dt_stop_max]]',reductionOp='[MAX]', &
+            & copy='[adap_dt_stop_max]',copyin='[stage]')
         do k = 1, nBubs
             ! Keller-Miksis model
 
@@ -569,7 +575,7 @@ contains
             call s_get_pinf(k, q_prim_vf, 1, myPinf, cell, aux1, aux2)
 
             ! Obtain liquid density and computing speed of sound from pinf
-            !$acc loop seq
+            $:GPU_LOOP(parallelism='[seq]')
             do i = 1, num_fluids
                 myalpha_rho(i) = q_prim_vf(i)%sf(cell(1), cell(2), cell(3))
                 myalpha(i) = q_prim_vf(E_idx + i)%sf(cell(1), cell(2), cell(3))
@@ -614,7 +620,7 @@ contains
         if (adap_dt .and. adap_dt_stop_max > 0) call s_mpi_abort("Adaptive time stepping failed to converge.")
 
         ! Bubbles remain in a fixed position
-        !$acc parallel loop collapse(2) gang vector default(present) private(k) copyin(stage)
+        $:GPU_PARALLEL_LOOP(collapse=2, private='[k]', copyin='[stage]')
         do k = 1, nBubs
             do l = 1, 3
                 mtn_dposdt(k, l, stage) = 0._wp
@@ -644,7 +650,7 @@ contains
         if (lag_params%solver_approach == 2) then
 
             if (p == 0) then
-                !$acc parallel loop collapse(4) gang vector default(present)
+                $:GPU_PARALLEL_LOOP(collapse=4)
                 do k = 0, p
                     do j = 0, n
                         do i = 0, m
@@ -660,7 +666,7 @@ contains
                     end do
                 end do
             else
-                !$acc parallel loop collapse(4) gang vector default(present)
+                $:GPU_PARALLEL_LOOP(collapse=4)
                 do k = 0, p
                     do j = 0, n
                         do i = 0, m
@@ -680,7 +686,7 @@ contains
 
                 call s_gradient_dir(q_prim_vf(E_idx), q_beta%vf(3), l)
 
-                !$acc parallel loop collapse(3) gang vector default(present)
+                $:GPU_PARALLEL_LOOP(collapse=3)
                 do k = 0, p
                     do j = 0, n
                         do i = 0, m
@@ -695,7 +701,7 @@ contains
                 end do
 
                 !source in energy
-                !$acc parallel loop collapse(3) gang vector default(present)
+                $:GPU_PARALLEL_LOOP(collapse=3)
                 do k = idwbuff(3)%beg, idwbuff(3)%end
                     do j = idwbuff(2)%beg, idwbuff(2)%end
                         do i = idwbuff(1)%beg, idwbuff(1)%end
@@ -706,7 +712,7 @@ contains
 
                 call s_gradient_dir(q_beta%vf(3), q_beta%vf(4), l)
 
-                !$acc parallel loop collapse(3) gang vector default(present)
+                $:GPU_PARALLEL_LOOP(collapse=3)
                 do k = 0, p
                     do j = 0, n
                         do i = 0, m
@@ -734,11 +740,9 @@ contains
         !! @param pi_inf Liquid stiffness
         !! @param cson Calculated speed of sound
     pure subroutine s_compute_cson_from_pinf(q_prim_vf, pinf, cell, rhol, gamma, pi_inf, cson)
-#ifdef _CRAYFTN
-        !DIR$ INLINEALWAYS s_compute_cson_from_pinf
-#else
-        !$acc routine seq
-#endif
+        $:GPU_ROUTINE(function_name='s_compute_cson_from_pinf', &
+            & parallelism='[seq]', cray_inline=True)
+
         type(scalar_field), dimension(sys_size), intent(in) :: q_prim_vf
         real(wp), intent(in) :: pinf, rhol, gamma, pi_inf
         integer, dimension(3), intent(in) :: cell
@@ -748,7 +752,7 @@ contains
         real(wp), dimension(num_dims) :: vel
         integer :: i
 
-        !$acc loop seq
+        $:GPU_LOOP(parallelism='[seq]')
         do i = 1, num_dims
             vel(i) = q_prim_vf(i + contxe)%sf(cell(1), cell(2), cell(3))
         end do
@@ -765,7 +769,7 @@ contains
 
         call nvtxStartRange("BUBBLES-LAGRANGE-KERNELS")
 
-        !$acc parallel loop collapse(4) gang vector default(present)
+        $:GPU_PARALLEL_LOOP(collapse=4)
         do i = 1, q_beta_idx
             do l = idwbuff(3)%beg, idwbuff(3)%end
                 do k = idwbuff(2)%beg, idwbuff(2)%end
@@ -780,7 +784,7 @@ contains
                               mtn_s, mtn_pos, q_beta)
 
         !Store 1-beta
-        !$acc parallel loop collapse(3) gang vector default(present)
+        $:GPU_PARALLEL_LOOP(collapse=3)
         do l = idwbuff(3)%beg, idwbuff(3)%end
             do k = idwbuff(2)%beg, idwbuff(2)%end
                 do j = idwbuff(1)%beg, idwbuff(1)%end
@@ -804,11 +808,9 @@ contains
         !! @param cell Bubble cell
         !! @param Romega Control volume radius
     pure subroutine s_get_pinf(bub_id, q_prim_vf, ptype, f_pinfl, cell, preterm1, term2, Romega)
-#ifdef _CRAYFTN
-        !DIR$ INLINEALWAYS s_get_pinf
-#else
-        !$acc routine seq
-#endif
+        $:GPU_ROUTINE(function_name='s_get_pinf',parallelism='[seq]', &
+            & cray_inline=True)
+
         integer, intent(in) :: bub_id, ptype
         type(scalar_field), dimension(sys_size), intent(in) :: q_prim_vf
         real(wp), intent(out) :: f_pinfl
@@ -829,7 +831,7 @@ contains
 
         !< Find current bubble cell
         cell(:) = int(scoord(:))
-        !$acc loop seq
+        $:GPU_LOOP(parallelism='[seq]')
         do i = 1, num_dims
             if (scoord(i) < 0._wp) cell(i) = cell(i) - 1
         end do
@@ -920,11 +922,11 @@ contains
             charpres2 = 0._wp
             vol = 0._wp
 
-            !$acc loop seq
+            $:GPU_LOOP(parallelism='[seq]')
             do i = 1, smearGrid
-                !$acc loop seq
+                $:GPU_LOOP(parallelism='[seq]')
                 do j = 1, smearGrid
-                    !$acc loop seq
+                    $:GPU_LOOP(parallelism='[seq]')
                     do k = 1, smearGridz
                         cellaux(1) = cell(1) + i - (mapCells + 1)
                         cellaux(2) = cell(2) + j - (mapCells + 1)
@@ -1023,7 +1025,7 @@ contains
         integer :: k
 
         if (time_stepper == 1) then ! 1st order TVD RK
-            !$acc parallel loop gang vector default(present) private(k)
+            $:GPU_PARALLEL_LOOP(private='[k]')
             do k = 1, nBubs
                 !u{1} = u{n} +  dt * RHS{n}
                 intfc_rad(k, 1) = intfc_rad(k, 1) + dt*intfc_draddt(k, 1)
@@ -1039,13 +1041,13 @@ contains
             if (lag_params%write_bubbles_stats) call s_calculate_lag_bubble_stats()
 
             if (lag_params%write_bubbles) then
-                !$acc update host(gas_p, gas_mv, intfc_rad, intfc_vel)
+                $:GPU_UPDATE(host='[gas_p,gas_mv,intfc_rad,intfc_vel]')
                 call s_write_lag_particles(mytime)
             end if
 
         elseif (time_stepper == 2) then ! 2nd order TVD RK
             if (stage == 1) then
-                !$acc parallel loop gang vector default(present) private(k)
+                $:GPU_PARALLEL_LOOP(private='[k]')
                 do k = 1, nBubs
                     !u{1} = u{n} +  dt * RHS{n}
                     intfc_rad(k, 2) = intfc_rad(k, 1) + dt*intfc_draddt(k, 1)
@@ -1057,7 +1059,7 @@ contains
                 end do
 
             elseif (stage == 2) then
-                !$acc parallel loop gang vector default(present) private(k)
+                $:GPU_PARALLEL_LOOP(private='[k]')
                 do k = 1, nBubs
                     !u{1} = u{n} + (1/2) * dt * (RHS{n} + RHS{1})
                     intfc_rad(k, 1) = intfc_rad(k, 1) + dt*(intfc_draddt(k, 1) + intfc_draddt(k, 2))/2._wp
@@ -1073,7 +1075,7 @@ contains
                 if (lag_params%write_bubbles_stats) call s_calculate_lag_bubble_stats()
 
                 if (lag_params%write_bubbles) then
-                    !$acc update host(gas_p, gas_mv, intfc_rad, intfc_vel)
+                    $:GPU_UPDATE(host='[gas_p,gas_mv,intfc_rad,intfc_vel]')
                     call s_write_lag_particles(mytime)
                 end if
 
@@ -1081,7 +1083,7 @@ contains
 
         elseif (time_stepper == 3) then ! 3rd order TVD RK
             if (stage == 1) then
-                !$acc parallel loop gang vector default(present) private(k)
+                $:GPU_PARALLEL_LOOP(private='[k]')
                 do k = 1, nBubs
                     !u{1} = u{n} +  dt * RHS{n}
                     intfc_rad(k, 2) = intfc_rad(k, 1) + dt*intfc_draddt(k, 1)
@@ -1093,7 +1095,7 @@ contains
                 end do
 
             elseif (stage == 2) then
-                !$acc parallel loop gang vector default(present) private(k)
+                $:GPU_PARALLEL_LOOP(private='[k]')
                 do k = 1, nBubs
                     !u{2} = u{n} + (1/4) * dt * [RHS{n} + RHS{1}]
                     intfc_rad(k, 2) = intfc_rad(k, 1) + dt*(intfc_draddt(k, 1) + intfc_draddt(k, 2))/4._wp
@@ -1104,7 +1106,7 @@ contains
                     gas_mv(k, 2) = gas_mv(k, 1) + dt*(gas_dmvdt(k, 1) + gas_dmvdt(k, 2))/4._wp
                 end do
             elseif (stage == 3) then
-                !$acc parallel loop gang vector default(present) private(k)
+                $:GPU_PARALLEL_LOOP(private='[k]')
                 do k = 1, nBubs
                     !u{n+1} = u{n} + (2/3) * dt * [(1/4)* RHS{n} + (1/4)* RHS{1} + RHS{2}]
                     intfc_rad(k, 1) = intfc_rad(k, 1) + (2._wp/3._wp)*dt*(intfc_draddt(k, 1)/4._wp + intfc_draddt(k, 2)/4._wp + intfc_draddt(k, 3))
@@ -1120,7 +1122,7 @@ contains
                 if (lag_params%write_bubbles_stats) call s_calculate_lag_bubble_stats()
 
                 if (lag_params%write_bubbles) then
-                    !$acc update host(gas_p, gas_mv, intfc_rad, intfc_vel)
+                    $:GPU_UPDATE(host='[gas_p,gas_mv,intfc_rad,intfc_vel]')
                     call s_write_lag_particles(mytime)
                 end if
 
@@ -1190,7 +1192,7 @@ contains
 
         integer :: k
 
-        !$acc parallel loop gang vector default(present) private(k)
+        $:GPU_PARALLEL_LOOP(private='[k]')
         do k = 1, nBubs
             gas_p(k, 2) = gas_p(k, 1)
             gas_mv(k, 2) = gas_mv(k, 1)
@@ -1289,7 +1291,7 @@ contains
 
         if (dir == 1) then
             ! Gradient in x dir.
-            !$acc parallel loop collapse(3) gang vector default(present)
+            $:GPU_PARALLEL_LOOP(collapse=3)
             do k = 0, p
                 do j = 0, n
                     do i = 0, m
@@ -1304,7 +1306,7 @@ contains
         else
             if (dir == 2) then
                 ! Gradient in y dir.
-                !$acc parallel loop collapse(3) gang vector default(present)
+                $:GPU_PARALLEL_LOOP(collapse=3)
                 do k = 0, p
                     do j = 0, n
                         do i = 0, m
@@ -1318,7 +1320,7 @@ contains
                 end do
             else
                 ! Gradient in z dir.
-                !$acc parallel loop collapse(3) gang vector default(present)
+                $:GPU_PARALLEL_LOOP(collapse=3)
                 do k = 0, p
                     do j = 0, n
                         do i = 0, m
@@ -1412,8 +1414,9 @@ contains
         lag_void_max = 0._wp
         lag_void_avg = 0._wp
         lag_vol = 0._wp
-        !$acc parallel loop collapse(3) gang vector default(present) reduction(+:lag_vol,lag_void_avg) &
-        !$acc reduction(MAX:lag_void_max) copy(lag_vol, lag_void_avg, lag_void_max)
+        $:GPU_PARALLEL_LOOP(collapse=3, reduction='[[lag_vol, lag_void_avg], &
+            & [lag_void_max]]', reductionOp='[+, MAX]', &
+            & copy='[lag_vol, lag_void_avg, lag_void_max]')
         do k = 0, p
             do j = 0, n
                 do i = 0, m
@@ -1597,8 +1600,8 @@ contains
 
         integer :: k
 
-        !$acc parallel loop gang vector default(present) reduction(MAX:Rmax_glb) &
-        !$acc reduction(MIN: Rmin_glb) copy(Rmax_glb, Rmin_glb)
+        $:GPU_PARALLEL_LOOP(reduction='[[Rmax_glb], [Rmin_glb]]', &
+            & reductionOp='[MAX, MIN]', copy='[Rmax_glb,Rmin_glb]')
         do k = 1, nBubs
             Rmax_glb = max(Rmax_glb, intfc_rad(k, 1)/bub_R0(k))
             Rmin_glb = min(Rmin_glb, intfc_rad(k, 1)/bub_R0(k))
@@ -1617,7 +1620,7 @@ contains
         write (file_loc, '(A,I0,A)') 'stats_lag_bubbles_', proc_rank, '.dat'
         file_loc = trim(case_dir)//'/D/'//trim(file_loc)
 
-        !$acc update host(Rmax_glb, Rmin_glb)
+        $:GPU_UPDATE(host='[Rmax_glb,Rmin_glb]')
 
         open (13, FILE=trim(file_loc), FORM='formatted', position='rewind')
         write (13, *) 'proc_rank, particleID, x, y, z, Rmax_glb, Rmin_glb'
@@ -1645,7 +1648,7 @@ contains
 
         integer :: i
 
-        !$acc loop seq
+        $:GPU_LOOP(parallelism='[seq]')
         do i = bub_id, nBubs - 1
             lag_id(i, 1) = lag_id(i + 1, 1)
             bub_R0(i) = bub_R0(i + 1)
@@ -1670,7 +1673,7 @@ contains
         end do
 
         nBubs = nBubs - 1
-        !$acc update device(nBubs)
+        $:GPU_UPDATE(device='[nBubs]')
 
     end subroutine s_remove_lag_bubble
 
diff --git a/src/simulation/m_bubbles_EL_kernels.fpp b/src/simulation/m_bubbles_EL_kernels.fpp
index 47566eac19..48ea3bad9a 100644
--- a/src/simulation/m_bubbles_EL_kernels.fpp
+++ b/src/simulation/m_bubbles_EL_kernels.fpp
@@ -55,7 +55,7 @@ contains
         real(wp), dimension(3) :: s_coord
         integer :: l
 
-        !$acc parallel loop gang vector default(present) private(l, s_coord, cell)
+        $:GPU_PARALLEL_LOOP(private='[l,s_coord,cell]')
         do l = 1, nBubs
 
             volpart = 4._wp/3._wp*pi*lbk_rad(l, 2)**3._wp
@@ -74,19 +74,19 @@ contains
 
             !Update void fraction field
             addFun1 = strength_vol/Vol
-            !$acc atomic update
+            $:GPU_ATOMIC(atomic='update')
             updatedvar%vf(1)%sf(cell(1), cell(2), cell(3)) = updatedvar%vf(1)%sf(cell(1), cell(2), cell(3)) + addFun1
 
             !Update time derivative of void fraction
             addFun2 = strength_vel/Vol
-            !$acc atomic update
+            $:GPU_ATOMIC(atomic='update')
             updatedvar%vf(2)%sf(cell(1), cell(2), cell(3)) = updatedvar%vf(2)%sf(cell(1), cell(2), cell(3)) + addFun2
 
             !Product of two smeared functions
             !Update void fraction * time derivative of void fraction
             if (lag_params%cluster_type >= 4) then
                 addFun3 = (strength_vol*strength_vel)/Vol
-                !$acc atomic update
+                $:GPU_ATOMIC(atomic='update')
                 updatedvar%vf(5)%sf(cell(1), cell(2), cell(3)) = updatedvar%vf(5)%sf(cell(1), cell(2), cell(3)) + addFun3
             end if
         end do
@@ -120,7 +120,7 @@ contains
         smearGridz = smearGrid
         if (p == 0) smearGridz = 1
 
-        !$acc parallel loop gang vector default(present) private(nodecoord, l, s_coord, cell, center) copyin(smearGrid, smearGridz)
+        $:GPU_PARALLEL_LOOP(private='[nodecoord,l,s_coord,cell,center]', copyin='[smearGrid,smearGridz]')
         do l = 1, nBubs
             nodecoord(1:3) = 0
             center(1:3) = 0._wp
@@ -134,7 +134,7 @@ contains
             strength_vol = volpart
             strength_vel = 4._wp*pi*lbk_rad(l, 2)**2._wp*lbk_vel(l, 2)
 
-            !$acc loop collapse(3) private(cellaux, nodecoord)
+            $:GPU_LOOP(collapse=3,private='[cellaux,nodecoord]')
             do i = 1, smearGrid
                 do j = 1, smearGrid
                     do k = 1, smearGridz
@@ -170,14 +170,14 @@ contains
 
                         !Update void fraction field
                         addFun1 = func*strength_vol
-                        !$acc atomic update
+                        $:GPU_ATOMIC(atomic='update')
                         updatedvar%vf(1)%sf(cellaux(1), cellaux(2), cellaux(3)) = &
                             updatedvar%vf(1)%sf(cellaux(1), cellaux(2), cellaux(3)) &
                             + addFun1
 
                         !Update time derivative of void fraction
                         addFun2 = func*strength_vel
-                        !$acc atomic update
+                        $:GPU_ATOMIC(atomic='update')
                         updatedvar%vf(2)%sf(cellaux(1), cellaux(2), cellaux(3)) = &
                             updatedvar%vf(2)%sf(cellaux(1), cellaux(2), cellaux(3)) &
                             + addFun2
@@ -186,7 +186,7 @@ contains
                         !Update void fraction * time derivative of void fraction
                         if (lag_params%cluster_type >= 4) then
                             addFun3 = func2*strength_vol*strength_vel
-                            !$acc atomic update
+                            $:GPU_ATOMIC(atomic='update')
                             updatedvar%vf(5)%sf(cellaux(1), cellaux(2), cellaux(3)) = &
                                 updatedvar%vf(5)%sf(cellaux(1), cellaux(2), cellaux(3)) &
                                 + addFun3
@@ -200,11 +200,9 @@ contains
 
     !> The purpose of this subroutine is to apply the gaussian kernel function for each bubble (Maeda and Colonius, 2018)).
     pure subroutine s_applygaussian(center, cellaux, nodecoord, stddsv, strength_idx, func)
-#ifdef _CRAYFTN
-        !DIR$ INLINEALWAYS s_applygaussian
-#else
-        !$acc routine seq
-#endif
+        $:GPU_ROUTINE(function_name='s_applygaussian',parallelism='[seq]', &
+            & cray_inline=True)
+
         real(wp), dimension(3), intent(in) :: center
         integer, dimension(3), intent(in) :: cellaux
         real(wp), dimension(3), intent(in) :: nodecoord
@@ -270,11 +268,9 @@ contains
             !! @param cellaux Tested cell to smear the bubble effect in.
             !! @param celloutside If true, then cellaux is outside the computational domain.
     pure subroutine s_check_celloutside(cellaux, celloutside)
-#ifdef _CRAYFTN
-        !DIR$ INLINEALWAYS s_check_celloutside
-#else
-        !$acc routine seq
-#endif
+        $:GPU_ROUTINE(function_name='s_check_celloutside',parallelism='[seq]', &
+            & cray_inline=True)
+
         integer, dimension(3), intent(inout) :: cellaux
         logical, intent(out) :: celloutside
 
@@ -306,11 +302,9 @@ contains
             !! @param cell Cell of the current bubble
             !! @param cellaux Cell to map the bubble effect in.
     pure subroutine s_shift_cell_symmetric_bc(cellaux, cell)
-#ifdef _CRAYFTN
-        !DIR$ INLINEALWAYS s_shift_cell_symmetric_bc
-#else
-        !$acc routine seq
-#endif
+        $:GPU_ROUTINE(function_name='s_shift_cell_symmetric_bc', &
+            & parallelism='[seq]', cray_inline=True)
+
         integer, dimension(3), intent(inout) :: cellaux
         integer, dimension(3), intent(in) :: cell
 
@@ -347,11 +341,9 @@ contains
             !! @param volpart Volume of the bubble
             !! @param stddsv Standard deviaton
     pure subroutine s_compute_stddsv(cell, volpart, stddsv)
-#ifdef _CRAYFTN
-        !DIR$ INLINEALWAYS s_compute_stddsv
-#else
-        !$acc routine seq
-#endif
+        $:GPU_ROUTINE(function_name='s_compute_stddsv',parallelism='[seq]', &
+            & cray_inline=True)
+
         integer, dimension(3), intent(in) :: cell
         real(wp), intent(in) :: volpart
         real(wp), intent(out) :: stddsv
@@ -388,11 +380,9 @@ contains
             !! @param cell Computational coordinates (x, y, z)
             !! @param Charvol Characteristic volume
     pure elemental subroutine s_get_char_vol(cellx, celly, cellz, Charvol)
-#ifdef _CRAYFTN
-        !DIR$ INLINEALWAYS s_get_char_vol
-#else
-        !$acc routine seq
-#endif
+        $:GPU_ROUTINE(function_name='s_get_char_vol',parallelism='[seq]', &
+            & cray_inline=True)
+
         integer, intent(in) :: cellx, celly, cellz
         real(wp), intent(out) :: Charvol
 
@@ -413,11 +403,9 @@ contains
             !! @param s Computational coordinates of the bubble, real type
             !! @param get_cell Computational coordinates of the bubble, integer type
     pure subroutine s_get_cell(s_cell, get_cell)
-#ifdef _CRAYFTN
-        !DIR$ INLINEALWAYS s_get_cell
-#else
-        !$acc routine seq
-#endif
+        $:GPU_ROUTINE(function_name='s_get_cell',parallelism='[seq]', &
+            & cray_inline=True)
+
         real(wp), dimension(3), intent(in) :: s_cell
         integer, dimension(3), intent(out) :: get_cell
         integer :: i
diff --git a/src/simulation/m_cbc.fpp b/src/simulation/m_cbc.fpp
index 6fb5438a2f..d655f1bbf6 100644
--- a/src/simulation/m_cbc.fpp
+++ b/src/simulation/m_cbc.fpp
@@ -48,6 +48,7 @@ module m_cbc
     real(wp), allocatable, dimension(:, :, :, :) :: q_prim_rsx_vf
     real(wp), allocatable, dimension(:, :, :, :) :: q_prim_rsy_vf
     real(wp), allocatable, dimension(:, :, :, :) :: q_prim_rsz_vf
+    $:GPU_DECLARE(create='[q_prim_rsx_vf,q_prim_rsy_vf,q_prim_rsz_vf]')
 
     !! Cell-average fluxes (src - source). These are directly determined from the
     !! cell-average primitive variables, q_prims_rs_vf, and not a Riemann solver.
@@ -55,6 +56,7 @@ module m_cbc
     real(wp), allocatable, dimension(:, :, :, :) :: F_rsx_vf, F_src_rsx_vf !<
     real(wp), allocatable, dimension(:, :, :, :) :: F_rsy_vf, F_src_rsy_vf !<
     real(wp), allocatable, dimension(:, :, :, :) :: F_rsz_vf, F_src_rsz_vf !<
+    $:GPU_DECLARE(create='[F_rsx_vf,F_src_rsx_vf,F_rsy_vf,F_src_rsy_vf,F_rsz_vf,F_src_rsz_vf]')
 
     !! There is a CCE bug that is causing some subset of these variables to interfere
     !! with variables of the same name in m_riemann_solvers.fpp, and giving this versions
@@ -65,9 +67,10 @@ module m_cbc
     real(wp), allocatable, dimension(:, :, :, :) :: flux_rsx_vf_l, flux_src_rsx_vf_l !<
     real(wp), allocatable, dimension(:, :, :, :) :: flux_rsy_vf_l, flux_src_rsy_vf_l
     real(wp), allocatable, dimension(:, :, :, :) :: flux_rsz_vf_l, flux_src_rsz_vf_l
+    $:GPU_DECLARE(create='[flux_rsx_vf_l,flux_src_rsx_vf_l,flux_rsy_vf_l,flux_src_rsy_vf_l,flux_rsz_vf_l,flux_src_rsz_vf_l]')
 
     real(wp) :: dpres_ds !< Spatial derivatives in s-dir of pressure
-    !$acc declare create(dpres_ds)
+    $:GPU_DECLARE(create='[dpres_ds]')
 
     real(wp), allocatable, dimension(:) :: ds !< Cell-width distribution in the s-direction
 
@@ -87,18 +90,21 @@ module m_cbc
     real(wp), allocatable, dimension(:, :, :) :: pi_coef_y !< Polynomial interpolant coefficients in y-dir
     real(wp), allocatable, dimension(:, :, :) :: pi_coef_z !< Polynomial interpolant coefficients in z-dir
 
+    $:GPU_DECLARE(create='[ds,fd_coef_x,fd_coef_y,fd_coef_z,pi_coef_x,pi_coef_y,pi_coef_z]')
+
     !! The first dimension of the array identifies the polynomial, the
     !! second dimension identifies the position of its coefficients and the last
     !! dimension denotes the location of the CBC.
 
     type(int_bounds_info) :: is1, is2, is3 !< Indical bounds in the s1-, s2- and s3-directions
-    !$acc declare create(is1, is2, is3)
+    $:GPU_DECLARE(create='[is1,is2,is3]')
 
     integer :: dj
     integer :: bcxb, bcxe, bcyb, bcye, bczb, bcze
     integer :: cbc_dir, cbc_loc
     integer :: flux_cbc_index
-    !$acc declare create(dj, bcxb, bcxe, bcyb, bcye, bczb, bcze, cbc_dir, cbc_loc, flux_cbc_index)
+    $:GPU_DECLARE(create='[dj,bcxb,bcxe,bcyb,bcye,bczb,bcze]')
+    $:GPU_DECLARE(create='[cbc_dir, cbc_loc,flux_cbc_index]')
 
     !! GRCBC inputs for subsonic inflow and outflow conditions consisting of
     !! inflow velocities, pressure, density and void fraction as well as
@@ -107,14 +113,9 @@ module m_cbc
     real(wp), allocatable, dimension(:) :: pres_in, pres_out, Del_in, Del_out
     real(wp), allocatable, dimension(:, :) :: vel_in, vel_out
     real(wp), allocatable, dimension(:, :) :: alpha_rho_in, alpha_in
-    !$acc declare create(pres_in, pres_out, Del_in, Del_out)
-    !$acc declare create(vel_in, vel_out)
-    !$acc declare create(alpha_rho_in, alpha_in)
-
-    !$acc declare create(q_prim_rsx_vf, q_prim_rsy_vf, q_prim_rsz_vf,  F_rsx_vf, F_src_rsx_vf,flux_rsx_vf_l, flux_src_rsx_vf_l, &
-    !$acc                 F_rsy_vf, F_src_rsy_vf,flux_rsy_vf_l, flux_src_rsy_vf_l, F_rsz_vf, F_src_rsz_vf,flux_rsz_vf_l, flux_src_rsz_vf_l, &
-    !$acc                 ds,fd_coef_x,fd_coef_y,fd_coef_z,      &
-    !$acc                 pi_coef_x,pi_coef_y,pi_coef_z)
+    $:GPU_DECLARE(create='[pres_in,pres_out,Del_in,Del_out]')
+    $:GPU_DECLARE(create='[vel_in,vel_out]')
+    $:GPU_DECLARE(create='[alpha_rho_in,alpha_in]')
 
 contains
 
@@ -131,7 +132,7 @@ contains
         else
             flux_cbc_index = adv_idx%end
         end if
-        !$acc update device(flux_cbc_index)
+        $:GPU_UPDATE(device='[flux_cbc_index]')
 
         call s_any_cbc_boundaries(is_cbc)
 
@@ -381,7 +382,8 @@ contains
 
         end if
 
-        !$acc update device(fd_coef_x, fd_coef_y, fd_coef_z, pi_coef_x, pi_coef_y, pi_coef_z)
+        $:GPU_UPDATE(device='[fd_coef_x,fd_coef_y,fd_coef_z, &
+            & pi_coef_x,pi_coef_y,pi_coef_z]')
 
         ! Associating the procedural pointer to the appropriate subroutine
         ! that will be utilized in the conversion to the mixture variables
@@ -389,20 +391,20 @@ contains
         bcxb = bc_x%beg
         bcxe = bc_x%end
 
-        !$acc update device(bcxb, bcxe)
+        $:GPU_UPDATE(device='[bcxb, bcxe]')
 
         if (n > 0) then
             bcyb = bc_y%beg
             bcye = bc_y%end
 
-            !$acc update device(bcyb, bcye)
+            $:GPU_UPDATE(device='[bcyb, bcye]')
         end if
 
         if (p > 0) then
             bczb = bc_z%beg
             bcze = bc_z%end
 
-            !$acc update device(bczb, bcze)
+            $:GPU_UPDATE(device='[bczb, bcze]')
         end if
 
         ! Allocate GRCBC inputs
@@ -434,7 +436,8 @@ contains
                 end do
             end if
         #:endfor
-        !$acc update device(vel_in, vel_out, pres_in, pres_out, Del_in, Del_out, alpha_rho_in, alpha_in)
+        $:GPU_UPDATE(device='[vel_in,vel_out,pres_in,pres_out, &
+            & Del_in,Del_out,alpha_rho_in,alpha_in]')
 
     end subroutine s_initialize_cbc_module
 
@@ -598,7 +601,7 @@ contains
 
         end if
 
-        !$acc update device(ds)
+        $:GPU_UPDATE(device='[ds]')
 
     end subroutine s_associate_cbc_coefficients_pointers
 
@@ -674,7 +677,7 @@ contains
         cbc_dir = cbc_dir_norm
         cbc_loc = cbc_loc_norm
 
-        !$acc update device(cbc_dir, cbc_loc)
+        $:GPU_UPDATE(device='[cbc_dir, cbc_loc]')
 
         call s_initialize_cbc(q_prim_vf, flux_vf, flux_src_vf, &
                               ix, iy, iz)
@@ -692,7 +695,7 @@ contains
                                                                F_src_rs${XYZ}$_vf, &
                                                                is1, is2, is3, idwbuff(2)%beg, idwbuff(3)%beg)
 
-                    !$acc parallel loop collapse(3) gang vector default(present)
+                    $:GPU_PARALLEL_LOOP(collapse=3)
                     do i = 1, flux_cbc_index
                         do r = is3%beg, is3%end
                             do k = is2%beg, is2%end
@@ -704,7 +707,7 @@ contains
                         end do
                     end do
 
-                    !$acc parallel loop collapse(3) gang vector default(present)
+                    $:GPU_PARALLEL_LOOP(collapse=3)
                     do i = advxb, advxe
                         do r = is3%beg, is3%end
                             do k = is2%beg, is2%end
@@ -723,7 +726,7 @@ contains
                                                                F_src_rs${XYZ}$_vf, &
                                                                is1, is2, is3, idwbuff(2)%beg, idwbuff(3)%beg)
 
-                    !$acc parallel loop collapse(4) gang vector default(present)
+                    $:GPU_PARALLEL_LOOP(collapse=4)
                     do i = 1, flux_cbc_index
                         do j = 0, 1
                             do r = is3%beg, is3%end
@@ -743,7 +746,7 @@ contains
                         end do
                     end do
 
-                    !$acc parallel loop collapse(4) gang vector default(present)
+                    $:GPU_PARALLEL_LOOP(collapse=4)
                     do i = advxb, advxe
                         do j = 0, 1
                             do r = is3%beg, is3%end
@@ -766,30 +769,33 @@ contains
                 end if
 
                 ! FD2 or FD4 of RHS at j = 0
-                !$acc parallel loop collapse(2) gang vector default(present) private(alpha_rho, vel, adv, mf, dvel_ds, dadv_ds, Re_cbc, dalpha_rho_ds,dvel_dt, dadv_dt, dalpha_rho_dt,L, lambda,Ys,dYs_dt,dYs_ds,h_k,Cp_i,Gamma_i,Xs)
+                $:GPU_PARALLEL_LOOP(collapse=2, private='[alpha_rho, vel, adv, &
+                    & mf, dvel_ds, dadv_ds, Re_cbc, dalpha_rho_ds,dvel_dt, &
+                    & dadv_dt, dalpha_rho_dt, L, lambda, Ys, dYs_dt, &
+                    & dYs_ds, h_k, Cp_i, Gamma_i, Xs]')
                 do r = is3%beg, is3%end
                     do k = is2%beg, is2%end
 
                         ! Transferring the Primitive Variables
-                        !$acc loop seq
+                        $:GPU_LOOP(parallelism='[seq]')
                         do i = 1, contxe
                             alpha_rho(i) = q_prim_rs${XYZ}$_vf(0, k, r, i)
                         end do
 
-                        !$acc loop seq
+                        $:GPU_LOOP(parallelism='[seq]')
                         do i = 1, num_dims
                             vel(i) = q_prim_rs${XYZ}$_vf(0, k, r, contxe + i)
                         end do
 
                         vel_K_sum = 0._wp
-                        !$acc loop seq
+                        $:GPU_LOOP(parallelism='[seq]')
                         do i = 1, num_dims
                             vel_K_sum = vel_K_sum + vel(i)**2._wp
                         end do
 
                         pres = q_prim_rs${XYZ}$_vf(0, k, r, E_idx)
 
-                        !$acc loop seq
+                        $:GPU_LOOP(parallelism='[seq]')
                         do i = 1, advxe - E_idx
                             adv(i) = q_prim_rs${XYZ}$_vf(0, k, r, E_idx + i)
                         end do
@@ -800,13 +806,13 @@ contains
                             call s_convert_species_to_mixture_variables_acc(rho, gamma, pi_inf, qv, adv, alpha_rho, Re_cbc)
                         end if
 
-                        !$acc loop seq
+                        $:GPU_LOOP(parallelism='[seq]')
                         do i = 1, contxe
                             mf(i) = alpha_rho(i)/rho
                         end do
 
                         if (chemistry) then
-                            !$acc loop seq
+                            $:GPU_LOOP(parallelism='[seq]')
                             do i = chemxb, chemxe
                                 Ys(i - chemxb + 1) = q_prim_rs${XYZ}$_vf(0, k, r, i)
                             end do
@@ -839,39 +845,39 @@ contains
 
                         ! First-Order Spatial Derivatives of Primitive Variables
 
-                        !$acc loop seq
+                        $:GPU_LOOP(parallelism='[seq]')
                         do i = 1, contxe
                             dalpha_rho_ds(i) = 0._wp
                         end do
 
-                        !$acc loop seq
+                        $:GPU_LOOP(parallelism='[seq]')
                         do i = 1, num_dims
                             dvel_ds(i) = 0._wp
                         end do
 
                         dpres_ds = 0._wp
-                        !$acc loop seq
+                        $:GPU_LOOP(parallelism='[seq]')
                         do i = 1, advxe - E_idx
                             dadv_ds(i) = 0._wp
                         end do
 
                         if (chemistry) then
-                            !$acc loop seq
+                            $:GPU_LOOP(parallelism='[seq]')
                             do i = 1, num_species
                                 dYs_ds(i) = 0._wp
                             end do
                         end if
 
-                        !$acc loop seq
+                        $:GPU_LOOP(parallelism='[seq]')
                         do j = 0, buff_size
 
-                            !$acc loop seq
+                            $:GPU_LOOP(parallelism='[seq]')
                             do i = 1, contxe
                                 dalpha_rho_ds(i) = q_prim_rs${XYZ}$_vf(j, k, r, i)* &
                                                    fd_coef_${XYZ}$ (j, cbc_loc) + &
                                                    dalpha_rho_ds(i)
                             end do
-                            !$acc loop seq
+                            $:GPU_LOOP(parallelism='[seq]')
                             do i = 1, num_dims
                                 dvel_ds(i) = q_prim_rs${XYZ}$_vf(j, k, r, contxe + i)* &
                                              fd_coef_${XYZ}$ (j, cbc_loc) + &
@@ -881,7 +887,7 @@ contains
                             dpres_ds = q_prim_rs${XYZ}$_vf(j, k, r, E_idx)* &
                                        fd_coef_${XYZ}$ (j, cbc_loc) + &
                                        dpres_ds
-                            !$acc loop seq
+                            $:GPU_LOOP(parallelism='[seq]')
                             do i = 1, advxe - E_idx
                                 dadv_ds(i) = q_prim_rs${XYZ}$_vf(j, k, r, E_idx + i)* &
                                              fd_coef_${XYZ}$ (j, cbc_loc) + &
@@ -889,7 +895,7 @@ contains
                             end do
 
                             if (chemistry) then
-                                !$acc loop seq
+                                $:GPU_LOOP(parallelism='[seq]')
                                 do i = 1, num_species
                                     dYs_ds(i) = q_prim_rs${XYZ}$_vf(j, k, r, chemxb - 1 + i)* &
                                                 fd_coef_${XYZ}$ (j, cbc_loc) + &
@@ -916,7 +922,7 @@ contains
                             call s_compute_nonreflecting_subsonic_inflow_L(lambda, L, rho, c, dpres_ds, dvel_ds)
                             ! Add GRCBC for Subsonic Inflow
                             if (bc_${XYZ}$%grcbc_in) then
-                                !$acc loop seq
+                                $:GPU_LOOP(parallelism='[seq]')
                                 do i = 2, momxb
                                     L(2) = c**3._wp*Ma*(alpha_rho(i - 1) - alpha_rho_in(i - 1, ${CBC_DIR}$))/Del_in(${CBC_DIR}$) - c*Ma*(pres - pres_in(${CBC_DIR}$))/Del_in(${CBC_DIR}$)
                                 end do
@@ -926,7 +932,7 @@ contains
                                         L(momxb + 2) = c*Ma*(vel(dir_idx(3)) - vel_in(${CBC_DIR}$, dir_idx(3)))/Del_in(${CBC_DIR}$)
                                     end if
                                 end if
-                                !$acc loop seq
+                                $:GPU_LOOP(parallelism='[seq]')
                                 do i = E_idx, advxe - 1
                                     L(i) = c*Ma*(adv(i + 1 - E_idx) - alpha_in(i + 1 - E_idx, ${CBC_DIR}$))/Del_in(${CBC_DIR}$)
                                 end do
@@ -966,13 +972,13 @@ contains
                             dpres_dt = -5.e-1_wp*(L(advxe) + L(1))
                         end if
 
-                        !$acc loop seq
+                        $:GPU_LOOP(parallelism='[seq]')
                         do i = 1, contxe
                             dalpha_rho_dt(i) = &
                                 -(L(i + 1) - mf(i)*dpres_dt)/(c*c)
                         end do
 
-                        !$acc loop seq
+                        $:GPU_LOOP(parallelism='[seq]')
                         do i = 1, num_dims
                             dvel_dt(dir_idx(i)) = dir_flg(dir_idx(i))* &
                                                   (L(1) - L(advxe))/(2._wp*rho*c) + &
@@ -981,13 +987,13 @@ contains
                         end do
 
                         vel_dv_dt_sum = 0._wp
-                        !$acc loop seq
+                        $:GPU_LOOP(parallelism='[seq]')
                         do i = 1, num_dims
                             vel_dv_dt_sum = vel_dv_dt_sum + vel(i)*dvel_dt(i)
                         end do
 
                         if (chemistry) then
-                            !$acc loop seq
+                            $:GPU_LOOP(parallelism='[seq]')
                             do i = 1, num_species
                                 dYs_dt(i) = -1._wp*L(chemxb + i - 1)
                             end do
@@ -995,12 +1001,12 @@ contains
 
                         ! The treatment of void fraction source is unclear
                         if (cyl_coord .and. cbc_dir == 2 .and. cbc_loc == 1) then
-                            !$acc loop seq
+                            $:GPU_LOOP(parallelism='[seq]')
                             do i = 1, advxe - E_idx
                                 dadv_dt(i) = -L(momxe + i) !+ adv(i) * vel(dir_idx(1))/y_cc(n)
                             end do
                         else
-                            !$acc loop seq
+                            $:GPU_LOOP(parallelism='[seq]')
                             do i = 1, advxe - E_idx
                                 dadv_dt(i) = -L(momxe + i)
                             end do
@@ -1013,7 +1019,7 @@ contains
                             dgamma_dt = dadv_dt(1)
                             dpi_inf_dt = dadv_dt(2)
                         else
-                            !$acc loop seq
+                            $:GPU_LOOP(parallelism='[seq]')
                             do i = 1, num_fluids
                                 drho_dt = drho_dt + dalpha_rho_dt(i)
                                 dgamma_dt = dgamma_dt + dadv_dt(i)*gammas(i)
@@ -1023,13 +1029,13 @@ contains
                         end if
 
                         ! flux_rs_vf_l and flux_src_rs_vf_l at j = -1/2
-                        !$acc loop seq
+                        $:GPU_LOOP(parallelism='[seq]')
                         do i = 1, contxe
                             flux_rs${XYZ}$_vf_l(-1, k, r, i) = flux_rs${XYZ}$_vf_l(0, k, r, i) &
                                                                + ds(0)*dalpha_rho_dt(i)
                         end do
 
-                        !$acc loop seq
+                        $:GPU_LOOP(parallelism='[seq]')
                         do i = momxb, momxe
                             flux_rs${XYZ}$_vf_l(-1, k, r, i) = flux_rs${XYZ}$_vf_l(0, k, r, i) &
                                                                + ds(0)*(vel(i - contxe)*drho_dt &
@@ -1040,14 +1046,14 @@ contains
                             ! Evolution of LODI equation of energy for real gases adjusted to perfect gas, doi:10.1006/jcph.2002.6990
                             call get_species_enthalpies_rt(T, h_k)
                             sum_Enthalpies = 0._wp
-                            !$acc loop seq
+                            $:GPU_LOOP(parallelism='[seq]')
                             do i = 1, num_species
                                 h_k(i) = h_k(i)*gas_constant/molecular_weights(i)*T
                                 sum_Enthalpies = sum_Enthalpies + (rho*h_k(i) - pres*Mw/molecular_weights(i)*Cp/R_gas)*dYs_dt(i)
                             end do
                             flux_rs${XYZ}$_vf_l(-1, k, r, E_idx) = flux_rs${XYZ}$_vf_l(0, k, r, E_idx) &
                                                                    + ds(0)*((E/rho + pres/rho)*drho_dt + rho*vel_dv_dt_sum + Cp*T*L(2)/(c*c) + sum_Enthalpies)
-                            !$acc loop seq
+                            $:GPU_LOOP(parallelism='[seq]')
                             do i = 1, num_species
                                 flux_rs${XYZ}$_vf_l(-1, k, r, i - 1 + chemxb) = flux_rs${XYZ}$_vf_l(0, k, r, chemxb + i - 1) &
                                                                                 + ds(0)*(drho_dt*Ys(i) + rho*dYs_dt(i))
@@ -1063,12 +1069,12 @@ contains
                         end if
 
                         if (riemann_solver == 1) then
-                            !$acc loop seq
+                            $:GPU_LOOP(parallelism='[seq]')
                             do i = advxb, advxe
                                 flux_rs${XYZ}$_vf_l(-1, k, r, i) = 0._wp
                             end do
 
-                            !$acc loop seq
+                            $:GPU_LOOP(parallelism='[seq]')
                             do i = advxb, advxe
                                 flux_src_rs${XYZ}$_vf_l(-1, k, r, i) = &
                                     1._wp/max(abs(vel(dir_idx(1))), sgm_eps) &
@@ -1081,13 +1087,13 @@ contains
 
                         else
 
-                            !$acc loop seq
+                            $:GPU_LOOP(parallelism='[seq]')
                             do i = advxb, advxe
                                 flux_rs${XYZ}$_vf_l(-1, k, r, i) = flux_rs${XYZ}$_vf_l(0, k, r, i) + &
                                                                    ds(0)*dadv_dt(i - E_idx)
                             end do
 
-                            !$acc loop seq
+                            $:GPU_LOOP(parallelism='[seq]')
                             do i = advxb, advxe
                                 flux_src_rs${XYZ}$_vf_l(-1, k, r, i) = flux_src_rs${XYZ}$_vf_l(0, k, r, i)
                             end do
@@ -1151,13 +1157,13 @@ contains
         end if
 
         dj = max(0, cbc_loc)
-        !$acc update device(is1, is2, is3, dj)
-        !$acc update device( dir_idx, dir_flg)
+        $:GPU_UPDATE(device='[is1,is2,is3,dj]')
+        $:GPU_UPDATE(device='[dir_idx,dir_flg]')
 
         ! Reshaping Inputted Data in x-direction
         if (cbc_dir == 1) then
 
-            !$acc parallel loop collapse(4) gang vector default(present)
+            $:GPU_PARALLEL_LOOP(collapse=4)
             do i = 1, sys_size
                 do r = is3%beg, is3%end
                     do k = is2%beg, is2%end
@@ -1169,7 +1175,7 @@ contains
                 end do
             end do
 
-            !$acc parallel loop collapse(3) gang vector default(present)
+            $:GPU_PARALLEL_LOOP(collapse=3)
             do r = is3%beg, is3%end
                 do k = is2%beg, is2%end
                     do j = 0, buff_size
@@ -1180,7 +1186,7 @@ contains
                 end do
             end do
 
-            !$acc parallel loop collapse(4) gang vector default(present)
+            $:GPU_PARALLEL_LOOP(collapse=4)
             do i = 1, flux_cbc_index
                 do r = is3%beg, is3%end
                     do k = is2%beg, is2%end
@@ -1193,7 +1199,7 @@ contains
                 end do
             end do
 
-            !$acc parallel loop collapse(3) gang vector default(present)
+            $:GPU_PARALLEL_LOOP(collapse=3)
             do r = is3%beg, is3%end
                 do k = is2%beg, is2%end
                     do j = -1, buff_size
@@ -1204,7 +1210,7 @@ contains
             end do
 
             if (riemann_solver == 1) then
-                !$acc parallel loop collapse(4) gang vector default(present)
+                $:GPU_PARALLEL_LOOP(collapse=4)
                 do i = advxb, advxe
                     do r = is3%beg, is3%end
                         do k = is2%beg, is2%end
@@ -1216,7 +1222,7 @@ contains
                     end do
                 end do
             else
-                !$acc parallel loop collapse(3) gang vector default(present)
+                $:GPU_PARALLEL_LOOP(collapse=3)
                 do r = is3%beg, is3%end
                     do k = is2%beg, is2%end
                         do j = -1, buff_size
@@ -1233,7 +1239,7 @@ contains
             ! Reshaping Inputted Data in y-direction
         elseif (cbc_dir == 2) then
 
-            !$acc parallel loop collapse(4) gang vector default(present)
+            $:GPU_PARALLEL_LOOP(collapse=4)
             do i = 1, sys_size
                 do r = is3%beg, is3%end
                     do k = is2%beg, is2%end
@@ -1245,7 +1251,7 @@ contains
                 end do
             end do
 
-            !$acc parallel loop collapse(3) gang vector default(present)
+            $:GPU_PARALLEL_LOOP(collapse=3)
             do r = is3%beg, is3%end
                 do k = is2%beg, is2%end
                     do j = 0, buff_size
@@ -1256,7 +1262,7 @@ contains
                 end do
             end do
 
-            !$acc parallel loop collapse(4) gang vector default(present)
+            $:GPU_PARALLEL_LOOP(collapse=4)
             do i = 1, flux_cbc_index
                 do r = is3%beg, is3%end
                     do k = is2%beg, is2%end
@@ -1269,7 +1275,7 @@ contains
                 end do
             end do
 
-            !$acc parallel loop collapse(3) gang vector default(present)
+            $:GPU_PARALLEL_LOOP(collapse=3)
             do r = is3%beg, is3%end
                 do k = is2%beg, is2%end
                     do j = -1, buff_size
@@ -1280,7 +1286,7 @@ contains
             end do
 
             if (riemann_solver == 1) then
-                !$acc parallel loop collapse(4) gang vector default(present)
+                $:GPU_PARALLEL_LOOP(collapse=4)
                 do i = advxb, advxe
                     do r = is3%beg, is3%end
                         do k = is2%beg, is2%end
@@ -1292,7 +1298,7 @@ contains
                     end do
                 end do
             else
-                !$acc parallel loop collapse(3) gang vector default(present)
+                $:GPU_PARALLEL_LOOP(collapse=3)
                 do r = is3%beg, is3%end
                     do k = is2%beg, is2%end
                         do j = -1, buff_size
@@ -1309,7 +1315,7 @@ contains
             ! Reshaping Inputted Data in z-direction
         else
 
-            !$acc parallel loop collapse(4) gang vector default(present)
+            $:GPU_PARALLEL_LOOP(collapse=4)
             do i = 1, sys_size
                 do r = is3%beg, is3%end
                     do k = is2%beg, is2%end
@@ -1321,7 +1327,7 @@ contains
                 end do
             end do
 
-            !$acc parallel loop collapse(3) gang vector default(present)
+            $:GPU_PARALLEL_LOOP(collapse=3)
             do r = is3%beg, is3%end
                 do k = is2%beg, is2%end
                     do j = 0, buff_size
@@ -1332,7 +1338,7 @@ contains
                 end do
             end do
 
-            !$acc parallel loop collapse(4) gang vector default(present)
+            $:GPU_PARALLEL_LOOP(collapse=4)
             do i = 1, flux_cbc_index
                 do r = is3%beg, is3%end
                     do k = is2%beg, is2%end
@@ -1345,7 +1351,7 @@ contains
                 end do
             end do
 
-            !$acc parallel loop collapse(3) gang vector default(present)
+            $:GPU_PARALLEL_LOOP(collapse=3)
             do r = is3%beg, is3%end
                 do k = is2%beg, is2%end
                     do j = -1, buff_size
@@ -1356,7 +1362,7 @@ contains
             end do
 
             if (riemann_solver == 1) then
-                !$acc parallel loop collapse(4) gang vector default(present)
+                $:GPU_PARALLEL_LOOP(collapse=4)
                 do i = advxb, advxe
                     do r = is3%beg, is3%end
                         do k = is2%beg, is2%end
@@ -1368,7 +1374,7 @@ contains
                     end do
                 end do
             else
-                !$acc parallel loop collapse(3) gang vector default(present)
+                $:GPU_PARALLEL_LOOP(collapse=3)
                 do r = is3%beg, is3%end
                     do k = is2%beg, is2%end
                         do j = -1, buff_size
@@ -1402,12 +1408,12 @@ contains
 
         ! Determining the indicial shift based on CBC location
         dj = max(0, cbc_loc)
-        !$acc update device(dj)
+        $:GPU_UPDATE(device='[dj]')
 
         ! Reshaping Outputted Data in x-direction
         if (cbc_dir == 1) then
 
-            !$acc parallel loop collapse(4) gang vector default(present)
+            $:GPU_PARALLEL_LOOP(collapse=4)
             do i = 1, flux_cbc_index
                 do r = is3%beg, is3%end
                     do k = is2%beg, is2%end
@@ -1419,7 +1425,7 @@ contains
                     end do
                 end do
             end do
-            !$acc parallel loop collapse(3) gang vector default(present)
+            $:GPU_PARALLEL_LOOP(collapse=3)
             do r = is3%beg, is3%end
                 do k = is2%beg, is2%end
                     do j = -1, buff_size
@@ -1430,7 +1436,7 @@ contains
             end do
 
             if (riemann_solver == 1) then
-                !$acc parallel loop collapse(4) gang vector default(present)
+                $:GPU_PARALLEL_LOOP(collapse=4)
                 do i = advxb, advxe
                     do r = is3%beg, is3%end
                         do k = is2%beg, is2%end
@@ -1442,7 +1448,7 @@ contains
                     end do
                 end do
             else
-                !$acc parallel loop collapse(3) gang vector default(present)
+                $:GPU_PARALLEL_LOOP(collapse=3)
                 do r = is3%beg, is3%end
                     do k = is2%beg, is2%end
                         do j = -1, buff_size
@@ -1458,7 +1464,7 @@ contains
             ! Reshaping Outputted Data in y-direction
         elseif (cbc_dir == 2) then
 
-            !$acc parallel loop collapse(4) gang vector default(present)
+            $:GPU_PARALLEL_LOOP(collapse=4)
             do i = 1, flux_cbc_index
                 do r = is3%beg, is3%end
                     do k = is2%beg, is2%end
@@ -1471,7 +1477,7 @@ contains
                 end do
             end do
 
-            !$acc parallel loop collapse(3) gang vector default(present)
+            $:GPU_PARALLEL_LOOP(collapse=3)
             do r = is3%beg, is3%end
                 do k = is2%beg, is2%end
                     do j = -1, buff_size
@@ -1482,7 +1488,7 @@ contains
             end do
 
             if (riemann_solver == 1) then
-                !$acc parallel loop collapse(4) gang vector default(present)
+                $:GPU_PARALLEL_LOOP(collapse=4)
                 do i = advxb, advxe
                     do r = is3%beg, is3%end
                         do k = is2%beg, is2%end
@@ -1494,7 +1500,7 @@ contains
                     end do
                 end do
             else
-                !$acc parallel loop collapse(3) gang vector default(present)
+                $:GPU_PARALLEL_LOOP(collapse=3)
                 do r = is3%beg, is3%end
                     do k = is2%beg, is2%end
                         do j = -1, buff_size
@@ -1511,7 +1517,7 @@ contains
             ! Reshaping Outputted Data in z-direction
         else
 
-            !$acc parallel loop collapse(4) gang vector default(present)
+            $:GPU_PARALLEL_LOOP(collapse=4)
             do i = 1, flux_cbc_index
                 do r = is3%beg, is3%end
                     do k = is2%beg, is2%end
@@ -1524,7 +1530,7 @@ contains
                 end do
             end do
 
-            !$acc parallel loop collapse(3) gang vector default(present)
+            $:GPU_PARALLEL_LOOP(collapse=3)
             do r = is3%beg, is3%end
                 do k = is2%beg, is2%end
                     do j = -1, buff_size
@@ -1535,7 +1541,7 @@ contains
             end do
 
             if (riemann_solver == 1) then
-                !$acc parallel loop collapse(4) gang vector default(present)
+                $:GPU_PARALLEL_LOOP(collapse=4)
                 do i = advxb, advxe
                     do r = is3%beg, is3%end
                         do k = is2%beg, is2%end
@@ -1547,7 +1553,7 @@ contains
                     end do
                 end do
             else
-                !$acc parallel loop collapse(3) gang vector default(present)
+                $:GPU_PARALLEL_LOOP(collapse=3)
                 do r = is3%beg, is3%end
                     do k = is2%beg, is2%end
                         do j = -1, buff_size
diff --git a/src/simulation/m_compute_cbc.fpp b/src/simulation/m_compute_cbc.fpp
index 022a06175d..694f6735b2 100644
--- a/src/simulation/m_compute_cbc.fpp
+++ b/src/simulation/m_compute_cbc.fpp
@@ -2,6 +2,8 @@
 !! @file m_compute_cbc.f90
 !! @brief CBC computation module
 
+#:include 'macros.fpp'
+
 module m_compute_cbc
     use m_global_parameters
     implicit none
@@ -18,7 +20,7 @@ module m_compute_cbc
 contains
     !> Base L1 calculation
     pure function f_base_L1(lambda, rho, c, dpres_ds, dvel_ds) result(L1)
-        !$acc routine seq
+        $:GPU_ROUTINE(parallelism='[seq]')
         real(wp), dimension(3), intent(in) :: lambda
         real(wp), intent(in) :: rho, c, dpres_ds
         real(wp), dimension(num_dims), intent(in) :: dvel_ds
@@ -28,7 +30,7 @@ contains
 
     !> Fill density L variables
     pure subroutine s_fill_density_L(L, lambda_factor, lambda2, c, mf, dalpha_rho_ds, dpres_ds)
-        !$acc routine seq
+        $:GPU_ROUTINE(parallelism='[seq]')
         real(wp), dimension(sys_size), intent(inout) :: L
         real(wp), intent(in) :: lambda_factor, lambda2, c
         real(wp), dimension(num_fluids), intent(in) :: mf, dalpha_rho_ds
@@ -42,7 +44,7 @@ contains
 
     !> Fill velocity L variables
     pure subroutine s_fill_velocity_L(L, lambda_factor, lambda2, dvel_ds)
-        !$acc routine seq
+        $:GPU_ROUTINE(parallelism='[seq]')
         real(wp), dimension(sys_size), intent(inout) :: L
         real(wp), intent(in) :: lambda_factor, lambda2
         real(wp), dimension(num_dims), intent(in) :: dvel_ds
@@ -55,7 +57,7 @@ contains
 
     !> Fill advection L variables
     pure subroutine s_fill_advection_L(L, lambda_factor, lambda2, dadv_ds)
-        !$acc routine seq
+        $:GPU_ROUTINE(parallelism='[seq]')
         real(wp), dimension(sys_size), intent(inout) :: L
         real(wp), intent(in) :: lambda_factor, lambda2
         real(wp), dimension(num_fluids), intent(in) :: dadv_ds
@@ -68,7 +70,7 @@ contains
 
     !> Fill chemistry L variables
     pure subroutine s_fill_chemistry_L(L, lambda_factor, lambda2, dYs_ds)
-        !$acc routine seq
+        $:GPU_ROUTINE(parallelism='[seq]')
         real(wp), dimension(sys_size), intent(inout) :: L
         real(wp), intent(in) :: lambda_factor, lambda2
         real(wp), dimension(num_species), intent(in) :: dYs_ds
@@ -83,11 +85,9 @@ contains
 
     !> Slip wall CBC (Thompson 1990, pg. 451)
     pure subroutine s_compute_slip_wall_L(lambda, L, rho, c, dpres_ds, dvel_ds)
-#ifdef _CRAYFTN
-        !DIR$ INLINEALWAYS s_compute_slip_wall_L
-#else
-        !$acc routine seq
-#endif
+        $:GPU_ROUTINE(function_name='s_compute_slip_wall_L',parallelism='[seq]', &
+            & cray_inline=True)
+
         real(wp), dimension(3), intent(in) :: lambda
         real(wp), dimension(sys_size), intent(inout) :: L
         real(wp), intent(in) :: rho, c, dpres_ds
@@ -101,11 +101,9 @@ contains
 
     !> Nonreflecting subsonic buffer CBC (Thompson 1987, pg. 13)
     pure subroutine s_compute_nonreflecting_subsonic_buffer_L(lambda, L, rho, c, mf, dalpha_rho_ds, dpres_ds, dvel_ds, dadv_ds, dYs_ds)
-#ifdef _CRAYFTN
-        !DIR$ INLINEALWAYS s_compute_nonreflecting_subsonic_buffer_L
-#else
-        !$acc routine seq
-#endif
+        $:GPU_ROUTINE(function_name='s_compute_nonreflecting_subsonic_buffer_L', &
+            & parallelism='[seq]', cray_inline=True)
+
         real(wp), dimension(3), intent(in) :: lambda
         real(wp), dimension(sys_size), intent(inout) :: L
         real(wp), intent(in) :: rho, c
@@ -131,11 +129,9 @@ contains
 
     !> Nonreflecting subsonic inflow CBC (Thompson 1990, pg. 455)
     pure subroutine s_compute_nonreflecting_subsonic_inflow_L(lambda, L, rho, c, dpres_ds, dvel_ds)
-#ifdef _CRAYFTN
-        !DIR$ INLINEALWAYS s_compute_nonreflecting_subsonic_inflow_L
-#else
-        !$acc routine seq
-#endif
+        $:GPU_ROUTINE(function_name='s_compute_nonreflecting_subsonic_inflow_L', &
+            & parallelism='[seq]', cray_inline=True)
+
         real(wp), dimension(3), intent(in) :: lambda
         real(wp), dimension(sys_size), intent(inout) :: L
         real(wp), intent(in) :: rho, c, dpres_ds
@@ -148,11 +144,9 @@ contains
 
     !> Nonreflecting subsonic outflow CBC (Thompson 1990, pg. 454)
     pure subroutine s_compute_nonreflecting_subsonic_outflow_L(lambda, L, rho, c, mf, dalpha_rho_ds, dpres_ds, dvel_ds, dadv_ds, dYs_ds)
-#ifdef _CRAYFTN
-        !DIR$ INLINEALWAYS s_compute_nonreflecting_subsonic_outflow_L
-#else
-        !$acc routine seq
-#endif
+        $:GPU_ROUTINE(function_name='s_compute_nonreflecting_subsonic_outflow_L', &
+            & parallelism='[seq]', cray_inline=True)
+
         real(wp), dimension(3), intent(in) :: lambda
         real(wp), dimension(sys_size), intent(inout) :: L
         real(wp), intent(in) :: rho, c
@@ -172,11 +166,9 @@ contains
 
     !> Force-free subsonic outflow CBC (Thompson 1990, pg. 454)
     pure subroutine s_compute_force_free_subsonic_outflow_L(lambda, L, rho, c, mf, dalpha_rho_ds, dpres_ds, dvel_ds, dadv_ds)
-#ifdef _CRAYFTN
-        !DIR$ INLINEALWAYS s_compute_force_free_subsonic_outflow_L
-#else
-        !$acc routine seq
-#endif
+        $:GPU_ROUTINE(function_name='s_compute_force_free_subsonic_outflow_L', &
+            & parallelism='[seq]', cray_inline=True)
+
         real(wp), dimension(3), intent(in) :: lambda
         real(wp), dimension(sys_size), intent(inout) :: L
         real(wp), intent(in) :: rho, c
@@ -194,11 +186,9 @@ contains
 
     !> Constant pressure subsonic outflow CBC (Thompson 1990, pg. 455)
     pure subroutine s_compute_constant_pressure_subsonic_outflow_L(lambda, L, rho, c, mf, dalpha_rho_ds, dpres_ds, dvel_ds, dadv_ds)
-#ifdef _CRAYFTN
-        !DIR$ INLINEALWAYS s_compute_constant_pressure_subsonic_outflow_L
-#else
-        !$acc routine seq
-#endif
+        $:GPU_ROUTINE(function_name='s_compute_constant_pressure_subsonic_outflow_L', &
+            & parallelism='[seq]', cray_inline=True)
+
         real(wp), dimension(3), intent(in) :: lambda
         real(wp), dimension(sys_size), intent(inout) :: L
         real(wp), intent(in) :: rho, c
@@ -216,11 +206,9 @@ contains
 
     !> Supersonic inflow CBC (Thompson 1990, pg. 453)
     pure subroutine s_compute_supersonic_inflow_L(L)
-#ifdef _CRAYFTN
-        !DIR$ INLINEALWAYS s_compute_supersonic_inflow_L
-#else
-        !$acc routine seq
-#endif
+        $:GPU_ROUTINE(function_name='s_compute_supersonic_inflow_L', &
+            & parallelism='[seq]', cray_inline=True)
+
         real(wp), dimension(sys_size), intent(inout) :: L
         L(1:advxe) = 0._wp
         if (chemistry) L(chemxb:chemxe) = 0._wp
@@ -228,11 +216,9 @@ contains
 
     !> Supersonic outflow CBC (Thompson 1990, pg. 453)
     pure subroutine s_compute_supersonic_outflow_L(lambda, L, rho, c, mf, dalpha_rho_ds, dpres_ds, dvel_ds, dadv_ds, dYs_ds)
-#ifdef _CRAYFTN
-        !DIR$ INLINEALWAYS s_compute_supersonic_outflow_L
-#else
-        !$acc routine seq
-#endif
+        $:GPU_ROUTINE(function_name='s_compute_supersonic_outflow_L', &
+            & parallelism='[seq]', cray_inline=True)
+
         real(wp), dimension(3), intent(in) :: lambda
         real(wp), dimension(sys_size), intent(inout) :: L
         real(wp), intent(in) :: rho, c
diff --git a/src/simulation/m_data_output.fpp b/src/simulation/m_data_output.fpp
index 552e52995b..f233bb5374 100644
--- a/src/simulation/m_data_output.fpp
+++ b/src/simulation/m_data_output.fpp
@@ -56,13 +56,14 @@ module m_data_output
     real(wp), allocatable, dimension(:, :, :) :: ccfl_sf  !< CCFL stability criterion
     real(wp), allocatable, dimension(:, :, :) :: Rc_sf  !< Rc stability criterion
     real(wp), public, allocatable, dimension(:, :) :: c_mass
-    !$acc declare create(icfl_sf, vcfl_sf, ccfl_sf, Rc_sf, c_mass)
+    $:GPU_DECLARE(create='[icfl_sf,vcfl_sf,ccfl_sf,Rc_sf,c_mass]')
 
     real(wp) :: icfl_max_loc, icfl_max_glb !< ICFL stability extrema on local and global grids
     real(wp) :: vcfl_max_loc, vcfl_max_glb !< VCFL stability extrema on local and global grids
     real(wp) :: ccfl_max_loc, ccfl_max_glb !< CCFL stability extrema on local and global grids
     real(wp) :: Rc_min_loc, Rc_min_glb !< Rc   stability extrema on local and global grids
-    !$acc declare create(icfl_max_loc, icfl_max_glb, vcfl_max_loc, vcfl_max_glb, ccfl_max_loc, ccfl_max_glb, Rc_min_loc, Rc_min_glb)
+    $:GPU_DECLARE(create='[icfl_max_loc,icfl_max_glb,vcfl_max_loc,vcfl_max_glb]')
+    $:GPU_DECLARE(create='[ccfl_max_loc,ccfl_max_glb,Rc_min_loc,Rc_min_glb]')
 
     !> @name ICFL, VCFL, CCFL and Rc stability criteria extrema over all the time-steps
     !> @{
@@ -279,7 +280,7 @@ contains
         integer :: j, k, l
 
         ! Computing Stability Criteria at Current Time-step
-        !$acc parallel loop collapse(3) gang vector default(present) private(vel, alpha, Re)
+        $:GPU_PARALLEL_LOOP(collapse=3, private='[vel, alpha, Re]')
         do l = 0, p
             do k = 0, n
                 do j = 0, m
@@ -296,17 +297,16 @@ contains
                 end do
             end do
         end do
-        !$acc end parallel loop
 
         ! end: Computing Stability Criteria at Current Time-step
 
         ! Determining local stability criteria extrema at current time-step
 
 #ifdef _CRAYFTN
-        !$acc update host(icfl_sf)
+        $:GPU_UPDATE(host='[icfl_sf]')
 
         if (viscous) then
-            !$acc update host(vcfl_sf, Rc_sf)
+            $:GPU_UPDATE(host='[vcfl_sf,Rc_sf]')
         end if
 
         icfl_max_loc = maxval(icfl_sf)
@@ -316,15 +316,14 @@ contains
             Rc_min_loc = minval(Rc_sf)
         end if
 #else
-        !$acc kernels
-        icfl_max_loc = maxval(icfl_sf)
-        !$acc end kernels
-
+        #:call GPU_PARALLEL()
+            icfl_max_loc = maxval(icfl_sf)
+        #:endcall GPU_PARALLEL
         if (viscous) then
-            !$acc kernels
-            vcfl_max_loc = maxval(vcfl_sf)
-            Rc_min_loc = minval(Rc_sf)
-            !$acc end kernels
+            #:call GPU_PARALLEL()
+                vcfl_max_loc = maxval(vcfl_sf)
+                Rc_min_loc = minval(Rc_sf)
+            #:endcall GPU_PARALLEL
         end if
 #endif
 
@@ -527,7 +526,7 @@ contains
         if (prim_vars_wrt .or. (n == 0 .and. p == 0)) then
             call s_convert_conservative_to_primitive_variables(q_cons_vf, q_T_sf, q_prim_vf, idwint)
             do i = 1, sys_size
-                !$acc update host(q_prim_vf(i)%sf(:,:,:))
+                $:GPU_UPDATE(host='[q_prim_vf(i)%sf(:,:,:)]')
             end do
             ! q_prim_vf(bubxb) stores the value of nb needed in riemann solvers, so replace with true primitive value (=1._wp)
             if (qbmm) then
diff --git a/src/simulation/m_fftw.fpp b/src/simulation/m_fftw.fpp
index 3c18a8c1fe..87f612a76b 100644
--- a/src/simulation/m_fftw.fpp
+++ b/src/simulation/m_fftw.fpp
@@ -46,12 +46,12 @@ module m_fftw
     !! Filtered complex data in Fourier space
 
 #if defined(MFC_OpenACC)
-    !$acc declare create(real_size, cmplx_size, x_size, batch_size, Nfq)
+    $:GPU_DECLARE(create='[real_size,cmplx_size,x_size,batch_size,Nfq]')
 
     real(dp), allocatable, target :: data_real_gpu(:)
     complex(dp), allocatable, target :: data_cmplx_gpu(:)
     complex(dp), allocatable, target :: data_fltr_cmplx_gpu(:)
-!$acc declare create(data_real_gpu, data_cmplx_gpu, data_fltr_cmplx_gpu)
+    $:GPU_DECLARE(create='[data_real_gpu,data_cmplx_gpu,data_fltr_cmplx_gpu]')
 
 #if defined(__PGI)
     integer :: fwd_plan_gpu, bwd_plan_gpu
@@ -89,8 +89,8 @@ contains
         gpu_fft_size(1) = real_size; 
         iembed(1) = 0
         oembed(1) = 0
-        !$acc enter data copyin(real_size, cmplx_size, x_size, sys_size, batch_size, Nfq)
-        !$acc update device(real_size, cmplx_size, x_size, sys_size, batch_size)
+        $:GPU_ENTER_DATA(copyin='[real_size,cmplx_size,x_size,sys_size,batch_size,Nfq]')
+        $:GPU_UPDATE(device='[real_size,cmplx_size,x_size,sys_size,batch_size]')
 #else
         ! Allocate input and output DFT data sizes
         fftw_real_data = fftw_alloc_real(int(real_size, c_size_t))
@@ -139,7 +139,7 @@ contains
         if (bc_y%beg >= 0) return
 #if defined(MFC_OpenACC)
 
-        !$acc parallel loop collapse(3) gang vector default(present)
+        $:GPU_PARALLEL_LOOP(collapse=3)
         do k = 1, sys_size
             do j = 0, m
                 do l = 1, cmplx_size
@@ -148,7 +148,7 @@ contains
             end do
         end do
 
-        !$acc parallel loop collapse(3) gang vector default(present)
+        $:GPU_PARALLEL_LOOP(collapse=3)
         do k = 1, sys_size
             do j = 0, m
                 do l = 0, p
@@ -161,139 +161,139 @@ contains
         p_cmplx => data_cmplx_gpu
         p_fltr_cmplx => data_fltr_cmplx_gpu
 
-!$acc data attach(p_real, p_cmplx, p_fltr_cmplx)
-!$acc host_data use_device(p_real, p_cmplx, p_fltr_cmplx)
+        #:call GPU_DATA(attach='[p_real, p_cmplx, p_fltr_cmplx]')
+            #:call GPU_HOST_DATA(use_device='[p_real, p_cmplx, p_fltr_cmplx]')
 #if defined(__PGI)
-        ierr = cufftExecD2Z(fwd_plan_gpu, data_real_gpu, data_cmplx_gpu)
+                ierr = cufftExecD2Z(fwd_plan_gpu, data_real_gpu, data_cmplx_gpu)
 #else
-        ierr = hipfftExecD2Z(fwd_plan_gpu, c_loc(p_real), c_loc(p_cmplx))
-        call hipCheck(hipDeviceSynchronize())
+                ierr = hipfftExecD2Z(fwd_plan_gpu, c_loc(p_real), c_loc(p_cmplx))
+                call hipCheck(hipDeviceSynchronize())
 #endif
-        !$acc end host_data
-        Nfq = 3
-        !$acc update device(Nfq)
+            #:endcall GPU_HOST_DATA
+            Nfq = 3
+            $:GPU_UPDATE(device='[Nfq]')
 
-        !$acc parallel loop collapse(3) gang vector default(present)
-        do k = 1, sys_size
-            do j = 0, m
-                do l = 1, Nfq
-                    data_fltr_cmplx_gpu(l + j*cmplx_size + (k - 1)*cmplx_size*x_size) = data_cmplx_gpu(l + j*cmplx_size + (k - 1)*cmplx_size*x_size)
+            $:GPU_PARALLEL_LOOP(collapse=3)
+            do k = 1, sys_size
+                do j = 0, m
+                    do l = 1, Nfq
+                        data_fltr_cmplx_gpu(l + j*cmplx_size + (k - 1)*cmplx_size*x_size) = data_cmplx_gpu(l + j*cmplx_size + (k - 1)*cmplx_size*x_size)
+                    end do
                 end do
             end do
-        end do
 
-!$acc host_data use_device(p_real, p_fltr_cmplx)
+            #:call GPU_HOST_DATA(use_device='[p_real, p_fltr_cmplx]')
 #if defined(__PGI)
-        ierr = cufftExecZ2D(bwd_plan_gpu, data_fltr_cmplx_gpu, data_real_gpu)
+                ierr = cufftExecZ2D(bwd_plan_gpu, data_fltr_cmplx_gpu, data_real_gpu)
 #else
-        ierr = hipfftExecZ2D(bwd_plan_gpu, c_loc(p_fltr_cmplx), c_loc(p_real))
-        call hipCheck(hipDeviceSynchronize())
+                ierr = hipfftExecZ2D(bwd_plan_gpu, c_loc(p_fltr_cmplx), c_loc(p_real))
+                call hipCheck(hipDeviceSynchronize())
 #endif
-        !$acc end host_data
+            #:endcall GPU_HOST_DATA
 
-        !$acc parallel loop collapse(3) gang vector default(present)
-        do k = 1, sys_size
-            do j = 0, m
-                do l = 0, p
-                    data_real_gpu(l + j*real_size + 1 + (k - 1)*real_size*x_size) = data_real_gpu(l + j*real_size + 1 + (k - 1)*real_size*x_size)/real(real_size, dp)
-                    q_cons_vf(k)%sf(j, 0, l) = data_real_gpu(l + j*real_size + 1 + (k - 1)*real_size*x_size)
+            $:GPU_PARALLEL_LOOP(collapse=3)
+            do k = 1, sys_size
+                do j = 0, m
+                    do l = 0, p
+                        data_real_gpu(l + j*real_size + 1 + (k - 1)*real_size*x_size) = data_real_gpu(l + j*real_size + 1 + (k - 1)*real_size*x_size)/real(real_size, dp)
+                        q_cons_vf(k)%sf(j, 0, l) = data_real_gpu(l + j*real_size + 1 + (k - 1)*real_size*x_size)
+                    end do
                 end do
             end do
-        end do
 
-        do i = 1, fourier_rings
+            do i = 1, fourier_rings
 
-            !$acc parallel loop collapse(3) gang vector default(present)
-            do k = 1, sys_size
-                do j = 0, m
-                    do l = 1, cmplx_size
-                        data_fltr_cmplx_gpu(l + j*cmplx_size + (k - 1)*cmplx_size*x_size) = (0_dp, 0_dp)
+                $:GPU_PARALLEL_LOOP(collapse=3)
+                do k = 1, sys_size
+                    do j = 0, m
+                        do l = 1, cmplx_size
+                            data_fltr_cmplx_gpu(l + j*cmplx_size + (k - 1)*cmplx_size*x_size) = (0_dp, 0_dp)
+                        end do
                     end do
                 end do
-            end do
 
-            !$acc parallel loop collapse(3) gang vector default(present) firstprivate(i)
-            do k = 1, sys_size
-                do j = 0, m
-                    do l = 0, p
-                        data_real_gpu(l + j*real_size + 1 + (k - 1)*real_size*x_size) = q_cons_vf(k)%sf(j, i, l)
+                $:GPU_PARALLEL_LOOP(collapse=3, firstprivate='[i]')
+                do k = 1, sys_size
+                    do j = 0, m
+                        do l = 0, p
+                            data_real_gpu(l + j*real_size + 1 + (k - 1)*real_size*x_size) = q_cons_vf(k)%sf(j, i, l)
+                        end do
                     end do
                 end do
-            end do
 
-!$acc host_data use_device(p_real, p_cmplx)
+                #:call GPU_HOST_DATA(use_device='[p_real, p_cmplx]')
 #if defined(__PGI)
-            ierr = cufftExecD2Z(fwd_plan_gpu, data_real_gpu, data_cmplx_gpu)
+                    ierr = cufftExecD2Z(fwd_plan_gpu, data_real_gpu, data_cmplx_gpu)
 #else
-            ierr = hipfftExecD2Z(fwd_plan_gpu, c_loc(p_real), c_loc(p_cmplx))
-            call hipCheck(hipDeviceSynchronize())
+                    ierr = hipfftExecD2Z(fwd_plan_gpu, c_loc(p_real), c_loc(p_cmplx))
+                    call hipCheck(hipDeviceSynchronize())
 #endif
-            !$acc end host_data
+                #:endcall GPU_HOST_DATA
 
-            Nfq = min(floor(2_dp*real(i, dp)*pi), cmplx_size)
-            !$acc update device(Nfq)
+                Nfq = min(floor(2_dp*real(i, dp)*pi), cmplx_size)
+                $:GPU_UPDATE(device='[Nfq]')
 
-            !$acc parallel loop collapse(3) gang vector default(present)
-            do k = 1, sys_size
-                do j = 0, m
-                    do l = 1, Nfq
-                        data_fltr_cmplx_gpu(l + j*cmplx_size + (k - 1)*cmplx_size*x_size) = data_cmplx_gpu(l + j*cmplx_size + (k - 1)*cmplx_size*x_size)
+                $:GPU_PARALLEL_LOOP(collapse=3)
+                do k = 1, sys_size
+                    do j = 0, m
+                        do l = 1, Nfq
+                            data_fltr_cmplx_gpu(l + j*cmplx_size + (k - 1)*cmplx_size*x_size) = data_cmplx_gpu(l + j*cmplx_size + (k - 1)*cmplx_size*x_size)
+                        end do
                     end do
                 end do
-            end do
 
-!$acc host_data use_device(p_real, p_fltr_cmplx)
+                #:call GPU_HOST_DATA(use_device='[p_real, p_fltr_cmplx]')
 #if defined(__PGI)
-            ierr = cufftExecZ2D(bwd_plan_gpu, data_fltr_cmplx_gpu, data_real_gpu)
+                    ierr = cufftExecZ2D(bwd_plan_gpu, data_fltr_cmplx_gpu, data_real_gpu)
 #else
-            ierr = hipfftExecZ2D(bwd_plan_gpu, c_loc(p_fltr_cmplx), c_loc(p_real))
-            call hipCheck(hipDeviceSynchronize())
+                    ierr = hipfftExecZ2D(bwd_plan_gpu, c_loc(p_fltr_cmplx), c_loc(p_real))
+                    call hipCheck(hipDeviceSynchronize())
 #endif
-            !$acc end host_data
+                #:endcall GPU_HOST_DATA
 
-            !$acc parallel loop collapse(3) gang vector default(present) firstprivate(i)
-            do k = 1, sys_size
-                do j = 0, m
-                    do l = 0, p
-                        data_real_gpu(l + j*real_size + 1 + (k - 1)*real_size*x_size) = data_real_gpu(l + j*real_size + 1 + (k - 1)*real_size*x_size)/real(real_size, dp)
-                        q_cons_vf(k)%sf(j, i, l) = data_real_gpu(l + j*real_size + 1 + (k - 1)*real_size*x_size)
+                $:GPU_PARALLEL_LOOP(collapse=3, firstprivate='[i]')
+                do k = 1, sys_size
+                    do j = 0, m
+                        do l = 0, p
+                            data_real_gpu(l + j*real_size + 1 + (k - 1)*real_size*x_size) = data_real_gpu(l + j*real_size + 1 + (k - 1)*real_size*x_size)/real(real_size, dp)
+                            q_cons_vf(k)%sf(j, i, l) = data_real_gpu(l + j*real_size + 1 + (k - 1)*real_size*x_size)
+                        end do
                     end do
                 end do
-            end do
 
-        end do
-
-#else
-        Nfq = 3
-        do j = 0, m
-            do k = 1, sys_size
-                data_fltr_cmplx(:) = (0_dp, 0_dp)
-                data_real(1:p + 1) = q_cons_vf(k)%sf(j, 0, 0:p)
-                call fftw_execute_dft_r2c(fwd_plan, data_real, data_cmplx)
-                data_fltr_cmplx(1:Nfq) = data_cmplx(1:Nfq)
-                call fftw_execute_dft_c2r(bwd_plan, data_fltr_cmplx, data_real)
-                data_real(:) = data_real(:)/real(real_size, dp)
-                q_cons_vf(k)%sf(j, 0, 0:p) = data_real(1:p + 1)
             end do
-        end do
 
-        ! Apply Fourier filter to additional rings
-        do i = 1, fourier_rings
-            Nfq = min(floor(2_dp*real(i, dp)*pi), cmplx_size)
+#else
+            Nfq = 3
             do j = 0, m
                 do k = 1, sys_size
                     data_fltr_cmplx(:) = (0_dp, 0_dp)
-                    data_real(1:p + 1) = q_cons_vf(k)%sf(j, i, 0:p)
+                    data_real(1:p + 1) = q_cons_vf(k)%sf(j, 0, 0:p)
                     call fftw_execute_dft_r2c(fwd_plan, data_real, data_cmplx)
                     data_fltr_cmplx(1:Nfq) = data_cmplx(1:Nfq)
                     call fftw_execute_dft_c2r(bwd_plan, data_fltr_cmplx, data_real)
                     data_real(:) = data_real(:)/real(real_size, dp)
-                    q_cons_vf(k)%sf(j, i, 0:p) = data_real(1:p + 1)
+                    q_cons_vf(k)%sf(j, 0, 0:p) = data_real(1:p + 1)
+                end do
+            end do
+
+            ! Apply Fourier filter to additional rings
+            do i = 1, fourier_rings
+                Nfq = min(floor(2_dp*real(i, dp)*pi), cmplx_size)
+                do j = 0, m
+                    do k = 1, sys_size
+                        data_fltr_cmplx(:) = (0_dp, 0_dp)
+                        data_real(1:p + 1) = q_cons_vf(k)%sf(j, i, 0:p)
+                        call fftw_execute_dft_r2c(fwd_plan, data_real, data_cmplx)
+                        data_fltr_cmplx(1:Nfq) = data_cmplx(1:Nfq)
+                        call fftw_execute_dft_c2r(bwd_plan, data_fltr_cmplx, data_real)
+                        data_real(:) = data_real(:)/real(real_size, dp)
+                        q_cons_vf(k)%sf(j, i, 0:p) = data_real(1:p + 1)
+                    end do
                 end do
             end do
-        end do
 #endif
-!$acc end data
+        #:endcall GPU_DATA
     end subroutine s_apply_fourier_filter
 
     !>  The purpose of this subroutine is to destroy the fftw plan
diff --git a/src/simulation/m_global_parameters.fpp b/src/simulation/m_global_parameters.fpp
index 9362f013ad..560f71d1a3 100644
--- a/src/simulation/m_global_parameters.fpp
+++ b/src/simulation/m_global_parameters.fpp
@@ -53,7 +53,7 @@ module m_global_parameters
     logical :: cyl_coord
     integer :: grid_geometry
     !> @}
-    !$acc declare create(cyl_coord, grid_geometry)
+    $:GPU_DECLARE(create='[cyl_coord,grid_geometry]')
 
     !> @name Cell-boundary (CB) locations in the x-, y- and z-directions, respectively
     !> @{
@@ -76,7 +76,7 @@ module m_global_parameters
 
     real(wp) :: dt !< Size of the time-step
 
-    !$acc declare create(x_cb, y_cb, z_cb, x_cc, y_cc, z_cc, dx, dy, dz, dt, m, n, p)
+    $:GPU_DECLARE(create='[x_cb,y_cb,z_cb,x_cc,y_cc,z_cc,dx,dy,dz,dt,m,n,p]')
 
     !> @name Starting time-step iteration, stopping time-step iteration and the number
     !! of time-step iterations between successive solution backups, respectively
@@ -90,7 +90,7 @@ module m_global_parameters
     real(wp) :: t_stop, t_save, cfl_target
     integer :: n_start
     !> @}
-    !$acc declare create(cfl_target)
+    $:GPU_DECLARE(create='[cfl_target]')
 
     logical :: cfl_adap_dt, cfl_const_dt, cfl_dt
 
@@ -158,7 +158,7 @@ module m_global_parameters
     logical :: bulk_stress   !< Bulk stresses
     logical :: cont_damage   !< Continuum damage modeling
 
-    !$acc declare create(chemistry)
+    $:GPU_DECLARE(create='[chemistry]')
 
     logical :: bodyForces
     logical :: bf_x, bf_y, bf_z !< body force toggle in three directions
@@ -169,24 +169,27 @@ module m_global_parameters
         #:endfor
     #:endfor
     real(wp), dimension(3) :: accel_bf
-    !$acc declare create(accel_bf)
+    $:GPU_DECLARE(create='[accel_bf]')
 
     integer :: cpu_start, cpu_end, cpu_rate
 
     #:if not MFC_CASE_OPTIMIZATION
-        !$acc declare create(num_dims, num_vels, weno_polyn, weno_order, weno_num_stencils, num_fluids, wenojs, mapped_weno, wenoz, teno, wenoz_q, mhd, relativity)
+        $:GPU_DECLARE(create='[num_dims,num_vels,weno_polyn,weno_order]')
+        $:GPU_DECLARE(create='[weno_num_stencils,num_fluids,wenojs]')
+        $:GPU_DECLARE(create='[mapped_weno, wenoz,teno,wenoz_q,mhd,relativity]')
     #:endif
 
-    !$acc declare create(mpp_lim, model_eqns, mixture_err, alt_soundspeed, avg_state, mp_weno, weno_eps, teno_CT, hypoelasticity, hyperelasticity, hyper_model, elasticity, low_Mach, viscous, shear_stress, bulk_stress, cont_damage)
+    $:GPU_DECLARE(create='[mpp_lim,model_eqns,mixture_err,alt_soundspeed]')
+    $:GPU_DECLARE(create='[avg_state,mp_weno,weno_eps,teno_CT,hypoelasticity]')
+    $:GPU_DECLARE(create='[hyperelasticity,hyper_model,elasticity,low_Mach]')
+    $:GPU_DECLARE(create='[viscous,shear_stress,bulk_stress,cont_damage]')
 
     logical :: relax          !< activate phase change
     integer :: relax_model    !< Relaxation model
     real(wp) :: palpha_eps     !< trigger parameter for the p relaxation procedure, phase change model
     real(wp) :: ptgalpha_eps   !< trigger parameter for the pTg relaxation procedure, phase change model
 
-!#ifndef _CRAYFTN
-!$acc declare create(relax, relax_model, palpha_eps,ptgalpha_eps)
-!#endif
+    $:GPU_DECLARE(create='[relax, relax_model, palpha_eps,ptgalpha_eps]')
 
     integer :: num_bc_patches
     logical :: bc_io
@@ -194,6 +197,10 @@ module m_global_parameters
     !> @{
     type(int_bounds_info) :: bc_x, bc_y, bc_z
     !> @}
+    $:GPU_DECLARE(create='[bc_x%vb1, bc_x%vb2, bc_x%vb3, bc_x%ve1, bc_x%ve2, bc_x%ve3]')
+    $:GPU_DECLARE(create='[bc_y%vb1, bc_y%vb2, bc_y%vb3, bc_y%ve1, bc_y%ve2, bc_y%ve3]')
+    $:GPU_DECLARE(create='[bc_z%vb1, bc_z%vb2, bc_z%vb3, bc_z%ve1, bc_z%ve2, bc_z%ve3]')
+
     type(bounds_info) :: x_domain, y_domain, z_domain
     real(wp) :: x_a, y_a, z_a
     real(wp) :: x_b, y_b, z_b
@@ -246,19 +253,20 @@ module m_global_parameters
     integer :: c_idx                                   !< Index of color function
     integer :: damage_idx                              !< Index of damage state variable (D) for continuum damage model
     !> @}
-
-    !$acc declare create(bub_idx)
+    $:GPU_DECLARE(create='[sys_size,E_idx,n_idx,bub_idx,alf_idx,gamma_idx]')
+    $:GPU_DECLARE(create='[pi_inf_idx,B_idx,stress_idx,xi_idx,b_size]')
+    $:GPU_DECLARE(create='[tensor_size,species_idx,c_idx]')
 
     ! Cell Indices for the (local) interior points (O-m, O-n, 0-p).
     ! Stands for "InDices With INTerior".
     type(int_bounds_info) :: idwint(1:3)
-    !$acc declare create(idwint)
+    $:GPU_DECLARE(create='[idwint]')
 
     ! Cell Indices for the entire (local) domain. In simulation and post_process,
     ! this includes the buffer region. idwbuff and idwint are the same otherwise.
     ! Stands for "InDices With BUFFer".
     type(int_bounds_info) :: idwbuff(1:3)
-    !$acc declare create(idwbuff)
+    $:GPU_DECLARE(create='[idwbuff]')
 
     !> @name The number of fluids, along with their identifying indexes, respectively,
     !! for which viscous effects, e.g. the shear and/or the volume Reynolds (Re)
@@ -268,7 +276,7 @@ module m_global_parameters
     integer, allocatable, dimension(:, :) :: Re_idx
     !> @}
 
-    !$acc declare create(Re_size, Re_idx)
+    $:GPU_DECLARE(create='[Re_size,Re_idx]')
 
     ! The WENO average (WA) flag regulates whether the calculation of any cell-
     ! average spatial derivatives is carried out in each cell by utilizing the
@@ -279,7 +287,7 @@ module m_global_parameters
     real(wp) :: wa_flg
     !> @{
 
-    !$acc declare create(wa_flg)
+    $:GPU_DECLARE(create='[wa_flg]')
 
     !> @name The coordinate direction indexes and flags (flg), respectively, for which
     !! the configurations will be determined with respect to a working direction
@@ -291,14 +299,14 @@ module m_global_parameters
     integer, dimension(3) :: dir_idx_tau !!used for hypoelasticity=true
     !> @}
 
-    !$acc declare create(dir_idx, dir_flg, dir_idx_tau)
+    $:GPU_DECLARE(create='[dir_idx,dir_flg,dir_idx_tau]')
 
     integer :: buff_size !<
     !! The number of cells that are necessary to be able to store enough boundary
     !! conditions data to march the solution in the physical computational domain
     !! to the next time-step.
 
-    !$acc declare create(sys_size, buff_size, E_idx, gamma_idx, pi_inf_idx, alf_idx, n_idx, stress_idx, b_size, tensor_size, xi_idx, species_idx, B_idx, c_idx)
+    $:GPU_DECLARE(create='[buff_size]')
 
     integer :: shear_num !! Number of shear stress components
     integer, dimension(3) :: shear_indices !<
@@ -309,7 +317,7 @@ module m_global_parameters
     !! Indices of shear stress components to reflect for boundary conditions.
     !! Size: (1:3, 1:shear_BC_flip_num) for (x/y/z, [indices])
 
-    !$acc declare create(shear_num, shear_indices, shear_BC_flip_num, shear_BC_flip_indices)
+    $:GPU_DECLARE(create='[shear_num,shear_indices,shear_BC_flip_num,shear_BC_flip_indices]')
 
     ! END: Simulation Algorithm Parameters
 
@@ -320,10 +328,6 @@ module m_global_parameters
     !! in the flow. These include the stiffened gas equation of state parameters,
     !! the Reynolds numbers and the Weber numbers.
 
-    !$acc declare create(bc_x%vb1, bc_x%vb2, bc_x%vb3, bc_x%ve1, bc_x%ve2, bc_x%ve3)
-    !$acc declare create(bc_y%vb1, bc_y%vb2, bc_y%vb3, bc_y%ve1, bc_y%ve2, bc_y%ve3)
-    !$acc declare create(bc_z%vb1, bc_z%vb2, bc_z%vb3, bc_z%ve1, bc_z%ve2, bc_z%ve3)
-
     integer :: fd_order !<
     !! The order of the finite-difference (fd) approximations of the first-order
     !! derivatives that need to be evaluated when the CoM or flow probe data
@@ -333,7 +337,7 @@ module m_global_parameters
     !! The finite-difference number is given by MAX(1, fd_order/2). Essentially,
     !! it is a measure of the half-size of the finite-difference stencil for the
     !! selected order of accuracy.
-    !$acc declare create(fd_order,fd_number)
+    $:GPU_DECLARE(create='[fd_order,fd_number]')
 
     logical :: probe_wrt
     logical :: integral_wrt
@@ -346,7 +350,7 @@ module m_global_parameters
     !> @{
     real(wp) :: rhoref, pref
     !> @}
-    !$acc declare create(rhoref, pref)
+    $:GPU_DECLARE(create='[rhoref,pref]')
 
     !> @name Immersed Boundaries
     !> @{
@@ -361,7 +365,7 @@ module m_global_parameters
     !! the maximum allowable number of patches, num_patches_max, may be changed
     !! in the module m_derived_types.f90.
 
-    !$acc declare create(ib, num_ibs, patch_ib)
+    $:GPU_DECLARE(create='[ib,num_ibs,patch_ib]')
     !> @}
 
     !> @name Bubble modeling
@@ -376,26 +380,31 @@ module m_global_parameters
     real(wp) :: Ca       !< Cavitation number
     real(wp) :: Web      !< Weber number
     real(wp) :: Re_inv   !< Inverse Reynolds number
+    $:GPU_DECLARE(create='[R0ref,Ca,Web,Re_inv]')
 
     real(wp), dimension(:), allocatable :: weight !< Simpson quadrature weights
     real(wp), dimension(:), allocatable :: R0     !< Bubble sizes
     real(wp), dimension(:), allocatable :: V0     !< Bubble velocities
-    !$acc declare create(weight, R0, V0)
+    $:GPU_DECLARE(create='[weight,R0,V0]')
 
     logical :: bubbles_euler      !< Bubbles euler on/off
     logical :: polytropic   !< Polytropic  switch
     logical :: polydisperse !< Polydisperse bubbles
+    $:GPU_DECLARE(create='[bubbles_euler,polytropic,polydisperse]')
+
     logical :: adv_n        !< Solve the number density equation and compute alpha from number density
     logical :: adap_dt      !< Adaptive step size control
     real(wp) :: adap_dt_tol !< Tolerance to control adaptive step size
+    $:GPU_DECLARE(create='[adv_n,adap_dt,adap_dt_tol]')
 
     integer :: bubble_model !< Gilmore or Keller--Miksis bubble model
     integer :: thermal      !< Thermal behavior. 1 = adiabatic, 2 = isotherm, 3 = transfer
+    $:GPU_DECLARE(create='[bubble_model,thermal]')
 
     real(wp), allocatable, dimension(:, :, :) :: ptil  !< Pressure modification
-    !$acc declare create(ptil)
 
     real(wp) :: poly_sigma  !< log normal sigma for polydisperse PDF
+    $:GPU_DECLARE(create='[ptil, poly_sigma]')
 
     logical :: qbmm      !< Quadrature moment method
     integer, parameter :: nmom = 6 !< Number of carried moments per R0 location
@@ -404,38 +413,39 @@ module m_global_parameters
     integer :: R0_type
 
     real(wp) :: pi_fac   !< Factor for artificial pi_inf
+    $:GPU_DECLARE(create='[qbmm, nmomsp,nmomtot,R0_type,pi_fac]')
 
     #:if not MFC_CASE_OPTIMIZATION
-        !$acc declare create(nb)
+        $:GPU_DECLARE(create='[nb]')
     #:endif
 
-    !$acc declare create(R0ref, Ca, Web, Re_inv, bubbles_euler, polytropic, polydisperse, qbmm, nmomsp, nmomtot, R0_type, bubble_model, thermal, poly_sigma, adv_n, adap_dt, adap_dt_tol, pi_fac)
-
     type(scalar_field), allocatable, dimension(:) :: mom_sp
     type(scalar_field), allocatable, dimension(:, :, :) :: mom_3d
-    !$acc declare create(mom_sp, mom_3d)
+    $:GPU_DECLARE(create='[mom_sp,mom_3d]')
 
     !> @}
 
     type(chemistry_parameters) :: chem_params
-    !$acc declare create(chem_params)
+    $:GPU_DECLARE(create='[chem_params]')
 
     !> @name Physical bubble parameters (see Ando 2010, Preston 2007)
     !> @{
 
     real(wp) :: R_n, R_v, phi_vn, phi_nv, Pe_c, Tw, pv, M_n, M_v, k_vl, k_nl, cp_n, cp_v
-    !$acc declare create(R_n, R_v, phi_vn, phi_nv, Pe_c, Tw, pv, M_n, M_v, k_vl, k_nl, cp_n, cp_v)
+    $:GPU_DECLARE(create='[R_n,R_v,phi_vn,phi_nv,Pe_c,Tw]')
+    $:GPU_DECLARE(create='[pv,M_n, M_v,k_vl,k_nl,cp_n,cp_v]')
 
     real(wp), dimension(:), allocatable :: k_n, k_v, pb0, mass_n0, mass_v0, Pe_T
     real(wp), dimension(:), allocatable :: Re_trans_T, Re_trans_c, Im_trans_T, Im_trans_c, omegaN
-    !$acc declare create( k_n, k_v, pb0, mass_n0, mass_v0, Pe_T, Re_trans_T, Re_trans_c, Im_trans_T, Im_trans_c, omegaN)
+    $:GPU_DECLARE(create='[k_n,k_v,pb0,mass_n0,mass_v0,Pe_T]')
+    $:GPU_DECLARE(create='[Re_trans_T,Re_trans_c,Im_trans_T,Im_trans_c,omegaN]')
 
     real(wp) :: mul0, ss, gamma_v, mu_v
     real(wp) :: gamma_m, gamma_n, mu_n
     real(wp) :: gam
     !> @}
 
-    !$acc declare create(mul0, ss, gamma_v, mu_v, gamma_m, gamma_n, mu_n, gam)
+    $:GPU_DECLARE(create='[mul0,ss,gamma_v,mu_v,gamma_m,gamma_n,mu_n,gam]')
 
     !> @name Acoustic acoustic_source parameters
     !> @{
@@ -443,14 +453,14 @@ module m_global_parameters
     type(acoustic_parameters), dimension(num_probes_max) :: acoustic !< Acoustic source parameters
     integer :: num_source !< Number of acoustic sources
     !> @}
-    !$acc declare create(acoustic_source, acoustic, num_source)
+    $:GPU_DECLARE(create='[acoustic_source,acoustic,num_source]')
 
     !> @name Surface tension parameters
     !> @{
 
     real(wp) :: sigma
     logical :: surface_tension
-    !$acc declare create(sigma, surface_tension)
+    $:GPU_DECLARE(create='[sigma,surface_tension]')
     !> @}
 
     integer :: momxb, momxe
@@ -461,11 +471,13 @@ module m_global_parameters
     integer :: strxb, strxe
     integer :: chemxb, chemxe
     integer :: xibeg, xiend
-    !$acc declare create(momxb, momxe, advxb, advxe, contxb, contxe, intxb, intxe, bubxb, bubxe, strxb, strxe, chemxb, chemxe)
-    !$acc declare create(xibeg,xiend)
+    $:GPU_DECLARE(create='[momxb,momxe,advxb,advxe,contxb,contxe]')
+    $:GPU_DECLARE(create='[intxb,intxe, bubxb,bubxe]')
+    $:GPU_DECLARE(create='[strxb,strxe,chemxb,chemxe]')
+    $:GPU_DECLARE(create='[xibeg,xiend]')
 
     real(wp), allocatable, dimension(:) :: gammas, gs_min, pi_infs, ps_inf, cvs, qvs, qvps
-    !$acc declare create(gammas, gs_min, pi_infs, ps_inf, cvs, qvs, qvps)
+    $:GPU_DECLARE(create='[gammas,gs_min,pi_infs,ps_inf,cvs,qvs,qvps]')
 
     real(wp) :: mytime       !< Current simulation time
     real(wp) :: finaltime    !< Final simulation time
@@ -476,25 +488,25 @@ module m_global_parameters
 
     type(pres_field), allocatable, dimension(:) :: mv_ts
 
-    !$acc declare create(pb_ts, mv_ts)
+    $:GPU_DECLARE(create='[pb_ts,mv_ts]')
 
     !> @name lagrangian subgrid bubble parameters
     !> @{!
     logical :: bubbles_lagrange                         !< Lagrangian subgrid bubble model switch
     type(bubbles_lagrange_parameters) :: lag_params     !< Lagrange bubbles' parameters
-    !$acc declare create(bubbles_lagrange, lag_params)
+    $:GPU_DECLARE(create='[bubbles_lagrange,lag_params]')
     !> @}
 
     real(wp) :: Bx0 !< Constant magnetic field in the x-direction (1D)
     logical :: powell !< Powell‐correction for div B = 0
-    !$acc declare create(Bx0, powell)
+    $:GPU_DECLARE(create='[Bx0,powell]')
 
     !> @name Continuum damage model parameters
     !> @{!
     real(wp) :: tau_star        !< Stress threshold for continuum damage modeling
     real(wp) :: cont_damage_s   !< Exponent s for continuum damage modeling
     real(wp) :: alpha_bar       !< Damage rate factor for continuum damage modeling
-    !$acc declare create(tau_star, cont_damage_s, alpha_bar)
+    $:GPU_DECLARE(create='[tau_star,cont_damage_s,alpha_bar]')
     !> @}
 
 contains
@@ -785,10 +797,10 @@ contains
             else
                 weno_num_stencils = weno_polyn
             end if
-            !$acc update device(weno_polyn)
-            !$acc update device(weno_num_stencils)
-            !$acc update device(nb)
-            !$acc update device(num_dims, num_vels, num_fluids)
+            $:GPU_UPDATE(device='[weno_polyn]')
+            $:GPU_UPDATE(device='[weno_num_stencils]')
+            $:GPU_UPDATE(device='[nb]')
+            $:GPU_UPDATE(device='[num_dims,num_vels,num_fluids]')
         #:endif
 
         ! Initializing the number of fluids for which viscous effects will
@@ -1024,7 +1036,7 @@ contains
             if (Re_size(1) > 0._wp) shear_stress = .true.
             if (Re_size(2) > 0._wp) bulk_stress = .true.
 
-            !$acc update device(Re_size, viscous, shear_stress, bulk_stress)
+            $:GPU_UPDATE(device='[Re_size,viscous,shear_stress,bulk_stress]')
 
             ! Bookkeeping the indexes of any viscous fluids and any pairs of
             ! fluids whose interface will support effects of surface tension
@@ -1080,7 +1092,7 @@ contains
                     ! y-dir: flip tau_xy and tau_yz
                     ! z-dir: flip tau_xz and tau_yz
                 end if
-                !$acc update device(shear_num, shear_indices, shear_BC_flip_num, shear_BC_flip_indices)
+                $:GPU_UPDATE(device='[shear_num,shear_indices,shear_BC_flip_num,shear_BC_flip_indices]')
             end if
 
             if (hyperelasticity) then
@@ -1147,7 +1159,7 @@ contains
         ! cell-boundary values or otherwise, the unaltered left and right,
         ! WENO-reconstructed, cell-boundary values
         wa_flg = 0._wp; if (weno_avg) wa_flg = 1._wp
-        !$acc update device(wa_flg)
+        $:GPU_UPDATE(device='[wa_flg]')
 
         ! Resort to default WENO-JS if no other WENO scheme is selected
         #:if not MFC_CASE_OPTIMIZATION
@@ -1157,7 +1169,7 @@ contains
         if (ib) allocate (MPI_IO_IB_DATA%var%sf(0:m, 0:n, 0:p))
         Np = 0
 
-        !$acc update device(Re_size)
+        $:GPU_UPDATE(device='[Re_size]')
 
         if (elasticity) then
             fd_number = max(1, fd_order/2)
@@ -1175,7 +1187,7 @@ contains
                                            idwint, idwbuff, viscous, &
                                            bubbles_lagrange, m, n, p, &
                                            num_dims)
-        !$acc update device(idwint, idwbuff)
+        $:GPU_UPDATE(device='[idwint, idwbuff]')
 
         ! Configuring Coordinate Direction Indexes
         if (bubbles_euler) then
@@ -1185,7 +1197,7 @@ contains
                 & idwbuff(3)%beg:idwbuff(3)%end))
         end if
 
-        !$acc update device(fd_order,fd_number)
+        $:GPU_UPDATE(device='[fd_order, fd_number]')
 
         if (cyl_coord .neqv. .true.) then ! Cartesian grid
             grid_geometry = 1
@@ -1212,30 +1224,44 @@ contains
         chemxb = species_idx%beg
         chemxe = species_idx%end
 
-        !$acc update device(momxb, momxe, advxb, advxe, contxb, contxe, bubxb, bubxe, intxb, intxe, sys_size, buff_size, E_idx, alf_idx, n_idx, adv_n, adap_dt, pi_fac, strxb, strxe, chemxb, chemxe, c_idx)
-        !$acc update device(b_size, xibeg, xiend, tensor_size)
+        $:GPU_UPDATE(device='[momxb,momxe,advxb,advxe,contxb,contxe, &
+            & bubxb,bubxe,intxb,intxe,sys_size,buff_size,E_idx, &
+            & alf_idx,n_idx,adv_n,adap_dt,pi_fac,strxb,strxe, &
+            & chemxb,chemxe,c_idx]')
+        $:GPU_UPDATE(device='[b_size,xibeg,xiend,tensor_size]')
 
-        !$acc update device(species_idx)
-        !$acc update device(cfl_target, m, n, p)
+        $:GPU_UPDATE(device='[species_idx]')
+        $:GPU_UPDATE(device='[cfl_target,m,n,p]')
 
-        !$acc update device(alt_soundspeed, acoustic_source, num_source)
-        !$acc update device(dt, sys_size, buff_size, pref, rhoref, gamma_idx, pi_inf_idx, E_idx, alf_idx, stress_idx, mpp_lim, bubbles_euler, hypoelasticity, alt_soundspeed, avg_state, num_fluids, model_eqns, num_dims, num_vels, mixture_err, grid_geometry, cyl_coord, mp_weno, weno_eps, teno_CT, hyperelasticity, hyper_model, elasticity, xi_idx, B_idx, low_Mach)
+        $:GPU_UPDATE(device='[alt_soundspeed,acoustic_source,num_source]')
+        $:GPU_UPDATE(device='[dt,sys_size,buff_size,pref,rhoref, &
+            & gamma_idx,pi_inf_idx,E_idx,alf_idx,stress_idx, &
+            & mpp_lim,bubbles_euler,hypoelasticity,alt_soundspeed, &
+            & avg_state,num_fluids,model_eqns,num_dims,num_vels, &
+            & mixture_err,grid_geometry,cyl_coord,mp_weno,weno_eps, &
+            & teno_CT,hyperelasticity,hyper_model,elasticity,xi_idx, &
+            & B_idx,low_Mach]')
 
-        !$acc update device(Bx0, powell)
+        $:GPU_UPDATE(device='[Bx0, powell]')
 
-        !$acc update device(cont_damage, tau_star, cont_damage_s, alpha_bar)
+        $:GPU_UPDATE(device='[cont_damage,tau_star,cont_damage_s,alpha_bar]')
 
         #:if not MFC_CASE_OPTIMIZATION
-            !$acc update device(wenojs, mapped_weno, wenoz, teno)
-            !$acc update device(wenoz_q)
-            !$acc update device(mhd, relativity)
+            $:GPU_UPDATE(device='[wenojs,mapped_weno,wenoz,teno]')
+            $:GPU_UPDATE(device='[wenoz_q]')
+            $:GPU_UPDATE(device='[mhd, relativity]')
         #:endif
 
-        !$acc enter data copyin(nb, R0ref, Ca, Web, Re_inv, weight, R0, V0, bubbles_euler, polytropic, polydisperse, qbmm, R0_type, ptil, bubble_model, thermal, poly_sigma)
-        !$acc enter data copyin(R_n, R_v, phi_vn, phi_nv, Pe_c, Tw, pv, M_n, M_v, k_n, k_v, pb0, mass_n0, mass_v0, Pe_T, Re_trans_T, Re_trans_c, Im_trans_T, Im_trans_c, omegaN, mul0, ss, gamma_v, mu_v, gamma_m, gamma_n, mu_n, gam)
-        !$acc enter data copyin(dir_idx, dir_flg, dir_idx_tau)
+        $:GPU_ENTER_DATA(copyin='[nb,R0ref,Ca,Web,Re_inv,weight,R0, &
+            & V0,bubbles_euler,polytropic,polydisperse,qbmm,R0_type, &
+            & ptil,bubble_model,thermal,poly_sigma]')
+        $:GPU_ENTER_DATA(copyin='[R_n,R_v,phi_vn,phi_nv,Pe_c,Tw,pv, &
+            & M_n,M_v,k_n,k_v,pb0,mass_n0,mass_v0,Pe_T, &
+            & Re_trans_T,Re_trans_c,Im_trans_T,Im_trans_c,omegaN, &
+            & mul0,ss,gamma_v,mu_v,gamma_m,gamma_n,mu_n,gam]')
+        $:GPU_ENTER_DATA(copyin='[dir_idx,dir_flg,dir_idx_tau]')
 
-        !$acc enter data copyin(relax, relax_model, palpha_eps,ptgalpha_eps)
+        $:GPU_ENTER_DATA(copyin='[relax,relax_model,palpha_eps,ptgalpha_eps]')
 
         ! Allocating grid variables for the x-, y- and z-directions
         @:ALLOCATE(x_cb(-1 - buff_size:m + buff_size))
diff --git a/src/simulation/m_hyperelastic.fpp b/src/simulation/m_hyperelastic.fpp
index 0aed395e8e..628605a652 100644
--- a/src/simulation/m_hyperelastic.fpp
+++ b/src/simulation/m_hyperelastic.fpp
@@ -26,14 +26,14 @@ module m_hyperelastic
     !! The btensor at the cell-interior Gaussian quadrature points.
     !! These tensor is needed to be calculated once and make the code DRY.
     type(vector_field) :: btensor !<
-    !$acc declare create(btensor)
+    $:GPU_DECLARE(create='[btensor]')
 
     real(wp), allocatable, dimension(:, :) :: fd_coeff_x
     real(wp), allocatable, dimension(:, :) :: fd_coeff_y
     real(wp), allocatable, dimension(:, :) :: fd_coeff_z
-    !$acc declare create(fd_coeff_x,fd_coeff_y,fd_coeff_z)
+    $:GPU_DECLARE(create='[fd_coeff_x,fd_coeff_y, fd_coeff_z]')
     real(wp), allocatable, dimension(:) :: Gs
-    !$acc declare create(Gs)
+    $:GPU_DECLARE(create='[Gs]')
 
 contains
 
@@ -55,11 +55,11 @@ contains
         @:ACC_SETUP_VFs(btensor)
 
         @:ALLOCATE(Gs(1:num_fluids))
-        !$acc loop seq
+        $:GPU_LOOP(parallelism='[seq]')
         do i = 1, num_fluids
             Gs(i) = fluid_pp(i)%G
         end do
-        !$acc update device(Gs)
+        $:GPU_UPDATE(device='[Gs]')
 
         @:ALLOCATE(fd_coeff_x(-fd_number:fd_number, 0:m))
         if (n > 0) then
@@ -72,16 +72,16 @@ contains
         ! Computing centered finite difference coefficients
         call s_compute_finite_difference_coefficients(m, x_cc, fd_coeff_x, buff_size, &
                                                       fd_number, fd_order)
-        !$acc update device(fd_coeff_x)
+        $:GPU_UPDATE(device='[fd_coeff_x]')
         if (n > 0) then
             call s_compute_finite_difference_coefficients(n, y_cc, fd_coeff_y, buff_size, &
                                                           fd_number, fd_order)
-            !$acc update device(fd_coeff_y)
+            $:GPU_UPDATE(device='[fd_coeff_y]')
         end if
         if (p > 0) then
             call s_compute_finite_difference_coefficients(p, z_cc, fd_coeff_z, buff_size, &
                                                           fd_number, fd_order)
-            !$acc update device(fd_coeff_z)
+            $:GPU_UPDATE(device='[fd_coeff_z]')
         end if
 
     end subroutine s_initialize_hyperelastic_module
@@ -106,12 +106,12 @@ contains
         real(wp) :: G
         integer :: j, k, l, i, r
 
-        !$acc parallel loop collapse(3) gang vector default(present) private(alpha_K, alpha_rho_K, &
-        !$acc rho, gamma, pi_inf, qv, G, Re, tensora, tensorb)
+        $:GPU_PARALLEL_LOOP(collapse=3, private='[alpha_K, alpha_rho_K, rho, &
+            & gamma, pi_inf, qv, G, Re, tensora, tensorb]')
         do l = 0, p
             do k = 0, n
                 do j = 0, m
-                    !$acc loop seq
+                    $:GPU_LOOP(parallelism='[seq]')
                     do i = 1, num_fluids
                         alpha_rho_k(i) = q_cons_vf(i)%sf(j, k, l)
                         alpha_k(i) = q_cons_vf(advxb + i - 1)%sf(j, k, l)
@@ -124,7 +124,7 @@ contains
                     !if ( G <= verysmall ) G_K = 0._wp
 
                     if (G > verysmall) then
-                        !$acc loop seq
+                        $:GPU_LOOP(parallelism='[seq]')
                         do i = 1, tensor_size
                             tensora(i) = 0._wp
                         end do
@@ -133,7 +133,7 @@ contains
                         ! number for the tensor 1-3:  dxix_dx, dxiy_dx, dxiz_dx
                         ! 4-6 :                       dxix_dy, dxiy_dy, dxiz_dy
                         ! 7-9 :                       dxix_dz, dxiy_dz, dxiz_dz
-                        !$acc loop seq
+                        $:GPU_LOOP(parallelism='[seq]')
                         do r = -fd_number, fd_number
                             ! derivatives in the x-direction
                             tensora(1) = tensora(1) + q_prim_vf(xibeg)%sf(j + r, k, l)*fd_coeff_x(r, j)
@@ -167,7 +167,7 @@ contains
                         if (tensorb(tensor_size) > verysmall) then
                             ! STEP 2c: computing the inverse of grad_xi tensor = F
                             ! tensorb is the adjoint, tensora becomes F
-                            !$acc loop seq
+                            $:GPU_LOOP(parallelism='[seq]')
                             do i = 1, tensor_size - 1
                                 tensora(i) = tensorb(i)/tensorb(tensor_size)
                             end do
@@ -198,7 +198,7 @@ contains
                             q_prim_vf(E_idx)%sf(j, k, l) = q_prim_vf(E_idx)%sf(j, k, l) - &
                                                            G*q_prim_vf(xiend + 1)%sf(j, k, l)/gamma
                             ! STEP 5c: updating the Cauchy stress conservative scalar field
-                            !$acc loop seq
+                            $:GPU_LOOP(parallelism='[seq]')
                             do i = 1, b_size - 1
                                 q_cons_vf(strxb + i - 1)%sf(j, k, l) = &
                                     rho*q_prim_vf(strxb + i - 1)%sf(j, k, l)
@@ -208,7 +208,6 @@ contains
                 end do
             end do
         end do
-        !$acc end parallel loop
     end subroutine s_hyperelastic_rmt_stress_update
 
     !>  The following subroutine handles the calculation of the btensor.
@@ -220,7 +219,7 @@ contains
         !! calculate the FFtranspose to obtain the btensor, btensor is nxn tensor
         !! btensor is symmetric, save the data space
     pure subroutine s_neoHookean_cauchy_solver(btensor, q_prim_vf, G, j, k, l)
-        !$acc routine seq
+        $:GPU_ROUTINE(parallelism='[seq]')
         type(scalar_field), dimension(sys_size), intent(inout) :: q_prim_vf
         type(scalar_field), dimension(b_size), intent(inout) :: btensor
         real(wp), intent(in) :: G
@@ -239,7 +238,7 @@ contains
         #:endfor
         ! dividing by the jacobian for neo-Hookean model
         ! setting the tensor to the stresses for riemann solver
-        !$acc loop seq
+        $:GPU_LOOP(parallelism='[seq]')
         do i = 1, b_size - 1
             q_prim_vf(strxb + i - 1)%sf(j, k, l) = &
                 G*btensor(i)%sf(j, k, l)/btensor(b_size)%sf(j, k, l)
@@ -259,7 +258,7 @@ contains
         !! calculate the FFtranspose to obtain the btensor, btensor is nxn tensor
         !! btensor is symmetric, save the data space
     pure subroutine s_Mooney_Rivlin_cauchy_solver(btensor, q_prim_vf, G, j, k, l)
-        !$acc routine seq
+        $:GPU_ROUTINE(parallelism='[seq]')
         type(scalar_field), dimension(sys_size), intent(inout) :: q_prim_vf
         type(scalar_field), dimension(b_size), intent(inout) :: btensor
         real(wp), intent(in) :: G
@@ -280,7 +279,7 @@ contains
 
         ! dividing by the jacobian for neo-Hookean model
         ! setting the tensor to the stresses for riemann solver
-        !$acc loop seq
+        $:GPU_LOOP(parallelism='[seq]')
         do i = 1, b_size - 1
             q_prim_vf(strxb + i - 1)%sf(j, k, l) = &
                 G*btensor(i)%sf(j, k, l)/btensor(b_size)%sf(j, k, l)
diff --git a/src/simulation/m_hypoelastic.fpp b/src/simulation/m_hypoelastic.fpp
index 059b5746d5..3f736b0b0b 100644
--- a/src/simulation/m_hypoelastic.fpp
+++ b/src/simulation/m_hypoelastic.fpp
@@ -20,20 +20,20 @@ module m_hypoelastic
  s_compute_damage_state
 
     real(wp), allocatable, dimension(:) :: Gs
-    !$acc declare create(Gs)
+    $:GPU_DECLARE(create='[Gs]')
 
     real(wp), allocatable, dimension(:, :, :) :: du_dx, du_dy, du_dz
     real(wp), allocatable, dimension(:, :, :) :: dv_dx, dv_dy, dv_dz
     real(wp), allocatable, dimension(:, :, :) :: dw_dx, dw_dy, dw_dz
-    !$acc declare create(du_dx,du_dy,du_dz,dv_dx,dv_dy,dv_dz,dw_dx,dw_dy,dw_dz)
+    $:GPU_DECLARE(create='[du_dx,du_dy,du_dz,dv_dx,dv_dy,dv_dz,dw_dx,dw_dy,dw_dz]')
 
     real(wp), allocatable, dimension(:, :, :) :: rho_K_field, G_K_field
-    !$acc declare create(rho_K_field, G_K_field)
+    $:GPU_DECLARE(create='[rho_K_field,G_K_field]')
 
     real(wp), allocatable, dimension(:, :) :: fd_coeff_x_h
     real(wp), allocatable, dimension(:, :) :: fd_coeff_y_h
     real(wp), allocatable, dimension(:, :) :: fd_coeff_z_h
-    !$acc declare create(fd_coeff_x_h,fd_coeff_y_h,fd_coeff_z_h)
+    $:GPU_DECLARE(create='[fd_coeff_x_h,fd_coeff_y_h,fd_coeff_z_h]')
 
 contains
 
@@ -55,7 +55,7 @@ contains
         do i = 1, num_fluids
             Gs(i) = fluid_pp(i)%G
         end do
-        !$acc update device(Gs)
+        $:GPU_UPDATE(device='[Gs]')
 
         @:ALLOCATE(fd_coeff_x_h(-fd_number:fd_number, 0:m))
         if (n > 0) then
@@ -68,16 +68,16 @@ contains
         ! Computing centered finite difference coefficients
         call s_compute_finite_difference_coefficients(m, x_cc, fd_coeff_x_h, buff_size, &
                                                       fd_number, fd_order)
-        !$acc update device(fd_coeff_x_h)
+        $:GPU_UPDATE(device='[fd_coeff_x_h]')
         if (n > 0) then
             call s_compute_finite_difference_coefficients(n, y_cc, fd_coeff_y_h, buff_size, &
                                                           fd_number, fd_order)
-            !$acc update device(fd_coeff_y_h)
+            $:GPU_UPDATE(device='[fd_coeff_y_h]')
         end if
         if (p > 0) then
             call s_compute_finite_difference_coefficients(p, z_cc, fd_coeff_z_h, buff_size, &
                                                           fd_number, fd_order)
-            !$acc update device(fd_coeff_z_h)
+            $:GPU_UPDATE(device='[fd_coeff_z_h]')
         end if
 
     end subroutine s_initialize_hypoelastic_module
@@ -104,7 +104,7 @@ contains
             ! calculate velocity gradients + rho_K and G_K
             ! TODO: re-organize these loops one by one for GPU efficiency if possible?
 
-            !$acc parallel loop collapse(3) gang vector default(present)
+            $:GPU_PARALLEL_LOOP(collapse=3)
             do q = 0, p
                 do l = 0, n
                     do k = 0, m
@@ -112,13 +112,12 @@ contains
                     end do
                 end do
             end do
-            !$acc end parallel loop
 
-            !$acc parallel loop collapse(3) gang vector default(present)
+            $:GPU_PARALLEL_LOOP(collapse=3)
             do q = 0, p
                 do l = 0, n
                     do k = 0, m
-                        !$acc loop seq
+                        $:GPU_LOOP(parallelism='[seq]')
                         do r = -fd_number, fd_number
                             du_dx(k, l, q) = du_dx(k, l, q) &
                                              + q_prim_vf(momxb)%sf(k + r, l, q)*fd_coeff_x_h(r, k)
@@ -127,10 +126,9 @@ contains
                     end do
                 end do
             end do
-            !$acc end parallel loop
 
             if (ndirs > 1) then
-                !$acc parallel loop collapse(3) gang vector default(present)
+                $:GPU_PARALLEL_LOOP(collapse=3)
                 do q = 0, p
                     do l = 0, n
                         do k = 0, m
@@ -138,13 +136,12 @@ contains
                         end do
                     end do
                 end do
-                !$acc end parallel loop
 
-                !$acc parallel loop collapse(3) gang vector default(present)
+                $:GPU_PARALLEL_LOOP(collapse=3)
                 do q = 0, p
                     do l = 0, n
                         do k = 0, m
-                            !$acc loop seq
+                            $:GPU_LOOP(parallelism='[seq]')
                             do r = -fd_number, fd_number
                                 du_dy(k, l, q) = du_dy(k, l, q) &
                                                  + q_prim_vf(momxb)%sf(k, l + r, q)*fd_coeff_y_h(r, l)
@@ -156,12 +153,11 @@ contains
                         end do
                     end do
                 end do
-                !$acc end parallel loop
 
                 ! 3D
                 if (ndirs == 3) then
 
-                    !$acc parallel loop collapse(3) gang vector default(present)
+                    $:GPU_PARALLEL_LOOP(collapse=3)
                     do q = 0, p
                         do l = 0, n
                             do k = 0, m
@@ -170,13 +166,12 @@ contains
                             end do
                         end do
                     end do
-                    !$acc end parallel loop
 
-                    !$acc parallel loop collapse(3) gang vector default(present)
+                    $:GPU_PARALLEL_LOOP(collapse=3)
                     do q = 0, p
                         do l = 0, n
                             do k = 0, m
-                                !$acc loop seq
+                                $:GPU_LOOP(parallelism='[seq]')
                                 do r = -fd_number, fd_number
                                     du_dz(k, l, q) = du_dz(k, l, q) &
                                                      + q_prim_vf(momxb)%sf(k, l, q + r)*fd_coeff_z_h(r, q)
@@ -192,11 +187,10 @@ contains
                             end do
                         end do
                     end do
-                    !$acc end parallel loop
                 end if
             end if
 
-            !$acc parallel loop collapse(3) gang vector default(present)
+            $:GPU_PARALLEL_LOOP(collapse=3)
             do q = 0, p
                 do l = 0, n
                     do k = 0, m
@@ -220,7 +214,7 @@ contains
             end do
 
             ! apply rhs source term to elastic stress equation
-            !$acc parallel loop collapse(3) gang vector default(present)
+            $:GPU_PARALLEL_LOOP(collapse=3)
             do q = 0, p
                 do l = 0, n
                     do k = 0, m
@@ -234,7 +228,7 @@ contains
             end do
 
         elseif (idir == 2) then
-            !$acc parallel loop collapse(3) gang vector default(present)
+            $:GPU_PARALLEL_LOOP(collapse=3)
             do q = 0, p
                 do l = 0, n
                     do k = 0, m
@@ -269,7 +263,7 @@ contains
             end do
 
         elseif (idir == 3) then
-            !$acc parallel loop collapse(3) gang vector default(present)
+            $:GPU_PARALLEL_LOOP(collapse=3)
             do q = 0, p
                 do l = 0, n
                     do k = 0, m
@@ -337,7 +331,7 @@ contains
 
         if (cyl_coord .and. idir == 2) then
 
-            !$acc parallel loop collapse(3) gang vector default(present)
+            $:GPU_PARALLEL_LOOP(collapse=3)
             do q = 0, p
                 do l = 0, n
                     do k = 0, m
@@ -399,13 +393,13 @@ contains
 
         if (n == 0) then
             l = 0; q = 0
-            !$acc parallel loop gang vector default(present)
+            $:GPU_PARALLEL_LOOP()
             do k = 0, m
                 rhs_vf(damage_idx)%sf(k, l, q) = (alpha_bar*max(abs(q_cons_vf(stress_idx%beg)%sf(k, l, q)) - tau_star, 0._wp))**cont_damage_s
             end do
         elseif (p == 0) then
             q = 0
-            !$acc parallel loop collapse(2) gang vector default(present)
+            $:GPU_PARALLEL_LOOP(collapse=2)
             do l = 0, n
                 do k = 0, m
                     ! Maximum principal stress
@@ -419,7 +413,7 @@ contains
                 end do
             end do
         else
-            !$acc parallel loop collapse(3) gang vector default(present)
+            $:GPU_PARALLEL_LOOP(collapse=3)
             do q = 0, p
                 do l = 0, n
                     do k = 0, m
diff --git a/src/simulation/m_ibm.fpp b/src/simulation/m_ibm.fpp
index 3c9b0db535..f9f12161b4 100644
--- a/src/simulation/m_ibm.fpp
+++ b/src/simulation/m_ibm.fpp
@@ -37,15 +37,15 @@ module m_ibm
     type(integer_field), public :: ib_markers
     type(levelset_field), public :: levelset
     type(levelset_norm_field), public :: levelset_norm
-    !$acc declare create(ib_markers, levelset, levelset_norm)
+    $:GPU_DECLARE(create='[ib_markers,levelset,levelset_norm]')
 
     type(ghost_point), dimension(:), allocatable :: ghost_points
     type(ghost_point), dimension(:), allocatable :: inner_points
-    !$acc declare create(ghost_points, inner_points)
+    $:GPU_DECLARE(create='[ghost_points,inner_points]')
 
     integer :: num_gps !< Number of ghost points
     integer :: num_inner_gps !< Number of ghost points
-    !$acc declare create(gp_layers, num_gps, num_inner_gps)
+    $:GPU_DECLARE(create='[gp_layers,num_gps,num_inner_gps]')
 
 contains
 
@@ -72,7 +72,7 @@ contains
         @:ACC_SETUP_SFs(levelset)
         @:ACC_SETUP_SFs(levelset_norm)
 
-        !$acc enter data copyin(num_gps, num_inner_gps)
+        $:GPU_ENTER_DATA(copyin='[num_gps,num_inner_gps]')
 
     end subroutine s_initialize_ibm_module
 
@@ -82,31 +82,31 @@ contains
 
         integer :: i, j, k
 
-        !$acc update device(ib_markers%sf)
-        !$acc update device(levelset%sf)
-        !$acc update device(levelset_norm%sf)
+        $:GPU_UPDATE(device='[ib_markers%sf]')
+        $:GPU_UPDATE(device='[levelset%sf]')
+        $:GPU_UPDATE(device='[levelset_norm%sf]')
 
         ! Get neighboring IB variables from other processors
         call s_populate_ib_buffers()
 
-        !$acc update host(ib_markers%sf)
+        $:GPU_UPDATE(host='[ib_markers%sf]')
 
         call s_find_num_ghost_points(num_gps, num_inner_gps)
 
-        !$acc update device(num_gps, num_inner_gps)
+        $:GPU_UPDATE(device='[num_gps, num_inner_gps]')
         @:ALLOCATE(ghost_points(1:num_gps))
         @:ALLOCATE(inner_points(1:num_inner_gps))
 
-        !$acc enter data copyin(ghost_points, inner_points)
+        $:GPU_ENTER_DATA(copyin='[ghost_points,inner_points]')
 
         call s_find_ghost_points(ghost_points, inner_points)
-        !$acc update device(ghost_points, inner_points)
+        $:GPU_UPDATE(device='[ghost_points, inner_points]')
 
         call s_compute_image_points(ghost_points, levelset, levelset_norm)
-        !$acc update device(ghost_points)
+        $:GPU_UPDATE(device='[ghost_points]')
 
         call s_compute_interpolation_coeffs(ghost_points)
-        !$acc update device(ghost_points)
+        $:GPU_UPDATE(device='[ghost_points]')
 
     end subroutine s_ibm_setup
 
@@ -166,7 +166,11 @@ contains
         type(ghost_point) :: gp
         type(ghost_point) :: innerp
 
-        !$acc parallel loop gang vector private(physical_loc, dyn_pres, alpha_rho_IP, alpha_IP, pres_IP, vel_IP, vel_g, vel_norm_IP, r_IP, v_IP, pb_IP, mv_IP, nmom_IP, presb_IP, massv_IP, rho, gamma, pi_inf, Re_K, G_K, Gs, gp, innerp, norm, buf, j, k, l, q)
+        $:GPU_PARALLEL_LOOP(private='[physical_loc,dyn_pres,alpha_rho_IP, &
+            & alpha_IP,pres_IP,vel_IP,vel_g,vel_norm_IP,r_IP, &
+            & v_IP,pb_IP,mv_IP,nmom_IP,presb_IP,massv_IP,rho, &
+            & gamma,pi_inf,Re_K,G_K,Gs,gp,innerp,norm,buf, &
+            & j,k,l,q]')
         do i = 1, num_gps
 
             gp = ghost_points(i)
@@ -203,7 +207,7 @@ contains
             dyn_pres = 0._wp
 
             ! Set q_prim_vf params at GP so that mixture vars calculated properly
-            !$acc loop seq
+            $:GPU_LOOP(parallelism='[seq]')
             do q = 1, num_fluids
                 q_prim_vf(q)%sf(j, k, l) = alpha_rho_IP(q)
                 q_prim_vf(advxb + q - 1)%sf(j, k, l) = alpha_IP(q)
@@ -239,7 +243,7 @@ contains
             end if
 
             ! Set momentum
-            !$acc loop seq
+            $:GPU_LOOP(parallelism='[seq]')
             do q = momxb, momxe
                 q_cons_vf(q)%sf(j, k, l) = rho*vel_g(q - momxb + 1)
                 dyn_pres = dyn_pres + q_cons_vf(q)%sf(j, k, l)* &
@@ -247,7 +251,7 @@ contains
             end do
 
             ! Set continuity and adv vars
-            !$acc loop seq
+            $:GPU_LOOP(parallelism='[seq]')
             do q = 1, num_fluids
                 q_cons_vf(q)%sf(j, k, l) = alpha_rho_IP(q)
                 q_cons_vf(advxb + q - 1)%sf(j, k, l) = alpha_IP(q)
@@ -301,7 +305,7 @@ contains
             end if
 
             if (model_eqns == 3) then
-                !$acc loop seq
+                $:GPU_LOOP(parallelism='[seq]')
                 do q = intxb, intxe
                     q_cons_vf(q)%sf(j, k, l) = alpha_IP(q - intxb + 1)*(gammas(q - intxb + 1)*pres_IP &
                                                                         + pi_infs(q - intxb + 1))
@@ -310,7 +314,9 @@ contains
         end do
 
         !Correct the state of the inner points in IBs
-        !$acc parallel loop gang vector private(physical_loc, dyn_pres, alpha_rho_IP, alpha_IP, vel_g, rho, gamma, pi_inf, Re_K, innerp, j, k, l, q)
+        $:GPU_PARALLEL_LOOP(private='[physical_loc,dyn_pres,alpha_rho_IP, &
+            & alpha_IP,vel_g,rho,gamma,pi_inf,Re_K,innerp, &
+            & j,k,l,q]')
         do i = 1, num_inner_gps
 
             innerp = inner_points(i)
@@ -318,7 +324,7 @@ contains
             k = innerp%loc(2)
             l = innerp%loc(3)
 
-            !$acc loop seq
+            $:GPU_LOOP(parallelism='[seq]')
             do q = momxb, momxe
                 q_cons_vf(q)%sf(j, k, l) = 0._wp
             end do
@@ -732,7 +738,7 @@ contains
     !> Function that uses the interpolation coefficients and the current state
     !! at the cell centers in order to estimate the state at the image point
     pure subroutine s_interpolate_image_point(q_prim_vf, gp, alpha_rho_IP, alpha_IP, pres_IP, vel_IP, c_IP, r_IP, v_IP, pb_IP, mv_IP, nmom_IP, pb, mv, presb_IP, massv_IP)
-        !$acc routine seq
+        $:GPU_ROUTINE(parallelism='[seq]')
         type(scalar_field), &
             dimension(sys_size), &
             intent(IN) :: q_prim_vf !< Primitive Variables
@@ -785,11 +791,11 @@ contains
             end if
         end if
 
-        !$acc loop seq
+        $:GPU_LOOP(parallelism='[seq]')
         do i = i1, i2
-            !$acc loop seq
+            $:GPU_LOOP(parallelism='[seq]')
             do j = j1, j2
-                !$acc loop seq
+                $:GPU_LOOP(parallelism='[seq]')
                 do k = k1, k2
 
                     coeff = gp%interp_coeffs(i - i1 + 1, j - j1 + 1, k - k1 + 1)
@@ -797,13 +803,13 @@ contains
                     pres_IP = pres_IP + coeff* &
                               q_prim_vf(E_idx)%sf(i, j, k)
 
-                    !$acc loop seq
+                    $:GPU_LOOP(parallelism='[seq]')
                     do q = momxb, momxe
                         vel_IP(q + 1 - momxb) = vel_IP(q + 1 - momxb) + coeff* &
                                                 q_prim_vf(q)%sf(i, j, k)
                     end do
 
-                    !$acc loop seq
+                    $:GPU_LOOP(parallelism='[seq]')
                     do l = contxb, contxe
                         alpha_rho_IP(l) = alpha_rho_IP(l) + coeff* &
                                           q_prim_vf(l)%sf(i, j, k)
@@ -816,7 +822,7 @@ contains
                     end if
 
                     if (bubbles_euler .and. .not. qbmm) then
-                        !$acc loop seq
+                        $:GPU_LOOP(parallelism='[seq]')
                         do l = 1, nb
                             if (polytropic) then
                                 r_IP(l) = r_IP(l) + coeff*q_prim_vf(bubxb + (l - 1)*2)%sf(i, j, k)
diff --git a/src/simulation/m_mhd.fpp b/src/simulation/m_mhd.fpp
index f5730b513f..8112b3af7e 100644
--- a/src/simulation/m_mhd.fpp
+++ b/src/simulation/m_mhd.fpp
@@ -24,12 +24,12 @@ module m_mhd
     real(wp), allocatable, dimension(:, :, :) :: du_dx, du_dy, du_dz
     real(wp), allocatable, dimension(:, :, :) :: dv_dx, dv_dy, dv_dz
     real(wp), allocatable, dimension(:, :, :) :: dw_dx, dw_dy, dw_dz
-    !$acc declare create(du_dx,du_dy,du_dz,dv_dx,dv_dy,dv_dz,dw_dx,dw_dy,dw_dz)
+    $:GPU_DECLARE(create='[du_dx,du_dy,du_dz,dv_dx,dv_dy,dv_dz,dw_dx,dw_dy,dw_dz]')
 
     real(wp), allocatable, dimension(:, :) :: fd_coeff_x_h
     real(wp), allocatable, dimension(:, :) :: fd_coeff_y_h
     real(wp), allocatable, dimension(:, :) :: fd_coeff_z_h
-    !$acc declare create(fd_coeff_x_h,fd_coeff_y_h,fd_coeff_z_h)
+    $:GPU_DECLARE(create='[fd_coeff_x_h,fd_coeff_y_h,fd_coeff_z_h]')
 
 contains
 
@@ -52,12 +52,12 @@ contains
 
         ! Computing centered finite difference coefficients
         call s_compute_finite_difference_coefficients(m, x_cc, fd_coeff_x_h, buff_size, fd_number, fd_order)
-        !$acc update device(fd_coeff_x_h)
+        $:GPU_UPDATE(device='[fd_coeff_x_h]')
         call s_compute_finite_difference_coefficients(n, y_cc, fd_coeff_y_h, buff_size, fd_number, fd_order)
-        !$acc update device(fd_coeff_y_h)
+        $:GPU_UPDATE(device='[fd_coeff_y_h]')
         if (p > 0) then
             call s_compute_finite_difference_coefficients(p, z_cc, fd_coeff_z_h, buff_size, fd_number, fd_order)
-            !$acc update device(fd_coeff_z_h)
+            $:GPU_UPDATE(device='[fd_coeff_z_h]')
         end if
 
     end subroutine s_initialize_mhd_powell_module
@@ -76,23 +76,22 @@ contains
         real(wp), dimension(3) :: v, B
         real(wp) :: divB, vdotB
 
-        !$acc parallel loop collapse(3) gang vector default(present) &
-        !$acc private(v, B)
+        $:GPU_PARALLEL_LOOP(collapse=3, private='[v, B]')
         do q = 0, p
             do l = 0, n
                 do k = 0, m
 
                     divB = 0._wp
-                    !$acc loop seq
+                    $:GPU_LOOP(parallelism='[seq]')
                     do r = -fd_number, fd_number
                         divB = divB + q_prim_vf(B_idx%beg)%sf(k + r, l, q)*fd_coeff_x_h(r, k)
                     end do
-                    !$acc loop seq
+                    $:GPU_LOOP(parallelism='[seq]')
                     do r = -fd_number, fd_number
                         divB = divB + q_prim_vf(B_idx%beg + 1)%sf(k, l + r, q)*fd_coeff_y_h(r, l)
                     end do
                     if (p > 0) then
-                        !$acc loop seq
+                        $:GPU_LOOP(parallelism='[seq]')
                         do r = -fd_number, fd_number
                             divB = divB + q_prim_vf(B_idx%beg + 2)%sf(k, l, q + r)*fd_coeff_z_h(r, q)
                         end do
@@ -130,7 +129,6 @@ contains
                 end do
             end do
         end do
-        !$acc end parallel loop
 
     end subroutine s_compute_mhd_powell_rhs
 
diff --git a/src/simulation/m_mpi_proxy.fpp b/src/simulation/m_mpi_proxy.fpp
index 93d864c5e8..e500a00898 100644
--- a/src/simulation/m_mpi_proxy.fpp
+++ b/src/simulation/m_mpi_proxy.fpp
@@ -48,7 +48,7 @@ module m_mpi_proxy
     !> @}
 
     integer :: i_halo_size
-    !$acc declare create(i_halo_size)
+    $:GPU_DECLARE(create='[i_halo_size]')
 
 contains
 
@@ -71,7 +71,7 @@ contains
                 i_halo_size = -1 + gp_layers
             end if
 
-            !$acc update device(i_halo_size)
+            $:GPU_UPDATE(device='[i_halo_size]')
             @:ALLOCATE(ib_buff_send(0:i_halo_size), ib_buff_recv(0:i_halo_size))
         end if
 #endif
@@ -297,7 +297,7 @@ contains
         #:for mpi_dir in [1, 2, 3]
             if (mpi_dir == ${mpi_dir}$) then
                 #:if mpi_dir == 1
-                    !$acc parallel loop collapse(3) gang vector default(present) private(r)
+                    $:GPU_PARALLEL_LOOP(collapse=3,private='[r]')
                     do l = 0, p
                         do k = 0, n
                             do j = 0, gp_layers - 1
@@ -307,7 +307,7 @@ contains
                         end do
                     end do
                 #:elif mpi_dir == 2
-                    !$acc parallel loop collapse(3) gang vector default(present) private(r)
+                    $:GPU_PARALLEL_LOOP(collapse=3,private='[r]')
                     do l = 0, p
                         do k = 0, gp_layers - 1
                             do j = -gp_layers, m + gp_layers
@@ -318,7 +318,7 @@ contains
                         end do
                     end do
                 #:else
-                    !$acc parallel loop collapse(3) gang vector default(present) private(r)
+                    $:GPU_PARALLEL_LOOP(collapse=3,private='[r]')
                     do l = 0, gp_layers - 1
                         do k = -gp_layers, n + gp_layers
                             do j = -gp_layers, m + gp_layers
@@ -345,7 +345,7 @@ contains
         #:for mpi_dir in [1, 2, 3]
             if (mpi_dir == ${mpi_dir}$) then
                 #:if mpi_dir == 1
-                    !$acc parallel loop collapse(3) gang vector default(present) private(r)
+                    $:GPU_PARALLEL_LOOP(collapse=3,private='[r]')
                     do l = 0, p
                         do k = 0, n
                             do j = -gp_layers, -1
@@ -355,7 +355,7 @@ contains
                         end do
                     end do
                 #:elif mpi_dir == 2
-                    !$acc parallel loop collapse(3) gang vector default(present) private(r)
+                    $:GPU_PARALLEL_LOOP(collapse=3,private='[r]')
                     do l = 0, p
                         do k = -gp_layers, -1
                             do j = -gp_layers, m + gp_layers
@@ -367,7 +367,7 @@ contains
                     end do
                 #:else
                     ! Unpacking buffer from bc_z%beg
-                    !$acc parallel loop collapse(3) gang vector default(present) private(r)
+                    $:GPU_PARALLEL_LOOP(collapse=3,private='[r]')
                     do l = -gp_layers, -1
                         do k = -gp_layers, n + gp_layers
                             do j = -gp_layers, m + gp_layers
diff --git a/src/simulation/m_pressure_relaxation.fpp b/src/simulation/m_pressure_relaxation.fpp
index ced668bef2..624cfa5390 100644
--- a/src/simulation/m_pressure_relaxation.fpp
+++ b/src/simulation/m_pressure_relaxation.fpp
@@ -21,10 +21,10 @@ module m_pressure_relaxation
  s_finalize_pressure_relaxation_module
 
     real(wp), allocatable, dimension(:) :: gamma_min, pres_inf
-    !$acc declare create(gamma_min, pres_inf)
+    $:GPU_DECLARE(create='[gamma_min, pres_inf]')
 
     real(wp), allocatable, dimension(:, :) :: Res
-    !$acc declare create(Res)
+    $:GPU_DECLARE(create='[Res]')
 
 contains
 
@@ -39,7 +39,7 @@ contains
             gamma_min(i) = 1._wp/fluid_pp(i)%gamma + 1._wp
             pres_inf(i) = fluid_pp(i)%pi_inf/(1._wp + fluid_pp(i)%gamma)
         end do
-        !$acc update device(gamma_min, pres_inf)
+        $:GPU_UPDATE(device='[gamma_min, pres_inf]')
 
         if (viscous) then
             @:ALLOCATE(Res(1:2, 1:maxval(Re_size)))
@@ -48,7 +48,7 @@ contains
                     Res(i, j) = fluid_pp(Re_idx(i, j))%Re(i)
                 end do
             end do
-            !$acc update device(Res, Re_idx, Re_size)
+            $:GPU_UPDATE(device='[Res, Re_idx, Re_size]')
         end if
 
     end subroutine s_initialize_pressure_relaxation_module
@@ -70,7 +70,7 @@ contains
         type(scalar_field), dimension(sys_size), intent(inout) :: q_cons_vf
         integer :: j, k, l
 
-        !$acc parallel loop collapse(3) gang vector default(present)
+        $:GPU_PARALLEL_LOOP(collapse=3)
         do l = 0, p
             do k = 0, n
                 do j = 0, m
@@ -83,7 +83,7 @@ contains
 
     !> Process pressure relaxation for a single cell
     pure subroutine s_relax_cell_pressure(q_cons_vf, j, k, l)
-        !$acc routine seq
+        $:GPU_ROUTINE(parallelism='[seq]')
 
         type(scalar_field), dimension(sys_size), intent(inout) :: q_cons_vf
         integer, intent(in) :: j, k, l
@@ -103,14 +103,14 @@ contains
 
     !> Check if pressure relaxation is needed for this cell
     pure logical function s_needs_pressure_relaxation(q_cons_vf, j, k, l)
-        !$acc routine seq
+        $:GPU_ROUTINE(parallelism='[seq]')
 
         type(scalar_field), dimension(sys_size), intent(in) :: q_cons_vf
         integer, intent(in) :: j, k, l
         integer :: i
 
         s_needs_pressure_relaxation = .true.
-        !$acc loop seq
+        $:GPU_LOOP(parallelism='[seq]')
         do i = 1, num_fluids
             if (q_cons_vf(i + advxb - 1)%sf(j, k, l) > (1._wp - sgm_eps)) then
                 s_needs_pressure_relaxation = .false.
@@ -121,7 +121,7 @@ contains
 
     !> Correct volume fractions to physical bounds
     pure subroutine s_correct_volume_fractions(q_cons_vf, j, k, l)
-        !$acc routine seq
+        $:GPU_ROUTINE(parallelism='[seq]')
 
         type(scalar_field), dimension(sys_size), intent(inout) :: q_cons_vf
         integer, intent(in) :: j, k, l
@@ -129,7 +129,7 @@ contains
         integer :: i
 
         sum_alpha = 0._wp
-        !$acc loop seq
+        $:GPU_LOOP(parallelism='[seq]')
         do i = 1, num_fluids
             if ((q_cons_vf(i + contxb - 1)%sf(j, k, l) < 0._wp) .or. &
                 (q_cons_vf(i + advxb - 1)%sf(j, k, l) < 0._wp)) then
@@ -142,7 +142,7 @@ contains
             sum_alpha = sum_alpha + q_cons_vf(i + advxb - 1)%sf(j, k, l)
         end do
 
-        !$acc loop seq
+        $:GPU_LOOP(parallelism='[seq]')
         do i = 1, num_fluids
             q_cons_vf(i + advxb - 1)%sf(j, k, l) = q_cons_vf(i + advxb - 1)%sf(j, k, l)/sum_alpha
         end do
@@ -151,7 +151,7 @@ contains
 
     !> Main pressure equilibration using Newton-Raphson
     pure subroutine s_equilibrate_pressure(q_cons_vf, j, k, l)
-        !$acc routine seq
+        $:GPU_ROUTINE(parallelism='[seq]')
 
         type(scalar_field), dimension(sys_size), intent(inout) :: q_cons_vf
         integer, intent(in) :: j, k, l
@@ -164,7 +164,7 @@ contains
 
         ! Initialize pressures
         pres_relax = 0._wp
-        !$acc loop seq
+        $:GPU_LOOP(parallelism='[seq]')
         do i = 1, num_fluids
             if (q_cons_vf(i + advxb - 1)%sf(j, k, l) > sgm_eps) then
                 pres_K_init(i) = (q_cons_vf(i + intxb - 1)%sf(j, k, l)/ &
@@ -180,7 +180,7 @@ contains
         ! Newton-Raphson iteration
         f_pres = 1.e-9_wp
         df_pres = 1.e9_wp
-        !$acc loop seq
+        $:GPU_LOOP(parallelism='[seq]')
         do iter = 0, MAX_ITER - 1
             if (abs(f_pres) > TOLERANCE) then
                 pres_relax = pres_relax - f_pres/df_pres
@@ -194,7 +194,7 @@ contains
                 ! Newton-Raphson step
                 f_pres = -1._wp
                 df_pres = 0._wp
-                !$acc loop seq
+                $:GPU_LOOP(parallelism='[seq]')
                 do i = 1, num_fluids
                     if (q_cons_vf(i + advxb - 1)%sf(j, k, l) > sgm_eps) then
                         rho_K_s(i) = q_cons_vf(i + contxb - 1)%sf(j, k, l)/ &
@@ -210,7 +210,7 @@ contains
         end do
 
         ! Update volume fractions
-        !$acc loop seq
+        $:GPU_LOOP(parallelism='[seq]')
         do i = 1, num_fluids
             if (q_cons_vf(i + advxb - 1)%sf(j, k, l) > sgm_eps) &
                 q_cons_vf(i + advxb - 1)%sf(j, k, l) = q_cons_vf(i + contxb - 1)%sf(j, k, l)/rho_K_s(i)
@@ -220,7 +220,7 @@ contains
 
     !> Correct internal energies using equilibrated pressure
     pure subroutine s_correct_internal_energies(q_cons_vf, j, k, l)
-        !$acc routine seq
+        $:GPU_ROUTINE(parallelism='[seq]')
 
         type(scalar_field), dimension(sys_size), intent(inout) :: q_cons_vf
         integer, intent(in) :: j, k, l
@@ -230,7 +230,7 @@ contains
         real(wp), dimension(2) :: Re
         integer :: i, q
 
-        !$acc loop seq
+        $:GPU_LOOP(parallelism='[seq]')
         do i = 1, num_fluids
             alpha_rho(i) = q_cons_vf(i)%sf(j, k, l)
             alpha(i) = q_cons_vf(E_idx + i)%sf(j, k, l)
@@ -243,14 +243,14 @@ contains
 
         if (bubbles_euler) then
             if (mpp_lim .and. (model_eqns == 2) .and. (num_fluids > 2)) then
-                !$acc loop seq
+                $:GPU_LOOP(parallelism='[seq]')
                 do i = 1, num_fluids
                     rho = rho + alpha_rho(i)
                     gamma = gamma + alpha(i)*gammas(i)
                     pi_inf = pi_inf + alpha(i)*pi_infs(i)
                 end do
             else if ((model_eqns == 2) .and. (num_fluids > 2)) then
-                !$acc loop seq
+                $:GPU_LOOP(parallelism='[seq]')
                 do i = 1, num_fluids - 1
                     rho = rho + alpha_rho(i)
                     gamma = gamma + alpha(i)*gammas(i)
@@ -264,7 +264,7 @@ contains
         else
             sum_alpha = 0._wp
             if (mpp_lim) then
-                !$acc loop seq
+                $:GPU_LOOP(parallelism='[seq]')
                 do i = 1, num_fluids
                     alpha_rho(i) = max(0._wp, alpha_rho(i))
                     alpha(i) = min(max(0._wp, alpha(i)), 1._wp)
@@ -273,7 +273,7 @@ contains
                 alpha = alpha/max(sum_alpha, sgm_eps)
             end if
 
-            !$acc loop seq
+            $:GPU_LOOP(parallelism='[seq]')
             do i = 1, num_fluids
                 rho = rho + alpha_rho(i)
                 gamma = gamma + alpha(i)*gammas(i)
@@ -281,11 +281,11 @@ contains
             end do
 
             if (viscous) then
-                !$acc loop seq
+                $:GPU_LOOP(parallelism='[seq]')
                 do i = 1, 2
                     Re(i) = dflt_real
                     if (Re_size(i) > 0) Re(i) = 0._wp
-                    !$acc loop seq
+                    $:GPU_LOOP(parallelism='[seq]')
                     do q = 1, Re_size(i)
                         Re(i) = alpha(Re_idx(i, q))/Res(i, q) + Re(i)
                     end do
@@ -296,7 +296,7 @@ contains
 
         ! Compute dynamic pressure and update internal energies
         dyn_pres = 0._wp
-        !$acc loop seq
+        $:GPU_LOOP(parallelism='[seq]')
         do i = momxb, momxe
             dyn_pres = dyn_pres + 5.e-1_wp*q_cons_vf(i)%sf(j, k, l)* &
                        q_cons_vf(i)%sf(j, k, l)/max(rho, sgm_eps)
@@ -304,7 +304,7 @@ contains
 
         pres_relax = (q_cons_vf(E_idx)%sf(j, k, l) - dyn_pres - pi_inf)/gamma
 
-        !$acc loop seq
+        $:GPU_LOOP(parallelism='[seq]')
         do i = 1, num_fluids
             q_cons_vf(i + intxb - 1)%sf(j, k, l) = &
                 q_cons_vf(i + advxb - 1)%sf(j, k, l)*(gammas(i)*pres_relax + pi_infs(i))
diff --git a/src/simulation/m_qbmm.fpp b/src/simulation/m_qbmm.fpp
index daf021c35c..027c47a567 100644
--- a/src/simulation/m_qbmm.fpp
+++ b/src/simulation/m_qbmm.fpp
@@ -25,21 +25,21 @@ module m_qbmm
     private; public :: s_initialize_qbmm_module, s_mom_inv, s_coeff, s_compute_qbmm_rhs
 
     real(wp), allocatable, dimension(:, :, :, :, :) :: momrhs
-    !$acc declare create(momrhs)
+    $:GPU_DECLARE(create='[momrhs]')
 
     #:if MFC_CASE_OPTIMIZATION
         integer, parameter :: nterms = ${nterms}$
     #:else
         integer :: nterms
-        !$acc declare create(nterms)
+        $:GPU_DECLARE(create='[nterms]')
     #:endif
 
     type(int_bounds_info) :: is1_qbmm, is2_qbmm, is3_qbmm
-    !$acc declare create(is1_qbmm, is2_qbmm, is3_qbmm)
+    $:GPU_DECLARE(create='[is1_qbmm,is2_qbmm,is3_qbmm]')
 
     integer, allocatable, dimension(:) :: bubrs
     integer, allocatable, dimension(:, :) :: bubmoms
-    !$acc declare create(bubrs, bubmoms)
+    $:GPU_DECLARE(create='[bubrs,bubmoms]')
 
 contains
 
@@ -57,8 +57,8 @@ contains
                 nterms = 7
             end if
 
-            !$acc enter data copyin(nterms)
-            !$acc update device(nterms)
+            $:GPU_ENTER_DATA(copyin='[nterms]')
+            $:GPU_UPDATE(device='[nterms]')
 
         #:endif
 
@@ -392,7 +392,7 @@ contains
             end do
         end if
 
-        !$acc update device(momrhs)
+        $:GPU_UPDATE(device='[momrhs]')
 
         @:ALLOCATE(bubrs(1:nb))
         @:ALLOCATE(bubmoms(1:nb, 1:nmom))
@@ -400,14 +400,14 @@ contains
         do i = 1, nb
             bubrs(i) = bub_idx%rs(i)
         end do
-        !$acc update device(bubrs)
+        $:GPU_UPDATE(device='[bubrs]')
 
         do j = 1, nmom
             do i = 1, nb
                 bubmoms(i, j) = bub_idx%moms(i, j)
             end do
         end do
-        !$acc update device(bubmoms)
+        $:GPU_UPDATE(device='[bubmoms]')
 
     end subroutine s_initialize_qbmm_module
 
@@ -433,7 +433,7 @@ contains
         end select
 
         if (.not. polytropic) then
-            !$acc parallel loop collapse(5) gang vector default(present) private(nb_q, nR, nR2, R, R2, nb_dot, nR_dot, nR2_dot, var, AX)
+            $:GPU_PARALLEL_LOOP(collapse=5,private='[nb_q,nR,nR2,R,R2,nb_dot,nR_dot,nR2_dot,var,AX]')
             do i = 1, nb
                 do q = 1, nnode
                     do l = 0, p
@@ -538,13 +538,13 @@ contains
 
         ! The following block is not repeated and is left as is
         if (idir == 1) then
-            !$acc parallel loop collapse(3) gang vector default(present)
+            $:GPU_PARALLEL_LOOP(collapse=3)
             do l = 0, p
                 do q = 0, n
                     do i = 0, m
                         rhs_vf(alf_idx)%sf(i, q, l) = rhs_vf(alf_idx)%sf(i, q, l) + mom_sp(2)%sf(i, q, l)
                         j = bubxb
-                        !$acc loop seq
+                        $:GPU_LOOP(parallelism='[seq]')
                         do k = 1, nb
                             rhs_vf(j)%sf(i, q, l) = rhs_vf(j)%sf(i, q, l) + mom_3d(0, 0, k)%sf(i, q, l)
                             rhs_vf(j + 1)%sf(i, q, l) = rhs_vf(j + 1)%sf(i, q, l) + mom_3d(1, 0, k)%sf(i, q, l)
@@ -563,11 +563,9 @@ contains
 
     !Coefficient array for non-polytropic model (pb and mv values are accounted in wght_pb and wght_mv)
     pure subroutine s_coeff_nonpoly(pres, rho, c, coeffs)
-#ifdef _CRAYFTN
-        !DIR$ INLINEALWAYS s_coeff_nonpoly
-#else
-        !$acc routine seq
-#endif
+        $:GPU_ROUTINE(function_name='s_coeff_nonpoly',parallelism='[seq]', &
+            & cray_inline=True)
+
         real(wp), intent(in) :: pres, rho, c
         real(wp), dimension(nterms, 0:2, 0:2), intent(out) :: coeffs
 
@@ -636,11 +634,8 @@ contains
 
 !Coefficient array for polytropic model (pb for each R0 bin accounted for in wght_pb)
     pure subroutine s_coeff(pres, rho, c, coeffs)
-#ifdef _CRAYFTN
-        !DIR$ INLINEALWAYS s_coeff
-#else
-        !$acc routine seq
-#endif
+        $:GPU_ROUTINE(function_name='s_coeff',parallelism='[seq]', &
+            & cray_inline=True)
 
         real(wp), intent(in) :: pres, rho, c
         real(wp), dimension(nterms, 0:2, 0:2), intent(out) :: coeffs
@@ -715,9 +710,12 @@ contains
         integer :: id1, id2, id3, i1, i2, j, q, r
 
         is1_qbmm = ix; is2_qbmm = iy; is3_qbmm = iz
-        !$acc update device(is1_qbmm, is2_qbmm, is3_qbmm)
+        $:GPU_UPDATE(device='[is1_qbmm,is2_qbmm,is3_qbmm]')
 
-        !$acc parallel loop collapse(3) gang vector default(present) private(moms, msum, wght, abscX, abscY, wght_pb, wght_mv, wght_ht, coeff, ht, r, q, n_tait, B_tait, pres, rho, nbub, c, alf, momsum, drdt, drdt2, chi_vw, x_vw, rho_mw, k_mw, T_bar, grad_T)
+        $:GPU_PARALLEL_LOOP(collapse=3, private='[moms, msum, wght, abscX, &
+            & abscY, wght_pb, wght_mv, wght_ht, coeff, ht, r, q, &
+            & n_tait, B_tait, pres, rho, nbub, c, alf, momsum, &
+            & drdt, drdt2, chi_vw, x_vw, rho_mw, k_mw, T_bar, grad_T]')
         do id3 = is3_qbmm%beg, is3_qbmm%end
             do id2 = is2_qbmm%beg, is2_qbmm%end
                 do id1 = is1_qbmm%beg, is1_qbmm%end
@@ -737,9 +735,10 @@ contains
 
                     if (alf > small_alf) then
                         nbub = q_cons_vf(bubxb)%sf(id1, id2, id3)
-                        !$acc loop seq
+                        $:GPU_LOOP(parallelism='[seq]')
                         do q = 1, nb
                             ! Gather moments for this bubble bin
+                            $:GPU_LOOP(parallelism='[seq]')
                             do r = 2, nmom
                                 moms(r) = q_prim_vf(bubmoms(q, r))%sf(id1, id2, id3)
                             end do
@@ -747,12 +746,12 @@ contains
                             call s_chyqmom(moms, wght(:, q), abscX(:, q), abscY(:, q))
 
                             if (polytropic) then
-                                !$acc loop seq
+                                $:GPU_LOOP(parallelism='[seq]')
                                 do j = 1, nnode
                                     wght_pb(j, q) = wght(j, q)*(pb0(q) - pv)
                                 end do
                             else
-                                !$acc loop seq
+                                $:GPU_LOOP(parallelism='[seq]')
                                 do j = 1, nnode
                                     chi_vw = 1._wp/(1._wp + R_v/R_n*(pb(id1, id2, id3, j, q)/pv - 1._wp))
                                     x_vw = M_n*chi_vw/(M_v + (M_n - M_v)*chi_vw)
@@ -771,13 +770,13 @@ contains
 
                             ! Compute change in moments due to bubble dynamics
                             r = 1
-                            !$acc loop seq
+                            $:GPU_LOOP(parallelism='[seq]')
                             do i2 = 0, 2
-                                !$acc loop seq
+                                $:GPU_LOOP(parallelism='[seq]')
                                 do i1 = 0, 2
                                     if ((i1 + i2) <= 2) then
                                         momsum = 0._wp
-                                        !$acc loop seq
+                                        $:GPU_LOOP(parallelism='[seq]')
                                         do j = 1, nterms
                                             select case (bubble_model)
                                             case (3)
@@ -807,7 +806,7 @@ contains
 
                             ! Compute change in pb and mv for non-polytropic model
                             if (.not. polytropic) then
-                                !$acc loop seq
+                                $:GPU_LOOP(parallelism='[seq]')
                                 do j = 1, nnode
                                     drdt = msum(2)
                                     drdt2 = merge(-1._wp, 1._wp, j == 1 .or. j == 2)/(2._wp*sqrt(merge(moms(4) - moms(2)**2._wp, verysmall, moms(4) - moms(2)**2._wp > 0._wp)))
@@ -835,11 +834,11 @@ contains
                             end if
                         end if
                     else
-                        !$acc loop seq
+                        $:GPU_LOOP(parallelism='[seq]')
                         do q = 1, nb
-                            !$acc loop seq
+                            $:GPU_LOOP(parallelism='[seq]')
                             do i1 = 0, 2
-                                !$acc loop seq
+                                $:GPU_LOOP(parallelism='[seq]')
                                 do i2 = 0, 2
                                     moms3d(i1, i2, q)%sf(id1, id2, id3) = 0._wp
                                 end do
@@ -857,11 +856,8 @@ contains
     contains
         ! Helper to select the correct coefficient routine
         subroutine s_coeff_selector(pres, rho, c, coeff, polytropic)
-#ifdef _CRAYFTN
-            !DIR$ INLINEALWAYS s_coeff_selector
-#else
-            !$acc routine seq
-#endif
+            $:GPU_ROUTINE(function_name='s_coeff_selector',parallelism='[seq]', &
+                & cray_inline=True)
             real(wp), intent(in) :: pres, rho, c
             real(wp), dimension(nterms, 0:2, 0:2), intent(out) :: coeff
             logical, intent(in) :: polytropic
@@ -873,11 +869,9 @@ contains
         end subroutine s_coeff_selector
 
         pure subroutine s_chyqmom(momin, wght, abscX, abscY)
-#ifdef _CRAYFTN
-            !DIR$ INLINEALWAYS s_chyqmom
-#else
-            !$acc routine seq
-#endif
+            $:GPU_ROUTINE(function_name='s_chyqmom',parallelism='[seq]', &
+                & cray_inline=True)
+
             real(wp), dimension(nmom), intent(in) :: momin
             real(wp), dimension(nnode), intent(inout) :: wght, abscX, abscY
 
@@ -933,11 +927,9 @@ contains
         end subroutine s_chyqmom
 
         pure subroutine s_hyqmom(frho, fup, fmom)
-#ifdef _CRAYFTN
-            !DIR$ INLINEALWAYS s_hyqmom
-#else
-            !$acc routine seq
-#endif
+            $:GPU_ROUTINE(function_name='s_hyqmom',parallelism='[seq]', &
+                & cray_inline=True)
+
             real(wp), dimension(2), intent(inout) :: frho, fup
             real(wp), dimension(3), intent(in) :: fmom
 
@@ -955,7 +947,7 @@ contains
         end subroutine s_hyqmom
 
         pure function f_quad(abscX, abscY, wght_in, q, r, s)
-            !$acc routine seq
+            $:GPU_ROUTINE(parallelism='[seq]')
             real(wp), dimension(nnode, nb), intent(in) :: abscX, abscY, wght_in
             real(wp), intent(in) :: q, r, s
 
@@ -971,7 +963,7 @@ contains
         end function f_quad
 
         pure function f_quad2D(abscX, abscY, wght_in, pow)
-            !$acc routine seq
+            $:GPU_ROUTINE(parallelism='[seq]')
             real(wp), dimension(nnode), intent(in) :: abscX, abscY, wght_in
             real(wp), dimension(3), intent(in) :: pow
 
diff --git a/src/simulation/m_rhs.fpp b/src/simulation/m_rhs.fpp
index 8aceb2dfb4..5c16a5ba9f 100644
--- a/src/simulation/m_rhs.fpp
+++ b/src/simulation/m_rhs.fpp
@@ -75,13 +75,13 @@ module m_rhs
     !! conservative variables, which are located in q_cons_vf, at cell-interior
     !! Gaussian quadrature points (QP).
     type(vector_field) :: q_cons_qp !<
-    !$acc declare create(q_cons_qp)
+    $:GPU_DECLARE(create='[q_cons_qp]')
 
     !! The primitive variables at cell-interior Gaussian quadrature points. These
     !! are calculated from the conservative variables and gradient magnitude (GM)
     !! of the volume fractions, q_cons_qp and gm_alpha_qp, respectively.
     type(vector_field) :: q_prim_qp !<
-    !$acc declare create(q_prim_qp)
+    $:GPU_DECLARE(create='[q_prim_qp]')
 
     !> @name The first-order spatial derivatives of the primitive variables at cell-
     !! interior Gaussian quadrature points. These are WENO-reconstructed from
@@ -90,7 +90,7 @@ module m_rhs
     !! of the primitive variables, located in qK_prim_n, where K = L or R.
     !> @{
     type(vector_field), allocatable, dimension(:) :: dq_prim_dx_qp, dq_prim_dy_qp, dq_prim_dz_qp
-    !$acc declare create(dq_prim_dx_qp, dq_prim_dy_qp, dq_prim_dz_qp)
+    $:GPU_DECLARE(create='[dq_prim_dx_qp,dq_prim_dy_qp,dq_prim_dz_qp]')
     !> @}
 
     !> @name The left and right WENO-reconstructed cell-boundary values of the cell-
@@ -100,26 +100,26 @@ module m_rhs
     !> @{
     type(vector_field), allocatable, dimension(:) :: dqL_prim_dx_n, dqL_prim_dy_n, dqL_prim_dz_n
     type(vector_field), allocatable, dimension(:) :: dqR_prim_dx_n, dqR_prim_dy_n, dqR_prim_dz_n
-    !$acc declare create(dqL_prim_dx_n, dqL_prim_dy_n, dqL_prim_dz_n)
-    !$acc declare create(dqR_prim_dx_n, dqR_prim_dy_n, dqR_prim_dz_n)
+    $:GPU_DECLARE(create='[dqL_prim_dx_n,dqL_prim_dy_n,dqL_prim_dz_n]')
+    $:GPU_DECLARE(create='[dqR_prim_dx_n,dqR_prim_dy_n,dqR_prim_dz_n]')
     !> @}
 
     type(scalar_field), allocatable, dimension(:) :: tau_Re_vf
-    !$acc declare create(tau_Re_vf)
+    $:GPU_DECLARE(create='[tau_Re_vf]')
 
     type(vector_field) :: gm_alpha_qp  !<
     !! The gradient magnitude of the volume fractions at cell-interior Gaussian
     !! quadrature points. gm_alpha_qp is calculated from individual first-order
     !! spatial derivatives located in dq_prim_ds_qp.
 
-    !$acc declare create(gm_alpha_qp)
+    $:GPU_DECLARE(create='[gm_alpha_qp]')
 
     !> @name The left and right WENO-reconstructed cell-boundary values of the cell-
     !! average gradient magnitude of volume fractions, located in gm_alpha_qp.
     !> @{
     type(vector_field), allocatable, dimension(:) :: gm_alphaL_n
     type(vector_field), allocatable, dimension(:) :: gm_alphaR_n
-    !$acc declare create(gm_alphaL_n, gm_alphaR_n)
+    $:GPU_DECLARE(create='[gm_alphaL_n,gm_alphaR_n]')
     !> @}
 
     !> @name The cell-boundary values of the fluxes (src - source, gsrc - geometrical
@@ -129,38 +129,38 @@ module m_rhs
     type(vector_field), allocatable, dimension(:) :: flux_n
     type(vector_field), allocatable, dimension(:) :: flux_src_n
     type(vector_field), allocatable, dimension(:) :: flux_gsrc_n
-    !$acc declare create(flux_n, flux_src_n, flux_gsrc_n)
+    $:GPU_DECLARE(create='[flux_n,flux_src_n,flux_gsrc_n]')
     !> @}
 
     type(vector_field), allocatable, dimension(:) :: qL_prim, qR_prim
-    !$acc declare create(qL_prim, qR_prim)
+    $:GPU_DECLARE(create='[qL_prim,qR_prim]')
 
     type(int_bounds_info) :: iv !< Vector field indical bounds
-    !$acc declare create(iv)
+    $:GPU_DECLARE(create='[iv]')
 
     !> @name Indical bounds in the x-, y- and z-directions
     !> @{
     type(int_bounds_info) :: irx, iry, irz
-    !$acc declare create(irx, iry, irz)
+    $:GPU_DECLARE(create='[irx,iry,irz]')
 
     type(int_bounds_info) :: is1, is2, is3
-    !$acc declare create(is1, is2, is3)
+    $:GPU_DECLARE(create='[is1,is2,is3]')
 
     !> @name Saved fluxes for testing
     !> @{
     type(scalar_field) :: alf_sum
     !> @}
-    !$acc declare create(alf_sum)
+    $:GPU_DECLARE(create='[alf_sum]')
 
     real(wp), allocatable, dimension(:, :, :) :: blkmod1, blkmod2, alpha1, alpha2, Kterm
     real(wp), allocatable, dimension(:, :, :, :) :: qL_rsx_vf, qL_rsy_vf, qL_rsz_vf, qR_rsx_vf, qR_rsy_vf, qR_rsz_vf
     real(wp), allocatable, dimension(:, :, :, :) :: dqL_rsx_vf, dqL_rsy_vf, dqL_rsz_vf, dqR_rsx_vf, dqR_rsy_vf, dqR_rsz_vf
-    !$acc declare create(blkmod1, blkmod2, alpha1, alpha2, Kterm)
-    !$acc declare create(qL_rsx_vf, qL_rsy_vf, qL_rsz_vf, qR_rsx_vf, qR_rsy_vf, qR_rsz_vf)
-    !$acc declare create(dqL_rsx_vf, dqL_rsy_vf, dqL_rsz_vf, dqR_rsx_vf, dqR_rsy_vf, dqR_rsz_vf)
+    $:GPU_DECLARE(create='[blkmod1,blkmod2,alpha1,alpha2,Kterm]')
+    $:GPU_DECLARE(create='[qL_rsx_vf,qL_rsy_vf,qL_rsz_vf,qR_rsx_vf,qR_rsy_vf,qR_rsz_vf]')
+    $:GPU_DECLARE(create='[dqL_rsx_vf,dqL_rsy_vf,dqL_rsz_vf,dqR_rsx_vf,dqR_rsy_vf,dqR_rsz_vf]')
 
     real(wp), allocatable, dimension(:, :, :) :: nbub !< Bubble number density
-    !$acc declare create(nbub)
+    $:GPU_DECLARE(create='[nbub]')
 
 contains
 
@@ -173,8 +173,8 @@ contains
 
         integer :: num_eqns_after_adv
 
-        !$acc enter data copyin(idwbuff, idwbuff)
-        !$acc update device(idwbuff, idwbuff)
+        $:GPU_ENTER_DATA(copyin='[idwbuff]')
+        $:GPU_UPDATE(device='[idwbuff]')
 
         @:ALLOCATE(q_cons_qp%vf(1:sys_size))
         @:ALLOCATE(q_prim_qp%vf(1:sys_size))
@@ -201,29 +201,29 @@ contains
                 @:ALLOCATE(q_prim_qp%vf(l)%sf(idwbuff(1)%beg:idwbuff(1)%end, idwbuff(2)%beg:idwbuff(2)%end, idwbuff(3)%beg:idwbuff(3)%end))
             else
                 q_prim_qp%vf(l)%sf => q_cons_qp%vf(l)%sf
-                !$acc enter data copyin(q_prim_qp%vf(l)%sf)
-                !$acc enter data attach(q_prim_qp%vf(l)%sf)
+                $:GPU_ENTER_DATA(copyin='[q_prim_qp%vf(l)%sf]')
+                $:GPU_ENTER_DATA(attach='[q_prim_qp%vf(l)%sf]')
             end if
         end do
 
         do l = adv_idx%beg, adv_idx%end
             q_prim_qp%vf(l)%sf => q_cons_qp%vf(l)%sf
-            !$acc enter data copyin(q_prim_qp%vf(l)%sf)
-            !$acc enter data attach(q_prim_qp%vf(l)%sf)
+            $:GPU_ENTER_DATA(copyin='[q_prim_qp%vf(l)%sf]')
+            $:GPU_ENTER_DATA(attach='[q_prim_qp%vf(l)%sf]')
         end do
 
         if (surface_tension) then
             q_prim_qp%vf(c_idx)%sf => &
                 q_cons_qp%vf(c_idx)%sf
-            !$acc enter data copyin(q_prim_qp%vf(c_idx)%sf)
-            !$acc enter data attach(q_prim_qp%vf(c_idx)%sf)
+            $:GPU_ENTER_DATA(copyin='[q_prim_qp%vf(c_idx)%sf]')
+            $:GPU_ENTER_DATA(attach='[q_prim_qp%vf(c_idx)%sf]')
         end if
 
         if (cont_damage) then
             q_prim_qp%vf(damage_idx)%sf => &
                 q_cons_qp%vf(damage_idx)%sf
-            !$acc enter data copyin(q_prim_qp%vf(damage_idx)%sf)
-            !$acc enter data attach(q_prim_qp%vf(damage_idx)%sf)
+            $:GPU_ENTER_DATA(copyin='[q_prim_qp%vf(damage_idx)%sf]')
+            $:GPU_ENTER_DATA(attach='[q_prim_qp%vf(damage_idx)%sf]')
         end if
 
         if (viscous) then
@@ -548,14 +548,14 @@ contains
                 if (riemann_solver /= 1 .and. riemann_solver /= 4) then
                     do l = adv_idx%beg + 1, adv_idx%end
                         flux_src_n(i)%vf(l)%sf => flux_src_n(i)%vf(adv_idx%beg)%sf
-                        !$acc enter data attach(flux_src_n(i)%vf(l)%sf)
+                        $:GPU_ENTER_DATA(attach='[flux_src_n(i)%vf(l)%sf]')
                     end do
                 end if
             else
                 do l = 1, sys_size
                     flux_n(i)%vf(l)%sf => flux_n(1)%vf(l)%sf
                     flux_src_n(i)%vf(l)%sf => flux_src_n(1)%vf(l)%sf
-                    !$acc enter data attach(flux_n(i)%vf(l)%sf,flux_src_n(i)%vf(l)%sf)
+                    $:GPU_ENTER_DATA(attach='[flux_n(i)%vf(l)%sf,flux_src_n(i)%vf(l)%sf]')
                 end do
             end if
         end do
@@ -568,7 +568,7 @@ contains
 
         call s_initialize_pressure_relaxation_module
 
-        !$acc parallel loop collapse(4) gang vector default(present)
+        $:GPU_PARALLEL_LOOP(collapse=4)
         do id = 1, num_dims
             do i = 1, sys_size
                 do l = idwbuff(3)%beg, idwbuff(3)%end
@@ -607,7 +607,7 @@ contains
 
         call cpu_time(t_start)
         ! Association/Population of Working Variables
-        !$acc parallel loop collapse(4) gang vector default(present)
+        $:GPU_PARALLEL_LOOP(collapse=4)
         do i = 1, sys_size
             do l = idwbuff(3)%beg, idwbuff(3)%end
                 do k = idwbuff(2)%beg, idwbuff(2)%end
@@ -621,16 +621,16 @@ contains
         ! Converting Conservative to Primitive Variables
 
         if (mpp_lim .and. bubbles_euler) then
-            !$acc parallel loop collapse(3) gang vector default(present)
+            $:GPU_PARALLEL_LOOP(collapse=3)
             do l = idwbuff(3)%beg, idwbuff(3)%end
                 do k = idwbuff(2)%beg, idwbuff(2)%end
                     do j = idwbuff(1)%beg, idwbuff(1)%end
                         alf_sum%sf(j, k, l) = 0._wp
-                        !$acc loop seq
+                        $:GPU_LOOP(parallelism='[seq]')
                         do i = advxb, advxe - 1
                             alf_sum%sf(j, k, l) = alf_sum%sf(j, k, l) + q_cons_qp%vf(i)%sf(j, k, l)
                         end do
-                        !$acc loop seq
+                        $:GPU_LOOP(parallelism='[seq]')
                         do i = advxb, advxe - 1
                             q_cons_qp%vf(i)%sf(j, k, l) = q_cons_qp%vf(i)%sf(j, k, l)*(1._wp - q_cons_qp%vf(alf_idx)%sf(j, k, l)) &
                                                           /alf_sum%sf(j, k, l)
@@ -839,7 +839,7 @@ contains
         ! END: Dimensional Splitting Loop
 
         if (ib) then
-            !$acc parallel loop collapse(3) gang vector default(present)
+            $:GPU_PARALLEL_LOOP(collapse=3)
             do l = 0, p
                 do k = 0, n
                     do j = 0, m
@@ -903,7 +903,7 @@ contains
         ! END: Additional physics and source terms
 
         if (run_time_info .or. probe_wrt .or. ib .or. bubbles_lagrange) then
-            !$acc parallel loop collapse(4) gang vector default(present)
+            $:GPU_PARALLEL_LOOP(collapse=4)
             do i = 1, sys_size
                 do l = idwbuff(3)%beg, idwbuff(3)%end
                     do k = idwbuff(2)%beg, idwbuff(2)%end
@@ -943,7 +943,7 @@ contains
         real(wp) :: advected_qty_val, pressure_val, velocity_val
 
         if (alt_soundspeed) then
-            !$acc parallel loop collapse(3) gang vector default(present)
+            $:GPU_PARALLEL_LOOP(collapse=3)
             do q_loop = 0, p
                 do l_loop = 0, n
                     do k_loop = 0, m
@@ -977,7 +977,7 @@ contains
                 call s_cbc(q_prim_vf%vf, flux_n(idir)%vf, flux_src_n_vf%vf, idir, 1, irx, iry, irz)
             end if
 
-            !$acc parallel loop collapse(4) gang vector default(present) private(inv_ds, flux_face1, flux_face2)
+            $:GPU_PARALLEL_LOOP(collapse=4,private='[inv_ds,flux_face1,flux_face2]')
             do j = 1, sys_size
                 do q_loop = 0, p
                     do l_loop = 0, n
@@ -992,8 +992,8 @@ contains
             end do
 
             if (model_eqns == 3) then
-                !$acc parallel loop collapse(4) gang vector default(present) &
-                !$acc private(inv_ds, advected_qty_val, pressure_val, flux_face1, flux_face2)
+                $:GPU_PARALLEL_LOOP(collapse=4,private='[inv_ds,advected_qty_val, &
+                    & pressure_val,flux_face1,flux_face2]')
                 do q_loop = 0, p
                     do l_loop = 0, n
                         do k_loop = 0, m
@@ -1022,7 +1022,7 @@ contains
                 call s_cbc(q_prim_vf%vf, flux_n(idir)%vf, flux_src_n_vf%vf, idir, 1, irx, iry, irz)
             end if
 
-            !$acc parallel loop collapse(4) gang vector default(present) private(inv_ds, flux_face1, flux_face2)
+            $:GPU_PARALLEL_LOOP(collapse=4,private='[inv_ds,flux_face1,flux_face2]')
             do j = 1, sys_size
                 do l = 0, p
                     do k = 0, n
@@ -1037,8 +1037,8 @@ contains
             end do
 
             if (model_eqns == 3) then
-                !$acc parallel loop collapse(4) gang vector default(present) &
-                !$acc private(inv_ds, advected_qty_val, pressure_val, flux_face1, flux_face2)
+                $:GPU_PARALLEL_LOOP(collapse=4,private='[inv_ds,advected_qty_val, &
+                    & pressure_val,flux_face1,flux_face2]')
                 do l = 0, p
                     do k = 0, n
                         do q = 0, m
@@ -1063,7 +1063,7 @@ contains
             end if
 
             if (cyl_coord) then
-                !$acc parallel loop collapse(4) gang vector default(present) private(flux_face1, flux_face2)
+                $:GPU_PARALLEL_LOOP(collapse=4,private='[flux_face1,flux_face2]')
                 do j = 1, sys_size
                     do l = 0, p
                         do k = 0, n
@@ -1089,8 +1089,8 @@ contains
             end if
 
             if (grid_geometry == 3) then ! Cylindrical Coordinates
-                !$acc parallel loop collapse(4) gang vector default(present) &
-                !$acc private(inv_ds, velocity_val, flux_face1, flux_face2)
+                $:GPU_PARALLEL_LOOP(collapse=4,private='[inv_ds,velocity_val, &
+                    & flux_face1,flux_face2]')
                 do j = 1, sys_size
                     do k = 0, p
                         do q = 0, n
@@ -1105,7 +1105,7 @@ contains
                         end do
                     end do
                 end do
-                !$acc parallel loop collapse(4) gang vector default(present) private(flux_face1, flux_face2)
+                $:GPU_PARALLEL_LOOP(collapse=4,private='[flux_face1,flux_face2]')
                 do j = 1, sys_size
                     do k = 0, p
                         do q = 0, n
@@ -1119,7 +1119,7 @@ contains
                     end do
                 end do
             else ! Cartesian Coordinates
-                !$acc parallel loop collapse(4) gang vector default(present) private(inv_ds, flux_face1, flux_face2)
+                $:GPU_PARALLEL_LOOP(collapse=4,private='[inv_ds,flux_face1,flux_face2]')
                 do j = 1, sys_size
                     do k = 0, p
                         do q = 0, n
@@ -1135,8 +1135,8 @@ contains
             end if
 
             if (model_eqns == 3) then
-                !$acc parallel loop collapse(4) gang vector default(present) &
-                !$acc private(inv_ds, advected_qty_val, pressure_val, flux_face1, flux_face2)
+                $:GPU_PARALLEL_LOOP(collapse=4,private='[inv_ds,advected_qty_val, &
+                    & pressure_val,flux_face1,flux_face2]')
                 do k = 0, p
                     do q = 0, n
                         do l = 0, m
@@ -1180,8 +1180,8 @@ contains
             case (1) ! x-direction
                 use_standard_riemann = (riemann_solver == 1 .or. riemann_solver == 4)
                 if (use_standard_riemann) then
-                    !$acc parallel loop collapse(4) gang vector default(present) &
-                    !$acc private(local_inv_ds, local_term_coeff, local_flux1, local_flux2)
+                    $:GPU_PARALLEL_LOOP(collapse=4,private='[local_inv_ds, &
+                        & local_term_coeff,local_flux1,local_flux2]')
                     do j_adv = advxb, advxe
                         do q_idx = 0, p ! z_extent
                             do l_idx = 0, n ! y_extent
@@ -1199,8 +1199,9 @@ contains
                 else ! Other Riemann solvers
                     if (alt_soundspeed) then
                         if (bubbles_euler .neqv. .true.) then
-                            !$acc parallel loop collapse(3) gang vector default(present) &
-                            !$acc private(local_inv_ds, local_q_cons_val, local_k_term_val, local_term_coeff, local_flux1, local_flux2)
+                            $:GPU_PARALLEL_LOOP(collapse=3, private='[local_inv_ds, &
+                                & local_q_cons_val, local_k_term_val, &
+                                & local_term_coeff, local_flux1, local_flux2]')
                             do q_idx = 0, p; do l_idx = 0, n; do k_idx = 0, m
                                         local_inv_ds = 1._wp/dx(k_idx)
                                         local_q_cons_val = q_cons_vf_arg%vf(advxe)%sf(k_idx, l_idx, q_idx)
@@ -1212,8 +1213,10 @@ contains
                                                                                     local_inv_ds*local_term_coeff*(local_flux1 - local_flux2)
                                     end do; end do; end do
 
-                            !$acc parallel loop collapse(3) gang vector default(present) &
-                            !$acc private(local_inv_ds, local_q_cons_val, local_k_term_val, local_term_coeff, local_flux1, local_flux2)
+                            $:GPU_PARALLEL_LOOP(collapse=3, private='[local_inv_ds,&
+                                & local_q_cons_val, local_k_term_val, &
+                                & local_term_coeff, local_flux1, &
+                                & local_flux2]')
                             do q_idx = 0, p; do l_idx = 0, n; do k_idx = 0, m
                                         local_inv_ds = 1._wp/dx(k_idx)
                                         local_q_cons_val = q_cons_vf_arg%vf(advxb)%sf(k_idx, l_idx, q_idx)
@@ -1226,8 +1229,8 @@ contains
                                     end do; end do; end do
                         end if
                     else ! NOT alt_soundspeed
-                        !$acc parallel loop collapse(4) gang vector default(present) &
-                        !$acc private(local_inv_ds, local_term_coeff, local_flux1, local_flux2)
+                        $:GPU_PARALLEL_LOOP(collapse=4,private='[local_inv_ds, &
+                            & local_term_coeff,local_flux1,local_flux2]')
                         do j_adv = advxb, advxe
                             do q_idx = 0, p; do l_idx = 0, n; do k_idx = 0, m
                                         local_inv_ds = 1._wp/dx(k_idx)
@@ -1244,8 +1247,8 @@ contains
             case (2) ! y-direction: loops q_idx (x), k_idx (y), l_idx (z); sf(q_idx, k_idx, l_idx); dy(k_idx); Kterm(q_idx,k_idx,l_idx)
                 use_standard_riemann = (riemann_solver == 1 .or. riemann_solver == 4)
                 if (use_standard_riemann) then
-                    !$acc parallel loop collapse(4) gang vector default(present) &
-                    !$acc private(local_inv_ds, local_term_coeff, local_flux1, local_flux2)
+                    $:GPU_PARALLEL_LOOP(collapse=4,private='[local_inv_ds, &
+                        & local_term_coeff,local_flux1,local_flux2]')
                     do j_adv = advxb, advxe
                         do l_idx = 0, p ! z_extent
                             do k_idx = 0, n ! y_extent
@@ -1263,8 +1266,10 @@ contains
                 else ! Other Riemann solvers
                     if (alt_soundspeed) then
                         if (bubbles_euler .neqv. .true.) then
-                            !$acc parallel loop collapse(3) gang vector default(present) &
-                            !$acc private(local_inv_ds, local_q_cons_val, local_k_term_val, local_term_coeff, local_flux1, local_flux2)
+                            $:GPU_PARALLEL_LOOP(collapse=3, private='[local_inv_ds, &
+                                & local_q_cons_val, local_k_term_val, &
+                                & local_term_coeff, local_flux1, &
+                                & local_flux2]')
                             do l_idx = 0, p; do k_idx = 0, n; do q_idx = 0, m
                                         local_inv_ds = 1._wp/dy(k_idx)
                                         local_q_cons_val = q_cons_vf_arg%vf(advxe)%sf(q_idx, k_idx, l_idx)
@@ -1280,8 +1285,10 @@ contains
                                         end if
                                     end do; end do; end do
 
-                            !$acc parallel loop collapse(3) gang vector default(present) &
-                            !$acc private(local_inv_ds, local_q_cons_val, local_k_term_val, local_term_coeff, local_flux1, local_flux2)
+                            $:GPU_PARALLEL_LOOP(collapse=3, private='[local_inv_ds, &
+                                & local_q_cons_val, local_k_term_val, &
+                                & local_term_coeff, local_flux1, &
+                                & local_flux2]')
                             do l_idx = 0, p; do k_idx = 0, n; do q_idx = 0, m
                                         local_inv_ds = 1._wp/dy(k_idx)
                                         local_q_cons_val = q_cons_vf_arg%vf(advxb)%sf(q_idx, k_idx, l_idx)
@@ -1298,8 +1305,8 @@ contains
                                     end do; end do; end do
                         end if
                     else ! NOT alt_soundspeed
-                        !$acc parallel loop collapse(4) gang vector default(present) &
-                        !$acc private(local_inv_ds, local_term_coeff, local_flux1, local_flux2)
+                        $:GPU_PARALLEL_LOOP(collapse=4,private='[local_inv_ds, &
+                            & local_term_coeff,local_flux1,local_flux2]')
                         do j_adv = advxb, advxe
                             do l_idx = 0, p; do k_idx = 0, n; do q_idx = 0, m
                                         local_inv_ds = 1._wp/dy(k_idx)
@@ -1321,8 +1328,8 @@ contains
                 end if
 
                 if (use_standard_riemann) then
-                    !$acc parallel loop collapse(4) gang vector default(present) &
-                    !$acc private(local_inv_ds, local_term_coeff, local_flux1, local_flux2)
+                    $:GPU_PARALLEL_LOOP(collapse=4,private='[local_inv_ds, &
+                        & local_term_coeff,local_flux1,local_flux2]')
                     do j_adv = advxb, advxe
                         do k_idx = 0, p ! z_extent
                             do q_idx = 0, n ! y_extent
@@ -1340,8 +1347,10 @@ contains
                 else ! Other Riemann solvers
                     if (alt_soundspeed) then
                         if (bubbles_euler .neqv. .true.) then
-                            !$acc parallel loop collapse(3) gang vector default(present) &
-                            !$acc private(local_inv_ds, local_q_cons_val, local_k_term_val, local_term_coeff, local_flux1, local_flux2)
+                            $:GPU_PARALLEL_LOOP(collapse=3, private='[local_inv_ds, &
+                                & local_q_cons_val, local_k_term_val, &
+                                & local_term_coeff, local_flux1, &
+                                & local_flux2]')
                             do k_idx = 0, p; do q_idx = 0, n; do l_idx = 0, m
                                         local_inv_ds = 1._wp/dz(k_idx)
                                         local_q_cons_val = q_cons_vf_arg%vf(advxe)%sf(l_idx, q_idx, k_idx)
@@ -1353,8 +1362,10 @@ contains
                                                                                     local_inv_ds*local_term_coeff*(local_flux1 - local_flux2)
                                     end do; end do; end do
 
-                            !$acc parallel loop collapse(3) gang vector default(present) &
-                            !$acc private(local_inv_ds, local_q_cons_val, local_k_term_val, local_term_coeff, local_flux1, local_flux2)
+                            $:GPU_PARALLEL_LOOP(collapse=3, private='[local_inv_ds, &
+                                & local_q_cons_val, local_k_term_val, &
+                                & local_term_coeff, local_flux1, &
+                                & local_flux2]')
                             do k_idx = 0, p; do q_idx = 0, n; do l_idx = 0, m
                                         local_inv_ds = 1._wp/dz(k_idx)
                                         local_q_cons_val = q_cons_vf_arg%vf(advxb)%sf(l_idx, q_idx, k_idx)
@@ -1367,8 +1378,8 @@ contains
                                     end do; end do; end do
                         end if
                     else ! NOT alt_soundspeed
-                        !$acc parallel loop collapse(4) gang vector default(present) &
-                        !$acc private(local_inv_ds, local_term_coeff, local_flux1, local_flux2)
+                        $:GPU_PARALLEL_LOOP(collapse=4,private='[local_inv_ds, &
+                            & local_term_coeff,local_flux1,local_flux2]')
                         do j_adv = advxb, advxe
                             do k_idx = 0, p; do q_idx = 0, n; do l_idx = 0, m
                                         local_inv_ds = 1._wp/dz(k_idx)
@@ -1400,7 +1411,7 @@ contains
         if (idir == 1) then ! x-direction
 
             if (surface_tension) then
-                !$acc parallel loop collapse(3) gang vector default(present)
+                $:GPU_PARALLEL_LOOP(collapse=3)
                 do l = 0, p
                     do k = 0, n
                         do j = 0, m
@@ -1414,11 +1425,11 @@ contains
                 end do
             end if
 
-            !$acc parallel loop collapse(3) gang vector default(present)
+            $:GPU_PARALLEL_LOOP(collapse=3)
             do l = 0, p
                 do k = 0, n
                     do j = 0, m
-                        !$acc loop seq
+                        $:GPU_LOOP(parallelism='[seq]')
                         do i = momxb, E_idx
                             rhs_vf(i)%sf(j, k, l) = &
                                 rhs_vf(i)%sf(j, k, l) + 1._wp/dx(j)* &
@@ -1432,7 +1443,7 @@ contains
         elseif (idir == 2) then ! y-direction
 
             if (surface_tension) then
-                !$acc parallel loop collapse(3) gang vector default(present)
+                $:GPU_PARALLEL_LOOP(collapse=3)
                 do l = 0, p
                     do k = 0, n
                         do j = 0, m
@@ -1464,10 +1475,10 @@ contains
                                                              idwbuff(1), idwbuff(2), idwbuff(3))
                     end if
 
-                    !$acc parallel loop collapse(2) gang vector default(present)
+                    $:GPU_PARALLEL_LOOP(collapse=2)
                     do l = 0, p
                         do j = 0, m
-                            !$acc loop seq
+                            $:GPU_LOOP(parallelism='[seq]')
                             do i = momxb, E_idx
                                 rhs_vf(i)%sf(j, 0, l) = &
                                     rhs_vf(i)%sf(j, 0, l) + 1._wp/(y_cc(1) - y_cc(-1))* &
@@ -1479,11 +1490,11 @@ contains
 
                 end if
 
-                !$acc parallel loop collapse(3) gang vector default(present)
+                $:GPU_PARALLEL_LOOP(collapse=3)
                 do l = 0, p
                     do k = 1, n
                         do j = 0, m
-                            !$acc loop seq
+                            $:GPU_LOOP(parallelism='[seq]')
                             do i = momxb, E_idx
                                 rhs_vf(i)%sf(j, k, l) = &
                                     rhs_vf(i)%sf(j, k, l) + 1._wp/dy(k)* &
@@ -1495,11 +1506,11 @@ contains
                 end do
 
             else
-                !$acc parallel loop collapse(3) gang vector default(present)
+                $:GPU_PARALLEL_LOOP(collapse=3)
                 do l = 0, p
                     do k = 0, n
                         do j = 0, m
-                            !$acc loop seq
+                            $:GPU_LOOP(parallelism='[seq]')
                             do i = momxb, E_idx
                                 rhs_vf(i)%sf(j, k, l) = &
                                     rhs_vf(i)%sf(j, k, l) + 1._wp/dy(k)* &
@@ -1516,11 +1527,11 @@ contains
             if (cyl_coord) then
                 if ((bc_y%beg == BC_REFLECTIVE) .or. (bc_y%beg == BC_AXIS)) then
 
-                    !$acc parallel loop collapse(3) gang vector default(present)
+                    $:GPU_PARALLEL_LOOP(collapse=3)
                     do l = 0, p
                         do k = 1, n
                             do j = 0, m
-                                !$acc loop seq
+                                $:GPU_LOOP(parallelism='[seq]')
                                 do i = momxb, E_idx
                                     rhs_vf(i)%sf(j, k, l) = &
                                         rhs_vf(i)%sf(j, k, l) - 5.e-1_wp/y_cc(k)* &
@@ -1532,10 +1543,10 @@ contains
                     end do
 
                     if (viscous) then
-                        !$acc parallel loop collapse(2) gang vector default(present)
+                        $:GPU_PARALLEL_LOOP(collapse=2)
                         do l = 0, p
                             do j = 0, m
-                                !$acc loop seq
+                                $:GPU_LOOP(parallelism='[seq]')
                                 do i = momxb, E_idx
                                     rhs_vf(i)%sf(j, 0, l) = &
                                         rhs_vf(i)%sf(j, 0, l) - 1._wp/y_cc(0)* &
@@ -1546,11 +1557,11 @@ contains
                     end if
                 else
 
-                    !$acc parallel loop collapse(3) gang vector default(present)
+                    $:GPU_PARALLEL_LOOP(collapse=3)
                     do l = 0, p
                         do k = 0, n
                             do j = 0, m
-                                !$acc loop seq
+                                $:GPU_LOOP(parallelism='[seq]')
                                 do i = momxb, E_idx
                                     rhs_vf(i)%sf(j, k, l) = &
                                         rhs_vf(i)%sf(j, k, l) - 5.e-1_wp/y_cc(k)* &
@@ -1567,7 +1578,7 @@ contains
         elseif (idir == 3) then ! z-direction
 
             if (surface_tension) then
-                !$acc parallel loop collapse(3) gang vector default(present)
+                $:GPU_PARALLEL_LOOP(collapse=3)
                 do l = 0, p
                     do k = 0, n
                         do j = 0, m
@@ -1581,11 +1592,11 @@ contains
                 end do
             end if
 
-            !$acc parallel loop collapse(3) gang vector default(present)
+            $:GPU_PARALLEL_LOOP(collapse=3)
             do l = 0, p
                 do k = 0, n
                     do j = 0, m
-                        !$acc loop seq
+                        $:GPU_LOOP(parallelism='[seq]')
                         do i = momxb, E_idx
                             rhs_vf(i)%sf(j, k, l) = &
                                 rhs_vf(i)%sf(j, k, l) + 1._wp/dz(l)* &
@@ -1597,7 +1608,7 @@ contains
             end do
 
             if (grid_geometry == 3) then
-                !$acc parallel loop collapse(3) gang vector default(present)
+                $:GPU_PARALLEL_LOOP(collapse=3)
                 do l = 0, p
                     do k = 0, n
                         do j = 0, m
@@ -1717,10 +1728,10 @@ contains
 
         end if
 
-        !$acc update device(is1, is2, is3, iv)
+        $:GPU_UPDATE(device='[is1,is2,is3,iv]')
 
         if (recon_dir == 1) then
-            !$acc parallel loop collapse(4) default(present)
+            $:GPU_PARALLEL_LOOP(collapse=4)
             do i = iv%beg, iv%end
                 do l = is3%beg, is3%end
                     do k = is2%beg, is2%end
@@ -1731,9 +1742,8 @@ contains
                     end do
                 end do
             end do
-            !$acc end parallel loop
         else if (recon_dir == 2) then
-            !$acc parallel loop collapse(4) default(present)
+            $:GPU_PARALLEL_LOOP(collapse=4)
             do i = iv%beg, iv%end
                 do l = is3%beg, is3%end
                     do k = is2%beg, is2%end
@@ -1744,9 +1754,8 @@ contains
                     end do
                 end do
             end do
-            !$acc end parallel loop
         else if (recon_dir == 3) then
-            !$acc parallel loop collapse(4) default(present)
+            $:GPU_PARALLEL_LOOP(collapse=4)
             do i = iv%beg, iv%end
                 do l = is3%beg, is3%end
                     do k = is2%beg, is2%end
@@ -1757,7 +1766,6 @@ contains
                     end do
                 end do
             end do
-            !$acc end parallel loop
         end if
 
     end subroutine s_reconstruct_cell_boundary_values_first_order
@@ -1775,13 +1783,13 @@ contains
                 @:DEALLOCATE(q_cons_qp%vf(j)%sf)
                 @:DEALLOCATE(q_prim_qp%vf(j)%sf)
             else
-                !$acc exit data detach(q_prim_qp%vf(j)%sf)
+                $:GPU_EXIT_DATA(detach='[q_prim_qp%vf(j)%sf]')
                 nullify (q_prim_qp%vf(j)%sf)
             end if
         end do
 
         do j = adv_idx%beg, adv_idx%end
-            !$acc exit data detach(q_prim_qp%vf(j)%sf)
+            $:GPU_EXIT_DATA(detach='[q_prim_qp%vf(j)%sf]')
             nullify (q_prim_qp%vf(j)%sf)
         end do
 
@@ -1814,7 +1822,7 @@ contains
         end if
 
         if (mpp_lim .and. bubbles_euler) then
-            !$acc exit data delete(alf_sum%sf)
+            $:GPU_EXIT_DATA(delete='[alf_sum%sf]')
             deallocate (alf_sum%sf)
         end if
 
diff --git a/src/simulation/m_riemann_solvers.fpp b/src/simulation/m_riemann_solvers.fpp
index 8b9d0cf043..a189d7e3c3 100644
--- a/src/simulation/m_riemann_solvers.fpp
+++ b/src/simulation/m_riemann_solvers.fpp
@@ -67,8 +67,7 @@ module m_riemann_solvers
     real(wp), allocatable, dimension(:, :, :, :) :: flux_rsx_vf, flux_src_rsx_vf
     real(wp), allocatable, dimension(:, :, :, :) :: flux_rsy_vf, flux_src_rsy_vf
     real(wp), allocatable, dimension(:, :, :, :) :: flux_rsz_vf, flux_src_rsz_vf
-    !$acc declare create( flux_rsx_vf, flux_src_rsx_vf, flux_rsy_vf,  &
-    !$acc   flux_src_rsy_vf, flux_rsz_vf, flux_src_rsz_vf )
+    $:GPU_DECLARE(create='[flux_rsx_vf,flux_src_rsx_vf,flux_rsy_vf,flux_src_rsy_vf,flux_rsz_vf,flux_src_rsz_vf]')
     !> @}
 
     !> The cell-boundary values of the geometrical source flux that are computed
@@ -79,7 +78,7 @@ module m_riemann_solvers
     real(wp), allocatable, dimension(:, :, :, :) :: flux_gsrc_rsx_vf !<
     real(wp), allocatable, dimension(:, :, :, :) :: flux_gsrc_rsy_vf !<
     real(wp), allocatable, dimension(:, :, :, :) :: flux_gsrc_rsz_vf !<
-    !$acc declare create( flux_gsrc_rsx_vf, flux_gsrc_rsy_vf, flux_gsrc_rsz_vf )
+    $:GPU_DECLARE(create='[flux_gsrc_rsx_vf,flux_gsrc_rsy_vf,flux_gsrc_rsz_vf]')
     !> @}
 
     ! The cell-boundary values of the velocity. vel_src_rs_vf is determined as
@@ -88,17 +87,17 @@ module m_riemann_solvers
     real(wp), allocatable, dimension(:, :, :, :) :: vel_src_rsx_vf
     real(wp), allocatable, dimension(:, :, :, :) :: vel_src_rsy_vf
     real(wp), allocatable, dimension(:, :, :, :) :: vel_src_rsz_vf
-    !$acc declare create(vel_src_rsx_vf, vel_src_rsy_vf, vel_src_rsz_vf)
+    $:GPU_DECLARE(create='[vel_src_rsx_vf,vel_src_rsy_vf,vel_src_rsz_vf]')
 
     real(wp), allocatable, dimension(:, :, :, :) :: mom_sp_rsx_vf
     real(wp), allocatable, dimension(:, :, :, :) :: mom_sp_rsy_vf
     real(wp), allocatable, dimension(:, :, :, :) :: mom_sp_rsz_vf
-    !$acc declare create(mom_sp_rsx_vf, mom_sp_rsy_vf, mom_sp_rsz_vf)
+    $:GPU_DECLARE(create='[mom_sp_rsx_vf,mom_sp_rsy_vf,mom_sp_rsz_vf]')
 
     real(wp), allocatable, dimension(:, :, :, :) :: Re_avg_rsx_vf
     real(wp), allocatable, dimension(:, :, :, :) :: Re_avg_rsy_vf
     real(wp), allocatable, dimension(:, :, :, :) :: Re_avg_rsz_vf
-    !$acc declare create(Re_avg_rsx_vf, Re_avg_rsy_vf, Re_avg_rsz_vf)
+    $:GPU_DECLARE(create='[Re_avg_rsx_vf,Re_avg_rsy_vf,Re_avg_rsz_vf]')
 
     !> @name Indical bounds in the s1-, s2- and s3-directions
     !> @{
@@ -106,13 +105,13 @@ module m_riemann_solvers
     type(int_bounds_info) :: isx, isy, isz
     !> @}
 
-    !$acc declare create(is1, is2, is3, isx, isy, isz)
+    $:GPU_DECLARE(create='[is1,is2,is3,isx,isy,isz]')
 
     real(wp), allocatable, dimension(:) :: Gs
-    !$acc declare create(Gs)
+    $:GPU_DECLARE(create='[Gs]')
 
     real(wp), allocatable, dimension(:, :) :: Res
-    !$acc declare create(Res)
+    $:GPU_DECLARE(create='[Res]')
 
 contains
 
@@ -358,19 +357,18 @@ contains
         #:for NORM_DIR, XYZ in [(1, 'x'), (2, 'y'), (3, 'z')]
 
             if (norm_dir == ${NORM_DIR}$) then
-                !$acc parallel loop collapse(3) gang vector default(present)    &
-                !$acc private(alpha_rho_L, alpha_rho_R, vel_L, vel_R, alpha_L,  &
-                !$acc alpha_R, tau_e_L, tau_e_R, G_L, G_R, Re_L, Re_R,          &
-                !$acc rho_avg, h_avg, gamma_avg, s_L, s_R, s_S, Ys_L, Ys_R,     &
-                !$acc xi_field_L, xi_field_R,                                   &
-                !$acc Cp_iL, Cp_iR, Xs_L, Xs_R, Gamma_iL, Gamma_iR,             &
-                !$acc Yi_avg, Phi_avg, h_iL, h_iR, h_avg_2,                     &
-                !$acc c_fast, pres_mag, B, Ga, vdotB, B2, b4, cm,               &
-                !$acc pcorr, zcoef, vel_L_tmp, vel_R_tmp)
+                $:GPU_PARALLEL_LOOP(collapse=3, private='[alpha_rho_L, alpha_rho_R, &
+                    & vel_L, vel_R, alpha_L, alpha_R, tau_e_L, tau_e_R, &
+                    & G_L, G_R, Re_L, Re_R, rho_avg, h_avg, gamma_avg, &
+                    & s_L, s_R, s_S, Ys_L, Ys_R, xi_field_L, xi_field_R, &
+                    & Cp_iL, Cp_iR, Xs_L, Xs_R, Gamma_iL, Gamma_iR, &
+                    & Yi_avg, Phi_avg, h_iL, h_iR, h_avg_2, c_fast, &
+                    & pres_mag, B, Ga, vdotB, B2, b4, cm, pcorr, &
+                    & zcoef, vel_L_tmp, vel_R_tmp]')
                 do l = is3%beg, is3%end
                     do k = is2%beg, is2%end
                         do j = is1%beg, is1%end
-                            !$acc loop seq
+                            $:GPU_LOOP(parallelism='[seq]')
                             do i = 1, contxe
                                 alpha_rho_L(i) = qL_prim_rs${XYZ}$_vf(j, k, l, i)
                                 alpha_rho_R(i) = qR_prim_rs${XYZ}$_vf(j + 1, k, l, i)
@@ -378,7 +376,7 @@ contains
 
                             vel_L_rms = 0._wp; vel_R_rms = 0._wp
 
-                            !$acc loop seq
+                            $:GPU_LOOP(parallelism='[seq]')
                             do i = 1, num_vels
                                 vel_L(i) = qL_prim_rs${XYZ}$_vf(j, k, l, contxe + i)
                                 vel_R(i) = qR_prim_rs${XYZ}$_vf(j + 1, k, l, contxe + i)
@@ -386,7 +384,7 @@ contains
                                 vel_R_rms = vel_R_rms + vel_R(i)**2._wp
                             end do
 
-                            !$acc loop seq
+                            $:GPU_LOOP(parallelism='[seq]')
                             do i = 1, num_fluids
                                 alpha_L(i) = qL_prim_rs${XYZ}$_vf(j, k, l, E_idx + i)
                                 alpha_R(i) = qR_prim_rs${XYZ}$_vf(j + 1, k, l, E_idx + i)
@@ -430,7 +428,7 @@ contains
                             pres_mag%R = 0._wp
 
                             if (mpp_lim) then
-                                !$acc loop seq
+                                $:GPU_LOOP(parallelism='[seq]')
                                 do i = 1, num_fluids
                                     alpha_rho_L(i) = max(0._wp, alpha_rho_L(i))
                                     alpha_L(i) = min(max(0._wp, alpha_L(i)), 1._wp)
@@ -444,7 +442,7 @@ contains
                                 alpha_R = alpha_R/max(alpha_R_sum, sgm_eps)
                             end if
 
-                            !$acc loop seq
+                            $:GPU_LOOP(parallelism='[seq]')
                             do i = 1, num_fluids
                                 rho_L = rho_L + alpha_rho_L(i)
                                 gamma_L = gamma_L + alpha_L(i)*gammas(i)
@@ -458,7 +456,7 @@ contains
                             end do
 
                             if (viscous) then
-                                !$acc loop seq
+                                $:GPU_LOOP(parallelism='[seq]')
                                 do i = 1, 2
                                     Re_L(i) = dflt_real
                                     Re_R(i) = dflt_real
@@ -466,7 +464,7 @@ contains
                                     if (Re_size(i) > 0) Re_L(i) = 0._wp
                                     if (Re_size(i) > 0) Re_R(i) = 0._wp
 
-                                    !$acc loop seq
+                                    $:GPU_LOOP(parallelism='[seq]')
                                     do q = 1, Re_size(i)
                                         Re_L(i) = alpha_L(Re_idx(i, q))/Res(i, q) &
                                                   + Re_L(i)
@@ -480,7 +478,7 @@ contains
                             end if
 
                             if (chemistry) then
-                                !$acc loop seq
+                                $:GPU_LOOP(parallelism='[seq]')
                                 do i = chemxb, chemxe
                                     Ys_L(i - chemxb + 1) = qL_prim_rs${XYZ}$_vf(j, k, l, i)
                                     Ys_R(i - chemxb + 1) = qR_prim_rs${XYZ}$_vf(j + 1, k, l, i)
@@ -568,7 +566,7 @@ contains
                             if (hypoelasticity) then
                                 G_L = 0._wp; G_R = 0._wp
 
-                                !$acc loop seq
+                                $:GPU_LOOP(parallelism='[seq]')
                                 do i = 1, num_fluids
                                     G_L = G_L + alpha_L(i)*Gs(i)
                                     G_R = G_R + alpha_R(i)*Gs(i)
@@ -601,7 +599,7 @@ contains
                             !    G_L = 0._wp
                             !    G_R = 0._wp
                             !
-                            !    !$acc loop seq
+                            !    $:GPU_LOOP(parallelism='[seq]')
                             !    do i = 1, num_fluids
                             !        G_L = G_L + alpha_L(i)*Gs(i)
                             !        G_R = G_R + alpha_R(i)*Gs(i)
@@ -610,17 +608,17 @@ contains
                             !    if ((G_L > 1.e-3_wp) .and. (G_R > 1.e-3_wp)) then
                             !    E_L = E_L + G_L*qL_prim_rs${XYZ}$_vf(j, k, l, xiend + 1)
                             !    E_R = E_R + G_R*qR_prim_rs${XYZ}$_vf(j + 1, k, l, xiend + 1)
-                            !    !$acc loop seq
+                            !    $:GPU_LOOP(parallelism='[seq]')
                             !    do i = 1, b_size-1
                             !        tau_e_L(i) = G_L*qL_prim_rs${XYZ}$_vf(j, k, l, strxb - 1 + i)
                             !        tau_e_R(i) = G_R*qR_prim_rs${XYZ}$_vf(j + 1, k, l, strxb - 1 + i)
                             !    end do
-                            !    !$acc loop seq
+                            !    $:GPU_LOOP(parallelism='[seq]')
                             !    do i = 1, b_size-1
                             !        tau_e_L(i) = 0._wp
                             !        tau_e_R(i) = 0._wp
                             !    end do
-                            !    !$acc loop seq
+                            !    $:GPU_LOOP(parallelism='[seq]')
                             !    do i = 1, num_dims
                             !        xi_field_L(i) = qL_prim_rs${XYZ}$_vf(j, k, l, xibeg - 1 + i)
                             !        xi_field_R(i) = qR_prim_rs${XYZ}$_vf(j + 1, k, l, xibeg - 1 + i)
@@ -648,7 +646,7 @@ contains
                             end if
 
                             if (viscous) then
-                                !$acc loop seq
+                                $:GPU_LOOP(parallelism='[seq]')
                                 do i = 1, 2
                                     Re_avg_rs${XYZ}$_vf(j, k, l, i) = 2._wp/(1._wp/Re_L(i) + 1._wp/Re_R(i))
                                 end do
@@ -727,7 +725,7 @@ contains
 
                             ! Mass
                             if (.not. relativity) then
-                                !$acc loop seq
+                                $:GPU_LOOP(parallelism='[seq]')
                                 do i = 1, contxe
                                     flux_rs${XYZ}$_vf(j, k, l, i) = &
                                         (s_M*alpha_rho_R(i)*vel_R(norm_dir) &
@@ -737,7 +735,7 @@ contains
                                         /(s_M - s_P)
                                 end do
                             elseif (relativity) then
-                                !$acc loop seq
+                                $:GPU_LOOP(parallelism='[seq]')
                                 do i = 1, contxe
                                     flux_rs${XYZ}$_vf(j, k, l, i) = &
                                         (s_M*Ga%R*alpha_rho_R(i)*vel_R(norm_dir) &
@@ -750,7 +748,7 @@ contains
 
                             ! Momentum
                             if (mhd .and. (.not. relativity)) then
-                                !$acc loop seq
+                                $:GPU_LOOP(parallelism='[seq]')
                                 do i = 1, 3
                                     ! Flux of rho*v_i in the ${XYZ}$ direction
                                     ! = rho * v_i * v_${XYZ}$ - B_i * B_${XYZ}$ + delta_(${XYZ}$,i) * p_tot
@@ -765,7 +763,7 @@ contains
                                         /(s_M - s_P)
                                 end do
                             elseif (mhd .and. relativity) then
-                                !$acc loop seq
+                                $:GPU_LOOP(parallelism='[seq]')
                                 do i = 1, 3
                                     ! Flux of m_i in the ${XYZ}$ direction
                                     ! = m_i * v_${XYZ}$ - b_i/Gamma * B_${XYZ}$ + delta_(${XYZ}$,i) * p_tot
@@ -780,7 +778,7 @@ contains
                                         /(s_M - s_P)
                                 end do
                             elseif (bubbles_euler) then
-                                !$acc loop seq
+                                $:GPU_LOOP(parallelism='[seq]')
                                 do i = 1, num_vels
                                     flux_rs${XYZ}$_vf(j, k, l, contxe + dir_idx(i)) = &
                                         (s_M*(rho_R*vel_R(dir_idx(1)) &
@@ -795,7 +793,7 @@ contains
                                         + (s_M/s_L)*(s_P/s_R)*pcorr*(vel_R(dir_idx(i)) - vel_L(dir_idx(i)))
                                 end do
                             else if (hypoelasticity) then
-                                !$acc loop seq
+                                $:GPU_LOOP(parallelism='[seq]')
                                 do i = 1, num_vels
                                     flux_rs${XYZ}$_vf(j, k, l, contxe + dir_idx(i)) = &
                                         (s_M*(rho_R*vel_R(dir_idx(1)) &
@@ -811,7 +809,7 @@ contains
                                         /(s_M - s_P)
                                 end do
                             else
-                                !$acc loop seq
+                                $:GPU_LOOP(parallelism='[seq]')
                                 do i = 1, num_vels
                                     flux_rs${XYZ}$_vf(j, k, l, contxe + dir_idx(i)) = &
                                         (s_M*(rho_R*vel_R(dir_idx(1)) &
@@ -852,7 +850,7 @@ contains
                                     + (s_M/s_L)*(s_P/s_R)*pcorr*(vel_R_rms - vel_L_rms)/2._wp
                             else if (hypoelasticity) then
                                 flux_tau_L = 0._wp; flux_tau_R = 0._wp
-                                !$acc loop seq
+                                $:GPU_LOOP(parallelism='[seq]')
                                 do i = 1, num_dims
                                     flux_tau_L = flux_tau_L + tau_e_L(dir_idx_tau(i))*vel_L(dir_idx(i))
                                     flux_tau_R = flux_tau_R + tau_e_R(dir_idx_tau(i))*vel_R(dir_idx(i))
@@ -885,7 +883,7 @@ contains
                             end if
 
                             ! Advection
-                            !$acc loop seq
+                            $:GPU_LOOP(parallelism='[seq]')
                             do i = advxb, advxe
                                 flux_rs${XYZ}$_vf(j, k, l, i) = &
                                     (qL_prim_rs${XYZ}$_vf(j, k, l, i) &
@@ -910,7 +908,7 @@ contains
                             !end if
 
                             ! Div(U)?
-                            !$acc loop seq
+                            $:GPU_LOOP(parallelism='[seq]')
                             do i = 1, num_vels
                                 vel_src_rs${XYZ}$_vf(j, k, l, dir_idx(i)) = &
                                     (xi_M*(rho_L*vel_L(dir_idx(i))* &
@@ -931,7 +929,7 @@ contains
                             end if
 
                             if (chemistry) then
-                                !$acc loop seq
+                                $:GPU_LOOP(parallelism='[seq]')
                                 do i = chemxb, chemxe
                                     Y_L = qL_prim_rs${XYZ}$_vf(j, k, l, i)
                                     Y_R = qR_prim_rs${XYZ}$_vf(j + 1, k, l, i)
@@ -948,7 +946,7 @@ contains
                                 if (n == 0) then ! 1D: d/dx flux only & Bx = Bx0 = const.
                                     ! B_y flux = v_x * B_y - v_y * Bx0
                                     ! B_z flux = v_x * B_z - v_z * Bx0
-                                    !$acc loop seq
+                                    $:GPU_LOOP(parallelism='[seq]')
                                     do i = 0, 1
                                         flux_rsx_vf(j, k, l, B_idx%beg + i) = (s_M*(vel_R(1)*B%R(2 + i) - vel_R(2 + i)*Bx0) &
                                                                                - s_P*(vel_L(1)*B%L(2 + i) - vel_L(2 + i)*Bx0) &
@@ -958,7 +956,7 @@ contains
                                     ! B_x d/d${XYZ}$ flux = (1 - delta(x,${XYZ}$)) * (v_${XYZ}$ * B_x - v_x * B_${XYZ}$)
                                     ! B_y d/d${XYZ}$ flux = (1 - delta(y,${XYZ}$)) * (v_${XYZ}$ * B_y - v_y * B_${XYZ}$)
                                     ! B_z d/d${XYZ}$ flux = (1 - delta(z,${XYZ}$)) * (v_${XYZ}$ * B_z - v_z * B_${XYZ}$)
-                                    !$acc loop seq
+                                    $:GPU_LOOP(parallelism='[seq]')
                                     do i = 0, 2
                                         flux_rs${XYZ}$_vf(j, k, l, B_idx%beg + i) = (1 - dir_flg(i + 1))*( &
                                                                                     s_M*(vel_R(dir_idx(1))*B%R(i + 1) - vel_R(i + 1)*B%R(norm_dir)) - &
@@ -972,7 +970,7 @@ contains
                             #:if (NORM_DIR == 2)
                                 if (cyl_coord) then
                                     !Substituting the advective flux into the inviscid geometrical source flux
-                                    !$acc loop seq
+                                    $:GPU_LOOP(parallelism='[seq]')
                                     do i = 1, E_idx
                                         flux_gsrc_rs${XYZ}$_vf(j, k, l, i) = flux_rs${XYZ}$_vf(j, k, l, i)
                                     end do
@@ -981,7 +979,7 @@ contains
                                         flux_rs${XYZ}$_vf(j, k, l, contxe + 2) &
                                         - (s_M*pres_R - s_P*pres_L)/(s_M - s_P)
                                     ! Geometrical source of the void fraction(s) is zero
-                                    !$acc loop seq
+                                    $:GPU_LOOP(parallelism='[seq]')
                                     do i = advxb, advxe
                                         flux_gsrc_rs${XYZ}$_vf(j, k, l, i) = flux_rs${XYZ}$_vf(j, k, l, i)
                                     end do
@@ -994,7 +992,7 @@ contains
                                         (s_M*tau_e_R(4) - s_P*tau_e_L(4)) &
                                         /(s_M - s_P)
 
-                                    !$acc loop seq
+                                    $:GPU_LOOP(parallelism='[seq]')
                                     do i = strxb, strxe
                                         flux_gsrc_rs${XYZ}$_vf(j, k, l, i) = flux_rs${XYZ}$_vf(j, k, l, i)
                                     end do
@@ -1190,13 +1188,14 @@ contains
                 ! 6-EQUATION MODEL WITH HLLC
                 if (model_eqns == 3) then
                     !ME3
-
-                    !$acc parallel loop collapse(3) gang vector default(present)                    &
-                    !$acc private(vel_L, vel_R, vel_K_Star, Re_L, Re_R, rho_avg, h_avg, gamma_avg,  &
-                    !$acc s_L, s_R, s_S, vel_avg_rms, alpha_L, alpha_R, Ys_L, Ys_R, Xs_L, Xs_R,     &
-                    !$acc Gamma_iL, Gamma_iR, Cp_iL, Cp_iR, Yi_avg, Phi_avg, h_iL, h_iR, h_avg_2,   &
-                    !$acc tau_e_L, tau_e_R, G_L, G_R, flux_ene_e, xi_field_L, xi_field_R, pcorr,    &
-                    !$acc zcoef, vel_L_tmp, vel_R_tmp)
+                    $:GPU_PARALLEL_LOOP(collapse=3, private='[vel_L, vel_R, &
+                        & vel_K_Star, Re_L, Re_R, rho_avg, h_avg, &
+                        & gamma_avg, s_L, s_R, s_S, vel_avg_rms, &
+                        & alpha_L, alpha_R, Ys_L, Ys_R, Xs_L, Xs_R, &
+                        & Gamma_iL, Gamma_iR, Cp_iL, Cp_iR, Yi_avg, &
+                        & Phi_avg, h_iL, h_iR, h_avg_2, tau_e_L, &
+                        & tau_e_R, G_L, G_R, flux_ene_e, xi_field_L, &
+                        & xi_field_R, pcorr, zcoef, vel_L_tmp, vel_R_tmp]')
                     do l = is3%beg, is3%end
                         do k = is2%beg, is2%end
                             do j = is1%beg, is1%end
@@ -1205,7 +1204,7 @@ contains
 
                                 vel_L_rms = 0._wp; vel_R_rms = 0._wp
 
-                                !$acc loop seq
+                                $:GPU_LOOP(parallelism='[seq]')
                                 do i = 1, num_dims
                                     vel_L(i) = qL_prim_rs${XYZ}$_vf(j, k, l, contxe + i)
                                     vel_R(i) = qR_prim_rs${XYZ}$_vf(j + 1, k, l, contxe + i)
@@ -1230,32 +1229,32 @@ contains
                                 alpha_R_sum = 0._wp
 
                                 if (mpp_lim) then
-                                    !$acc loop seq
+                                    $:GPU_LOOP(parallelism='[seq]')
                                     do i = 1, num_fluids
                                         qL_prim_rs${XYZ}$_vf(j, k, l, i) = max(0._wp, qL_prim_rs${XYZ}$_vf(j, k, l, i))
                                         qL_prim_rs${XYZ}$_vf(j, k, l, E_idx + i) = min(max(0._wp, qL_prim_rs${XYZ}$_vf(j, k, l, E_idx + i)), 1._wp)
                                         alpha_L_sum = alpha_L_sum + qL_prim_rs${XYZ}$_vf(j, k, l, E_idx + i)
                                     end do
 
-                                    !$acc loop seq
+                                    $:GPU_LOOP(parallelism='[seq]')
                                     do i = 1, num_fluids
                                         qL_prim_rs${XYZ}$_vf(j, k, l, E_idx + i) = qL_prim_rs${XYZ}$_vf(j, k, l, E_idx + i)/max(alpha_L_sum, sgm_eps)
                                     end do
 
-                                    !$acc loop seq
+                                    $:GPU_LOOP(parallelism='[seq]')
                                     do i = 1, num_fluids
                                         qR_prim_rs${XYZ}$_vf(j + 1, k, l, i) = max(0._wp, qR_prim_rs${XYZ}$_vf(j + 1, k, l, i))
                                         qR_prim_rs${XYZ}$_vf(j + 1, k, l, E_idx + i) = min(max(0._wp, qR_prim_rs${XYZ}$_vf(j + 1, k, l, E_idx + i)), 1._wp)
                                         alpha_R_sum = alpha_R_sum + qR_prim_rs${XYZ}$_vf(j + 1, k, l, E_idx + i)
                                     end do
 
-                                    !$acc loop seq
+                                    $:GPU_LOOP(parallelism='[seq]')
                                     do i = 1, num_fluids
                                         qR_prim_rs${XYZ}$_vf(j + 1, k, l, E_idx + i) = qR_prim_rs${XYZ}$_vf(j + 1, k, l, E_idx + i)/max(alpha_R_sum, sgm_eps)
                                     end do
                                 end if
 
-                                !$acc loop seq
+                                $:GPU_LOOP(parallelism='[seq]')
                                 do i = 1, num_fluids
                                     rho_L = rho_L + qL_prim_rs${XYZ}$_vf(j, k, l, i)
                                     gamma_L = gamma_L + qL_prim_rs${XYZ}$_vf(j, k, l, E_idx + i)*gammas(i)
@@ -1272,13 +1271,13 @@ contains
                                 end do
 
                                 if (viscous) then
-                                    !$acc loop seq
+                                    $:GPU_LOOP(parallelism='[seq]')
                                     do i = 1, 2
                                         Re_L(i) = dflt_real
 
                                         if (Re_size(i) > 0) Re_L(i) = 0._wp
 
-                                        !$acc loop seq
+                                        $:GPU_LOOP(parallelism='[seq]')
                                         do q = 1, Re_size(i)
                                             Re_L(i) = qL_prim_rs${XYZ}$_vf(j, k, l, E_idx + Re_idx(i, q))/Res(i, q) &
                                                       + Re_L(i)
@@ -1288,13 +1287,13 @@ contains
 
                                     end do
 
-                                    !$acc loop seq
+                                    $:GPU_LOOP(parallelism='[seq]')
                                     do i = 1, 2
                                         Re_R(i) = dflt_real
 
                                         if (Re_size(i) > 0) Re_R(i) = 0._wp
 
-                                        !$acc loop seq
+                                        $:GPU_LOOP(parallelism='[seq]')
                                         do q = 1, Re_size(i)
                                             Re_R(i) = qR_prim_rs${XYZ}$_vf(j + 1, k, l, E_idx + Re_idx(i, q))/Res(i, q) &
                                                       + Re_R(i)
@@ -1310,18 +1309,18 @@ contains
 
                                 ! ENERGY ADJUSTMENTS FOR HYPOELASTIC ENERGY
                                 if (hypoelasticity) then
-                                    !$acc loop seq
+                                    $:GPU_LOOP(parallelism='[seq]')
                                     do i = 1, strxe - strxb + 1
                                         tau_e_L(i) = qL_prim_rs${XYZ}$_vf(j, k, l, strxb - 1 + i)
                                         tau_e_R(i) = qR_prim_rs${XYZ}$_vf(j + 1, k, l, strxb - 1 + i)
                                     end do
                                     G_L = 0._wp; G_R = 0._wp
-                                    !$acc loop seq
+                                    $:GPU_LOOP(parallelism='[seq]')
                                     do i = 1, num_fluids
                                         G_L = G_L + alpha_L(i)*Gs(i)
                                         G_R = G_R + alpha_R(i)*Gs(i)
                                     end do
-                                    !$acc loop seq
+                                    $:GPU_LOOP(parallelism='[seq]')
                                     do i = 1, strxe - strxb + 1
                                         ! Elastic contribution to energy if G large enough
                                         if ((G_L > verysmall) .and. (G_R > verysmall)) then
@@ -1338,13 +1337,13 @@ contains
 
                                 ! ENERGY ADJUSTMENTS FOR HYPERELASTIC ENERGY
                                 if (hyperelasticity) then
-                                    !$acc loop seq
+                                    $:GPU_LOOP(parallelism='[seq]')
                                     do i = 1, num_dims
                                         xi_field_L(i) = qL_prim_rs${XYZ}$_vf(j, k, l, xibeg - 1 + i)
                                         xi_field_R(i) = qR_prim_rs${XYZ}$_vf(j + 1, k, l, xibeg - 1 + i)
                                     end do
                                     G_L = 0._wp; G_R = 0._wp; 
-                                    !$acc loop seq
+                                    $:GPU_LOOP(parallelism='[seq]')
                                     do i = 1, num_fluids
                                         ! Mixture left and right shear modulus
                                         G_L = G_L + alpha_L(i)*Gs(i)
@@ -1355,7 +1354,7 @@ contains
                                         E_L = E_L + G_L*qL_prim_rs${XYZ}$_vf(j, k, l, xiend + 1)
                                         E_R = E_R + G_R*qR_prim_rs${XYZ}$_vf(j + 1, k, l, xiend + 1)
                                     end if
-                                    !$acc loop seq
+                                    $:GPU_LOOP(parallelism='[seq]')
                                     do i = 1, b_size - 1
                                         tau_e_L(i) = qL_prim_rs${XYZ}$_vf(j, k, l, strxb - 1 + i)
                                         tau_e_R(i) = qR_prim_rs${XYZ}$_vf(j + 1, k, l, strxb - 1 + i)
@@ -1379,7 +1378,7 @@ contains
                                                               vel_avg_rms, 0._wp, c_avg)
 
                                 if (viscous) then
-                                    !$acc loop seq
+                                    $:GPU_LOOP(parallelism='[seq]')
                                     do i = 1, 2
                                         Re_avg_rs${XYZ}$_vf(j, k, l, i) = 2._wp/(1._wp/Re_L(i) + 1._wp/Re_R(i))
                                     end do
@@ -1474,7 +1473,7 @@ contains
 
                                 ! COMPUTING FLUXES
                                 ! MASS FLUX.
-                                !$acc loop seq
+                                $:GPU_LOOP(parallelism='[seq]')
                                 do i = 1, contxe
                                     flux_rs${XYZ}$_vf(j, k, l, i) = &
                                         xi_M*qL_prim_rs${XYZ}$_vf(j, k, l, i)*(vel_L(idx1) + s_M*(xi_L - 1._wp)) + &
@@ -1483,7 +1482,7 @@ contains
 
                                 ! MOMENTUM FLUX.
                                 ! f = \rho u u - \sigma, q = \rho u, q_star = \xi * \rho*(s_star, v, w)
-                                !$acc loop seq
+                                $:GPU_LOOP(parallelism='[seq]')
                                 do i = 1, num_dims
                                     idxi = dir_idx(i)
                                     flux_rs${XYZ}$_vf(j, k, l, contxe + idxi) = rho_Star*vel_K_Star* &
@@ -1499,7 +1498,7 @@ contains
                                 ! ELASTICITY. Elastic shear stress additions for the momentum and energy flux
                                 if (elasticity) then
                                     flux_ene_e = 0._wp; 
-                                    !$acc loop seq
+                                    $:GPU_LOOP(parallelism='[seq]')
                                     do i = 1, num_dims
                                         idxi = dir_idx(i)
                                         ! MOMENTUM ELASTIC FLUX.
@@ -1517,7 +1516,7 @@ contains
                                 end if
 
                                 ! VOLUME FRACTION FLUX.
-                                !$acc loop seq
+                                $:GPU_LOOP(parallelism='[seq]')
                                 do i = advxb, advxe
                                     flux_rs${XYZ}$_vf(j, k, l, i) = &
                                         xi_M*qL_prim_rs${XYZ}$_vf(j, k, l, i)*s_S + &
@@ -1525,7 +1524,7 @@ contains
                                 end do
 
                                 ! SOURCE TERM FOR VOLUME FRACTION ADVECTION FLUX.
-                                !$acc loop seq
+                                $:GPU_LOOP(parallelism='[seq]')
                                 do i = 1, num_dims
                                     idxi = dir_idx(i)
                                     vel_src_rs${XYZ}$_vf(j, k, l, idxi) = &
@@ -1535,7 +1534,7 @@ contains
 
                                 ! INTERNAL ENERGIES ADVECTION FLUX.
                                 ! K-th pressure and velocity in preparation for the internal energy flux
-                                !$acc loop seq
+                                $:GPU_LOOP(parallelism='[seq]')
                                 do i = 1, num_fluids
                                     p_K_Star = xi_M*(xi_MP*((pres_L + pi_infs(i)/(1._wp + gammas(i)))* &
                                                             xi_L**(1._wp/gammas(i) + 1._wp) - pi_infs(i)/(1._wp + gammas(i)) - pres_L) + pres_L) + &
@@ -1554,7 +1553,7 @@ contains
 
                                 ! HYPOELASTIC STRESS EVOLUTION FLUX.
                                 if (hypoelasticity) then
-                                    !$acc loop seq
+                                    $:GPU_LOOP(parallelism='[seq]')
                                     do i = 1, strxe - strxb + 1
                                         flux_rs${XYZ}$_vf(j, k, l, strxb - 1 + i) = &
                                             xi_M*(s_S/(s_L - s_S))*(s_L*rho_L*tau_e_L(i) - rho_L*vel_L(idx1)*tau_e_L(i)) + &
@@ -1564,7 +1563,7 @@ contains
 
                                 ! REFERENCE MAP FLUX.
                                 if (hyperelasticity) then
-                                    !$acc loop seq
+                                    $:GPU_LOOP(parallelism='[seq]')
                                     do i = 1, num_dims
                                         flux_rs${XYZ}$_vf(j, k, l, xibeg - 1 + i) = &
                                             xi_M*(s_S/(s_L - s_S))*(s_L*rho_L*xi_field_L(i) &
@@ -1585,11 +1584,11 @@ contains
                                 #:if (NORM_DIR == 2)
                                     if (cyl_coord) then
                                         !Substituting the advective flux into the inviscid geometrical source flux
-                                        !$acc loop seq
+                                        $:GPU_LOOP(parallelism='[seq]')
                                         do i = 1, E_idx
                                             flux_gsrc_rs${XYZ}$_vf(j, k, l, i) = flux_rs${XYZ}$_vf(j, k, l, i)
                                         end do
-                                        !$acc loop seq
+                                        $:GPU_LOOP(parallelism='[seq]')
                                         do i = intxb, intxe
                                             flux_gsrc_rs${XYZ}$_vf(j, k, l, i) = flux_rs${XYZ}$_vf(j, k, l, i)
                                         end do
@@ -1597,7 +1596,7 @@ contains
                                         flux_gsrc_rs${XYZ}$_vf(j, k, l, momxb - 1 + dir_idx(1)) = &
                                             flux_gsrc_rs${XYZ}$_vf(j, k, l, momxb - 1 + dir_idx(1)) - p_Star
                                         ! Geometrical source of the void fraction(s) is zero
-                                        !$acc loop seq
+                                        $:GPU_LOOP(parallelism='[seq]')
                                         do i = advxb, advxe
                                             flux_gsrc_rs${XYZ}$_vf(j, k, l, i) = 0._wp
                                         end do
@@ -1605,7 +1604,7 @@ contains
                                 #:endif
                                 #:if (NORM_DIR == 3)
                                     if (grid_geometry == 3) then
-                                        !$acc loop seq
+                                        $:GPU_LOOP(parallelism='[seq]')
                                         do i = 1, sys_size
                                             flux_gsrc_rs${XYZ}$_vf(j, k, l, i) = 0._wp
                                         end do
@@ -1622,32 +1621,34 @@ contains
 
                 elseif (model_eqns == 4) then
                     !ME4
-                    !$acc parallel loop collapse(3) gang vector default(present) private(alpha_rho_L, alpha_rho_R, vel_L, vel_R, alpha_L, alpha_R, &
-                    !$acc rho_avg, h_avg, gamma_avg, s_L, s_R, s_S, vel_avg_rms, nbub_L, nbub_R, ptilde_L, ptilde_R)
+                    $:GPU_PARALLEL_LOOP(collapse=3, private='[alpha_rho_L, &
+                        & alpha_rho_R, vel_L, vel_R, alpha_L, alpha_R, &
+                        & rho_avg, h_avg, gamma_avg, s_L, s_R, s_S, &
+                        & vel_avg_rms, nbub_L, nbub_R, ptilde_L, ptilde_R]')
                     do l = is3%beg, is3%end
                         do k = is2%beg, is2%end
                             do j = is1%beg, is1%end
 
-                                !$acc loop seq
+                                $:GPU_LOOP(parallelism='[seq]')
                                 do i = 1, contxe
                                     alpha_rho_L(i) = qL_prim_rs${XYZ}$_vf(j, k, l, i)
                                     alpha_rho_R(i) = qR_prim_rs${XYZ}$_vf(j + 1, k, l, i)
                                 end do
 
-                                !$acc loop seq
+                                $:GPU_LOOP(parallelism='[seq]')
                                 do i = 1, num_dims
                                     vel_L(i) = qL_prim_rs${XYZ}$_vf(j, k, l, contxe + i)
                                     vel_R(i) = qR_prim_rs${XYZ}$_vf(j + 1, k, l, contxe + i)
                                 end do
 
                                 vel_L_rms = 0._wp; vel_R_rms = 0._wp
-                                !$acc loop seq
+                                $:GPU_LOOP(parallelism='[seq]')
                                 do i = 1, num_dims
                                     vel_L_rms = vel_L_rms + vel_L(i)**2._wp
                                     vel_R_rms = vel_R_rms + vel_R(i)**2._wp
                                 end do
 
-                                !$acc loop seq
+                                $:GPU_LOOP(parallelism='[seq]')
                                 do i = 1, num_fluids
                                     alpha_L(i) = qL_prim_rs${XYZ}$_vf(j, k, l, E_idx + i)
                                     alpha_R(i) = qR_prim_rs${XYZ}$_vf(j + 1, k, l, E_idx + i)
@@ -1660,7 +1661,7 @@ contains
                                 gamma_L = 0._wp
                                 pi_inf_L = 0._wp
                                 qv_L = 0._wp
-                                !$acc loop seq
+                                $:GPU_LOOP(parallelism='[seq]')
                                 do i = 1, num_fluids
                                     rho_L = rho_L + alpha_rho_L(i)
                                     gamma_L = gamma_L + alpha_L(i)*gammas(i)
@@ -1672,7 +1673,7 @@ contains
                                 gamma_R = 0._wp
                                 pi_inf_R = 0._wp
                                 qv_R = 0._wp
-                                !$acc loop seq
+                                $:GPU_LOOP(parallelism='[seq]')
                                 do i = 1, num_fluids
                                     rho_R = rho_R + alpha_rho_R(i)
                                     gamma_R = gamma_R + alpha_R(i)*gammas(i)
@@ -1747,7 +1748,7 @@ contains
                                 xi_M = (5.e-1_wp + sign(5.e-1_wp, s_S))
                                 xi_P = (5.e-1_wp - sign(5.e-1_wp, s_S))
 
-                                !$acc loop seq
+                                $:GPU_LOOP(parallelism='[seq]')
                                 do i = 1, contxe
                                     flux_rs${XYZ}$_vf(j, k, l, i) = &
                                         xi_M*alpha_rho_L(i) &
@@ -1758,7 +1759,7 @@ contains
 
                                 ! Momentum flux.
                                 ! f = \rho u u + p I, q = \rho u, q_star = \xi * \rho*(s_star, v, w)
-                                !$acc loop seq
+                                $:GPU_LOOP(parallelism='[seq]')
                                 do i = 1, num_dims
                                     flux_rs${XYZ}$_vf(j, k, l, contxe + dir_idx(i)) = &
                                         xi_M*(rho_L*(vel_L(dir_idx(1))* &
@@ -1777,7 +1778,7 @@ contains
 
                                 if (bubbles_euler) then
                                     ! Put p_tilde in
-                                    !$acc loop seq
+                                    $:GPU_LOOP(parallelism='[seq]')
                                     do i = 1, num_dims
                                         flux_rs${XYZ}$_vf(j, k, l, contxe + dir_idx(i)) = &
                                             flux_rs${XYZ}$_vf(j, k, l, contxe + dir_idx(i)) + &
@@ -1788,7 +1789,7 @@ contains
 
                                 flux_rs${XYZ}$_vf(j, k, l, E_idx) = 0._wp
 
-                                !$acc loop seq
+                                $:GPU_LOOP(parallelism='[seq]')
                                 do i = alf_idx, alf_idx !only advect the void fraction
                                     flux_rs${XYZ}$_vf(j, k, l, i) = &
                                         xi_M*qL_prim_rs${XYZ}$_vf(j, k, l, i) &
@@ -1798,7 +1799,7 @@ contains
                                 end do
 
                                 ! Source for volume fraction advection equation
-                                !$acc loop seq
+                                $:GPU_LOOP(parallelism='[seq]')
                                 do i = 1, num_dims
 
                                     vel_src_rs${XYZ}$_vf(j, k, l, dir_idx(i)) = 0._wp
@@ -1809,7 +1810,7 @@ contains
 
                                 ! Add advection flux for bubble variables
                                 if (bubbles_euler) then
-                                    !$acc loop seq
+                                    $:GPU_LOOP(parallelism='[seq]')
                                     do i = bubxb, bubxe
                                         flux_rs${XYZ}$_vf(j, k, l, i) = &
                                             xi_M*nbub_L*qL_prim_rs${XYZ}$_vf(j, k, l, i) &
@@ -1824,7 +1825,7 @@ contains
                                 #:if (NORM_DIR == 2)
                                     if (cyl_coord) then
                                         ! Substituting the advective flux into the inviscid geometrical source flux
-                                        !$acc loop seq
+                                        $:GPU_LOOP(parallelism='[seq]')
                                         do i = 1, E_idx
                                             flux_gsrc_rs${XYZ}$_vf(j, k, l, i) = flux_rs${XYZ}$_vf(j, k, l, i)
                                         end do
@@ -1841,7 +1842,7 @@ contains
                                                                       (1._wp - dir_flg(dir_idx(1)))* &
                                                                       vel_R(dir_idx(1))) - vel_R(dir_idx(1)))))
                                         ! Geometrical source of the void fraction(s) is zero
-                                        !$acc loop seq
+                                        $:GPU_LOOP(parallelism='[seq]')
                                         do i = advxb, advxe
                                             flux_gsrc_rs${XYZ}$_vf(j, k, l, i) = 0._wp
                                         end do
@@ -1849,7 +1850,7 @@ contains
                                 #:endif
                                 #:if (NORM_DIR == 3)
                                     if (grid_geometry == 3) then
-                                        !$acc loop seq
+                                        $:GPU_LOOP(parallelism='[seq]')
                                         do i = 1, sys_size
                                             flux_gsrc_rs${XYZ}$_vf(j, k, l, i) = 0._wp
                                         end do
@@ -1870,16 +1871,19 @@ contains
                             end do
                         end do
                     end do
-                    !$acc end parallel loop
 
                 elseif (model_eqns == 2 .and. bubbles_euler) then
-                    !$acc parallel loop collapse(3) gang vector default(present) private(R0_L, R0_R, V0_L, V0_R, P0_L, P0_R, pbw_L, pbw_R, vel_L, vel_R, &
-                    !$acc rho_avg, alpha_L, alpha_R, h_avg, gamma_avg, s_L, s_R, s_S, nbub_L, nbub_R, ptilde_L, ptilde_R, vel_avg_rms, Re_L, Re_R, pcorr, zcoef, vel_L_tmp, vel_R_tmp)
+                    $:GPU_PARALLEL_LOOP(collapse=3, private='[R0_L, R0_R, V0_L, &
+                        & V0_R, P0_L, P0_R, pbw_L, pbw_R, vel_L, &
+                        & vel_R, rho_avg, alpha_L, alpha_R, h_avg, &
+                        & gamma_avg, s_L, s_R, s_S, nbub_L, nbub_R, &
+                        & ptilde_L, ptilde_R, vel_avg_rms, Re_L, Re_R, &
+                        & pcorr, zcoef, vel_L_tmp, vel_R_tmp]')
                     do l = is3%beg, is3%end
                         do k = is2%beg, is2%end
                             do j = is1%beg, is1%end
 
-                                !$acc loop seq
+                                $:GPU_LOOP(parallelism='[seq]')
                                 do i = 1, num_fluids
                                     alpha_L(i) = qL_prim_rs${XYZ}$_vf(j, k, l, E_idx + i)
                                     alpha_R(i) = qR_prim_rs${XYZ}$_vf(j + 1, k, l, E_idx + i)
@@ -1887,7 +1891,7 @@ contains
 
                                 vel_L_rms = 0._wp; vel_R_rms = 0._wp
 
-                                !$acc loop seq
+                                $:GPU_LOOP(parallelism='[seq]')
                                 do i = 1, num_dims
                                     vel_L(i) = qL_prim_rs${XYZ}$_vf(j, k, l, contxe + i)
                                     vel_R(i) = qR_prim_rs${XYZ}$_vf(j + 1, k, l, contxe + i)
@@ -1905,7 +1909,7 @@ contains
 
                                 ! Retain this in the refactor
                                 if (mpp_lim .and. (num_fluids > 2)) then
-                                    !$acc loop seq
+                                    $:GPU_LOOP(parallelism='[seq]')
                                     do i = 1, num_fluids
                                         rho_L = rho_L + qL_prim_rs${XYZ}$_vf(j, k, l, i)
                                         gamma_L = gamma_L + qL_prim_rs${XYZ}$_vf(j, k, l, E_idx + i)*gammas(i)
@@ -1913,7 +1917,7 @@ contains
                                         qv_L = qv_L + qL_prim_rs${XYZ}$_vf(j, k, l, i)*qvs(i)
                                     end do
                                 else if (num_fluids > 2) then
-                                    !$acc loop seq
+                                    $:GPU_LOOP(parallelism='[seq]')
                                     do i = 1, num_fluids - 1
                                         rho_L = rho_L + qL_prim_rs${XYZ}$_vf(j, k, l, i)
                                         gamma_L = gamma_L + qL_prim_rs${XYZ}$_vf(j, k, l, E_idx + i)*gammas(i)
@@ -1933,7 +1937,7 @@ contains
                                 qv_R = 0._wp
 
                                 if (mpp_lim .and. (num_fluids > 2)) then
-                                    !$acc loop seq
+                                    $:GPU_LOOP(parallelism='[seq]')
                                     do i = 1, num_fluids
                                         rho_R = rho_R + qR_prim_rs${XYZ}$_vf(j + 1, k, l, i)
                                         gamma_R = gamma_R + qR_prim_rs${XYZ}$_vf(j + 1, k, l, E_idx + i)*gammas(i)
@@ -1941,7 +1945,7 @@ contains
                                         qv_R = qv_R + qR_prim_rs${XYZ}$_vf(j + 1, k, l, i)*qvs(i)
                                     end do
                                 else if (num_fluids > 2) then
-                                    !$acc loop seq
+                                    $:GPU_LOOP(parallelism='[seq]')
                                     do i = 1, num_fluids - 1
                                         rho_R = rho_R + qR_prim_rs${XYZ}$_vf(j + 1, k, l, i)
                                         gamma_R = gamma_R + qR_prim_rs${XYZ}$_vf(j + 1, k, l, E_idx + i)*gammas(i)
@@ -1957,13 +1961,13 @@ contains
 
                                 if (viscous) then
                                     if (num_fluids == 1) then ! Need to consider case with num_fluids >= 2
-                                        !$acc loop seq
+                                        $:GPU_LOOP(parallelism='[seq]')
                                         do i = 1, 2
                                             Re_L(i) = dflt_real
 
                                             if (Re_size(i) > 0) Re_L(i) = 0._wp
 
-                                            !$acc loop seq
+                                            $:GPU_LOOP(parallelism='[seq]')
                                             do q = 1, Re_size(i)
                                                 Re_L(i) = (1._wp - qL_prim_rs${XYZ}$_vf(j, k, l, E_idx + Re_idx(i, q)))/Res(i, q) &
                                                           + Re_L(i)
@@ -1973,13 +1977,13 @@ contains
 
                                         end do
 
-                                        !$acc loop seq
+                                        $:GPU_LOOP(parallelism='[seq]')
                                         do i = 1, 2
                                             Re_R(i) = dflt_real
 
                                             if (Re_size(i) > 0) Re_R(i) = 0._wp
 
-                                            !$acc loop seq
+                                            $:GPU_LOOP(parallelism='[seq]')
                                             do q = 1, Re_size(i)
                                                 Re_R(i) = (1._wp - qR_prim_rs${XYZ}$_vf(j + 1, k, l, E_idx + Re_idx(i, q)))/Res(i, q) &
                                                           + Re_R(i)
@@ -1998,7 +2002,7 @@ contains
                                 H_R = (E_R + pres_R)/rho_R
 
                                 if (avg_state == 2) then
-                                    !$acc loop seq
+                                    $:GPU_LOOP(parallelism='[seq]')
                                     do i = 1, nb
                                         R0_L(i) = qL_prim_rs${XYZ}$_vf(j, k, l, rs(i))
                                         R0_R(i) = qR_prim_rs${XYZ}$_vf(j + 1, k, l, rs(i))
@@ -2018,7 +2022,7 @@ contains
                                         else
                                             nbub_L_denom = 0._wp
                                             nbub_R_denom = 0._wp
-                                            !$acc loop seq
+                                            $:GPU_LOOP(parallelism='[seq]')
                                             do i = 1, nb
                                                 nbub_L_denom = nbub_L_denom + (R0_L(i)**3._wp)*weight(i)
                                                 nbub_R_denom = nbub_R_denom + (R0_R(i)**3._wp)*weight(i)
@@ -2032,7 +2036,7 @@ contains
                                         nbub_R = qR_prim_rs${XYZ}$_vf(j + 1, k, l, bubxb)
                                     end if
 
-                                    !$acc loop seq
+                                    $:GPU_LOOP(parallelism='[seq]')
                                     do i = 1, nb
                                         if (.not. qbmm) then
                                             if (polytropic) then
@@ -2065,7 +2069,7 @@ contains
                                         R3V2Lbar = 0._wp
                                         R3V2Rbar = 0._wp
 
-                                        !$acc loop seq
+                                        $:GPU_LOOP(parallelism='[seq]')
                                         do i = 1, nb
                                             PbwR3Lbar = PbwR3Lbar + pbw_L(i)*(R0_L(i)**3._wp)*weight(i)
                                             PbwR3Rbar = PbwR3Rbar + pbw_R(i)*(R0_R(i)**3._wp)*weight(i)
@@ -2100,7 +2104,7 @@ contains
                                     gamma_avg = 5.e-1_wp*(gamma_L + gamma_R)
                                     vel_avg_rms = 0._wp
 
-                                    !$acc loop seq
+                                    $:GPU_LOOP(parallelism='[seq]')
                                     do i = 1, num_dims
                                         vel_avg_rms = vel_avg_rms + (5.e-1_wp*(vel_L(i) + vel_R(i)))**2._wp
                                     end do
@@ -2119,7 +2123,7 @@ contains
                                                               vel_avg_rms, 0._wp, c_avg)
 
                                 if (viscous) then
-                                    !$acc loop seq
+                                    $:GPU_LOOP(parallelism='[seq]')
                                     do i = 1, 2
                                         Re_avg_rs${XYZ}$_vf(j, k, l, i) = 2._wp/(1._wp/Re_L(i) + 1._wp/Re_R(i))
                                     end do
@@ -2183,7 +2187,7 @@ contains
                                     pcorr = 0._wp
                                 end if
 
-                                !$acc loop seq
+                                $:GPU_LOOP(parallelism='[seq]')
                                 do i = 1, contxe
                                     flux_rs${XYZ}$_vf(j, k, l, i) = &
                                         xi_M*qL_prim_rs${XYZ}$_vf(j, k, l, i) &
@@ -2202,7 +2206,7 @@ contains
 
                                 ! Include p_tilde
 
-                                !$acc loop seq
+                                $:GPU_LOOP(parallelism='[seq]')
                                 do i = 1, num_dims
                                     flux_rs${XYZ}$_vf(j, k, l, contxe + dir_idx(i)) = &
                                         xi_M*(rho_L*(vel_L(dir_idx(1))* &
@@ -2234,7 +2238,7 @@ contains
                                     + (s_M/s_L)*(s_P/s_R)*pcorr*s_S
 
                                 ! Volume fraction flux
-                                !$acc loop seq
+                                $:GPU_LOOP(parallelism='[seq]')
                                 do i = advxb, advxe
                                     flux_rs${XYZ}$_vf(j, k, l, i) = &
                                         xi_M*qL_prim_rs${XYZ}$_vf(j, k, l, i) &
@@ -2244,7 +2248,7 @@ contains
                                 end do
 
                                 ! Source for volume fraction advection equation
-                                !$acc loop seq
+                                $:GPU_LOOP(parallelism='[seq]')
                                 do i = 1, num_dims
                                     vel_src_rs${XYZ}$_vf(j, k, l, dir_idx(i)) = &
                                         xi_M*(vel_L(dir_idx(i)) + &
@@ -2260,7 +2264,7 @@ contains
                                 flux_src_rs${XYZ}$_vf(j, k, l, advxb) = vel_src_rs${XYZ}$_vf(j, k, l, dir_idx(1))
 
                                 ! Add advection flux for bubble variables
-                                !$acc loop seq
+                                $:GPU_LOOP(parallelism='[seq]')
                                 do i = bubxb, bubxe
                                     flux_rs${XYZ}$_vf(j, k, l, i) = &
                                         xi_M*nbub_L*qL_prim_rs${XYZ}$_vf(j, k, l, i) &
@@ -2289,7 +2293,7 @@ contains
                                 #:if (NORM_DIR == 2)
                                     if (cyl_coord) then
                                         ! Substituting the advective flux into the inviscid geometrical source flux
-                                        !$acc loop seq
+                                        $:GPU_LOOP(parallelism='[seq]')
                                         do i = 1, E_idx
                                             flux_gsrc_rs${XYZ}$_vf(j, k, l, i) = flux_rs${XYZ}$_vf(j, k, l, i)
                                         end do
@@ -2306,7 +2310,7 @@ contains
                                                                       (1._wp - dir_flg(dir_idx(1)))* &
                                                                       vel_R(dir_idx(1))) - vel_R(dir_idx(1)))))
                                         ! Geometrical source of the void fraction(s) is zero
-                                        !$acc loop seq
+                                        $:GPU_LOOP(parallelism='[seq]')
                                         do i = advxb, advxe
                                             flux_gsrc_rs${XYZ}$_vf(j, k, l, i) = 0._wp
                                         end do
@@ -2314,7 +2318,7 @@ contains
                                 #:endif
                                 #:if (NORM_DIR == 3)
                                     if (grid_geometry == 3) then
-                                        !$acc loop seq
+                                        $:GPU_LOOP(parallelism='[seq]')
                                         do i = 1, sys_size
                                             flux_gsrc_rs${XYZ}$_vf(j, k, l, i) = 0._wp
                                         end do
@@ -2337,28 +2341,31 @@ contains
                             end do
                         end do
                     end do
-                    !$acc end parallel loop
                 else
                     ! 5-EQUATION MODEL WITH HLLC
-                    !$acc parallel loop collapse(3) gang vector default(present) private(vel_L, vel_R, Re_L, Re_R, &
-                    !$acc rho_avg, h_avg, gamma_avg, alpha_L, alpha_R, s_L, s_R, s_S, vel_avg_rms, pcorr, zcoef,   &
-                    !$acc vel_L_tmp, vel_R_tmp, Ys_L, Ys_R, Xs_L, Xs_R, Gamma_iL, Gamma_iR, Cp_iL, Cp_iR,          &
-                    !$acc tau_e_L, tau_e_R, xi_field_L, xi_field_R,                                                &
-                    !$acc Yi_avg, Phi_avg, h_iL, h_iR, h_avg_2) copyin(is1,is2,is3)
+                    $:GPU_PARALLEL_LOOP(collapse=3, private='[vel_L, vel_R, &
+                        & Re_L, Re_R, rho_avg, h_avg, gamma_avg, &
+                        & alpha_L, alpha_R, s_L, s_R, s_S, &
+                        & vel_avg_rms, pcorr, zcoef, vel_L_tmp, &
+                        & vel_R_tmp, Ys_L, Ys_R, Xs_L, Xs_R, &
+                        & Gamma_iL, Gamma_iR, Cp_iL, Cp_iR, tau_e_L, &
+                        & tau_e_R, xi_field_L, xi_field_R, Yi_avg, &
+                        & Phi_avg, h_iL, h_iR, h_avg_2]', &
+                        & copyin='[is1, is2, is3]')
                     do l = is3%beg, is3%end
                         do k = is2%beg, is2%end
                             do j = is1%beg, is1%end
 
                                 !idx1 = 1; if (dir_idx(1) == 2) idx1 = 2; if (dir_idx(1) == 3) idx1 = 3
 
-                                !$acc loop seq
+                                $:GPU_LOOP(parallelism='[seq]')
                                 do i = 1, num_fluids
                                     alpha_L(i) = qL_prim_rs${XYZ}$_vf(j, k, l, E_idx + i)
                                     alpha_R(i) = qR_prim_rs${XYZ}$_vf(j + 1, k, l, E_idx + i)
                                 end do
 
                                 vel_L_rms = 0._wp; vel_R_rms = 0._wp
-                                !$acc loop seq
+                                $:GPU_LOOP(parallelism='[seq]')
                                 do i = 1, num_dims
                                     vel_L(i) = qL_prim_rs${XYZ}$_vf(j, k, l, contxe + i)
                                     vel_R(i) = qR_prim_rs${XYZ}$_vf(j + 1, k, l, contxe + i)
@@ -2385,32 +2392,32 @@ contains
                                 ! Change this by splitting it into the cases
                                 ! present in the bubbles_euler
                                 if (mpp_lim) then
-                                    !$acc loop seq
+                                    $:GPU_LOOP(parallelism='[seq]')
                                     do i = 1, num_fluids
                                         qL_prim_rs${XYZ}$_vf(j, k, l, i) = max(0._wp, qL_prim_rs${XYZ}$_vf(j, k, l, i))
                                         qL_prim_rs${XYZ}$_vf(j, k, l, E_idx + i) = min(max(0._wp, qL_prim_rs${XYZ}$_vf(j, k, l, E_idx + i)), 1._wp)
                                         alpha_L_sum = alpha_L_sum + qL_prim_rs${XYZ}$_vf(j, k, l, E_idx + i)
                                     end do
 
-                                    !$acc loop seq
+                                    $:GPU_LOOP(parallelism='[seq]')
                                     do i = 1, num_fluids
                                         qL_prim_rs${XYZ}$_vf(j, k, l, E_idx + i) = qL_prim_rs${XYZ}$_vf(j, k, l, E_idx + i)/max(alpha_L_sum, sgm_eps)
                                     end do
 
-                                    !$acc loop seq
+                                    $:GPU_LOOP(parallelism='[seq]')
                                     do i = 1, num_fluids
                                         qR_prim_rs${XYZ}$_vf(j + 1, k, l, i) = max(0._wp, qR_prim_rs${XYZ}$_vf(j + 1, k, l, i))
                                         qR_prim_rs${XYZ}$_vf(j + 1, k, l, E_idx + i) = min(max(0._wp, qR_prim_rs${XYZ}$_vf(j + 1, k, l, E_idx + i)), 1._wp)
                                         alpha_R_sum = alpha_R_sum + qR_prim_rs${XYZ}$_vf(j + 1, k, l, E_idx + i)
                                     end do
 
-                                    !$acc loop seq
+                                    $:GPU_LOOP(parallelism='[seq]')
                                     do i = 1, num_fluids
                                         qR_prim_rs${XYZ}$_vf(j + 1, k, l, E_idx + i) = qR_prim_rs${XYZ}$_vf(j + 1, k, l, E_idx + i)/max(alpha_R_sum, sgm_eps)
                                     end do
                                 end if
 
-                                !$acc loop seq
+                                $:GPU_LOOP(parallelism='[seq]')
                                 do i = 1, num_fluids
                                     rho_L = rho_L + qL_prim_rs${XYZ}$_vf(j, k, l, i)
                                     gamma_L = gamma_L + qL_prim_rs${XYZ}$_vf(j, k, l, E_idx + i)*gammas(i)
@@ -2424,13 +2431,13 @@ contains
                                 end do
 
                                 if (viscous) then
-                                    !$acc loop seq
+                                    $:GPU_LOOP(parallelism='[seq]')
                                     do i = 1, 2
                                         Re_L(i) = dflt_real
 
                                         if (Re_size(i) > 0) Re_L(i) = 0._wp
 
-                                        !$acc loop seq
+                                        $:GPU_LOOP(parallelism='[seq]')
                                         do q = 1, Re_size(i)
                                             Re_L(i) = qL_prim_rs${XYZ}$_vf(j, k, l, E_idx + Re_idx(i, q))/Res(i, q) &
                                                       + Re_L(i)
@@ -2440,13 +2447,13 @@ contains
 
                                     end do
 
-                                    !$acc loop seq
+                                    $:GPU_LOOP(parallelism='[seq]')
                                     do i = 1, 2
                                         Re_R(i) = dflt_real
 
                                         if (Re_size(i) > 0) Re_R(i) = 0._wp
 
-                                        !$acc loop seq
+                                        $:GPU_LOOP(parallelism='[seq]')
                                         do q = 1, Re_size(i)
                                             Re_R(i) = qR_prim_rs${XYZ}$_vf(j + 1, k, l, E_idx + Re_idx(i, q))/Res(i, q) &
                                                       + Re_R(i)
@@ -2458,7 +2465,7 @@ contains
 
                                 if (chemistry) then
                                     c_sum_Yi_Phi = 0.0_wp
-                                    !$acc loop seq
+                                    $:GPU_LOOP(parallelism='[seq]')
                                     do i = chemxb, chemxe
                                         Ys_L(i - chemxb + 1) = qL_prim_rs${XYZ}$_vf(j, k, l, i)
                                         Ys_R(i - chemxb + 1) = qR_prim_rs${XYZ}$_vf(j + 1, k, l, i)
@@ -2517,19 +2524,19 @@ contains
 
                                 ! ENERGY ADJUSTMENTS FOR HYPOELASTIC ENERGY
                                 if (hypoelasticity) then
-                                    !$acc loop seq
+                                    $:GPU_LOOP(parallelism='[seq]')
                                     do i = 1, strxe - strxb + 1
                                         tau_e_L(i) = qL_prim_rs${XYZ}$_vf(j, k, l, strxb - 1 + i)
                                         tau_e_R(i) = qR_prim_rs${XYZ}$_vf(j + 1, k, l, strxb - 1 + i)
                                     end do
                                     G_L = 0._wp
                                     G_R = 0._wp
-                                    !$acc loop seq
+                                    $:GPU_LOOP(parallelism='[seq]')
                                     do i = 1, num_fluids
                                         G_L = G_L + alpha_L(i)*Gs(i)
                                         G_R = G_R + alpha_R(i)*Gs(i)
                                     end do
-                                    !$acc loop seq
+                                    $:GPU_LOOP(parallelism='[seq]')
                                     do i = 1, strxe - strxb + 1
                                         ! Elastic contribution to energy if G large enough
                                         if ((G_L > verysmall) .and. (G_R > verysmall)) then
@@ -2546,14 +2553,14 @@ contains
 
                                 ! ENERGY ADJUSTMENTS FOR HYPERELASTIC ENERGY
                                 if (hyperelasticity) then
-                                    !$acc loop seq
+                                    $:GPU_LOOP(parallelism='[seq]')
                                     do i = 1, num_dims
                                         xi_field_L(i) = qL_prim_rs${XYZ}$_vf(j, k, l, xibeg - 1 + i)
                                         xi_field_R(i) = qR_prim_rs${XYZ}$_vf(j + 1, k, l, xibeg - 1 + i)
                                     end do
                                     G_L = 0._wp
                                     G_R = 0._wp
-                                    !$acc loop seq
+                                    $:GPU_LOOP(parallelism='[seq]')
                                     do i = 1, num_fluids
                                         ! Mixture left and right shear modulus
                                         G_L = G_L + alpha_L(i)*Gs(i)
@@ -2564,7 +2571,7 @@ contains
                                         E_L = E_L + G_L*qL_prim_rs${XYZ}$_vf(j, k, l, xiend + 1)
                                         E_R = E_R + G_R*qR_prim_rs${XYZ}$_vf(j + 1, k, l, xiend + 1)
                                     end if
-                                    !$acc loop seq
+                                    $:GPU_LOOP(parallelism='[seq]')
                                     do i = 1, b_size - 1
                                         tau_e_L(i) = qL_prim_rs${XYZ}$_vf(j, k, l, strxb - 1 + i)
                                         tau_e_R(i) = qR_prim_rs${XYZ}$_vf(j + 1, k, l, strxb - 1 + i)
@@ -2588,7 +2595,7 @@ contains
                                                               vel_avg_rms, c_sum_Yi_Phi, c_avg)
 
                                 if (viscous) then
-                                    !$acc loop seq
+                                    $:GPU_LOOP(parallelism='[seq]')
                                     do i = 1, 2
                                         Re_avg_rs${XYZ}$_vf(j, k, l, i) = 2._wp/(1._wp/Re_L(i) + 1._wp/Re_R(i))
                                     end do
@@ -2664,7 +2671,7 @@ contains
 
                                 ! COMPUTING THE HLLC FLUXES
                                 ! MASS FLUX.
-                                !$acc loop seq
+                                $:GPU_LOOP(parallelism='[seq]')
                                 do i = 1, contxe
                                     flux_rs${XYZ}$_vf(j, k, l, i) = &
                                         xi_M*qL_prim_rs${XYZ}$_vf(j, k, l, i) &
@@ -2675,7 +2682,7 @@ contains
 
                                 ! MOMENTUM FLUX.
                                 ! f = \rho u u - \sigma, q = \rho u, q_star = \xi * \rho*(s_star, v, w)
-                                !$acc loop seq
+                                $:GPU_LOOP(parallelism='[seq]')
                                 do i = 1, num_dims
                                     idxi = dir_idx(i)
                                     flux_rs${XYZ}$_vf(j, k, l, contxe + idxi) = &
@@ -2710,7 +2717,7 @@ contains
                                 ! ELASTICITY. Elastic shear stress additions for the momentum and energy flux
                                 if (elasticity) then
                                     flux_ene_e = 0._wp
-                                    !$acc loop seq
+                                    $:GPU_LOOP(parallelism='[seq]')
                                     do i = 1, num_dims
                                         idxi = dir_idx(i)
                                         ! MOMENTUM ELASTIC FLUX.
@@ -2729,7 +2736,7 @@ contains
 
                                 ! HYPOELASTIC STRESS EVOLUTION FLUX.
                                 if (hypoelasticity) then
-                                    !$acc loop seq
+                                    $:GPU_LOOP(parallelism='[seq]')
                                     do i = 1, strxe - strxb + 1
                                         flux_rs${XYZ}$_vf(j, k, l, strxb - 1 + i) = &
                                             xi_M*(s_S/(s_L - s_S))*(s_L*rho_L*tau_e_L(i) - rho_L*vel_L(idx1)*tau_e_L(i)) + &
@@ -2738,7 +2745,7 @@ contains
                                 end if
 
                                 ! VOLUME FRACTION FLUX.
-                                !$acc loop seq
+                                $:GPU_LOOP(parallelism='[seq]')
                                 do i = advxb, advxe
                                     flux_rs${XYZ}$_vf(j, k, l, i) = &
                                         xi_M*qL_prim_rs${XYZ}$_vf(j, k, l, i) &
@@ -2748,7 +2755,7 @@ contains
                                 end do
 
                                 ! VOLUME FRACTION SOURCE FLUX.
-                                !$acc loop seq
+                                $:GPU_LOOP(parallelism='[seq]')
                                 do i = 1, num_dims
                                     idxi = dir_idx(i)
                                     vel_src_rs${XYZ}$_vf(j, k, l, idxi) = &
@@ -2771,7 +2778,7 @@ contains
 
                                 ! REFERENCE MAP FLUX.
                                 if (hyperelasticity) then
-                                    !$acc loop seq
+                                    $:GPU_LOOP(parallelism='[seq]')
                                     do i = 1, num_dims
                                         flux_rs${XYZ}$_vf(j, k, l, xibeg - 1 + i) = &
                                             xi_M*(s_S/(s_L - s_S))*(s_L*rho_L*xi_field_L(i) &
@@ -2784,7 +2791,7 @@ contains
                                 flux_src_rs${XYZ}$_vf(j, k, l, advxb) = vel_src_rs${XYZ}$_vf(j, k, l, idx1)
 
                                 if (chemistry) then
-                                    !$acc loop seq
+                                    $:GPU_LOOP(parallelism='[seq]')
                                     do i = chemxb, chemxe
                                         Y_L = qL_prim_rs${XYZ}$_vf(j, k, l, i)
                                         Y_R = qR_prim_rs${XYZ}$_vf(j + 1, k, l, i)
@@ -2799,7 +2806,7 @@ contains
                                 #:if (NORM_DIR == 2)
                                     if (cyl_coord) then
                                         !Substituting the advective flux into the inviscid geometrical source flux
-                                        !$acc loop seq
+                                        $:GPU_LOOP(parallelism='[seq]')
                                         do i = 1, E_idx
                                             flux_gsrc_rs${XYZ}$_vf(j, k, l, i) = flux_rs${XYZ}$_vf(j, k, l, i)
                                         end do
@@ -2816,7 +2823,7 @@ contains
                                                                       (1._wp - dir_flg(idx1))* &
                                                                       vel_R(idx1)) - vel_R(idx1))))
                                         ! Geometrical source of the void fraction(s) is zero
-                                        !$acc loop seq
+                                        $:GPU_LOOP(parallelism='[seq]')
                                         do i = advxb, advxe
                                             flux_gsrc_rs${XYZ}$_vf(j, k, l, i) = 0._wp
                                         end do
@@ -2824,7 +2831,7 @@ contains
                                 #:endif
                                 #:if (NORM_DIR == 3)
                                     if (grid_geometry == 3) then
-                                        !$acc loop seq
+                                        $:GPU_LOOP(parallelism='[seq]')
                                         do i = 1, sys_size
                                             flux_gsrc_rs${XYZ}$_vf(j, k, l, i) = 0._wp
                                         end do
@@ -2848,7 +2855,6 @@ contains
                             end do
                         end do
                     end do
-                    !$acc end parallel loop
                 end if
             end if
         #:endfor
@@ -2961,10 +2967,12 @@ contains
 
         #:for NORM_DIR, XYZ in [(1, 'x'), (2, 'y'), (3, 'z')]
             if (norm_dir == ${NORM_DIR}$) then
-                !$acc parallel loop collapse(3) gang vector default(present) &
-                !$acc private(alpha_rho_L, alpha_rho_R, vel, alpha_L, alpha_R, &
-                !$acc rho, pres, E, H_no_mag, gamma, pi_inf, qv, vel_rms, B, c, c_fast, pres_mag, &
-                !$acc U_L, U_R, U_starL, U_starR, U_doubleL, U_doubleR, F_L, F_R, F_starL, F_starR, F_hlld)
+                $:GPU_PARALLEL_LOOP(collapse=3, private='[alpha_rho_L, &
+                    & alpha_rho_R, vel, alpha_L, alpha_R, rho, pres, &
+                    & E, H_no_mag, gamma, pi_inf, qv, vel_rms, B, &
+                    & c, c_fast, pres_mag, U_L, U_R, U_starL, &
+                    & U_starR, U_doubleL, U_doubleR, F_L, F_R, &
+                    & F_starL, F_starR, F_hlld]')
                 do l = is3%beg, is3%end
                     do k = is2%beg, is2%end
                         do j = is1%beg, is1%end
@@ -3010,7 +3018,7 @@ contains
                             ! Sum properties of all fluid components
                             rho%L = 0._wp; gamma%L = 0._wp; pi_inf%L = 0._wp; qv%L = 0._wp
                             rho%R = 0._wp; gamma%R = 0._wp; pi_inf%R = 0._wp; qv%R = 0._wp
-                            !$acc loop seq
+                            $:GPU_LOOP(parallelism='[seq]')
                             do i = 1, num_fluids
                                 rho%L = rho%L + alpha_rho_L(i)
                                 gamma%L = gamma%L + alpha_L(i)*gammas(i)
@@ -3127,7 +3135,7 @@ contains
                             ! Energy
                             flux_rs${XYZ}$_vf(j, k, l, E_idx) = F_hlld(7)
                             ! Partial fraction
-                            !$acc loop seq
+                            $:GPU_LOOP(parallelism='[seq]')
                             do i = advxb, advxe
                                 flux_rs${XYZ}$_vf(j, k, l, i) = 0._wp ! TODO multi-component (zero for now)
                             end do
@@ -3136,7 +3144,6 @@ contains
                         end do
                     end do
                 end do
-                !$acc end parallel loop
             end if
         #:endfor
 
@@ -3159,7 +3166,7 @@ contains
         do i = 1, num_fluids
             Gs(i) = fluid_pp(i)%G
         end do
-        !$acc update device(Gs)
+        $:GPU_UPDATE(device='[Gs]')
 
         if (viscous) then
             @:ALLOCATE(Res(1:2, 1:maxval(Re_size)))
@@ -3171,10 +3178,10 @@ contains
                     Res(i, j) = fluid_pp(Re_idx(i, j))%Re(i)
                 end do
             end do
-            !$acc update device(Res, Re_idx, Re_size)
+            $:GPU_UPDATE(device='[Res,Re_idx,Re_size]')
         end if
 
-        !$acc enter data copyin(is1, is2, is3, isx, isy, isz)
+        $:GPU_ENTER_DATA(copyin='[is1,is2,is3,isx,isy,isz]')
 
         is1%beg = -1; is2%beg = 0; is3%beg = 0
         is1%end = m; is2%end = n; is3%end = p
@@ -3317,7 +3324,7 @@ contains
             dir_idx = (/3, 1, 2/); dir_flg = (/0._wp, 0._wp, 1._wp/)
         end if
 
-        !$acc update device(is1, is2, is3)
+        $:GPU_UPDATE(device='[is1,is2,is3]')
 
         if (elasticity) then
             if (norm_dir == 1) then
@@ -3330,14 +3337,16 @@ contains
         end if
 
         isx = ix; isy = iy; isz = iz
-        !$acc update device(isx, isy, isz) ! for stuff in the same module
-        !$acc update device(dir_idx, dir_flg,  dir_idx_tau) ! for stuff in different modules
+        ! for stuff in the same module
+        $:GPU_UPDATE(device='[isx,isy,isz]')
+        ! for stuff in different modules
+        $:GPU_UPDATE(device='[dir_idx,dir_flg,dir_idx_tau]')
 
         ! Population of Buffers in x-direction
         if (norm_dir == 1) then
 
             if (bc_x%beg == BC_RIEMANN_EXTRAP) then    ! Riemann state extrap. BC at beginning
-                !$acc parallel loop collapse(3) gang vector default(present)
+                $:GPU_PARALLEL_LOOP(collapse=3)
                 do i = 1, sys_size
                     do l = is3%beg, is3%end
                         do k = is2%beg, is2%end
@@ -3348,7 +3357,7 @@ contains
                 end do
 
                 if (viscous) then
-                    !$acc parallel loop collapse(3) gang vector default(present)
+                    $:GPU_PARALLEL_LOOP(collapse=3)
                     do i = momxb, momxe
                         do l = isz%beg, isz%end
                             do k = isy%beg, isy%end
@@ -3360,7 +3369,7 @@ contains
                     end do
 
                     if (n > 0) then
-                        !$acc parallel loop collapse(3) gang vector default(present)
+                        $:GPU_PARALLEL_LOOP(collapse=3)
                         do i = momxb, momxe
                             do l = isz%beg, isz%end
                                 do k = isy%beg, isy%end
@@ -3372,7 +3381,7 @@ contains
                         end do
 
                         if (p > 0) then
-                            !$acc parallel loop collapse(3) gang vector default(present)
+                            $:GPU_PARALLEL_LOOP(collapse=3)
                             do i = momxb, momxe
                                 do l = isz%beg, isz%end
                                     do k = isy%beg, isy%end
@@ -3392,7 +3401,7 @@ contains
 
             if (bc_x%end == BC_RIEMANN_EXTRAP) then    ! Riemann state extrap. BC at end
 
-                !$acc parallel loop collapse(3) gang vector default(present)
+                $:GPU_PARALLEL_LOOP(collapse=3)
                 do i = 1, sys_size
                     do l = is3%beg, is3%end
                         do k = is2%beg, is2%end
@@ -3404,7 +3413,7 @@ contains
 
                 if (viscous) then
 
-                    !$acc parallel loop collapse(3) gang vector default(present)
+                    $:GPU_PARALLEL_LOOP(collapse=3)
                     do i = momxb, momxe
                         do l = isz%beg, isz%end
                             do k = isy%beg, isy%end
@@ -3416,7 +3425,7 @@ contains
                     end do
 
                     if (n > 0) then
-                        !$acc parallel loop collapse(3) gang vector default(present)
+                        $:GPU_PARALLEL_LOOP(collapse=3)
                         do i = momxb, momxe
                             do l = isz%beg, isz%end
                                 do k = isy%beg, isy%end
@@ -3428,7 +3437,7 @@ contains
                         end do
 
                         if (p > 0) then
-                            !$acc parallel loop collapse(3) gang vector default(present)
+                            $:GPU_PARALLEL_LOOP(collapse=3)
                             do i = momxb, momxe
                                 do l = isz%beg, isz%end
                                     do k = isy%beg, isy%end
@@ -3451,7 +3460,7 @@ contains
         elseif (norm_dir == 2) then
 
             if (bc_y%beg == BC_RIEMANN_EXTRAP) then    ! Riemann state extrap. BC at beginning
-                !$acc parallel loop collapse(3) gang vector default(present)
+                $:GPU_PARALLEL_LOOP(collapse=3)
                 do i = 1, sys_size
                     do l = is3%beg, is3%end
                         do k = is2%beg, is2%end
@@ -3463,7 +3472,7 @@ contains
 
                 if (viscous) then
 
-                    !$acc parallel loop collapse(3) gang vector default(present)
+                    $:GPU_PARALLEL_LOOP(collapse=3)
                     do i = momxb, momxe
                         do l = isz%beg, isz%end
                             do j = isx%beg, isx%end
@@ -3473,7 +3482,7 @@ contains
                         end do
                     end do
 
-                    !$acc parallel loop collapse(3) gang vector default(present)
+                    $:GPU_PARALLEL_LOOP(collapse=3)
                     do i = momxb, momxe
                         do l = isz%beg, isz%end
                             do j = isx%beg, isx%end
@@ -3484,7 +3493,7 @@ contains
                     end do
 
                     if (p > 0) then
-                        !$acc parallel loop collapse(3) gang vector default(present)
+                        $:GPU_PARALLEL_LOOP(collapse=3)
                         do i = momxb, momxe
                             do l = isz%beg, isz%end
                                 do j = isx%beg, isx%end
@@ -3501,7 +3510,7 @@ contains
 
             if (bc_y%end == BC_RIEMANN_EXTRAP) then    ! Riemann state extrap. BC at end
 
-                !$acc parallel loop collapse(3) gang vector default(present)
+                $:GPU_PARALLEL_LOOP(collapse=3)
                 do i = 1, sys_size
                     do l = is3%beg, is3%end
                         do k = is2%beg, is2%end
@@ -3513,7 +3522,7 @@ contains
 
                 if (viscous) then
 
-                    !$acc parallel loop collapse(3) gang vector default(present)
+                    $:GPU_PARALLEL_LOOP(collapse=3)
                     do i = momxb, momxe
                         do l = isz%beg, isz%end
                             do j = isx%beg, isx%end
@@ -3523,7 +3532,7 @@ contains
                         end do
                     end do
 
-                    !$acc parallel loop collapse(3) gang vector default(present)
+                    $:GPU_PARALLEL_LOOP(collapse=3)
                     do i = momxb, momxe
                         do l = isz%beg, isz%end
                             do j = isx%beg, isx%end
@@ -3534,7 +3543,7 @@ contains
                     end do
 
                     if (p > 0) then
-                        !$acc parallel loop collapse(3) gang vector default(present)
+                        $:GPU_PARALLEL_LOOP(collapse=3)
                         do i = momxb, momxe
                             do l = isz%beg, isz%end
                                 do j = isx%beg, isx%end
@@ -3554,7 +3563,7 @@ contains
         else
 
             if (bc_z%beg == BC_RIEMANN_EXTRAP) then    ! Riemann state extrap. BC at beginning
-                !$acc parallel loop collapse(3) gang vector default(present)
+                $:GPU_PARALLEL_LOOP(collapse=3)
                 do i = 1, sys_size
                     do l = is3%beg, is3%end
                         do k = is2%beg, is2%end
@@ -3565,7 +3574,7 @@ contains
                 end do
 
                 if (viscous) then
-                    !$acc parallel loop collapse(3) gang vector default(present)
+                    $:GPU_PARALLEL_LOOP(collapse=3)
                     do i = momxb, momxe
                         do k = isy%beg, isy%end
                             do j = isx%beg, isx%end
@@ -3574,7 +3583,7 @@ contains
                             end do
                         end do
                     end do
-                    !$acc parallel loop collapse(3) gang vector default(present)
+                    $:GPU_PARALLEL_LOOP(collapse=3)
                     do i = momxb, momxe
                         do k = isy%beg, isy%end
                             do j = isx%beg, isx%end
@@ -3583,7 +3592,7 @@ contains
                             end do
                         end do
                     end do
-                    !$acc parallel loop collapse(3) gang vector default(present)
+                    $:GPU_PARALLEL_LOOP(collapse=3)
                     do i = momxb, momxe
                         do k = isy%beg, isy%end
                             do j = isx%beg, isx%end
@@ -3598,7 +3607,7 @@ contains
 
             if (bc_z%end == BC_RIEMANN_EXTRAP) then    ! Riemann state extrap. BC at end
 
-                !$acc parallel loop collapse(3) gang vector default(present)
+                $:GPU_PARALLEL_LOOP(collapse=3)
                 do i = 1, sys_size
                     do l = is3%beg, is3%end
                         do k = is2%beg, is2%end
@@ -3609,7 +3618,7 @@ contains
                 end do
 
                 if (viscous) then
-                    !$acc parallel loop collapse(3) gang vector default(present)
+                    $:GPU_PARALLEL_LOOP(collapse=3)
                     do i = momxb, momxe
                         do k = isy%beg, isy%end
                             do j = isx%beg, isx%end
@@ -3619,7 +3628,7 @@ contains
                         end do
                     end do
 
-                    !$acc parallel loop collapse(3) gang vector default(present)
+                    $:GPU_PARALLEL_LOOP(collapse=3)
                     do i = momxb, momxe
                         do k = isy%beg, isy%end
                             do j = isx%beg, isx%end
@@ -3629,7 +3638,7 @@ contains
                         end do
                     end do
 
-                    !$acc parallel loop collapse(3) gang vector default(present)
+                    $:GPU_PARALLEL_LOOP(collapse=3)
                     do i = momxb, momxe
                         do k = isy%beg, isy%end
                             do j = isx%beg, isx%end
@@ -3681,7 +3690,7 @@ contains
 
             if (viscous .or. (surface_tension)) then
 
-                !$acc parallel loop collapse(4) gang vector default(present)
+                $:GPU_PARALLEL_LOOP(collapse=4)
                 do i = momxb, E_idx
                     do l = is3%beg, is3%end
                         do k = is2%beg, is2%end
@@ -3695,7 +3704,7 @@ contains
 
             if (qbmm) then
 
-                !$acc parallel loop collapse(4) gang vector default(present)
+                $:GPU_PARALLEL_LOOP(collapse=4)
                 do i = 1, 4
                     do l = is3%beg, is3%end
                         do k = is2%beg, is2%end
@@ -3711,7 +3720,7 @@ contains
         elseif (norm_dir == 2) then
 
             if (viscous .or. (surface_tension)) then
-                !$acc parallel loop collapse(4) gang vector default(present)
+                $:GPU_PARALLEL_LOOP(collapse=4)
                 do i = momxb, E_idx
                     do l = is3%beg, is3%end
                         do j = is1%beg, is1%end
@@ -3724,7 +3733,7 @@ contains
             end if
 
             if (qbmm) then
-                !$acc parallel loop collapse(4) gang vector default(present)
+                $:GPU_PARALLEL_LOOP(collapse=4)
                 do i = 1, 4
                     do l = is3%beg, is3%end
                         do k = is2%beg, is2%end
@@ -3740,7 +3749,7 @@ contains
         else
 
             if (viscous .or. (surface_tension)) then
-                !$acc parallel loop collapse(4) gang vector default(present)
+                $:GPU_PARALLEL_LOOP(collapse=4)
                 do i = momxb, E_idx
                     do j = is1%beg, is1%end
                         do k = is2%beg, is2%end
@@ -3753,7 +3762,7 @@ contains
             end if
 
             if (qbmm) then
-                !$acc parallel loop collapse(4) gang vector default(present)
+                $:GPU_PARALLEL_LOOP(collapse=4)
                 do i = 1, 4
                     do l = is3%beg, is3%end
                         do k = is2%beg, is2%end
@@ -3819,10 +3828,10 @@ contains
         integer :: i_vel             !!< Loop iterator for velocity components.
         integer :: idx_rp(3)         !!< Indices $(j,k,l)$ of 'right' point for averaging.
 
-        !$acc parallel loop collapse(3) gang vector default(present) &
-        !$acc private(idx_rp, avg_v_int, avg_dvdx_int, avg_dvdy_int, avg_dvdz_int, &
-        !$acc         Re_s, Re_b, vel_src_int, r_eff, divergence_cyl, &
-        !$acc         stress_vector_shear, stress_normal_bulk, div_v_term_const)
+        $:GPU_PARALLEL_LOOP(collapse=3, private='[idx_rp, avg_v_int, &
+            & avg_dvdx_int, avg_dvdy_int, avg_dvdz_int, Re_s, Re_b, &
+            & vel_src_int, r_eff, divergence_cyl, stress_vector_shear, &
+            & stress_normal_bulk, div_v_term_const]')
         do l = iz%beg, iz%end
             do k = iy%beg, iy%end
                 do j = ix%beg, ix%end
@@ -3833,7 +3842,7 @@ contains
 
                     ! Average velocities and their derivatives at the interface
                     ! For cylindrical: x-dir ~ axial (z_cyl), y-dir ~ radial (r_cyl), z-dir ~ azimuthal (theta_cyl)
-                    !$acc loop seq
+                    $:GPU_LOOP(parallelism='[seq]')
                     do i_vel = 1, num_dims
                         avg_v_int(i_vel) = 0.5_wp*(velL_vf(i_vel)%sf(j, k, l) + velR_vf(i_vel)%sf(idx_rp(1), idx_rp(2), idx_rp(3)))
 
@@ -3911,7 +3920,7 @@ contains
                             end if
                         end select
 
-                        !$acc loop seq
+                        $:GPU_LOOP(parallelism='[seq]')
                         do i_vel = 1, num_dims
                             flux_src_vf(momxb + i_vel - 1)%sf(j, k, l) = flux_src_vf(momxb + i_vel - 1)%sf(j, k, l) - stress_vector_shear(i_vel)
                             flux_src_vf(E_idx)%sf(j, k, l) = flux_src_vf(E_idx)%sf(j, k, l) - vel_src_int(i_vel)*stress_vector_shear(i_vel)
@@ -3928,7 +3937,6 @@ contains
                 end do
             end do
         end do
-        !$acc end parallel loop
 
     end subroutine s_compute_cylindrical_viscous_source_flux
 
@@ -3982,10 +3990,9 @@ contains
 
         real(wp) :: divergence_v   !< Velocity divergence at interface.
 
-        !$acc parallel loop collapse(3) gang vector default(present) &
-        !$acc private(idx_right_phys, vel_grad_avg, &
-        !$acc current_tau_shear, current_tau_bulk, vel_src_at_interface, &
-        !$acc Re_shear, Re_bulk, divergence_v, i_dim, vel_comp_idx)
+        $:GPU_PARALLEL_LOOP(collapse=3, private='[idx_right_phys, vel_grad_avg, &
+            & current_tau_shear, current_tau_bulk, vel_src_at_interface, &
+            & Re_shear, Re_bulk, divergence_v, i_dim, vel_comp_idx]')
         do l_loop = isz%beg, isz%end
             do k_loop = isy%beg, isy%end
                 do j_loop = isx%beg, isx%end
@@ -4066,7 +4073,6 @@ contains
                 end do
             end do
         end do
-        !$acc end parallel loop
 
     end subroutine s_compute_cartesian_viscous_source_flux
 
@@ -4077,7 +4083,7 @@ contains
     !! @param[in] divergence_v Velocity divergence (du/dx + dv/dy + dw/dz).
     !! @param[out] tau_shear_out Calculated shear stress tensor (stress on i-face, j-direction).
     pure subroutine s_calculate_shear_stress_tensor(vel_grad_avg, Re_shear, divergence_v, tau_shear_out)
-        !$acc routine seq
+        $:GPU_ROUTINE(parallelism='[seq]')
 
         implicit none
 
@@ -4111,7 +4117,7 @@ contains
     !! @param[in] divergence_v Velocity divergence (du/dx + dv/dy + dw/dz).
     !! @param[out] tau_bulk_out Calculated bulk stress tensor (stress on i-face, i-direction).
     pure subroutine s_calculate_bulk_stress_tensor(Re_bulk, divergence_v, tau_bulk_out)
-        !$acc routine seq
+        $:GPU_ROUTINE(parallelism='[seq]')
 
         implicit none
 
@@ -4151,7 +4157,7 @@ contains
 
         ! Reshaping Outputted Data in y-direction
         if (norm_dir == 2) then
-            !$acc parallel loop collapse(4) gang vector default(present)
+            $:GPU_PARALLEL_LOOP(collapse=4)
             do i = 1, sys_size
                 do l = is3%beg, is3%end
                     do j = is1%beg, is1%end
@@ -4164,7 +4170,7 @@ contains
             end do
 
             if (cyl_coord) then
-                !$acc parallel loop collapse(4) gang vector default(present)
+                $:GPU_PARALLEL_LOOP(collapse=4)
                 do i = 1, sys_size
                     do l = is3%beg, is3%end
                         do j = is1%beg, is1%end
@@ -4177,7 +4183,7 @@ contains
                 end do
             end if
 
-            !$acc parallel loop collapse(3) gang vector default(present)
+            $:GPU_PARALLEL_LOOP(collapse=3)
             do l = is3%beg, is3%end
                 do j = is1%beg, is1%end
                     do k = is2%beg, is2%end
@@ -4188,7 +4194,7 @@ contains
             end do
 
             if (riemann_solver == 1 .or. riemann_solver == 4) then
-                !$acc parallel loop collapse(4) gang vector default(present)
+                $:GPU_PARALLEL_LOOP(collapse=4)
                 do i = advxb + 1, advxe
                     do l = is3%beg, is3%end
                         do j = is1%beg, is1%end
@@ -4203,7 +4209,7 @@ contains
             end if
             ! Reshaping Outputted Data in z-direction
         elseif (norm_dir == 3) then
-            !$acc parallel loop collapse(4) gang vector default(present)
+            $:GPU_PARALLEL_LOOP(collapse=4)
             do i = 1, sys_size
                 do j = is1%beg, is1%end
                     do k = is2%beg, is2%end
@@ -4216,7 +4222,7 @@ contains
                 end do
             end do
             if (grid_geometry == 3) then
-                !$acc parallel loop collapse(4) gang vector default(present)
+                $:GPU_PARALLEL_LOOP(collapse=4)
                 do i = 1, sys_size
                     do j = is1%beg, is1%end
                         do k = is2%beg, is2%end
@@ -4230,7 +4236,7 @@ contains
                 end do
             end if
 
-            !$acc parallel loop collapse(3) gang vector default(present)
+            $:GPU_PARALLEL_LOOP(collapse=3)
             do j = is1%beg, is1%end
                 do k = is2%beg, is2%end
                     do l = is3%beg, is3%end
@@ -4241,7 +4247,7 @@ contains
             end do
 
             if (riemann_solver == 1 .or. riemann_solver == 4) then
-                !$acc parallel loop collapse(4) gang vector default(present)
+                $:GPU_PARALLEL_LOOP(collapse=4)
                 do i = advxb + 1, advxe
                     do j = is1%beg, is1%end
                         do k = is2%beg, is2%end
@@ -4255,7 +4261,7 @@ contains
 
             end if
         elseif (norm_dir == 1) then
-            !$acc parallel loop collapse(4) gang vector default(present)
+            $:GPU_PARALLEL_LOOP(collapse=4)
             do i = 1, sys_size
                 do l = is3%beg, is3%end
                     do k = is2%beg, is2%end
@@ -4267,7 +4273,7 @@ contains
                 end do
             end do
 
-            !$acc parallel loop collapse(3) gang vector default(present)
+            $:GPU_PARALLEL_LOOP(collapse=3)
             do l = is3%beg, is3%end
                 do k = is2%beg, is2%end
                     do j = is1%beg, is1%end
@@ -4278,7 +4284,7 @@ contains
             end do
 
             if (riemann_solver == 1 .or. riemann_solver == 4) then
-                !$acc parallel loop collapse(4) gang vector default(present)
+                $:GPU_PARALLEL_LOOP(collapse=4)
                 do i = advxb + 1, advxe
                     do l = is3%beg, is3%end
                         do k = is2%beg, is2%end
diff --git a/src/simulation/m_sim_helpers.f90 b/src/simulation/m_sim_helpers.fpp
similarity index 95%
rename from src/simulation/m_sim_helpers.f90
rename to src/simulation/m_sim_helpers.fpp
index 0ab9c5d1ba..cf8cf80bd7 100644
--- a/src/simulation/m_sim_helpers.f90
+++ b/src/simulation/m_sim_helpers.fpp
@@ -1,3 +1,5 @@
+#:include 'macros.fpp'
+
 module m_sim_helpers
 
     use m_derived_types        !< Definitions of the derived types
@@ -19,7 +21,7 @@ module m_sim_helpers
         !! @param l z coordinate index
         !! @return fltr_dtheta Modified dtheta value for cylindrical coordinates
     pure function f_compute_filtered_dtheta(k, l) result(fltr_dtheta)
-        !$acc routine seq
+        $:GPU_ROUTINE(parallelism='[seq]')
         integer, intent(in) :: k, l
         real(wp) :: fltr_dtheta
         integer :: Nfq
@@ -46,7 +48,7 @@ end function f_compute_filtered_dtheta
         !! @param l z coordinate index
         !! @return cfl_terms computed CFL terms for 2D/3D cases
     pure function f_compute_multidim_cfl_terms(vel, c, j, k, l) result(cfl_terms)
-        !$acc routine seq
+        $:GPU_ROUTINE(parallelism='[seq]')
         real(wp), dimension(num_vels), intent(in) :: vel
         real(wp), intent(in) :: c
         integer, intent(in) :: j, k, l
@@ -88,11 +90,8 @@ end function f_compute_multidim_cfl_terms
         !! @param k y index
         !! @param l z index
     pure subroutine s_compute_enthalpy(q_prim_vf, pres, rho, gamma, pi_inf, Re, H, alpha, vel, vel_sum, j, k, l)
-#ifdef _CRAYFTN
-        !DIR$ INLINEALWAYS s_compute_enthalpy
-#else
-        !$acc routine seq
-#endif
+        $:GPU_ROUTINE(function_name='s_compute_enthalpy',parallelism='[seq]', &
+            & cray_inline=True)
 
         type(scalar_field), intent(in), dimension(sys_size) :: q_prim_vf
         real(wp), intent(inout), dimension(num_fluids) :: alpha
@@ -106,7 +105,7 @@ pure subroutine s_compute_enthalpy(q_prim_vf, pres, rho, gamma, pi_inf, Re, H, a
 
         integer :: i
 
-        !$acc loop seq
+        $:GPU_LOOP(parallelism='[seq]')
         do i = 1, num_fluids
             alpha_rho(i) = q_prim_vf(i)%sf(j, k, l)
             alpha(i) = q_prim_vf(E_idx + i)%sf(j, k, l)
@@ -121,13 +120,13 @@ pure subroutine s_compute_enthalpy(q_prim_vf, pres, rho, gamma, pi_inf, Re, H, a
             call s_convert_species_to_mixture_variables_acc(rho, gamma, pi_inf, qv, alpha, alpha_rho, Re)
         end if
 
-        !$acc loop seq
+        $:GPU_LOOP(parallelism='[seq]')
         do i = 1, num_vels
             vel(i) = q_prim_vf(contxe + i)%sf(j, k, l)
         end do
 
         vel_sum = 0._wp
-        !$acc loop seq
+        $:GPU_LOOP(parallelism='[seq]')
         do i = 1, num_vels
             vel_sum = vel_sum + vel(i)**2._wp
         end do
@@ -156,7 +155,7 @@ end subroutine s_compute_enthalpy
         !! @param vcfl_sf (optional) cell-centered viscous CFL number
         !! @param Rc_sf (optional) cell centered Rc
     pure subroutine s_compute_stability_from_dt(vel, c, rho, Re_l, j, k, l, icfl_sf, vcfl_sf, Rc_sf)
-        !$acc routine seq
+        $:GPU_ROUTINE(parallelism='[seq]')
         real(wp), intent(in), dimension(num_vels) :: vel
         real(wp), intent(in) :: c, rho
         real(wp), dimension(0:m, 0:n, 0:p), intent(inout) :: icfl_sf
@@ -219,7 +218,7 @@ end subroutine s_compute_stability_from_dt
         !! @param k y coordinate
         !! @param l z coordinate
     pure subroutine s_compute_dt_from_cfl(vel, c, max_dt, rho, Re_l, j, k, l)
-        !$acc routine seq
+        $:GPU_ROUTINE(parallelism='[seq]')
         real(wp), dimension(num_vels), intent(in) :: vel
         real(wp), intent(in) :: c, rho
         real(wp), dimension(0:m, 0:n, 0:p), intent(inout) :: max_dt
diff --git a/src/simulation/m_start_up.fpp b/src/simulation/m_start_up.fpp
index 02b7345530..57106cfac9 100644
--- a/src/simulation/m_start_up.fpp
+++ b/src/simulation/m_start_up.fpp
@@ -3,6 +3,7 @@
 !! @brief Contains module m_start_up
 
 #:include 'case.fpp'
+#:include 'macros.fpp'
 
 !> @brief The purpose of the module is primarily to read in the files that
 !!              contain the inputs, the initial condition data and the grid data
@@ -1047,12 +1048,12 @@ contains
         if (cfl_dt) then
             if ((mytime + dt) >= t_stop) then
                 dt = t_stop - mytime
-                !$acc update device(dt)
+                $:GPU_UPDATE(device='[dt]')
             end if
         else
             if ((mytime + dt) >= finaltime) then
                 dt = finaltime - mytime
-                !$acc update device(dt)
+                $:GPU_UPDATE(device='[dt]')
             end if
         end if
 
@@ -1076,7 +1077,7 @@ contains
 
         if (probe_wrt) then
             do i = 1, sys_size
-                !$acc update host(q_cons_ts(1)%vf(i)%sf)
+                $:GPU_UPDATE(host='[q_cons_ts(1)%vf(i)%sf]')
             end do
         end if
 
@@ -1179,7 +1180,7 @@ contains
         call cpu_time(start)
         call nvtxStartRange("SAVE-DATA")
         do i = 1, sys_size
-            !$acc update host(q_cons_ts(1)%vf(i)%sf)
+            $:GPU_UPDATE(host='[q_cons_ts(1)%vf(i)%sf]')
             do l = 0, p
                 do k = 0, n
                     do j = 0, m
@@ -1193,8 +1194,8 @@ contains
         end do
 
         if (qbmm .and. .not. polytropic) then
-            !$acc update host(pb_ts(1)%sf)
-            !$acc update host(mv_ts(1)%sf)
+            $:GPU_UPDATE(host='[pb_ts(1)%sf]')
+            $:GPU_UPDATE(host='[mv_ts(1)%sf]')
         end if
 
         if (cfl_dt) then
@@ -1204,16 +1205,16 @@ contains
         end if
 
         if (bubbles_lagrange) then
-            !$acc update host(intfc_rad)
+            $:GPU_UPDATE(host='[intfc_rad]')
             do i = 1, nBubs
                 if (ieee_is_nan(intfc_rad(i, 1)) .or. intfc_rad(i, 1) <= 0._wp) then
                     call s_mpi_abort("Bubble radius is negative or NaN, please reduce dt.")
                 end if
             end do
 
-            !$acc update host(q_beta%vf(1)%sf)
+            $:GPU_UPDATE(host='[q_beta%vf(1)%sf]')
             call s_write_data_files(q_cons_ts(1)%vf, q_T_sf, q_prim_vf, save_count, q_beta%vf(1))
-            !$acc update host(Rmax_stats, Rmin_stats, gas_p, gas_mv, intfc_vel)
+            $:GPU_UPDATE(host='[Rmax_stats,Rmin_stats,gas_p,gas_mv,intfc_vel]')
             call s_write_restart_lag_bubbles(save_count) !parallel
             if (lag_params%write_bubbles_stats) call s_write_lag_bubble_stats()
         else
@@ -1411,38 +1412,44 @@ contains
         integer :: i
         !Update GPU DATA
         do i = 1, sys_size
-            !$acc update device(q_cons_ts(1)%vf(i)%sf)
+            $:GPU_UPDATE(device='[q_cons_ts(1)%vf(i)%sf]')
         end do
 
         if (qbmm .and. .not. polytropic) then
-            !$acc update device(pb_ts(1)%sf, mv_ts(1)%sf)
+            $:GPU_UPDATE(device='[pb_ts(1)%sf,mv_ts(1)%sf]')
         end if
         if (chemistry) then
-            !$acc update device(q_T_sf%sf)
+            $:GPU_UPDATE(device='[q_T_sf%sf]')
         end if
-        !$acc update device(nb, R0ref, Ca, Web, Re_inv, weight, R0, V0, bubbles_euler, polytropic, polydisperse, qbmm, R0_type, ptil, bubble_model, thermal, poly_sigma, adv_n, adap_dt, adap_dt_tol, n_idx, pi_fac, low_Mach)
-        !$acc update device(R_n, R_v, phi_vn, phi_nv, Pe_c, Tw, pv, M_n, M_v, k_n, k_v, pb0, mass_n0, mass_v0, Pe_T, Re_trans_T, Re_trans_c, Im_trans_T, Im_trans_c, omegaN , mul0, ss, gamma_v, mu_v, gamma_m, gamma_n, mu_n, gam)
+        $:GPU_UPDATE(device='[nb,R0ref,Ca,Web,Re_inv,weight,R0,V0, &
+            & bubbles_euler,polytropic,polydisperse,qbmm,R0_type, &
+            & ptil,bubble_model,thermal,poly_sigma,adv_n,adap_dt, &
+            & adap_dt_tol,n_idx,pi_fac,low_Mach]')
+        $:GPU_UPDATE(device='[R_n,R_v,phi_vn,phi_nv,Pe_c,Tw,pv,M_n, &
+            & M_v,k_n,k_v,pb0,mass_n0,mass_v0,Pe_T,Re_trans_T, &
+            & Re_trans_c,Im_trans_T,Im_trans_c,omegaN,mul0,ss, &
+            & gamma_v,mu_v,gamma_m,gamma_n,mu_n,gam]')
 
-        !$acc update device(acoustic_source, num_source)
-        !$acc update device(sigma, surface_tension)
+        $:GPU_UPDATE(device='[acoustic_source, num_source]')
+        $:GPU_UPDATE(device='[sigma, surface_tension]')
 
-        !$acc update device(dx, dy, dz, x_cb, x_cc, y_cb, y_cc, z_cb, z_cc)
+        $:GPU_UPDATE(device='[dx,dy,dz,x_cb,x_cc,y_cb,y_cc,z_cb,z_cc]')
 
-        !$acc update device(bc_x%vb1, bc_x%vb2, bc_x%vb3, bc_x%ve1, bc_x%ve2, bc_x%ve3)
-        !$acc update device(bc_y%vb1, bc_y%vb2, bc_y%vb3, bc_y%ve1, bc_y%ve2, bc_y%ve3)
-        !$acc update device(bc_z%vb1, bc_z%vb2, bc_z%vb3, bc_z%ve1, bc_z%ve2, bc_z%ve3)
+        $:GPU_UPDATE(device='[bc_x%vb1,bc_x%vb2,bc_x%vb3,bc_x%ve1,bc_x%ve2,bc_x%ve3]')
+        $:GPU_UPDATE(device='[bc_y%vb1,bc_y%vb2,bc_y%vb3,bc_y%ve1,bc_y%ve2,bc_y%ve3]')
+        $:GPU_UPDATE(device='[bc_z%vb1,bc_z%vb2,bc_z%vb3,bc_z%ve1,bc_z%ve2,bc_z%ve3]')
 
-        !$acc update device(bc_x%grcbc_in, bc_x%grcbc_out, bc_x%grcbc_vel_out)
-        !$acc update device(bc_y%grcbc_in, bc_y%grcbc_out, bc_y%grcbc_vel_out)
-        !$acc update device(bc_z%grcbc_in, bc_z%grcbc_out, bc_z%grcbc_vel_out)
+        $:GPU_UPDATE(device='[bc_x%grcbc_in,bc_x%grcbc_out,bc_x%grcbc_vel_out]')
+        $:GPU_UPDATE(device='[bc_y%grcbc_in,bc_y%grcbc_out,bc_y%grcbc_vel_out]')
+        $:GPU_UPDATE(device='[bc_z%grcbc_in,bc_z%grcbc_out,bc_z%grcbc_vel_out]')
 
-        !$acc update device(relax, relax_model)
+        $:GPU_UPDATE(device='[relax, relax_model]')
         if (relax) then
-            !$acc update device(palpha_eps, ptgalpha_eps)
+            $:GPU_UPDATE(device='[palpha_eps, ptgalpha_eps]')
         end if
 
         if (ib) then
-            !$acc update device(ib_markers%sf)
+            $:GPU_UPDATE(device='[ib_markers%sf]')
         end if
     end subroutine s_initialize_gpu_vars
 
diff --git a/src/simulation/m_surface_tension.fpp b/src/simulation/m_surface_tension.fpp
index 5cf87531aa..30b6fcec2b 100644
--- a/src/simulation/m_surface_tension.fpp
+++ b/src/simulation/m_surface_tension.fpp
@@ -29,16 +29,16 @@ module m_surface_tension
     !> @{
     type(scalar_field), allocatable, dimension(:) :: c_divs
     !> @)
-    !$acc declare create(c_divs)
+    $:GPU_DECLARE(create='[c_divs]')
 
     !> @name cell boundary reconstructed gradient components and magnitude
     !> @{
     real(wp), allocatable, dimension(:, :, :, :) :: gL_x, gR_x, gL_y, gR_y, gL_z, gR_z
     !> @}
-    !$acc declare create(gL_x, gR_x, gL_y, gR_y, gL_z, gR_z)
+    $:GPU_DECLARE(create='[gL_x,gR_x,gL_y,gR_y,gL_z,gR_z]')
 
     type(int_bounds_info) :: is1, is2, is3, iv
-    !$acc declare create(is1, is2, is3, iv)
+    $:GPU_DECLARE(create='[is1,is2,is3,iv]')
 
 contains
 
@@ -85,8 +85,9 @@ contains
         integer :: j, k, l, i
 
         if (id == 1) then
-            !$acc parallel loop collapse(3) gang vector default(present) private(Omega, &
-            !$acc w1L, w2L, w3L, w1R, w2R, w3R, w1, w2, w3, normWL, normWR, normW)
+            $:GPU_PARALLEL_LOOP(collapse=3, private='[Omega, w1L, w2L, w3L, &
+                & w1R, w2R, w3R, w1, w2, w3, normWL, &
+                & normWR, normW]')
             do l = isz%beg, isz%end
                 do k = isy%beg, isy%end
                     do j = isx%beg, isx%end
@@ -131,8 +132,9 @@ contains
 
         elseif (id == 2) then
 
-            !$acc parallel loop collapse(3) gang vector default(present) private(Omega, &
-            !$acc w1L, w2L, w3L, w1R, w2R, w3R, w1, w2, w3, normWL, normWR, normW)
+            $:GPU_PARALLEL_LOOP(collapse=3, private='[Omega, w1L, w2L, w3L, &
+                & w1R, w2R, w3R, w1, w2, w3, normWL, normWR, &
+                & normW]')
             do l = isz%beg, isz%end
                 do k = isy%beg, isy%end
                     do j = isx%beg, isx%end
@@ -177,8 +179,9 @@ contains
 
         elseif (id == 3) then
 
-            !$acc parallel loop collapse(3) gang vector default(present) private(Omega, &
-            !$acc w1L, w2L, w3L, w1R, w2R, w3R, w1, w2, w3, normWL, normWR, normW)
+            $:GPU_PARALLEL_LOOP(collapse=3, private='[Omega, w1L, w2L, w3L, &
+                & w1R, w2R, w3R, w1, w2, w3, normWL, normWR, &
+                & normW]')
             do l = isz%beg, isz%end
                 do k = isy%beg, isy%end
                     do j = isx%beg, isx%end
@@ -240,7 +243,7 @@ contains
         isx%end = m; isy%end = n; isz%end = p
 
         ! compute gradient components
-        !$acc parallel loop collapse(3) gang vector default(present)
+        $:GPU_PARALLEL_LOOP(collapse=3)
         do l = 0, p
             do k = 0, n
                 do j = 0, m
@@ -250,7 +253,7 @@ contains
             end do
         end do
 
-        !$acc parallel loop collapse(3) gang vector default(present)
+        $:GPU_PARALLEL_LOOP(collapse=3)
         do l = 0, p
             do k = 0, n
                 do j = 0, m
@@ -261,7 +264,7 @@ contains
         end do
 
         if (p > 0) then
-            !$acc parallel loop collapse(3) gang vector default(present)
+            $:GPU_PARALLEL_LOOP(collapse=3)
             do l = 0, p
                 do k = 0, n
                     do j = 0, m
@@ -272,7 +275,7 @@ contains
             end do
         end if
 
-        !$acc parallel loop collapse(3) gang vector default(present)
+        $:GPU_PARALLEL_LOOP(collapse=3)
         do l = 0, p
             do k = 0, n
                 do j = 0, m
@@ -332,10 +335,10 @@ contains
 
         end if
 
-        !$acc update device(is1, is2, is3, iv)
+        $:GPU_UPDATE(device='[is1,is2,is3,iv]')
 
         if (recon_dir == 1) then
-            !$acc parallel loop collapse(4) gang vector default(present)
+            $:GPU_PARALLEL_LOOP(collapse=4)
             do i = iv%beg, iv%end
                 do l = is3%beg, is3%end
                     do k = is2%beg, is2%end
@@ -346,9 +349,8 @@ contains
                     end do
                 end do
             end do
-            !$acc end parallel loop
         else if (recon_dir == 2) then
-            !$acc parallel loop collapse(4) gang vector default(present)
+            $:GPU_PARALLEL_LOOP(collapse=4)
             do i = iv%beg, iv%end
                 do l = is3%beg, is3%end
                     do k = is2%beg, is2%end
@@ -359,9 +361,8 @@ contains
                     end do
                 end do
             end do
-            !$acc end parallel loop
         else if (recon_dir == 3) then
-            !$acc parallel loop collapse(4) gang vector default(present)
+            $:GPU_PARALLEL_LOOP(collapse=4)
             do i = iv%beg, iv%end
                 do l = is3%beg, is3%end
                     do k = is2%beg, is2%end
@@ -372,7 +373,6 @@ contains
                     end do
                 end do
             end do
-            !$acc end parallel loop
         end if
 
     end subroutine s_reconstruct_cell_boundary_values_capillary
diff --git a/src/simulation/m_time_steppers.fpp b/src/simulation/m_time_steppers.fpp
index b1c338b5c9..381455be2b 100644
--- a/src/simulation/m_time_steppers.fpp
+++ b/src/simulation/m_time_steppers.fpp
@@ -75,7 +75,7 @@ module m_time_steppers
     integer, private :: num_ts !<
     !! Number of time stages in the time-stepping scheme
 
-    !$acc declare create(q_cons_ts, q_prim_vf, q_T_sf, rhs_vf, q_prim_ts, rhs_mv, rhs_pb, max_dt)
+    $:GPU_DECLARE(create='[q_cons_ts,q_prim_vf,q_T_sf,rhs_vf,q_prim_ts,rhs_mv,rhs_pb,max_dt]')
 
 contains
 
@@ -377,7 +377,7 @@ contains
 
         if (bubbles_lagrange .and. .not. adap_dt) call s_update_lagrange_tdv_rk(stage=1)
 
-        !$acc parallel loop collapse(4) gang vector default(present)
+        $:GPU_PARALLEL_LOOP(collapse=4)
         do i = 1, sys_size
             do l = 0, p
                 do k = 0, n
@@ -392,7 +392,7 @@ contains
 
         !Evolve pb and mv for non-polytropic qbmm
         if (qbmm .and. (.not. polytropic)) then
-            !$acc parallel loop collapse(5) gang vector default(present)
+            $:GPU_PARALLEL_LOOP(collapse=5)
             do i = 1, nb
                 do l = 0, p
                     do k = 0, n
@@ -409,7 +409,7 @@ contains
         end if
 
         if (qbmm .and. (.not. polytropic)) then
-            !$acc parallel loop collapse(5) gang vector default(present)
+            $:GPU_PARALLEL_LOOP(collapse=5)
             do i = 1, nb
                 do l = 0, p
                     do k = 0, n
@@ -479,7 +479,7 @@ contains
 
         if (bubbles_lagrange .and. .not. adap_dt) call s_update_lagrange_tdv_rk(stage=1)
 
-        !$acc parallel loop collapse(4) gang vector default(present)
+        $:GPU_PARALLEL_LOOP(collapse=4)
         do i = 1, sys_size
             do l = 0, p
                 do k = 0, n
@@ -494,7 +494,7 @@ contains
 
         !Evolve pb and mv for non-polytropic qbmm
         if (qbmm .and. (.not. polytropic)) then
-            !$acc parallel loop collapse(5) gang vector default(present)
+            $:GPU_PARALLEL_LOOP(collapse=5)
             do i = 1, nb
                 do l = 0, p
                     do k = 0, n
@@ -511,7 +511,7 @@ contains
         end if
 
         if (qbmm .and. (.not. polytropic)) then
-            !$acc parallel loop collapse(5) gang vector default(present)
+            $:GPU_PARALLEL_LOOP(collapse=5)
             do i = 1, nb
                 do l = 0, p
                     do k = 0, n
@@ -551,7 +551,7 @@ contains
 
         if (bubbles_lagrange .and. .not. adap_dt) call s_update_lagrange_tdv_rk(stage=2)
 
-        !$acc parallel loop collapse(4) gang vector default(present)
+        $:GPU_PARALLEL_LOOP(collapse=4)
         do i = 1, sys_size
             do l = 0, p
                 do k = 0, n
@@ -566,7 +566,7 @@ contains
         end do
 
         if (qbmm .and. (.not. polytropic)) then
-            !$acc parallel loop collapse(5) gang vector default(present)
+            $:GPU_PARALLEL_LOOP(collapse=5)
             do i = 1, nb
                 do l = 0, p
                     do k = 0, n
@@ -584,7 +584,7 @@ contains
         end if
 
         if (qbmm .and. (.not. polytropic)) then
-            !$acc parallel loop collapse(5) gang vector default(present)
+            $:GPU_PARALLEL_LOOP(collapse=5)
             do i = 1, nb
                 do l = 0, p
                     do k = 0, n
@@ -661,7 +661,7 @@ contains
 
         if (bubbles_lagrange .and. .not. adap_dt) call s_update_lagrange_tdv_rk(stage=1)
 
-        !$acc parallel loop collapse(4) gang vector default(present)
+        $:GPU_PARALLEL_LOOP(collapse=4)
         do i = 1, sys_size
             do l = 0, p
                 do k = 0, n
@@ -676,7 +676,7 @@ contains
 
         !Evolve pb and mv for non-polytropic qbmm
         if (qbmm .and. (.not. polytropic)) then
-            !$acc parallel loop collapse(5) gang vector default(present)
+            $:GPU_PARALLEL_LOOP(collapse=5)
             do i = 1, nb
                 do l = 0, p
                     do k = 0, n
@@ -693,7 +693,7 @@ contains
         end if
 
         if (qbmm .and. (.not. polytropic)) then
-            !$acc parallel loop collapse(5) gang vector default(present)
+            $:GPU_PARALLEL_LOOP(collapse=5)
             do i = 1, nb
                 do l = 0, p
                     do k = 0, n
@@ -733,7 +733,7 @@ contains
 
         if (bubbles_lagrange .and. .not. adap_dt) call s_update_lagrange_tdv_rk(stage=2)
 
-        !$acc parallel loop collapse(4) gang vector default(present)
+        $:GPU_PARALLEL_LOOP(collapse=4)
         do i = 1, sys_size
             do l = 0, p
                 do k = 0, n
@@ -748,7 +748,7 @@ contains
         end do
 
         if (qbmm .and. (.not. polytropic)) then
-            !$acc parallel loop collapse(5) gang vector default(present)
+            $:GPU_PARALLEL_LOOP(collapse=5)
             do i = 1, nb
                 do l = 0, p
                     do k = 0, n
@@ -766,7 +766,7 @@ contains
         end if
 
         if (qbmm .and. (.not. polytropic)) then
-            !$acc parallel loop collapse(5) gang vector default(present)
+            $:GPU_PARALLEL_LOOP(collapse=5)
             do i = 1, nb
                 do l = 0, p
                     do k = 0, n
@@ -806,7 +806,7 @@ contains
 
         if (bubbles_lagrange .and. .not. adap_dt) call s_update_lagrange_tdv_rk(stage=3)
 
-        !$acc parallel loop collapse(4) gang vector default(present)
+        $:GPU_PARALLEL_LOOP(collapse=4)
         do i = 1, sys_size
             do l = 0, p
                 do k = 0, n
@@ -821,7 +821,7 @@ contains
         end do
 
         if (qbmm .and. (.not. polytropic)) then
-            !$acc parallel loop collapse(5) gang vector default(present)
+            $:GPU_PARALLEL_LOOP(collapse=5)
             do i = 1, nb
                 do l = 0, p
                     do k = 0, n
@@ -839,7 +839,7 @@ contains
         end if
 
         if (qbmm .and. (.not. polytropic)) then
-            !$acc parallel loop collapse(5) gang vector default(present)
+            $:GPU_PARALLEL_LOOP(collapse=5)
             do i = 1, nb
                 do l = 0, p
                     do k = 0, n
@@ -946,7 +946,7 @@ contains
             if (stage == 3) then
                 if (lag_params%write_bubbles_stats) call s_calculate_lag_bubble_stats()
                 if (lag_params%write_bubbles) then
-                    !$acc update host(gas_p, gas_mv, intfc_rad, intfc_vel)
+                    $:GPU_UPDATE(host='[gas_p,gas_mv,intfc_rad,intfc_vel]')
                     call s_write_lag_particles(mytime)
                 end if
                 call s_write_void_evol(mytime)
@@ -979,7 +979,7 @@ contains
             q_prim_vf, &
             idwint)
 
-        !$acc parallel loop collapse(3) gang vector default(present) private(vel, alpha, Re)
+        $:GPU_PARALLEL_LOOP(collapse=3, private='[vel, alpha, Re]')
         do l = 0, p
             do k = 0, n
                 do j = 0, m
@@ -993,9 +993,9 @@ contains
             end do
         end do
 
-        !$acc kernels
-        dt_local = minval(max_dt)
-        !$acc end kernels
+        #:call GPU_PARALLEL()
+            dt_local = minval(max_dt)
+        #:endcall GPU_PARALLEL
 
         if (num_procs == 1) then
             dt = dt_local
@@ -1003,7 +1003,7 @@ contains
             call s_mpi_allreduce_min(dt_local, dt)
         end if
 
-        !$acc update device(dt)
+        $:GPU_UPDATE(device='[dt]')
 
     end subroutine s_compute_dt
 
@@ -1022,7 +1022,7 @@ contains
         call nvtxStartRange("RHS-BODYFORCES")
         call s_compute_body_forces_rhs(q_prim_vf, q_cons_vf, rhs_vf)
 
-        !$acc parallel loop collapse(4) gang vector default(present)
+        $:GPU_PARALLEL_LOOP(collapse=4)
         do i = momxb, E_idx
             do l = 0, p
                 do k = 0, n
@@ -1048,7 +1048,7 @@ contains
         integer :: i !< Generic loop iterator
 
         do i = 1, sys_size
-            !$acc update host(q_prim_vf(i)%sf)
+            $:GPU_UPDATE(host='[q_prim_vf(i)%sf]')
         end do
 
         if (t_step == t_step_start) then
diff --git a/src/simulation/m_viscous.fpp b/src/simulation/m_viscous.fpp
index 24b7dbb8df..ef301f5856 100644
--- a/src/simulation/m_viscous.fpp
+++ b/src/simulation/m_viscous.fpp
@@ -24,10 +24,10 @@ module m_viscous
 
     type(int_bounds_info) :: iv
     type(int_bounds_info) :: is1_viscous, is2_viscous, is3_viscous
-    !$acc declare create(is1_viscous, is2_viscous, is3_viscous, iv)
+    $:GPU_DECLARE(create='[is1_viscous,is2_viscous,is3_viscous,iv]')
 
     real(wp), allocatable, dimension(:, :) :: Res_viscous
-    !$acc declare create(Res_viscous)
+    $:GPU_DECLARE(create='[Res_viscous]')
 
 contains
 
@@ -42,8 +42,8 @@ contains
                 Res_viscous(i, j) = fluid_pp(Re_idx(i, j))%Re(i)
             end do
         end do
-        !$acc update device(Res_viscous, Re_idx, Re_size)
-        !$acc enter data copyin(is1_viscous, is2_viscous, is3_viscous, iv)
+        $:GPU_UPDATE(device='[Res_viscous,Re_idx,Re_size]')
+        $:GPU_ENTER_DATA(copyin='[is1_viscous,is2_viscous,is3_viscous,iv]')
 
     end subroutine s_initialize_viscous_module
 
@@ -75,13 +75,13 @@ contains
 
         is1_viscous = ix; is2_viscous = iy; is3_viscous = iz
 
-        !$acc update device(is1_viscous, is2_viscous, is3_viscous)
+        $:GPU_UPDATE(device='[is1_viscous,is2_viscous,is3_viscous]')
 
-        !$acc parallel loop collapse(3) gang vector default(present)
+        $:GPU_PARALLEL_LOOP(collapse=3)
         do l = is3_viscous%beg, is3_viscous%end
             do k = is2_viscous%beg, is2_viscous%end
                 do j = is1_viscous%beg, is1_viscous%end
-                    !$acc loop seq
+                    $:GPU_LOOP(parallelism='[seq]')
                     do i = momxb, E_idx
                         tau_Re_vf(i)%sf(j, k, l) = 0._wp
                     end do
@@ -89,12 +89,13 @@ contains
             end do
         end do
         if (shear_stress) then    ! Shear stresses
-            !$acc parallel loop collapse(3) gang vector default(present) private(alpha_visc, alpha_rho_visc, Re_visc, tau_Re )
+            $:GPU_PARALLEL_LOOP(collapse=3, private='[alpha_visc, &
+                & alpha_rho_visc, Re_visc, tau_Re]')
             do l = is3_viscous%beg, is3_viscous%end
                 do k = -1, 1
                     do j = is1_viscous%beg, is1_viscous%end
 
-                        !$acc loop seq
+                        $:GPU_LOOP(parallelism='[seq]')
                         do i = 1, num_fluids
                             alpha_rho_visc(i) = q_prim_vf(i)%sf(j, k, l)
                             if (bubbles_euler .and. num_fluids == 1) then
@@ -110,14 +111,14 @@ contains
                             pi_inf_visc = 0._wp
 
                             if (mpp_lim .and. (model_eqns == 2) .and. (num_fluids > 2)) then
-                                !$acc loop seq
+                                $:GPU_LOOP(parallelism='[seq]')
                                 do i = 1, num_fluids
                                     rho_visc = rho_visc + alpha_rho_visc(i)
                                     gamma_visc = gamma_visc + alpha_visc(i)*gammas(i)
                                     pi_inf_visc = pi_inf_visc + alpha_visc(i)*pi_infs(i)
                                 end do
                             else if ((model_eqns == 2) .and. (num_fluids > 2)) then
-                                !$acc loop seq
+                                $:GPU_LOOP(parallelism='[seq]')
                                 do i = 1, num_fluids - 1
                                     rho_visc = rho_visc + alpha_rho_visc(i)
                                     gamma_visc = gamma_visc + alpha_visc(i)*gammas(i)
@@ -136,7 +137,7 @@ contains
                             alpha_visc_sum = 0._wp
 
                             if (mpp_lim) then
-                                !$acc loop seq
+                                $:GPU_LOOP(parallelism='[seq]')
                                 do i = 1, num_fluids
                                     alpha_rho_visc(i) = max(0._wp, alpha_rho_visc(i))
                                     alpha_visc(i) = min(max(0._wp, alpha_visc(i)), 1._wp)
@@ -147,7 +148,7 @@ contains
 
                             end if
 
-                            !$acc loop seq
+                            $:GPU_LOOP(parallelism='[seq]')
                             do i = 1, num_fluids
                                 rho_visc = rho_visc + alpha_rho_visc(i)
                                 gamma_visc = gamma_visc + alpha_visc(i)*gammas(i)
@@ -155,12 +156,12 @@ contains
                             end do
 
                             if (viscous) then
-                                !$acc loop seq
+                                $:GPU_LOOP(parallelism='[seq]')
                                 do i = 1, 2
                                     Re_visc(i) = dflt_real
 
                                     if (Re_size(i) > 0) Re_visc(i) = 0._wp
-                                    !$acc loop seq
+                                    $:GPU_LOOP(parallelism='[seq]')
                                     do q = 1, Re_size(i)
                                         Re_visc(i) = alpha_visc(Re_idx(i, q))/Res_viscous(i, q) &
                                                      + Re_visc(i)
@@ -180,7 +181,7 @@ contains
                                         - 2._wp*grad_x_vf(1)%sf(j, k, l) &
                                         - 2._wp*q_prim_vf(momxb + 1)%sf(j, k, l)/y_cc(k))/ &
                                        (3._wp*Re_visc(1))
-                        !$acc loop seq
+                        $:GPU_LOOP(parallelism='[seq]')
                         do i = 1, 2
                             tau_Re_vf(contxe + i)%sf(j, k, l) = &
                                 tau_Re_vf(contxe + i)%sf(j, k, l) - &
@@ -196,12 +197,13 @@ contains
         end if
 
         if (bulk_stress) then    ! Bulk stresses
-            !$acc parallel loop collapse(3) gang vector default(present) private(alpha_visc, alpha_rho_visc, Re_visc, tau_Re )
+            $:GPU_PARALLEL_LOOP(collapse=3, private='[alpha_visc, &
+                & alpha_rho_visc, Re_visc, tau_Re]')
             do l = is3_viscous%beg, is3_viscous%end
                 do k = -1, 1
                     do j = is1_viscous%beg, is1_viscous%end
 
-                        !$acc loop seq
+                        $:GPU_LOOP(parallelism='[seq]')
                         do i = 1, num_fluids
                             alpha_rho_visc(i) = q_prim_vf(i)%sf(j, k, l)
                             if (bubbles_euler .and. num_fluids == 1) then
@@ -217,14 +219,14 @@ contains
                             pi_inf_visc = 0._wp
 
                             if (mpp_lim .and. (model_eqns == 2) .and. (num_fluids > 2)) then
-                                !$acc loop seq
+                                $:GPU_LOOP(parallelism='[seq]')
                                 do i = 1, num_fluids
                                     rho_visc = rho_visc + alpha_rho_visc(i)
                                     gamma_visc = gamma_visc + alpha_visc(i)*gammas(i)
                                     pi_inf_visc = pi_inf_visc + alpha_visc(i)*pi_infs(i)
                                 end do
                             else if ((model_eqns == 2) .and. (num_fluids > 2)) then
-                                !$acc loop seq
+                                $:GPU_LOOP(parallelism='[seq]')
                                 do i = 1, num_fluids - 1
                                     rho_visc = rho_visc + alpha_rho_visc(i)
                                     gamma_visc = gamma_visc + alpha_visc(i)*gammas(i)
@@ -243,7 +245,7 @@ contains
                             alpha_visc_sum = 0._wp
 
                             if (mpp_lim) then
-                                !$acc loop seq
+                                $:GPU_LOOP(parallelism='[seq]')
                                 do i = 1, num_fluids
                                     alpha_rho_visc(i) = max(0._wp, alpha_rho_visc(i))
                                     alpha_visc(i) = min(max(0._wp, alpha_visc(i)), 1._wp)
@@ -254,7 +256,7 @@ contains
 
                             end if
 
-                            !$acc loop seq
+                            $:GPU_LOOP(parallelism='[seq]')
                             do i = 1, num_fluids
                                 rho_visc = rho_visc + alpha_rho_visc(i)
                                 gamma_visc = gamma_visc + alpha_visc(i)*gammas(i)
@@ -262,12 +264,12 @@ contains
                             end do
 
                             if (viscous) then
-                                !$acc loop seq
+                                $:GPU_LOOP(parallelism='[seq]')
                                 do i = 1, 2
                                     Re_visc(i) = dflt_real
 
                                     if (Re_size(i) > 0) Re_visc(i) = 0._wp
-                                    !$acc loop seq
+                                    $:GPU_LOOP(parallelism='[seq]')
                                     do q = 1, Re_size(i)
                                         Re_visc(i) = alpha_visc(Re_idx(i, q))/Res_viscous(i, q) &
                                                      + Re_visc(i)
@@ -300,12 +302,13 @@ contains
         if (p == 0) return
 
         if (shear_stress) then    ! Shear stresses
-            !$acc parallel loop collapse(3) gang vector default(present) private(alpha_visc, alpha_rho_visc, Re_visc, tau_Re )
+            $:GPU_PARALLEL_LOOP(collapse=3, private='[alpha_visc, &
+                & alpha_rho_visc, Re_visc, tau_Re]')
             do l = is3_viscous%beg, is3_viscous%end
                 do k = -1, 1
                     do j = is1_viscous%beg, is1_viscous%end
 
-                        !$acc loop seq
+                        $:GPU_LOOP(parallelism='[seq]')
                         do i = 1, num_fluids
                             alpha_rho_visc(i) = q_prim_vf(i)%sf(j, k, l)
                             if (bubbles_euler .and. num_fluids == 1) then
@@ -321,14 +324,14 @@ contains
                             pi_inf_visc = 0._wp
 
                             if (mpp_lim .and. (model_eqns == 2) .and. (num_fluids > 2)) then
-                                !$acc loop seq
+                                $:GPU_LOOP(parallelism='[seq]')
                                 do i = 1, num_fluids
                                     rho_visc = rho_visc + alpha_rho_visc(i)
                                     gamma_visc = gamma_visc + alpha_visc(i)*gammas(i)
                                     pi_inf_visc = pi_inf_visc + alpha_visc(i)*pi_infs(i)
                                 end do
                             else if ((model_eqns == 2) .and. (num_fluids > 2)) then
-                                !$acc loop seq
+                                $:GPU_LOOP(parallelism='[seq]')
                                 do i = 1, num_fluids - 1
                                     rho_visc = rho_visc + alpha_rho_visc(i)
                                     gamma_visc = gamma_visc + alpha_visc(i)*gammas(i)
@@ -347,7 +350,7 @@ contains
                             alpha_visc_sum = 0._wp
 
                             if (mpp_lim) then
-                                !$acc loop seq
+                                $:GPU_LOOP(parallelism='[seq]')
                                 do i = 1, num_fluids
                                     alpha_rho_visc(i) = max(0._wp, alpha_rho_visc(i))
                                     alpha_visc(i) = min(max(0._wp, alpha_visc(i)), 1._wp)
@@ -358,7 +361,7 @@ contains
 
                             end if
 
-                            !$acc loop seq
+                            $:GPU_LOOP(parallelism='[seq]')
                             do i = 1, num_fluids
                                 rho_visc = rho_visc + alpha_rho_visc(i)
                                 gamma_visc = gamma_visc + alpha_visc(i)*gammas(i)
@@ -366,12 +369,12 @@ contains
                             end do
 
                             if (viscous) then
-                                !$acc loop seq
+                                $:GPU_LOOP(parallelism='[seq]')
                                 do i = 1, 2
                                     Re_visc(i) = dflt_real
 
                                     if (Re_size(i) > 0) Re_visc(i) = 0._wp
-                                    !$acc loop seq
+                                    $:GPU_LOOP(parallelism='[seq]')
                                     do q = 1, Re_size(i)
                                         Re_visc(i) = alpha_visc(Re_idx(i, q))/Res_viscous(i, q) &
                                                      + Re_visc(i)
@@ -391,7 +394,7 @@ contains
                                         y_cc(k) + grad_y_vf(3)%sf(j, k, l))/ &
                                        Re_visc(1)
 
-                        !$acc loop seq
+                        $:GPU_LOOP(parallelism='[seq]')
                         do i = 2, 3
                             tau_Re_vf(contxe + i)%sf(j, k, l) = &
                                 tau_Re_vf(contxe + i)%sf(j, k, l) - &
@@ -408,12 +411,13 @@ contains
         end if
 
         if (bulk_stress) then    ! Bulk stresses
-            !$acc parallel loop collapse(3) gang vector default(present) private(alpha_visc, alpha_rho_visc, Re_visc, tau_Re )
+            $:GPU_PARALLEL_LOOP(collapse=3, private='[alpha_visc, &
+                & alpha_rho_visc, Re_visc, tau_Re]')
             do l = is3_viscous%beg, is3_viscous%end
                 do k = -1, 1
                     do j = is1_viscous%beg, is1_viscous%end
 
-                        !$acc loop seq
+                        $:GPU_LOOP(parallelism='[seq]')
                         do i = 1, num_fluids
                             alpha_rho_visc(i) = q_prim_vf(i)%sf(j, k, l)
                             if (bubbles_euler .and. num_fluids == 1) then
@@ -429,14 +433,14 @@ contains
                             pi_inf_visc = 0._wp
 
                             if (mpp_lim .and. (model_eqns == 2) .and. (num_fluids > 2)) then
-                                !$acc loop seq
+                                $:GPU_LOOP(parallelism='[seq]')
                                 do i = 1, num_fluids
                                     rho_visc = rho_visc + alpha_rho_visc(i)
                                     gamma_visc = gamma_visc + alpha_visc(i)*gammas(i)
                                     pi_inf_visc = pi_inf_visc + alpha_visc(i)*pi_infs(i)
                                 end do
                             else if ((model_eqns == 2) .and. (num_fluids > 2)) then
-                                !$acc loop seq
+                                $:GPU_LOOP(parallelism='[seq]')
                                 do i = 1, num_fluids - 1
                                     rho_visc = rho_visc + alpha_rho_visc(i)
                                     gamma_visc = gamma_visc + alpha_visc(i)*gammas(i)
@@ -455,7 +459,7 @@ contains
                             alpha_visc_sum = 0._wp
 
                             if (mpp_lim) then
-                                !$acc loop seq
+                                $:GPU_LOOP(parallelism='[seq]')
                                 do i = 1, num_fluids
                                     alpha_rho_visc(i) = max(0._wp, alpha_rho_visc(i))
                                     alpha_visc(i) = min(max(0._wp, alpha_visc(i)), 1._wp)
@@ -466,7 +470,7 @@ contains
 
                             end if
 
-                            !$acc loop seq
+                            $:GPU_LOOP(parallelism='[seq]')
                             do i = 1, num_fluids
                                 rho_visc = rho_visc + alpha_rho_visc(i)
                                 gamma_visc = gamma_visc + alpha_visc(i)*gammas(i)
@@ -474,12 +478,12 @@ contains
                             end do
 
                             if (viscous) then
-                                !$acc loop seq
+                                $:GPU_LOOP(parallelism='[seq]')
                                 do i = 1, 2
                                     Re_visc(i) = dflt_real
 
                                     if (Re_size(i) > 0) Re_visc(i) = 0._wp
-                                    !$acc loop seq
+                                    $:GPU_LOOP(parallelism='[seq]')
                                     do q = 1, Re_size(i)
                                         Re_visc(i) = alpha_visc(Re_idx(i, q))/Res_viscous(i, q) &
                                                      + Re_visc(i)
@@ -545,7 +549,7 @@ contains
 
             iv%beg = mom_idx%beg; iv%end = mom_idx%end
 
-            !$acc update device(iv)
+            $:GPU_UPDATE(device='[iv]')
 
             call s_reconstruct_cell_boundary_values_visc( &
                 q_prim_qp%vf(iv%beg:iv%end), &
@@ -583,17 +587,17 @@ contains
         else ! Compute velocity gradient at cell centers using finite differences
 
             iv%beg = mom_idx%beg; iv%end = mom_idx%end
-            !$acc update device(iv)
+            $:GPU_UPDATE(device='[iv]')
 
             is1_viscous = ix; is2_viscous = iy; is3_viscous = iz
 
-            !$acc update device(is1_viscous, is2_viscous, is3_viscous)
+            $:GPU_UPDATE(device='[is1_viscous,is2_viscous,is3_viscous]')
 
-            !$acc parallel loop collapse(3) gang vector default(present)
+            $:GPU_PARALLEL_LOOP(collapse=3)
             do l = is3_viscous%beg, is3_viscous%end
                 do k = iy%beg, iy%end
                     do j = is1_viscous%beg + 1, is1_viscous%end
-                        !$acc loop seq
+                        $:GPU_LOOP(parallelism='[seq]')
                         do i = iv%beg, iv%end
                             dqL_prim_dx_n(1)%vf(i)%sf(j, k, l) = &
                                 (q_prim_qp%vf(i)%sf(j, k, l) - &
@@ -604,11 +608,11 @@ contains
                 end do
             end do
 
-            !$acc parallel loop collapse(3) gang vector default(present)
+            $:GPU_PARALLEL_LOOP(collapse=3)
             do l = is3_viscous%beg, is3_viscous%end
                 do k = is2_viscous%beg, is2_viscous%end
                     do j = is1_viscous%beg, is1_viscous%end - 1
-                        !$acc loop seq
+                        $:GPU_LOOP(parallelism='[seq]')
                         do i = iv%beg, iv%end
                             dqR_prim_dx_n(1)%vf(i)%sf(j, k, l) = &
                                 (q_prim_qp%vf(i)%sf(j + 1, k, l) - &
@@ -621,11 +625,11 @@ contains
 
             if (n > 0) then
 
-                !$acc parallel loop collapse(3) gang vector default(present)
+                $:GPU_PARALLEL_LOOP(collapse=3)
                 do l = is3_viscous%beg, is3_viscous%end
                     do j = is2_viscous%beg + 1, is2_viscous%end
                         do k = is1_viscous%beg, is1_viscous%end
-                            !$acc loop seq
+                            $:GPU_LOOP(parallelism='[seq]')
                             do i = iv%beg, iv%end
                                 dqL_prim_dy_n(2)%vf(i)%sf(k, j, l) = &
                                     (q_prim_qp%vf(i)%sf(k, j, l) - &
@@ -636,11 +640,11 @@ contains
                     end do
                 end do
 
-                !$acc parallel loop collapse(3) gang vector default(present)
+                $:GPU_PARALLEL_LOOP(collapse=3)
                 do l = is3_viscous%beg, is3_viscous%end
                     do j = is2_viscous%beg, is2_viscous%end - 1
                         do k = is1_viscous%beg, is1_viscous%end
-                            !$acc loop seq
+                            $:GPU_LOOP(parallelism='[seq]')
                             do i = iv%beg, iv%end
                                 dqR_prim_dy_n(2)%vf(i)%sf(k, j, l) = &
                                     (q_prim_qp%vf(i)%sf(k, j + 1, l) - &
@@ -651,11 +655,11 @@ contains
                     end do
                 end do
 
-                !$acc parallel loop collapse(3) gang vector default(present)
+                $:GPU_PARALLEL_LOOP(collapse=3)
                 do l = is3_viscous%beg, is3_viscous%end
                     do j = is2_viscous%beg + 1, is2_viscous%end
                         do k = is1_viscous%beg + 1, is1_viscous%end - 1
-                            !$acc loop seq
+                            $:GPU_LOOP(parallelism='[seq]')
                             do i = iv%beg, iv%end
                                 dqL_prim_dx_n(2)%vf(i)%sf(k, j, l) = &
                                     (dqL_prim_dx_n(1)%vf(i)%sf(k, j, l) + &
@@ -670,11 +674,11 @@ contains
                     end do
                 end do
 
-                !$acc parallel loop collapse(3) gang vector default(present)
+                $:GPU_PARALLEL_LOOP(collapse=3)
                 do l = is3_viscous%beg, is3_viscous%end
                     do j = is2_viscous%beg, is2_viscous%end - 1
                         do k = is1_viscous%beg + 1, is1_viscous%end - 1
-                            !$acc loop seq
+                            $:GPU_LOOP(parallelism='[seq]')
                             do i = iv%beg, iv%end
                                 dqR_prim_dx_n(2)%vf(i)%sf(k, j, l) = &
                                     (dqL_prim_dx_n(1)%vf(i)%sf(k, j + 1, l) + &
@@ -690,11 +694,11 @@ contains
                     end do
                 end do
 
-                !$acc parallel loop collapse(3) gang vector default(present)
+                $:GPU_PARALLEL_LOOP(collapse=3)
                 do l = is3_viscous%beg, is3_viscous%end
                     do k = is2_viscous%beg + 1, is2_viscous%end - 1
                         do j = is1_viscous%beg + 1, is1_viscous%end
-                            !$acc loop seq
+                            $:GPU_LOOP(parallelism='[seq]')
                             do i = iv%beg, iv%end
                                 dqL_prim_dy_n(1)%vf(i)%sf(j, k, l) = &
                                     (dqL_prim_dy_n(2)%vf(i)%sf(j, k, l) + &
@@ -710,11 +714,11 @@ contains
                     end do
                 end do
 
-                !$acc parallel loop collapse(3) gang vector default(present)
+                $:GPU_PARALLEL_LOOP(collapse=3)
                 do l = is3_viscous%beg, is3_viscous%end
                     do k = is2_viscous%beg + 1, is2_viscous%end - 1
                         do j = is1_viscous%beg, is1_viscous%end - 1
-                            !$acc loop seq
+                            $:GPU_LOOP(parallelism='[seq]')
                             do i = iv%beg, iv%end
                                 dqR_prim_dy_n(1)%vf(i)%sf(j, k, l) = &
                                     (dqL_prim_dy_n(2)%vf(i)%sf(j + 1, k, l) + &
@@ -732,11 +736,11 @@ contains
 
                 if (p > 0) then
 
-                    !$acc parallel loop collapse(3) gang vector default(present)
+                    $:GPU_PARALLEL_LOOP(collapse=3)
                     do j = is3_viscous%beg + 1, is3_viscous%end
                         do l = is2_viscous%beg, is2_viscous%end
                             do k = is1_viscous%beg, is1_viscous%end
-                                !$acc loop seq
+                                $:GPU_LOOP(parallelism='[seq]')
                                 do i = iv%beg, iv%end
 
                                     dqL_prim_dz_n(3)%vf(i)%sf(k, l, j) = &
@@ -748,11 +752,11 @@ contains
                         end do
                     end do
 
-                    !$acc parallel loop collapse(3) gang vector default(present)
+                    $:GPU_PARALLEL_LOOP(collapse=3)
                     do j = is3_viscous%beg, is3_viscous%end - 1
                         do l = is2_viscous%beg, is2_viscous%end
                             do k = is1_viscous%beg, is1_viscous%end
-                                !$acc loop seq
+                                $:GPU_LOOP(parallelism='[seq]')
                                 do i = iv%beg, iv%end
 
                                     dqR_prim_dz_n(3)%vf(i)%sf(k, l, j) = &
@@ -764,11 +768,11 @@ contains
                         end do
                     end do
 
-                    !$acc parallel loop collapse(3) gang vector default(present)
+                    $:GPU_PARALLEL_LOOP(collapse=3)
                     do l = is3_viscous%beg + 1, is3_viscous%end - 1
                         do k = is2_viscous%beg, is2_viscous%end
                             do j = is1_viscous%beg + 1, is1_viscous%end
-                                !$acc loop seq
+                                $:GPU_LOOP(parallelism='[seq]')
                                 do i = iv%beg, iv%end
 
                                     dqL_prim_dz_n(1)%vf(i)%sf(j, k, l) = &
@@ -785,11 +789,11 @@ contains
                         end do
                     end do
 
-                    !$acc parallel loop collapse(3) gang vector default(present)
+                    $:GPU_PARALLEL_LOOP(collapse=3)
                     do l = is3_viscous%beg + 1, is3_viscous%end - 1
                         do k = is2_viscous%beg, is2_viscous%end
                             do j = is1_viscous%beg, is1_viscous%end - 1
-                                !$acc loop seq
+                                $:GPU_LOOP(parallelism='[seq]')
                                 do i = iv%beg, iv%end
 
                                     dqR_prim_dz_n(1)%vf(i)%sf(j, k, l) = &
@@ -806,11 +810,11 @@ contains
                         end do
                     end do
 
-                    !$acc parallel loop collapse(3) gang vector default(present)
+                    $:GPU_PARALLEL_LOOP(collapse=3)
                     do l = is3_viscous%beg + 1, is3_viscous%end - 1
                         do j = is2_viscous%beg + 1, is2_viscous%end
                             do k = is1_viscous%beg, is1_viscous%end
-                                !$acc loop seq
+                                $:GPU_LOOP(parallelism='[seq]')
                                 do i = iv%beg, iv%end
 
                                     dqL_prim_dz_n(2)%vf(i)%sf(k, j, l) = &
@@ -827,11 +831,11 @@ contains
                         end do
                     end do
 
-                    !$acc parallel loop collapse(3) gang vector default(present)
+                    $:GPU_PARALLEL_LOOP(collapse=3)
                     do l = is3_viscous%beg + 1, is3_viscous%end - 1
                         do j = is2_viscous%beg, is2_viscous%end - 1
                             do k = is1_viscous%beg, is1_viscous%end
-                                !$acc loop seq
+                                $:GPU_LOOP(parallelism='[seq]')
                                 do i = iv%beg, iv%end
 
                                     dqR_prim_dz_n(2)%vf(i)%sf(k, j, l) = &
@@ -848,11 +852,11 @@ contains
                         end do
                     end do
 
-                    !$acc parallel loop collapse(3) gang vector default(present)
+                    $:GPU_PARALLEL_LOOP(collapse=3)
                     do j = is3_viscous%beg + 1, is3_viscous%end
                         do l = is2_viscous%beg + 1, is2_viscous%end - 1
                             do k = is1_viscous%beg, is1_viscous%end
-                                !$acc loop seq
+                                $:GPU_LOOP(parallelism='[seq]')
                                 do i = iv%beg, iv%end
 
                                     dqL_prim_dy_n(3)%vf(i)%sf(k, l, j) = &
@@ -869,11 +873,11 @@ contains
                         end do
                     end do
 
-                    !$acc parallel loop collapse(3) gang vector default(present)
+                    $:GPU_PARALLEL_LOOP(collapse=3)
                     do j = is3_viscous%beg, is3_viscous%end - 1
                         do l = is2_viscous%beg + 1, is2_viscous%end - 1
                             do k = is1_viscous%beg, is1_viscous%end
-                                !$acc loop seq
+                                $:GPU_LOOP(parallelism='[seq]')
                                 do i = iv%beg, iv%end
 
                                     dqR_prim_dy_n(3)%vf(i)%sf(k, l, j) = &
@@ -889,11 +893,11 @@ contains
                             end do
                         end do
                     end do
-                    !$acc parallel loop collapse(3) gang vector default(present)
+                    $:GPU_PARALLEL_LOOP(collapse=3)
                     do j = is3_viscous%beg + 1, is3_viscous%end
                         do l = is2_viscous%beg, is2_viscous%end
                             do k = is1_viscous%beg + 1, is1_viscous%end - 1
-                                !$acc loop seq
+                                $:GPU_LOOP(parallelism='[seq]')
                                 do i = iv%beg, iv%end
 
                                     dqL_prim_dx_n(3)%vf(i)%sf(k, l, j) = &
@@ -909,11 +913,11 @@ contains
                             end do
                         end do
                     end do
-                    !$acc parallel loop collapse(3) gang vector default(present)
+                    $:GPU_PARALLEL_LOOP(collapse=3)
                     do j = is3_viscous%beg, is3_viscous%end - 1
                         do l = is2_viscous%beg, is2_viscous%end
                             do k = is1_viscous%beg + 1, is1_viscous%end - 1
-                                !$acc loop seq
+                                $:GPU_LOOP(parallelism='[seq]')
                                 do i = iv%beg, iv%end
                                     dqR_prim_dx_n(3)%vf(i)%sf(k, l, j) = &
                                         (dqL_prim_dx_n(1)%vf(i)%sf(k, l, j + 1) + &
@@ -995,7 +999,7 @@ contains
 
         end if
 
-        !$acc update device(is1_viscous, is2_viscous, is3_viscous, iv)
+        $:GPU_UPDATE(device='[is1_viscous, is2_viscous, is3_viscous, iv]')
 
         if (n > 0) then
             if (p > 0) then
@@ -1019,7 +1023,7 @@ contains
         if (viscous) then
             if (weno_Re_flux) then
                 if (norm_dir == 2) then
-                    !$acc parallel loop collapse(4) gang vector default(present)
+                    $:GPU_PARALLEL_LOOP(collapse=4)
                     do i = iv%beg, iv%end
                         do l = is3_viscous%beg, is3_viscous%end
                             do j = is1_viscous%beg, is1_viscous%end
@@ -1031,7 +1035,7 @@ contains
                         end do
                     end do
                 elseif (norm_dir == 3) then
-                    !$acc parallel loop collapse(4) gang vector default(present)
+                    $:GPU_PARALLEL_LOOP(collapse=4)
                     do i = iv%beg, iv%end
                         do j = is1_viscous%beg, is1_viscous%end
                             do k = is2_viscous%beg, is2_viscous%end
@@ -1043,7 +1047,7 @@ contains
                         end do
                     end do
                 elseif (norm_dir == 1) then
-                    !$acc parallel loop collapse(4) gang vector default(present)
+                    $:GPU_PARALLEL_LOOP(collapse=4)
                     do i = iv%beg, iv%end
                         do l = is3_viscous%beg, is3_viscous%end
                             do k = is2_viscous%beg, is2_viscous%end
@@ -1092,7 +1096,7 @@ contains
 
         end if
 
-        !$acc update device(is1_viscous, is2_viscous, is3_viscous, iv)
+        $:GPU_UPDATE(device='[is1_viscous, is2_viscous, is3_viscous, iv]')
 
         if (n > 0) then
             if (p > 0) then
@@ -1118,7 +1122,7 @@ contains
         if (viscous) then
             if (weno_Re_flux) then
                 if (norm_dir == 2) then
-                    !$acc parallel loop collapse(4) gang vector default(present)
+                    $:GPU_PARALLEL_LOOP(collapse=4)
                     do i = iv%beg, iv%end
                         do l = is3_viscous%beg, is3_viscous%end
                             do j = is1_viscous%beg, is1_viscous%end
@@ -1130,7 +1134,7 @@ contains
                         end do
                     end do
                 elseif (norm_dir == 3) then
-                    !$acc parallel loop collapse(4) gang vector default(present)
+                    $:GPU_PARALLEL_LOOP(collapse=4)
                     do i = iv%beg, iv%end
                         do j = is1_viscous%beg, is1_viscous%end
                             do k = is2_viscous%beg, is2_viscous%end
@@ -1142,7 +1146,7 @@ contains
                         end do
                     end do
                 elseif (norm_dir == 1) then
-                    !$acc parallel loop collapse(4) gang vector default(present)
+                    $:GPU_PARALLEL_LOOP(collapse=4)
                     do i = iv%beg, iv%end
                         do l = is3_viscous%beg, is3_viscous%end
                             do k = is2_viscous%beg, is2_viscous%end
@@ -1195,7 +1199,7 @@ contains
         is3_viscous = iz
         iv = iv_in
 
-        !$acc update device(is1_viscous, is2_viscous, is3_viscous, iv)
+        $:GPU_UPDATE(device='[is1_viscous, is2_viscous, is3_viscous, iv]')
 
         ! First-Order Spatial Derivatives in x-direction
         if (norm_dir == 1) then
@@ -1206,11 +1210,11 @@ contains
             ! cell-boundaries, to calculate the cell-averaged first-order
             ! spatial derivatives inside the cell.
 
-            !$acc parallel loop collapse(3) gang vector default(present)
+            $:GPU_PARALLEL_LOOP(collapse=3)
             do l = is3_viscous%beg, is3_viscous%end
                 do k = is2_viscous%beg, is2_viscous%end
                     do j = is1_viscous%beg + 1, is1_viscous%end - 1
-                        !$acc loop seq
+                        $:GPU_LOOP(parallelism='[seq]')
                         do i = iv%beg, iv%end
                             dv_ds_vf(i)%sf(j, k, l) = &
                                 1._wp/((1._wp + wa_flg)*dL(j)) &
@@ -1234,11 +1238,11 @@ contains
             ! cell-boundaries, to calculate the cell-averaged first-order
             ! spatial derivatives inside the cell.
 
-            !$acc parallel loop collapse(3) gang vector default(present)
+            $:GPU_PARALLEL_LOOP(collapse=3)
             do l = is3_viscous%beg, is3_viscous%end
                 do k = is2_viscous%beg + 1, is2_viscous%end - 1
                     do j = is1_viscous%beg, is1_viscous%end
-                        !$acc loop seq
+                        $:GPU_LOOP(parallelism='[seq]')
                         do i = iv%beg, iv%end
                             dv_ds_vf(i)%sf(j, k, l) = &
                                 1._wp/((1._wp + wa_flg)*dL(k)) &
@@ -1262,11 +1266,11 @@ contains
             ! cell-boundaries, to calculate the cell-averaged first-order
             ! spatial derivatives inside the cell.
 
-            !$acc parallel loop collapse(3) gang vector default(present)
+            $:GPU_PARALLEL_LOOP(collapse=3)
             do l = is3_viscous%beg + 1, is3_viscous%end - 1
                 do k = is2_viscous%beg, is2_viscous%end
                     do j = is1_viscous%beg, is1_viscous%end
-                        !$acc loop seq
+                        $:GPU_LOOP(parallelism='[seq]')
                         do i = iv%beg, iv%end
                             dv_ds_vf(i)%sf(j, k, l) = &
                                 1._wp/((1._wp + wa_flg)*dL(l)) &
@@ -1315,9 +1319,9 @@ contains
 
         is1_viscous = ix; is2_viscous = iy; is3_viscous = iz
 
-        !$acc update device(is1_viscous, is2_viscous, is3_viscous)
+        $:GPU_UPDATE(device='[is1_viscous,is2_viscous,is3_viscous]')
 
-        !$acc parallel loop collapse(3) gang vector default(present)
+        $:GPU_PARALLEL_LOOP(collapse=3)
         do l = is3_viscous%beg, is3_viscous%end
             do k = is2_viscous%beg, is2_viscous%end
                 do j = is1_viscous%beg, is1_viscous%end
@@ -1329,7 +1333,7 @@ contains
         end do
 
         if (n > 0) then
-            !$acc parallel loop collapse(3) gang vector default(present)
+            $:GPU_PARALLEL_LOOP(collapse=3)
             do l = is3_viscous%beg, is3_viscous%end
                 do k = is2_viscous%beg, is2_viscous%end
                     do j = is1_viscous%beg, is1_viscous%end
@@ -1342,7 +1346,7 @@ contains
         end if
 
         if (p > 0) then
-            !$acc parallel loop collapse(3) gang vector default(present)
+            $:GPU_PARALLEL_LOOP(collapse=3)
             do l = is3_viscous%beg, is3_viscous%end
                 do k = is2_viscous%beg, is2_viscous%end
                     do j = is1_viscous%beg, is1_viscous%end
@@ -1354,7 +1358,7 @@ contains
             end do
         end if
 
-        !$acc parallel loop collapse(2) gang vector default(present)
+        $:GPU_PARALLEL_LOOP(collapse=2)
         do l = idwbuff(3)%beg, idwbuff(3)%end
             do k = idwbuff(2)%beg, idwbuff(2)%end
                 grad_x%sf(idwbuff(1)%beg, k, l) = &
@@ -1366,7 +1370,7 @@ contains
             end do
         end do
         if (n > 0) then
-            !$acc parallel loop collapse(2) gang vector default(present)
+            $:GPU_PARALLEL_LOOP(collapse=2)
             do l = idwbuff(3)%beg, idwbuff(3)%end
                 do j = idwbuff(1)%beg, idwbuff(1)%end
                     grad_y%sf(j, idwbuff(2)%beg, l) = &
@@ -1378,7 +1382,7 @@ contains
                 end do
             end do
             if (p > 0) then
-                !$acc parallel loop collapse(2) gang vector default(present)
+                $:GPU_PARALLEL_LOOP(collapse=2)
                 do k = idwbuff(2)%beg, idwbuff(2)%end
                     do j = idwbuff(1)%beg, idwbuff(1)%end
                         grad_z%sf(j, k, idwbuff(3)%beg) = &
@@ -1393,7 +1397,7 @@ contains
         end if
 
         if (bc_x%beg <= BC_GHOST_EXTRAP) then
-            !$acc parallel loop collapse(2) gang vector default(present)
+            $:GPU_PARALLEL_LOOP(collapse=2)
             do l = idwbuff(3)%beg, idwbuff(3)%end
                 do k = idwbuff(2)%beg, idwbuff(2)%end
                     grad_x%sf(0, k, l) = (-3._wp*var%sf(0, k, l) + 4._wp*var%sf(1, k, l) - var%sf(2, k, l))/ &
@@ -1402,7 +1406,7 @@ contains
             end do
         end if
         if (bc_x%end <= BC_GHOST_EXTRAP) then
-            !$acc parallel loop collapse(2) gang vector default(present)
+            $:GPU_PARALLEL_LOOP(collapse=2)
             do l = idwbuff(3)%beg, idwbuff(3)%end
                 do k = idwbuff(2)%beg, idwbuff(2)%end
                     grad_x%sf(m, k, l) = (3._wp*var%sf(m, k, l) - 4._wp*var%sf(m - 1, k, l) + var%sf(m - 2, k, l))/ &
@@ -1412,7 +1416,7 @@ contains
         end if
         if (n > 0) then
             if (bc_y%beg <= BC_GHOST_EXTRAP .and. bc_y%beg /= BC_NULL) then
-                !$acc parallel loop collapse(2) gang vector default(present)
+                $:GPU_PARALLEL_LOOP(collapse=2)
                 do l = idwbuff(3)%beg, idwbuff(3)%end
                     do j = idwbuff(1)%beg, idwbuff(1)%end
                         grad_y%sf(j, 0, l) = (-3._wp*var%sf(j, 0, l) + 4._wp*var%sf(j, 1, l) - var%sf(j, 2, l))/ &
@@ -1421,7 +1425,7 @@ contains
                 end do
             end if
             if (bc_y%end <= BC_GHOST_EXTRAP) then
-                !$acc parallel loop collapse(2) gang vector default(present)
+                $:GPU_PARALLEL_LOOP(collapse=2)
                 do l = idwbuff(3)%beg, idwbuff(3)%end
                     do j = idwbuff(1)%beg, idwbuff(1)%end
                         grad_y%sf(j, n, l) = (3._wp*var%sf(j, n, l) - 4._wp*var%sf(j, n - 1, l) + var%sf(j, n - 2, l))/ &
@@ -1431,7 +1435,7 @@ contains
             end if
             if (p > 0) then
                 if (bc_z%beg <= BC_GHOST_EXTRAP) then
-                    !$acc parallel loop collapse(2) gang vector default(present)
+                    $:GPU_PARALLEL_LOOP(collapse=2)
                     do k = idwbuff(2)%beg, idwbuff(2)%end
                         do j = idwbuff(1)%beg, idwbuff(1)%end
                             grad_z%sf(j, k, 0) = &
@@ -1441,7 +1445,7 @@ contains
                     end do
                 end if
                 if (bc_z%end <= BC_GHOST_EXTRAP) then
-                    !$acc parallel loop collapse(2) gang vector default(present)
+                    $:GPU_PARALLEL_LOOP(collapse=2)
                     do k = idwbuff(2)%beg, idwbuff(2)%end
                         do j = idwbuff(1)%beg, idwbuff(1)%end
                             grad_z%sf(j, k, p) = &
diff --git a/src/simulation/m_weno.fpp b/src/simulation/m_weno.fpp
index fb13b45aba..f03c7c8151 100644
--- a/src/simulation/m_weno.fpp
+++ b/src/simulation/m_weno.fpp
@@ -42,6 +42,7 @@ module m_weno
     !> @{
     real(wp), allocatable, dimension(:, :, :, :) :: v_rs_ws_x, v_rs_ws_y, v_rs_ws_z
     !> @}
+    $:GPU_DECLARE(create='[v_rs_ws_x,v_rs_ws_y,v_rs_ws_z]')
 
     ! WENO Coefficients
 
@@ -58,6 +59,8 @@ module m_weno
     real(wp), target, allocatable, dimension(:, :, :) :: poly_coef_cbR_y
     real(wp), target, allocatable, dimension(:, :, :) :: poly_coef_cbR_z
     !> @}
+    $:GPU_DECLARE(create='[poly_coef_cbL_x,poly_coef_cbL_y,poly_coef_cbL_z]')
+    $:GPU_DECLARE(create='[poly_coef_cbR_x,poly_coef_cbR_y,poly_coef_cbR_z]')
 
     !> @name The ideal weights at the left and the right cell-boundaries and at the
     !! left and the right quadrature points, in x-, y- and z-directions. Note
@@ -72,6 +75,7 @@ module m_weno
     real(wp), target, allocatable, dimension(:, :) :: d_cbR_y
     real(wp), target, allocatable, dimension(:, :) :: d_cbR_z
     !> @}
+    $:GPU_DECLARE(create='[d_cbL_x,d_cbL_y,d_cbL_z,d_cbR_x,d_cbR_y,d_cbR_z]')
 
     !> @name Smoothness indicator coefficients in the x-, y-, and z-directions. Note
     !! that the first array dimension identifies the smoothness indicator, the
@@ -82,25 +86,20 @@ module m_weno
     real(wp), target, allocatable, dimension(:, :, :) :: beta_coef_y
     real(wp), target, allocatable, dimension(:, :, :) :: beta_coef_z
     !> @}
+    $:GPU_DECLARE(create='[beta_coef_x,beta_coef_y,beta_coef_z]')
 
     ! END: WENO Coefficients
 
     integer :: v_size !< Number of WENO-reconstructed cell-average variables
-    !$acc declare create(v_size)
+    $:GPU_DECLARE(create='[v_size]')
 
     !> @name Indical bounds in the s1-, s2- and s3-directions
     !> @{
     type(int_bounds_info) :: is1_weno, is2_weno, is3_weno
-    !$acc declare create(is1_weno, is2_weno, is3_weno)
+    $:GPU_DECLARE(create='[is1_weno,is2_weno,is3_weno]')
     !
     !> @}
 
-    !$acc declare create( &
-    !$acc                v_rs_ws_x, v_rs_ws_y, v_rs_ws_z, &
-    !$acc                poly_coef_cbL_x,poly_coef_cbL_y,poly_coef_cbL_z, &
-    !$acc                poly_coef_cbR_x,poly_coef_cbR_y,poly_coef_cbR_z,d_cbL_x,       &
-    !$acc                d_cbL_y,d_cbL_z,d_cbR_x,d_cbR_y,d_cbR_z,beta_coef_x,beta_coef_y,beta_coef_z)
-
 contains
 
     !>  The computation of parameters, the allocation of memory,
@@ -622,11 +621,11 @@ contains
         #:endfor
 
         if (weno_dir == 1) then
-            !$acc update device(poly_coef_cbL_x, poly_coef_cbR_x, d_cbL_x, d_cbR_x, beta_coef_x)
+            $:GPU_UPDATE(device='[poly_coef_cbL_x,poly_coef_cbR_x,d_cbL_x,d_cbR_x,beta_coef_x]')
         elseif (weno_dir == 2) then
-            !$acc update device(poly_coef_cbL_y, poly_coef_cbR_y, d_cbL_y, d_cbR_y, beta_coef_y)
+            $:GPU_UPDATE(device='[poly_coef_cbL_y,poly_coef_cbR_y,d_cbL_y,d_cbR_y,beta_coef_y]')
         else
-            !$acc update device(poly_coef_cbL_z, poly_coef_cbR_z, d_cbL_z, d_cbR_z, beta_coef_z)
+            $:GPU_UPDATE(device='[poly_coef_cbL_z,poly_coef_cbR_z,d_cbL_z,d_cbR_z,beta_coef_z]')
         end if
 
         ! Nullifying WENO coefficients and cell-boundary locations pointers
@@ -660,7 +659,7 @@ contains
         is2_weno = is2_weno_d
         is3_weno = is3_weno_d
 
-        !$acc update device(is1_weno, is2_weno, is3_weno)
+        $:GPU_UPDATE(device='[is1_weno,is2_weno,is3_weno]')
 
         if (weno_order /= 1) then
             call s_initialize_weno(v_vf, &
@@ -669,7 +668,7 @@ contains
 
         if (weno_order == 1) then
             if (weno_dir == 1) then
-                !$acc parallel loop collapse(4) gang vector default(present)
+                $:GPU_PARALLEL_LOOP(collapse=4)
                 do i = 1, ubound(v_vf, 1)
                     do l = is3_weno%beg, is3_weno%end
                         do k = is2_weno%beg, is2_weno%end
@@ -680,9 +679,8 @@ contains
                         end do
                     end do
                 end do
-                !$acc end parallel loop
             else if (weno_dir == 2) then
-                !$acc parallel loop collapse(4) gang vector default(present)
+                $:GPU_PARALLEL_LOOP(collapse=4)
                 do i = 1, ubound(v_vf, 1)
                     do l = is3_weno%beg, is3_weno%end
                         do k = is2_weno%beg, is2_weno%end
@@ -693,9 +691,8 @@ contains
                         end do
                     end do
                 end do
-                !$acc end parallel loop
             else if (weno_dir == 3) then
-                !$acc parallel loop collapse(4) gang vector default(present)
+                $:GPU_PARALLEL_LOOP(collapse=4)
                 do i = 1, ubound(v_vf, 1)
                     do l = is3_weno%beg, is3_weno%end
                         do k = is2_weno%beg, is2_weno%end
@@ -706,12 +703,11 @@ contains
                         end do
                     end do
                 end do
-                !$acc end parallel loop
             end if
         elseif (weno_order == 3) then
             #:for WENO_DIR, XYZ in [(1, 'x'), (2, 'y'), (3, 'z')]
                 if (weno_dir == ${WENO_DIR}$) then
-                    !$acc parallel loop collapse(4) gang vector default(present) private(beta,dvd,poly,omega,alpha,tau)
+                    $:GPU_PARALLEL_LOOP(collapse=4,private='[beta,dvd,poly,omega,alpha,tau]')
                     do l = is3_weno%beg, is3_weno%end
                         do k = is2_weno%beg, is2_weno%end
                             do j = is1_weno%beg, is1_weno%end
@@ -784,17 +780,16 @@ contains
                             end do
                         end do
                     end do
-                    !$acc end parallel loop
                 end if
             #:endfor
         elseif (weno_order == 5) then
             #:for WENO_DIR, XYZ in [(1, 'x'), (2, 'y'), (3, 'z')]
                 if (weno_dir == ${WENO_DIR}$) then
-                    !$acc parallel loop vector gang collapse(3) default(present) private(dvd, poly, beta, alpha, omega, tau, delta)
+                    $:GPU_PARALLEL_LOOP(collapse=3,private='[dvd,poly,beta,alpha,omega,tau,delta]')
                     do l = is3_weno%beg, is3_weno%end
                         do k = is2_weno%beg, is2_weno%end
                             do j = is1_weno%beg, is1_weno%end
-                                !$acc loop seq
+                                $:GPU_LOOP(parallelism='[seq]')
                                 do i = 1, v_size
                                     ! reconstruct from left side
 
@@ -899,7 +894,6 @@ contains
                             end do
                         end do
                     end do
-                    !$acc end parallel loop
 
                     if (mp_weno) then
                         call s_preserve_monotonicity(v_rs_ws_${XYZ}$, vL_rs_vf_${XYZ}$, &
@@ -910,11 +904,11 @@ contains
         elseif (weno_order == 7) then
             #:for WENO_DIR, XYZ in [(1, 'x'), (2, 'y'), (3, 'z')]
                 if (weno_dir == ${WENO_DIR}$) then
-                    !$acc parallel loop vector gang collapse(3) default(present) private(poly, beta, alpha, omega, tau, delta, dvd, v)
+                    $:GPU_PARALLEL_LOOP(collapse=3,private='[poly,beta,alpha,omega,tau,delta,dvd,v]')
                     do l = is3_weno%beg, is3_weno%end
                         do k = is2_weno%beg, is2_weno%end
                             do j = is1_weno%beg, is1_weno%end
-                                !$acc loop seq
+                                $:GPU_LOOP(parallelism='[seq]')
                                 do i = 1, v_size
 
                                     if (teno) v = v_rs_ws_${XYZ}$ (j - 3:j + 3, k, l, i) ! temporary field value array for clarity
@@ -1095,7 +1089,6 @@ contains
                             end do
                         end do
                     end do
-                    !$acc end parallel loop
 
                 end if
             #:endfor
@@ -1130,10 +1123,10 @@ contains
         ! as to reshape the inputted data in the coordinate direction of
         ! the WENO reconstruction
         v_size = ubound(v_vf, 1)
-        !$acc update device(v_size)
+        $:GPU_UPDATE(device='[v_size]')
 
         if (weno_dir == 1) then
-            !$acc parallel loop collapse(4) gang vector default(present)
+            $:GPU_PARALLEL_LOOP(collapse=4)
             do j = 1, v_size
                 do q = is3_weno%beg, is3_weno%end
                     do l = is2_weno%beg, is2_weno%end
@@ -1143,7 +1136,6 @@ contains
                     end do
                 end do
             end do
-            !$acc end parallel loop
         end if
 
         ! Reshaping/Projecting onto Characteristic Fields in y-direction
@@ -1156,22 +1148,22 @@ contains
                     block
                         use CuTensorEx
 
-                        !$acc host_data use_device(v_rs_ws_x, v_rs_ws_y)
-                        v_rs_ws_y = reshape(v_rs_ws_x, shape=[n + 1 + 2*buff_size, m + 2*buff_size + 1, p + 1, sys_size], order=[2, 1, 3, 4])
-                        !$acc end host_data
+                        #:call GPU_HOST_DATA(use_device='[v_rs_ws_x, v_rs_ws_y]')
+                            v_rs_ws_y = reshape(v_rs_ws_x, shape=[n + 1 + 2*buff_size, m + 2*buff_size + 1, p + 1, sys_size], order=[2, 1, 3, 4])
+                        #:endcall GPU_HOST_DATA
                     end block
                 else
                     block
                         use CuTensorEx
 
-                        !$acc host_data use_device(v_rs_ws_x, v_rs_ws_y)
-                        v_rs_ws_y = reshape(v_rs_ws_x, shape=[n + 1 + 2*buff_size, m + 2*buff_size + 1, p + 1 + 2*buff_size, sys_size], order=[2, 1, 3, 4])
-                        !$acc end host_data
+                        #:call GPU_HOST_DATA(use_device='[v_rs_ws_x, v_rs_ws_y]')
+                            v_rs_ws_y = reshape(v_rs_ws_x, shape=[n + 1 + 2*buff_size, m + 2*buff_size + 1, p + 1 + 2*buff_size, sys_size], order=[2, 1, 3, 4])
+                        #:endcall GPU_HOST_DATA
                     end block
                 end if
             else
 #endif
-                !$acc parallel loop collapse(4) gang vector default(present)
+                $:GPU_PARALLEL_LOOP(collapse=4)
                 do j = 1, v_size
                     do q = is3_weno%beg, is3_weno%end
                         do l = is2_weno%beg, is2_weno%end
@@ -1181,7 +1173,6 @@ contains
                         end do
                     end do
                 end do
-!$acc end parallel loop
 #if MFC_cuTENSOR
             end if
 #endif
@@ -1195,13 +1186,13 @@ contains
                 block
                     use CuTensorEx
 
-                    !$acc host_data use_device(v_rs_ws_x, v_rs_ws_z)
-                    v_rs_ws_z = reshape(v_rs_ws_x, shape=[p + 1 + 2*buff_size, n + 2*buff_size + 1, m + 2*buff_size + 1, sys_size], order=[3, 2, 1, 4])
-                    !$acc end host_data
+                    #:call GPU_HOST_DATA(use_device='[v_rs_ws_x, v_rs_ws_z]')
+                        v_rs_ws_z = reshape(v_rs_ws_x, shape=[p + 1 + 2*buff_size, n + 2*buff_size + 1, m + 2*buff_size + 1, sys_size], order=[3, 2, 1, 4])
+                    #:endcall
                 end block
             else
 #endif
-                !$acc parallel loop collapse(4) gang vector default(present)
+                $:GPU_PARALLEL_LOOP(collapse=4)
                 do j = 1, v_size
                     do q = is3_weno%beg, is3_weno%end
                         do l = is2_weno%beg, is2_weno%end
@@ -1211,7 +1202,6 @@ contains
                         end do
                     end do
                 end do
-!$acc end parallel loop
 #if MFC_cuTENSOR
             end if
 #endif
@@ -1264,7 +1254,7 @@ contains
         real(wp), parameter :: alpha_mp = 2._wp
         real(wp), parameter :: beta_mp = 4._wp/3._wp
 
-        !$acc parallel loop gang vector collapse (4)  default(present) private(d)
+        $:GPU_PARALLEL_LOOP(collapse=4,private='[d]')
         do l = is3_weno%beg, is3_weno%end
             do k = is2_weno%beg, is2_weno%end
                 do j = is1_weno%beg, is1_weno%end
@@ -1389,7 +1379,6 @@ contains
                 end do
             end do
         end do
-        !$acc end parallel loop
 
     end subroutine s_preserve_monotonicity