## Exercise: Compute-IO Overlap

Usage of `cub::DeviceTransform` for your reference:

```c++
cub::DeviceTransform::Transform(input_iterator, output_iterator, num_items, op);
```

In the code below, replace `thrust::tabulate` with `cub::DeviceTransform` and use `cudaDeviceSynchronize` appropriately:

![Compute-IO-Overlap](Images/overlap.png "Compute/IO Overlap")

In [None]:
import os

if os.getenv("COLAB_RELEASE_TAG"): # If running in Google Colab:
  !mkdir -p Sources
  !wget https://raw.githubusercontent.com/NVIDIA/accelerated-computing-hub/refs/heads/main/gpu-cpp-tutorial/notebooks/02.02-Asynchrony/Sources/ach.h -nv -O Sources/ach.h
!wget https://raw.githubusercontent.com/NVIDIA/accelerated-computing-hub/refs/heads/main/gpu-cpp-tutorial/notebooks/02.02-Asynchrony/Sources/nvtx3.hpp -nv -O Sources/nvtx3.hpp

<details>
    <summary>Original code in case you need to refer to it</summary>

```c++
%%writefile Sources/compute-io-overlap.cpp
#include "ach.h"

void simulate(int width,
              int height,
              const thrust::device_vector<float> &in,
                    thrust::device_vector<float> &out)
{
  cuda::std::mdspan temp_in(thrust::raw_pointer_cast(in.data()), height, width);
  thrust::tabulate(out.begin(), out.end(), [=] __device__(int id) {
    return ach::compute(id, temp_in);
  });
}

int main()
{
  int height = 2048;
  int width  = 8192;

  thrust::device_vector<float> d_prev = ach::init(height, width);
  thrust::device_vector<float> d_next(height * width);
  thrust::host_vector<float> h_prev(height * width);

  const int compute_steps = 500;
  const int write_steps = 3;
  for (int write_step = 0; write_step < write_steps; write_step++)
  {
    auto step_begin = std::chrono::high_resolution_clock::now();
    thrust::copy(d_prev.begin(), d_prev.end(), h_prev.begin());

    for (int compute_step = 0; compute_step < compute_steps; compute_step++)
    {
      simulate(width, height, d_prev, d_next);
      d_prev.swap(d_next);
    }

    auto write_begin = std::chrono::high_resolution_clock::now();
    ach::store(write_step, height, width, h_prev);
    auto write_end = std::chrono::high_resolution_clock::now();
    auto write_seconds = std::chrono::duration<double>(write_end - write_begin).count();

    auto step_end = std::chrono::high_resolution_clock::now();
    auto step_seconds = std::chrono::duration<double>(step_end - step_begin).count();
    std::printf("compute + write %d in %g s\n", write_step, step_seconds);
    std::printf("          write %d in %g s\n", write_step, write_seconds);
  }
}
```
    
</details>

In [None]:
%%writefile Sources/compute-io-overlap.cpp
#include "ach.h"

void simulate(int width,
              int height,
              const thrust::device_vector<float> &in,
                    thrust::device_vector<float> &out)
{
  cuda::std::mdspan temp_in(thrust::raw_pointer_cast(in.data()), height, width);
  // TODO: replace with an asynchronous call
  thrust::tabulate(out.begin(), out.end(), [=] __device__(int id) {
    return ach::compute(id, temp_in);
  });
}

int main()
{
  int height = 2048;
  int width  = 8192;

  thrust::device_vector<float> d_prev = ach::init(height, width);
  thrust::device_vector<float> d_next(height * width);
  thrust::host_vector<float> h_prev(height * width);

  const int compute_steps = 500;
  const int write_steps = 3;

  /* Executing write_steps iterations of
   *  1. Device to Host copy of prev
   *  2. Simulate next on the device
   *  3. Write host prev to disk
   * The goal is to overlap 2. and 3. by using asynchrony
  */
  for (int write_step = 0; write_step < write_steps; write_step++)
  {
    auto step_begin = std::chrono::high_resolution_clock::now();

    // 1. Copying the device prev buffer of previous step in the host buffer
    thrust::copy(d_prev.begin(), d_prev.end(), h_prev.begin());

    // 2. Executing compute_step iterations to simulate
    // We want this step to now run asynchronously on the GPU
    for (int compute_step = 0; compute_step < compute_steps; compute_step++)
    {
      simulate(width, height, d_prev, d_next);
      d_prev.swap(d_next);
    }

    // TODO: Don't forget to synchronize where it makes senses

    auto write_begin = std::chrono::high_resolution_clock::now();

    // 3. Store host prev to disk
    ach::store(write_step, height, width, h_prev);

    auto write_end = std::chrono::high_resolution_clock::now();
    auto write_seconds = std::chrono::duration<double>(write_end - write_begin).count();

    auto step_end = std::chrono::high_resolution_clock::now();
    auto step_seconds = std::chrono::duration<double>(step_end - step_begin).count();
    std::printf("compute + write %d in %g s\n", write_step, step_seconds);
    std::printf("          write %d in %g s\n", write_step, write_seconds);
  }
}

In [None]:
!nvcc --extended-lambda -o /tmp/a.out Sources/compute-io-overlap.cpp -x cu -arch=native # build executable
!/tmp/a.out # run executable

If you’re unsure how to proceed, consider expanding this section for guidance. Use the hint only after giving the problem a genuine attempt.

<details>
  <summary>Hints</summary>
  
  - `cub::DeviceTransform::Transform` accepts the following parameters (in order):
    - input iterator  (Think about what we learned earlier using counting iterators)
    - output iterator
    - number of elements
    - unary function
  - You should synchronize the device in a place that allows in-flight transformations to overlap writing data to the file system.  It should be somewhere in the `main` function.
</details>

Open this section only after you’ve made a serious attempt at solving the problem. Once you’ve completed your solution, compare it with the reference provided here to evaluate your approach and identify any potential improvements.

<details>
  <summary>Solution</summary>

  Key points:

  - Synchronizing the device after the write step allows us to overlap computation with I/O

  Solution:
  ```c++
  void simulate(int width, int height, const thrust::device_vector<float> &in,
                thrust::device_vector<float> &out) {
    cuda::std::mdspan temp_in(thrust::raw_pointer_cast(in.data()), height, width);
    cub::DeviceTransform::Transform(
        thrust::make_counting_iterator(0), out.begin(), width * height,
        [=] __host__ __device__(int id) { return ach::compute(id, temp_in); });
  }

  // ... 

  for (int write_step = 0; write_step < write_steps; write_step++) {
    thrust::copy(d_prev.begin(), d_prev.end(), h_prev.begin());

    for (int compute_step = 0; compute_step < compute_steps; compute_step++) {
      simulate(width, height, d_prev, d_next);
      d_prev.swap(d_next);
    }

    ach::store(write_step, height, width, h_prev);
    cudaDeviceSynchronize();
  }
  ```

  You can find full solution [here](Solutions/compute-io-overlap.cu).
</details>

---
Congratulations!  You successfully used CUB to overlap computation with I/O.  Move on to the [next exercise](02.02.03-Exercise-Nsight.ipynb).