<img src="Images/nvidia_header.png" style="margin-left: -30px; width: 300px; float: left;">

## Exercise: Async Copy and Streams

Usage of streams:

```c++
cudaStream_t stream;

// create a stream
cudaStreamCreate(&stream); 

// make CPU wait for all operations in the stream to complete
cudaStreamSynchronize(stream); 

// destroy the stream
cudaStreamDestroy(stream);
```

Usage of `cub::DeviceTransform`:

```c++
cub::DeviceTransform::Transform(input_iterator, output_iterator, num_items, op, stream);
```

Usage of `cudaMemcpyAsync`:

```c++
cudaMemcpyAsync(dst, src, num_bytes, cudaMemcpyDeviceToHost, stream);
```

For this exercise, we'll attempt to make transfers between the host and device asynchronous.
To do this, you are expected to:

- replace `thrust::copy` with `cudaMemcpyAsync`
- put compute and copy operations in separate CUDA streams
- synchronize the streams to follow the pattern from the diagram below

![Compute-Copy-Overlap](Images/async-copy.png "Compute/Copy Overlap")

In [None]:
#@title Google Colab Setup
!mkdir -p Sources
!wget https://raw.githubusercontent.com/NVIDIA/accelerated-computing-hub/refs/heads/main/gpu-cpp-tutorial/notebooks/02.03-Streams/Sources/ach.h -nv -O Sources/ach.h
!sudo apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64/7fa2af80.pub > /dev/null 2>&1 
!sudo add-apt-repository -y "deb https://developer.download.nvidia.com/devtools/repos/ubuntu$(source /etc/lsb-release; echo "$DISTRIB_RELEASE" | tr -d .)/$(dpkg --print-architecture)/ /" > /dev/null 2>&1 
!sudo apt install -y nsight-systems > /dev/null 2>&1 

In [None]:
%%writefile Sources/async-copy.cpp
#include "ach.h"

void simulate(int width, int height, const thrust::device_vector<float> &in,
              thrust::device_vector<float> &out,
              cudaStream_t stream = 0)
{
  cuda::std::mdspan temp_in(thrust::raw_pointer_cast(in.data()), height, width);
  cub::DeviceTransform::Transform(
      thrust::make_counting_iterator(0), out.begin(), width * height,
      [=] __host__ __device__(int id) { return ach::compute(id, temp_in); },
      stream);
}

int main()
{
  int height = 2048;
  int width = 8192;

  thrust::device_vector<float> d_prev = ach::init(height, width);
  thrust::device_vector<float> d_next(height * width);
  thrust::device_vector<float> d_buffer(height * width);
  thrust::host_vector<float> h_prev(height * width);

  const int compute_steps = 750;
  const int write_steps = 3;

  // TODO: 1. Create compute and copy streams

  for (int write_step = 0; write_step < write_steps; write_step++)
  {
    thrust::copy(d_prev.begin(), d_prev.end(), d_buffer.begin());

    // TODO: 2. Replace `thrust::copy` with `cudaMemcpyAsync` on copy stream.
    // Use `thrust::raw_pointer_cast(vec.data())` to get raw pointers from Thrust containers.
    thrust::copy(d_buffer.begin(), d_buffer.end(), h_prev.begin());

    for (int compute_step = 0; compute_step < compute_steps; compute_step++)
    {
      // TODO: 3. Put `simulate` on compute stream
      simulate(width, height, d_prev, d_next);
      d_prev.swap(d_next);
    }

    // TODO: 4. Make sure to synchronize copy stream before reading `h_prev`
    ach::store(write_step, height, width, h_prev);

    // TODO: 5. Make sure to synchronize compute stream before next iteration
    cudaDeviceSynchronize();
  }
}

In [None]:
!nvcc --extended-lambda -o /tmp/a.out Sources/async-copy.cpp -x cu -arch=native # build executable
!nsys profile --force-overwrite true -o copy /tmp/a.out # run and profile executable

If you’re unsure how to proceed, consider expanding this section for guidance. Use the hint only after giving the problem a genuine attempt.

<details>
  <summary>Hints</summary>
  
  - CUB accepts `stream` as its last argument `cub::DeviceTransform::Transform(input, output, num_items, op, stream)` 
  - `cudaMemcpyAsync` accepts `stream` as its last argument `cudaMemcpyAsync(dst, src, size, cudaMemcpyDeviceToHost, stream)`
  - You can use the following operations on a CUDA stream:
    - `cudaStreamCreate(&stream)` to create a stream
    - `cudaStreamDestroy(stream)` to destroy a stream
    - `cudaStreamSynchronize(stream)` to make the CPU wait for `stream` to finish all operations
</details>

Open this section only after you’ve made a serious attempt at solving the problem. Once you’ve completed your solution, compare it with the reference provided here to evaluate your approach and identify any potential improvements.

<details>
  <summary>Solution</summary>

  Key points:

  - Synchronize the copy stream before storing the data

  Solution:
  ```c++
  cudaStream_t compute_stream;
  cudaStreamCreate(&compute_stream);

  cudaStream_t copy_stream;
  cudaStreamCreate(&copy_stream);

  for (int write_step = 0; write_step < write_steps; write_step++) 
  {
    cudaMemcpy(thrust::raw_pointer_cast(d_buffer.data()),
               thrust::raw_pointer_cast(d_prev.data()),
               height * width * sizeof(float), cudaMemcpyDeviceToDevice);
    cudaMemcpyAsync(thrust::raw_pointer_cast(h_prev.data()),
                    thrust::raw_pointer_cast(d_buffer.data()),
                    height * width * sizeof(float), cudaMemcpyDeviceToHost,
                    copy_stream);

    for (int compute_step = 0; compute_step < compute_steps; compute_step++) {
      simulate(width, height, d_prev, d_next, compute_stream);
      d_prev.swap(d_next);
    }

    cudaStreamSynchronize(copy_stream);
    ach::store(write_step, height, width, h_prev);

    cudaStreamSynchronize(compute_stream);
  }

  cudaStreamDestroy(compute_stream);
  cudaStreamDestroy(copy_stream);
  ```

  You can find the full solution [here](Solutions/async-copy.cu).
</details>

---
Great job!  Proceed to the [next section](../02.04-Pinned-Memory/02.04.01-Pinned.ipynb).

<img src="Images/nvidia_header.png" style="margin-left: -30px; width: 300px; float: left;">