## Exercise: `mdspan`

Usage of `cuda::std::mdspan` API for your reference:
```cpp
int height = 2;
int width = 3;
cuda::std::array<int, 6> sd {0, 1, 2, 3, 4, 5};
cuda::std::mdspan md(sd.data(), height, width);

std::printf("md(0, 0) = %d\n", md(0, 0)); // 0
std::printf("md(1, 2) = %d\n", md(1, 2)); // 5

std::printf("size   = %zu\n", md.size());    // 6
std::printf("height = %zu\n", md.extent(0)); // 2
std::printf("width  = %zu\n", md.extent(1)); // 3
```

Complete the exercise below by adding the use of `cuda::std::mdspan` instead of dereferencing the raw pointer via pointer offset arithmetic.

<details>
<summary>Original heat-2D.cpp -x cu -arch=native code in case you need to refer back to it.</summary>

```c++
%%writefile Sources/heat-2D.cpp
#include "ach.h"

__host__ __device__
cuda::std::pair<int, int> row_col(int id, int width) {
    return cuda::std::make_pair(id / width, id % width);
}

void simulate(int height, int width,
              const thrust::universal_vector<float> &in,
                    thrust::universal_vector<float> &out)
{
  const float *in_ptr = thrust::raw_pointer_cast(in.data());

  thrust::tabulate(
    thrust::device, out.begin(), out.end(), 
    [in_ptr, height, width] __host__ __device__(int id) {
      auto [row, column] = row_col(id, width);

      if (row > 0 && column > 0 && row < height - 1 && column < width - 1) {
        float d2tdx2 = in_ptr[(row) * width + column - 1] - 2 * in_ptr[row * width + column] + in_ptr[(row) * width + column + 1];
        float d2tdy2 = in_ptr[(row - 1) * width + column] - 2 * in_ptr[row * width + column] + in_ptr[(row + 1) * width + column];

        return in_ptr[row * width + column] + 0.2f * (d2tdx2 + d2tdy2);
      } else {
        return in_ptr[row * width + column];
      }
    });
}
```

</details>

In [None]:
import os

if os.getenv("COLAB_RELEASE_TAG"): # If running in Google Colab:
  !mkdir -p Sources
  !wget https://raw.githubusercontent.com/NVIDIA/accelerated-computing-hub/refs/heads/main/cuda-cpp-tutorial/notebooks/01.04-Vocabulary-Types/Sources/ach.h -nv -O Sources/ach.h
  !wget https://raw.githubusercontent.com/NVIDIA/accelerated-computing-hub/refs/heads/main/cuda-cpp-tutorial/notebooks/01.04-Vocabulary-Types/Sources/ach.py -nv -O Sources/ach.py
  !wget https://raw.githubusercontent.com/NVIDIA/accelerated-computing-hub/refs/heads/main/cuda-cpp-tutorial/notebooks/01.04-Vocabulary-Types/Sources/__init__.py -nv -O Sources/__init__.py

In [None]:
%%writefile Sources/heat-2D.cpp
#include "ach.h"

__host__ __device__
cuda::std::pair<int, int> row_col(int id, int width) {
    return cuda::std::make_pair(id / width, id % width);
}

void simulate(int height, int width,
              const thrust::universal_vector<float> &in,
                    thrust::universal_vector<float> &out)
{
  // TODO: Modify the following code to use `cuda::std::mdspan`
  const float *in_ptr = thrust::raw_pointer_cast(in.data());

  thrust::tabulate(
    thrust::device, out.begin(), out.end(),
    [in_ptr, height, width] __host__ __device__(int id) {
      auto [row, column] = row_col(id, width);

      if (row > 0 && column > 0 && row < height - 1 && column < width - 1) {
        float d2tdx2 = in_ptr[(row) * width + column - 1] - 2 * in_ptr[row * width + column] + in_ptr[(row) * width + column + 1];
        float d2tdy2 = in_ptr[(row - 1) * width + column] - 2 * in_ptr[row * width + column] + in_ptr[(row + 1) * width + column];

        return in_ptr[row * width + column] + 0.2f * (d2tdx2 + d2tdy2);
      } else {
        return in_ptr[row * width + column];
      }
    });
}

In [None]:
!nvcc -x cu -arch=native --extended-lambda -o /tmp/a.out Sources/heat-2D.cpp

In [None]:
import Sources.ach
Sources.ach.run("Sources/heat-2D.cpp")

If you’re unsure how to proceed, consider expanding this section for guidance. Use the hint only after giving the problem a genuine attempt.

<details>
  <summary>Hints</summary>
  
  - `cuda::std::mdspan` constructor takes a pointer followed by the height and width of the 2D array
  - Two-dimensional `cuda::std::mdpsan` provides `operator()(int row, int column)` to access elements
</details>

Open this section only after you’ve made a serious attempt at solving the problem. Once you’ve completed your solution, compare it with the reference provided here to evaluate your approach and identify any potential improvements.

<details>
  <summary>Solution</summary>

  Key points:

  - You can use `md.extent(0)` and `md.extent(1)` to get the height and width of the 2D array

  Solution:
  ```cpp
  cuda::std::mdspan temp_in(thrust::raw_pointer_cast(in.data()), height, width);

  thrust::tabulate(thrust::device, out.begin(), out.end(), [temp_in] __host__ __device__(int id) {
    int column = id % temp_in.extent(1);
    int row    = id / temp_in.extent(1);

    if (row > 0 && column > 0 && row < temp_in.extent(0) - 1 && column < temp_in.extent(1) - 1)
    {
      float d2tdx2 = temp_in(row, column - 1) - 2 * temp_in(row, column) + temp_in(row, column + 1);
      float d2tdy2 = temp_in(row - 1, column) - 2 * temp_in(row, column) + temp_in(row + 1, column);

      return temp_in(row, column) + 0.2f * (d2tdx2 + d2tdy2);
    }
    else
    {
      return temp_in(row, column);
    }
  });
  ```

  You can find full solution [here](Solutions/heat-2D.cpp).
</details>


---

Congratulations!  Now that you know how to use vocabulary types, proceed to [the next section](../01.05-Serial-vs-Parallel/01.05.01-Serial-vs-Parallel.ipynb).