
## Exercise: Fix Histogram

The code below has a data race in it.
Multiple threads concurrently increment the same element of the histogram array.
Use `cuda::std::atomic_ref` to fix this bug. 

Interface of `cuda::std::atomic_ref` is equivalent to `std::atomic_ref`:

```c++
__global__ void kernel(int *count)
{
  // Wrap data in atomic_ref
  cuda::std::atomic_ref<int> ref(count[0]);

  // Atomically increment the underlying value
  ref.fetch_add(1);
}
```

In [None]:
import os

if os.getenv("COLAB_RELEASE_TAG"): # If running in Google Colab:
  !mkdir -p Sources
  !wget https://raw.githubusercontent.com/NVIDIA/accelerated-computing-hub/refs/heads/main/tutorials/cuda-cpp/notebooks/03.03-Atomics/Sources/ach.cuh -nv -O Sources/ach.cuh
  !wget https://raw.githubusercontent.com/NVIDIA/accelerated-computing-hub/refs/heads/main/tutorials/cuda-cpp/notebooks/03.03-Atomics/Sources/__init__.py -nv -O Sources/__init__.py
  !wget https://raw.githubusercontent.com/NVIDIA/accelerated-computing-hub/refs/heads/main/tutorials/cuda-cpp/notebooks/03.03-Atomics/Sources/ach.py -nv -O Sources/ach.py

<details>
<summary>Original code in case you need to refer to it.</summary>

```c++
%%writefile Sources/histogram.cpp
#include "ach.cuh"

constexpr float bin_width = 10;

__global__ void histogram_kernel(cuda::std::span<float> temperatures, 
                                 cuda::std::span<int> histogram)
{
  int cell = blockIdx.x * blockDim.x + threadIdx.x;
  if (cell < temperatures.size()) {
    int bin = static_cast<int>(temperatures[cell] / bin_width);

    // fix data race in incrementing histogram bins by using `cuda::std::atomic_ref`
    int old_count = histogram[bin];
    int new_count = old_count + 1;
    histogram[bin] = new_count;
  }
}

void histogram(cuda::std::span<float> temperatures, 
               cuda::std::span<int> histogram,
               cudaStream_t stream)
{
  int block_size = 256;
  int grid_size = cuda::ceil_div(temperatures.size(), block_size);
  histogram_kernel<<<grid_size, block_size, 0, stream>>>(
    temperatures, histogram);
}
```
    
</details>

In [None]:
%%writefile Sources/histogram.cpp
#include "ach.cuh"

constexpr float bin_width = 10;

__global__ void histogram_kernel(cuda::std::span<float> temperatures,
                                 cuda::std::span<int> histogram)
{
  int cell = blockIdx.x * blockDim.x + threadIdx.x;
  if (cell < temperatures.size()) {
    int bin = static_cast<int>(temperatures[cell] / bin_width);

    // TODO: fix data race in incrementing histogram bins by using `cuda::std::atomic_ref`
    int old_count = histogram[bin];
    int new_count = old_count + 1;
    histogram[bin] = new_count;
  }
}

void histogram(cuda::std::span<float> temperatures,
               cuda::std::span<int> histogram,
               cudaStream_t stream)
{
  int block_size = 256;
  int grid_size = cuda::ceil_div(temperatures.size(), block_size);
  histogram_kernel<<<grid_size, block_size, 0, stream>>>(
    temperatures, histogram);
}

In [None]:
import Sources.ach
Sources.ach.run("Sources/histogram.cpp")

If you’re unsure how to proceed, consider expanding this section for guidance. Use the hint only after giving the problem a genuine attempt.

<details>
  <summary>Hints</summary>
  
  - `cuda::std::atomic_ref` wraps a reference and applies atomic operations to the underlying object
  - You can increment a variable atomically using `ref.fetch_add(1)`
</details>

Open this section only after you’ve made a serious attempt at solving the problem. Once you’ve completed your solution, compare it with the reference provided here to evaluate your approach and identify any potential improvements.

<details>
  <summary>Solution</summary>

  Key points:

  - Wrap selected bin in `cuda::std::atomic_ref<int>` for atomic operations
  - Use `fetch_add` to increment the bin value atomically

  Solution:
  ```c++
  __global__ void histogram_kernel(cuda::std::span<float> temperatures,
                                   cuda::std::span<int> histogram) 
  {
    int cell = blockIdx.x * blockDim.x + threadIdx.x;
    int bin = static_cast<int>(temperatures[cell] / 10);

    cuda::std::atomic_ref<int> ref(histogram[bin]);
    ref.fetch_add(1);
  }
  ```

  You can find full solution [here](Solutions/histogram.cpp).
</details>

---
Congratulations!  Move on to the [next section](../03.04-Synchronization/03.04.01-Sync.ipynb)