In [1]:
import torch
from mmcv_ops.bbox_overlaps import bbox_overlaps

### 1. bbox_overlaps
We first read the source codes of bbox_overlaps. Actually, if the version of your pytorch is parrots and device is cpu, you will perform IoU calculation using Python. However, I delete these codes because I focus on the C++ and cuda programming.
```python
def bbox_overlaps(bboxes1: torch.Tensor,
                  bboxes2: torch.Tensor,
                  mode: str = 'iou',
                  aligned: bool = False,
                  offset: int = 0) -> torch.Tensor:
    """Calculate overlap between two set of bboxes.

    If ``aligned`` is ``False``, then calculate the ious between each bbox
    of bboxes1 and bboxes2, otherwise the ious between each aligned pair of
    bboxes1 and bboxes2.

    Args:
        bboxes1 (torch.Tensor): shape (m, 4) in <x1, y1, x2, y2> format or
            empty.
        bboxes2 (torch.Tensor): shape (n, 4) in <x1, y1, x2, y2> format or
            empty. If aligned is ``True``, then m and n must be equal.
        mode (str): "iou" (intersection over union) or iof (intersection over
            foreground).

    Returns:
        torch.Tensor: Return the ious betweens boxes. If ``aligned`` is
        ``False``, the shape of ious is (m, n) else (m, 1).

    Example:
        >>> bboxes1 = torch.FloatTensor([
        >>>     [0, 0, 10, 10],
        >>>     [10, 10, 20, 20],
        >>>     [32, 32, 38, 42],
        >>> ])
        >>> bboxes2 = torch.FloatTensor([
        >>>     [0, 0, 10, 20],
        >>>     [0, 10, 10, 19],
        >>>     [10, 10, 20, 20],
        >>> ])
        >>> bbox_overlaps(bboxes1, bboxes2)
        tensor([[0.5000, 0.0000, 0.0000],
                [0.0000, 0.0000, 1.0000],
                [0.0000, 0.0000, 0.0000]])

    Example:
        >>> empty = torch.FloatTensor([])
        >>> nonempty = torch.FloatTensor([
        >>>     [0, 0, 10, 9],
        >>> ])
        >>> assert tuple(bbox_overlaps(empty, nonempty).shape) == (0, 1)
        >>> assert tuple(bbox_overlaps(nonempty, empty).shape) == (1, 0)
        >>> assert tuple(bbox_overlaps(empty, empty).shape) == (0, 0)
    """

    mode_dict = {'iou': 0, 'iof': 1}
    assert mode in mode_dict.keys()
    mode_flag = mode_dict[mode]
    # Either the boxes are empty or the length of boxes' last dimension is 4
    assert (bboxes1.size(-1) == 4 or bboxes1.size(0) == 0)
    assert (bboxes2.size(-1) == 4 or bboxes2.size(0) == 0)
    assert offset == 1 or offset == 0

    rows = bboxes1.size(0)
    cols = bboxes2.size(0)

    if aligned:
        assert rows == cols
        ious = bboxes1.new_zeros(rows)
    else:
        ious = bboxes1.new_zeros((rows, cols))

    if rows * cols == 0:
        return ious

    if bboxes1.device == torch.device('cpu'):
        bbox_overlaps_kernel = ext_module.bbox_overlaps_cpu
    else:
        bbox_overlaps_kernel = ext_module.bbox_overlaps_cuda

    bbox_overlaps_kernel(
        bboxes1, bboxes2, ious, mode=mode_flag, aligned=aligned, offset=offset)

    return ious
```

### 2. bbox_overlaps_cpu
We firstly see the cpu kernel.
```cpp
void bbox_overlaps_cpu(const Tensor bboxes1, const Tensor bboxes2, Tensor ious,
                   const int mode, const bool aligned, const int offset) {
    bbox_overlaps_cpu_kernel(bboxes1, bboxes2, ious, mode, aligned, offset);
}
```

### 3. bbox_overlaps_cpu_kernel
For `bbox_overlaps_cpu_kernel`, it supports two kinds of IoU calculation, IoF and IoU.
```cpp
#include "pytorch_cpp_helper.hpp"

using torch::indexing::None;
using torch::indexing::Slice;

void bbox_overlaps_cpu_kernel(const Tensor boxes1, const Tensor boxes2,
                              Tensor ious, const int mode_flag,
                              const bool aligned, const int offset) {
  Tensor temp_ious;
  if (aligned) {
    // Slice(None) equals to : in python
    // Slice({None, 2}) equals to :2 in python
    // index({Slice(None), Slice({None, 2})}) equals to [:, :2] in python
    // lt is the left top coordinate of the bbox overlaps
    Tensor lt = torch::max(boxes1.index({Slice(None), Slice({None, 2})}),
                           boxes2.index({Slice(None), Slice({None, 2})}));
    // rb is the right bottom coordinate of the bbox overlaps
    Tensor rb = torch::min(boxes1.index({Slice(None), Slice(2)}),
                           boxes2.index({Slice(None), Slice(2)}));
    // get width and height
    Tensor wh = (rb - lt + offset).clamp(0.f, INT_MAX * 1.f);
    // get the area of the overlap
    Tensor overlap = wh.index({Slice(None), 0}) * wh.index({Slice(None), 1});
    // get the area of boxes1
    // boxes1 is the gt boxes
    Tensor area1 = (boxes1.index({Slice(None), 2}) -
                    boxes1.index({Slice(None), 0}) + offset) *
                   (boxes1.index({Slice(None), 3}) -
                    boxes1.index({Slice(None), 1}) + offset);
    if (mode_flag == 0) {
      // get the area of the boxes2
      // boxes2 is the prediction boxes
      Tensor area2 = (boxes2.index({Slice(None), 2}) -
                      boxes2.index({Slice(None), 0}) + offset) *
                     (boxes2.index({Slice(None), 3}) -
                      boxes2.index({Slice(None), 1}) + offset);
      temp_ious = overlap / (area1 + area2 - overlap);
    } else {
      temp_ious = overlap / area1;
    }
  } else {
    // index({Slice(None), None, Slice({None, 2})}) equals to [:, None, :2]
    // suppose that the dimension of boxes1 is (N, 4)
    // the dimension of boxes1.index({Slice(None), None, Slice({None, 2})}) is (N, 1, 2)
    // the dimension of boxes2.index({Slice(None), Slice({None, 2})}) is (M, 2)
    // After performing torch::max, the dimension of lt is (N, M, 2) based o the broadcasting rule.
    Tensor lt = torch::max(boxes1.index({Slice(None), None, Slice({None, 2})}),
                           boxes2.index({Slice(None), Slice({None, 2})}));
    // same as lt
    Tensor rb = torch::min(boxes1.index({Slice(None), None, Slice(2)}),
                           boxes2.index({Slice(None), Slice(2)}));
    // note that the dimension of wh is (N, M, 2)
    // because we want to calculate the IoU of each pair
    Tensor wh = (rb - lt + offset).clamp(0.f, INT_MAX * 1.f);
    Tensor overlap = wh.index({"...", 0}) * wh.index({"...", 1});
    Tensor area1 = (boxes1.index({Slice(None), 2}) -
                    boxes1.index({Slice(None), 0}) + offset) *
                   (boxes1.index({Slice(None), 3}) -
                    boxes1.index({Slice(None), 1}) + offset);
    if (mode_flag == 0) {
      Tensor area2 = (boxes2.index({Slice(None), 2}) -
                      boxes2.index({Slice(None), 0}) + offset) *
                     (boxes2.index({Slice(None), 3}) -
                      boxes2.index({Slice(None), 1}) + offset);
      temp_ious =
          overlap / (area1.index({Slice(None), None}) + area2 - overlap);
    } else {
      temp_ious = overlap / area1.index({Slice(None), None});
    }
  }
  ious.copy_(temp_ious);
}

```

In [2]:
bboxes1 = torch.tensor([
    [0, 0, 10, 10],
    [2, 2, 8, 8]
], dtype=torch.float32)
bboxes2 = torch.tensor([
    [0, 0, 5, 5],
    [1, 1, 8, 8]
], dtype=torch.float32)

In [3]:
bbox_overlaps(bboxes1, bboxes2)

tensor([[0.2500, 0.4900],
        [0.1731, 0.7347]])

### 4. bbox_overlaps_cuda
Then we see the cuda version of the bbox_overlaps.
```cpp
void bbox_overlaps_cuda(const Tensor bboxes1, const Tensor bboxes2,
                                    Tensor ious, const int mode,
                                    const bool aligned, const int offset) {
  int output_size = ious.numel();
  int num_bbox1 = bboxes1.size(0);
  int num_bbox2 = bboxes2.size(0);

  at::cuda::CUDAGuard device_guard(bboxes1.device());
  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
      bboxes1.scalar_type(), "bbox_overlaps_cuda_kernel", ([&] {
        bbox_overlaps_cuda_kernel<scalar_t>
            <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>(
                bboxes1.data_ptr<scalar_t>(), bboxes2.data_ptr<scalar_t>(),
                ious.data_ptr<scalar_t>(), num_bbox1, num_bbox2, mode, aligned,
                offset);
      }));
  AT_CUDA_CHECK(cudaGetLastError());
}
```

### 5. bbox_overlaps_cuda_kernel
```cpp
template <typename T>
__global__ void bbox_overlaps_cuda_kernel(const T* bbox1, const T* bbox2,
                                          T* ious, const int num_bbox1,
                                          const int num_bbox2, const int mode,
                                          const bool aligned,
                                          const int offset) {
  if (aligned) {
    CUDA_1D_KERNEL_LOOP(index, num_bbox1) {
      const int b1 = index;
      const int b2 = index;

      const int base1 = b1 << 2;  // b1 * 4
      T b1_x1, b1_y1, b1_x2, b1_y2;
      // read section 6
      load_bbox<T>(bbox1, base1, b1_x1, b1_y1, b1_x2, b1_y2);
      // get the area of box1
      const T b1_area = (b1_x2 - b1_x1 + offset) * (b1_y2 - b1_y1 + offset);

      const int base2 = b2 << 2;  // b2 * 4
      T b2_x1, b2_y1, b2_x2, b2_y2;
      load_bbox<T>(bbox2, base2, b2_x1, b2_y1, b2_x2, b2_y2);
      // get the area of box2
      const T b2_area = (b2_x2 - b2_x1 + offset) * (b2_y2 - b2_y1 + offset);
      // get the corner point coordinate
      const T left = fmaxf(b1_x1, b2_x1), right = fminf(b1_x2, b2_x2);
      const T top = fmaxf(b1_y1, b2_y1), bottom = fminf(b1_y2, b2_y2);
      const T width = fmaxf(right - left + offset, 0.f);
      const T height = fmaxf(bottom - top + offset, 0.f);
      const T interS = width * height;

      const T baseS =
          fmaxf(mode == 0 ? b1_area + b2_area - interS : b1_area, T(offset));
      ious[index] = interS / baseS;
    }
  } else {
    CUDA_1D_KERNEL_LOOP(index, num_bbox1 * num_bbox2) {
      // it equals to a 2D loop
      const int b1 = index / num_bbox2;
      const int b2 = index % num_bbox2;

      const int base1 = b1 << 2;  // b1 * 4
      T b1_x1, b1_y1, b1_x2, b1_y2;
      load_bbox<T>(bbox1, base1, b1_x1, b1_y1, b1_x2, b1_y2);
      const T b1_area = (b1_x2 - b1_x1 + offset) * (b1_y2 - b1_y1 + offset);

      const int base2 = b2 << 2;  // b2 * 4
      T b2_x1, b2_y1, b2_x2, b2_y2;
      load_bbox<T>(bbox2, base2, b2_x1, b2_y1, b2_x2, b2_y2);
      const T b2_area = (b2_x2 - b2_x1 + offset) * (b2_y2 - b2_y1 + offset);

      const T left = fmaxf(b1_x1, b2_x1), right = fminf(b1_x2, b2_x2);
      const T top = fmaxf(b1_y1, b2_y1), bottom = fminf(b1_y2, b2_y2);
      const T width = fmaxf(right - left + offset, 0.f);
      const T height = fmaxf(bottom - top + offset, 0.f);
      const T interS = width * height;

      const T baseS =
          fmaxf(mode == 0 ? b1_area + b2_area - interS : b1_area, T(offset));
      ious[index] = interS / baseS;
    }
  }
}
```

### 6. Data Loading
```cpp
template <typename T>
__device__ __forceinline__ void load_bbox(const T* bbox, const int base, T& x1,
                                          T& y1, T& x2, T& y2) {
  x1 = bbox[base];
  y1 = bbox[base + 1];
  x2 = bbox[base + 2];
  y2 = bbox[base + 3];
}

template <>
__device__ __forceinline__ void load_bbox<float>(const float* bbox,
                                                 const int base, float& x1,
                                                 float& y1, float& x2,
                                                 float& y2) {
  const float4 bbox_offset = reinterpret_cast<const float4*>(bbox + base)[0];
  x1 = bbox_offset.x;
  y1 = bbox_offset.y;
  x2 = bbox_offset.z;
  y2 = bbox_offset.w;
}
```

In [4]:
bbox_overlaps(bboxes1.cuda(), bboxes2.cuda())

tensor([[0.2500, 0.4900],
        [0.1731, 0.7347]], device='cuda:0')