# Performance Comparison nunique

In [1]:
import torch
from torch.utils.cpp_extension import load_inline

from htc.cpp import nunique

In [2]:
def with_python(inp: torch.Tensor, dim: int):
    inp_last = inp.unsqueeze(-1).transpose(dim, -1).squeeze(dim)
    out = torch.empty(inp_last.shape[:-1], dtype=torch.int64)

    for i in range(inp_last.size(0)):
        for j in range(inp_last.size(1)):
            n_unique = len(inp_last[i, j].unique(return_counts=True)[1])
            out[i, j] = n_unique

    return out

In [3]:
@torch.jit.script
def with_python_jit(inp: torch.Tensor, dim: int):
    inp_last = inp.unsqueeze(-1).transpose(dim, -1).squeeze(dim)
    out = torch.empty(inp_last.shape[:-1], dtype=torch.int64)

    for i in range(inp_last.size(0)):
        for j in range(inp_last.size(1)):
            n_unique = len(torch.unique(inp_last[i, j], return_counts=True)[1])
            # n_unique = len(inp_last[i, j].unique(return_counts=True)[1])
            out[i, j] = n_unique

    return out

In [4]:
source = """
using namespace torch::indexing;

torch::Tensor with_accesors(torch::Tensor in, int64_t dim) {
    // Move reduce dim to the last dimension
    auto in_last = in.unsqueeze(-1).transpose(dim, -1).squeeze(dim);
    auto size_in = in_last.sizes();

    c10::IntArrayRef out_size(size_in.begin(), size_in.begin() + (in.ndimension() - 2) + 1); // [:-1]
    auto out = torch::empty(out_size, torch::kInt64);

    auto in_last_a = in_last.accessor<int64_t, 3>();
    auto out_a = out.accessor<int64_t, 2>();

    for (int i = 0; i < in_last_a.size(0); ++i) {
        for (int j = 0; j < in_last_a.size(1); ++j) {
            auto values = in_last.index({i, j, Slice(None)});
            auto n_unique = std::get<2>(torch::_unique2(values, true, false, true)).numel();
            out_a[i][j] = n_unique;
        }
    }

    return out;
}

torch::Tensor with_accesors_set(torch::Tensor in, int64_t dim) {
    // Move reduce dim to the last dimension
    auto in_last = in.unsqueeze(-1).transpose(dim, -1).squeeze(dim);
    auto size_in = in_last.sizes();

    c10::IntArrayRef out_size(size_in.begin(), size_in.begin() + (in.ndimension() - 2) + 1); // [:-1]
    auto out = torch::empty(out_size, torch::kInt64);

    auto in_last_a = in_last.accessor<int64_t, 3>();
    auto out_a = out.accessor<int64_t, 2>();

    for (int i = 0; i < in_last_a.size(0); ++i) {
        for (int j = 0; j < in_last_a.size(1); ++j) {
            std::unordered_set<int64_t> values;
            for (int k = 0; k < in_last_a.size(2); ++k) {
                values.insert(in_last_a[i][j][k]);
            }
            out_a[i][j] = values.size();
        }
    }

    return out;
}
"""
module = load_inline(
    name="inline_extension",
    cpp_sources=[source],
    functions=["with_accesors", "with_accesors_set"],
    extra_cflags=[
        "-O3",
        "-std=c++2a",
    ],
)

## Test Implementations

In [5]:
x_small = torch.tensor([
    [[1, 2], [3, 2]],
    [[5, 2], [2, 2]],
])
y = torch.tensor([[2, 1], [2, 1]])

assert torch.all(y == with_python(x_small, 0))
assert torch.all(y == with_python_jit(x_small, 0))
assert torch.all(y == module.with_accesors(x_small, 0))
assert torch.all(y == module.with_accesors_set(x_small, 0))
assert torch.all(y == nunique(x_small, 0))

## Performance

In [6]:
torch.manual_seed(0)
x_large = torch.randint(0, 10, (5, 480, 640))

In [7]:
%timeit with_python(x_large, 0)

4.43 s ± 78.3 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [8]:
%timeit with_python_jit(x_large, 0)

2.59 s ± 65.6 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [9]:
%timeit module.with_accesors(x_large, 0)

1.91 s ± 9.75 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [10]:
%timeit module.with_accesors_set(x_large, 0)

50.7 ms ± 153 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [11]:
%timeit nunique(x_large, 0)

5.33 ms ± 692 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
