diff --git a/docs_input/executor_compatibility.rst b/docs_input/executor_compatibility.rst index e19d2bb54..2da78590e 100644 --- a/docs_input/executor_compatibility.rst +++ b/docs_input/executor_compatibility.rst @@ -172,7 +172,7 @@ fused JIT expression; non-JIT CUDA execution through cudaExecutor remains availa "reduce", "|no|", "|yes|", "|no|", "Generic custom reduction currently uses CUDA reduction support." "remap", "|yes|", "|yes|", "|yes|", "View/reindex expression." "repmat", "|yes|", "|yes|", "|yes|", "View/expression composition." - "resample_poly", "|no|", "|yes|", "|no|", "CUDA-only resampling transform." + "resample_poly", "|yes|", "|yes|", "|no|", "Polyphase resampling transform for host and CUDA executors." "reshape", "|yes|", "|yes|", "|yes|", "View expression." "reverse", "|yes|", "|yes|", "|yes|", "View/reindex expression." "round", "|yes|", "|yes|", "|yes|", "Element-wise expression." diff --git a/include/matx/core/operator_utils.h b/include/matx/core/operator_utils.h index e09d8228f..2c6bd11ae 100644 --- a/include/matx/core/operator_utils.h +++ b/include/matx/core/operator_utils.h @@ -96,7 +96,9 @@ namespace matx { namespace detail { // Used inside of transforms to allocate temporary output template - __MATX_HOST__ __MATX_INLINE__ void AllocateTempTensor(TensorType &tensor, Executor &&ex, ShapeType &&shape, typename TensorType::value_type **ptr) { + __MATX_HOST__ __MATX_INLINE__ void AllocateTempTensor(TensorType &tensor, Executor &&ex, + ShapeType &&shape, typename TensorType::value_type **ptr, + matxMemorySpace_t host_memory_space = MATX_HOST_MEMORY) { const auto ttl_size = cuda::std::accumulate(shape.begin(), shape.end(), static_cast(1), cuda::std::multiplies()) * sizeof(typename TensorType::value_type); @@ -106,7 +108,7 @@ namespace matx { make_tensor(tensor, *ptr, shape); } else { - matxAlloc((void**)ptr, ttl_size, MATX_HOST_MEMORY); + matxAlloc((void**)ptr, ttl_size, host_memory_space); make_tensor(tensor, *ptr, shape); } } diff --git a/include/matx/operators/resample_poly.h b/include/matx/operators/resample_poly.h index 86601a825..934120834 100644 --- a/include/matx/operators/resample_poly.h +++ b/include/matx/operators/resample_poly.h @@ -112,9 +112,11 @@ namespace detail { template void Exec(Out &&out, Executor &&ex) const { - static_assert(is_cuda_executor_v, "resample_poly() only supports the CUDA executor currently"); + static_assert(is_cuda_executor_v || is_host_executor_v, + "resample_poly() only supports CUDA and host executors"); - resample_poly_impl(cuda::std::get<0>(out), a_, f_, up_, down_, ex.getStream()); + resample_poly_impl(cuda::std::get<0>(out), a_, f_, up_, down_, + std::forward(ex)); } template @@ -137,7 +139,13 @@ namespace detail { InnerPreRun(std::forward(shape), std::forward(ex)); - detail::AllocateTempTensor(tmp_out_, std::forward(ex), out_dims_, &ptr); + if constexpr (is_host_executor_v) { + detail::AllocateTempTensor(tmp_out_, std::forward(ex), out_dims_, &ptr, + MATX_HOST_MALLOC_MEMORY); + } + else { + detail::AllocateTempTensor(tmp_out_, std::forward(ex), out_dims_, &ptr); + } prerun_done_ = true; Exec(cuda::std::make_tuple(tmp_out_), std::forward(ex)); diff --git a/include/matx/transforms/resample_poly.h b/include/matx/transforms/resample_poly.h index cdd254e35..1c85ffd5a 100644 --- a/include/matx/transforms/resample_poly.h +++ b/include/matx/transforms/resample_poly.h @@ -33,18 +33,34 @@ #pragma once #include +#include #include +#include #include +#include +#include + #include "matx/core/error.h" #include "matx/core/nvtx.h" #include "matx/core/tensor.h" +#include "matx/executors/host.h" #include "matx/operators/clone.h" #include "matx/kernels/resample_poly.cuh" namespace matx { namespace detail { +template +__MATX_INLINE__ decltype(auto) ApplyOpWithIdx(Op &&op, + const cuda::std::array &idx) +{ + return cuda::std::apply( + [&op](auto... indices) -> decltype(auto) { + return op(indices...); + }, idx); +} + template inline void matxResamplePoly1DInternal(OutType &o, const InType &i, const FilterType &filter, index_t up, index_t down, @@ -193,6 +209,77 @@ inline void matxResamplePoly1DInternal(OutType &o, const InType &i, #endif } +template +inline void matxResamplePoly1DInternal(OutType &o, const InType &i, + const FilterType &filter, index_t up, index_t down, + [[maybe_unused]] const HostExecutor &exec) +{ + using filter_t = typename FilterType::value_type; + using filter_inner_t = typename inner_op_type_t::type; + using output_t = typename OutType::value_type; + + constexpr int RANK = InType::Rank(); + const index_t output_len = o.Size(RANK - 1); + const index_t input_len = i.Size(RANK - 1); + index_t filter_len = filter.Size(FilterType::Rank() - 1); + const bool is_even_filter = (filter_len % 2) == 0; + + if (is_even_filter) { + filter_len++; + } + + const index_t filter_len_half = filter_len / 2; + const index_t filter_central_tap = (filter_len - 1) / 2; + const index_t max_input_ind = input_len - 1; + const index_t batch_count = TotalSize(o) / output_len; + const filter_t scale = static_cast(static_cast(up)); + + auto run_batch = [&](index_t batch_idx) { + auto input_idx = BlockToIdx(o, batch_idx, 1); + auto output_idx = input_idx; + + for (index_t out_ind = 0; out_ind < output_len; out_ind++) { + const index_t up_ind = out_ind * down; + const index_t up_start = (up_ind > filter_len_half) ? + up_ind - filter_len_half : 0; + const index_t up_end = cuda::std::min(max_input_ind * up, + up_ind + filter_len_half); + const index_t x_start = (up_start + up - 1) / up; + const index_t x_end = up_end / up; + index_t h_ind = filter_central_tap + (up_ind - up * x_start); + + output_t accum {}; + for (index_t in_ind = x_start; in_ind <= x_end; in_ind++) { + if (!is_even_filter || h_ind > 0) { + input_idx[RANK - 1] = in_ind; + const auto in_val = ApplyOpWithIdx(i, input_idx); + const index_t filter_ind = is_even_filter ? h_ind - 1 : h_ind; + accum += in_val * filter(filter_ind); + } + h_ind -= up; + } + + output_idx[RANK - 1] = out_ind; + ApplyOpWithIdx(o, output_idx) = accum * scale; + } + }; + +#ifdef MATX_EN_OMP + if (exec.GetNumThreads() > 1) { + #pragma omp parallel for num_threads(exec.GetNumThreads()) + for (index_t batch_idx = 0; batch_idx < batch_count; batch_idx++) { + run_batch(batch_idx); + } + } + else +#endif + { + for (index_t batch_idx = 0; batch_idx < batch_count; batch_idx++) { + run_batch(batch_idx); + } + } +} + } // end namespace detail @@ -207,11 +294,12 @@ inline void matxResamplePoly1DInternal(OutType &o, const InType &i, * @param f Filter operator * @param up Factor by which to upsample * @param down Factor by which to downsample - * @param stream CUDA stream on which to run the kernel(s) + * @param exec Executor on which to run the resampler */ -template +template + requires is_executor inline void resample_poly_impl(OutType &out, const InType &in, const FilterType &f, - index_t up, index_t down, cudaStream_t stream = 0) { + index_t up, index_t down, Executor &&exec) { MATX_NVTX_START("", matx::MATX_NVTX_LOG_API) constexpr int RANK = InType::Rank(); @@ -243,11 +331,26 @@ inline void resample_poly_impl(OutType &out, const InType &in, const FilterType // first interpretation and return a copy of the input tensor. This matches // the behavior of scipy. if (up == 1 && down == 1) { - (out = in).run(stream); + (out = in).run(exec); return; } - matxResamplePoly1DInternal(out, in, f, up, down, stream); + if constexpr (is_cuda_executor_v) { + matxResamplePoly1DInternal(out, in, f, up, down, exec.getStream()); + } + else if constexpr (is_host_executor_v) { + matxResamplePoly1DInternal(out, in, f, up, down, exec); + } + else { + static_assert(is_cuda_executor_v || is_host_executor_v, + "resample_poly_impl() only supports CUDA and host executors"); + } +} + +template +inline void resample_poly_impl(OutType &out, const InType &in, const FilterType &f, + index_t up, index_t down, cudaStream_t stream = 0) { + resample_poly_impl(out, in, f, up, down, cudaExecutor(stream)); } } // end namespace matx diff --git a/test/00_transform/ResamplePoly.cu b/test/00_transform/ResamplePoly.cu index c44bbae14..fe1d22e33 100644 --- a/test/00_transform/ResamplePoly.cu +++ b/test/00_transform/ResamplePoly.cu @@ -36,8 +36,108 @@ #include "utilities.h" #include "gtest/gtest.h" +#include +#include +#include + using namespace matx; +namespace { + +template +T MakeResampleTestValue(double real, double imag = 0.0) +{ + if constexpr (is_complex_v) { + using inner_t = typename inner_op_type_t::type; + return T{static_cast(real), static_cast(imag)}; + } + else { + return static_cast(real); + } +} + +template +double ResampleAbsDiff(const T &a, const T &b) +{ + if constexpr (is_complex_v) { + return static_cast(cuda::std::abs(a - b)); + } + else { + return std::abs(static_cast(a - b)); + } +} + +template +std::vector ReferenceResamplePoly(const std::vector &in, + const std::vector &filter, + index_t up, index_t down) +{ + const index_t g = std::gcd(up, down); + up /= g; + down /= g; + + const index_t out_len = + static_cast((in.size() * up + down - 1) / down); + std::vector out(static_cast(out_len), T{}); + + if (up == 1 && down == 1) { + return in; + } + + const bool is_even_filter = (filter.size() % 2) == 0; + const index_t logical_filter_len = + static_cast(filter.size()) + (is_even_filter ? 1 : 0); + const index_t filter_center = (logical_filter_len - 1) / 2; + + for (index_t out_ind = 0; out_ind < out_len; out_ind++) { + T accum {}; + const index_t up_ind = out_ind * down; + for (index_t in_ind = 0; in_ind < static_cast(in.size()); in_ind++) { + const index_t h_ind = filter_center + up_ind - in_ind * up; + if (h_ind < 0 || h_ind >= logical_filter_len) { + continue; + } + + if (is_even_filter && h_ind == 0) { + continue; + } + + const index_t filter_ind = is_even_filter ? h_ind - 1 : h_ind; + accum += in[static_cast(in_ind)] * + filter[static_cast(filter_ind)]; + } + + out[static_cast(out_ind)] = + accum * MakeResampleTestValue(static_cast(up)); + } + + return out; +} + +template +void FillResampleInputs(std::vector &in, std::vector &filter) +{ + for (size_t i = 0; i < in.size(); i++) { + const double real = (static_cast(i * 5 % 17) - 8) / 4.0; + const double imag = (static_cast(i * 3 % 11) - 5) / 7.0; + in[i] = MakeResampleTestValue(real, imag); + } + + for (size_t i = 0; i < filter.size(); i++) { + const double real = (static_cast(i * 7 % 13) - 6) / 9.0; + const double imag = (static_cast(i * 2 % 9) - 4) / 8.0; + filter[i] = MakeResampleTestValue(real, imag); + } +} + +template +void AssertNearResampleValue(const T &actual, const T &expected, double thresh) +{ + ASSERT_NEAR(ResampleAbsDiff(actual, expected), 0.0, thresh); +} + +} // namespace + template class ResamplePolyTest : public ::testing::Test { using GTestType = cuda::std::tuple_element_t<0, T>; @@ -79,6 +179,126 @@ class ResamplePolyTestFloatTypes TYPED_TEST_SUITE(ResamplePolyTestNonHalfFloatTypes, MatXFloatNonHalfTypesCUDAExec); TYPED_TEST_SUITE(ResamplePolyTestFloatTypes, MatXFloatTypesCUDAExec); +template +class ResamplePolyHostTest : public ::testing::Test {}; + +using ResamplePolyHostTypes = ::testing::Types>; +TYPED_TEST_SUITE(ResamplePolyHostTest, ResamplePolyHostTypes); + +TYPED_TEST(ResamplePolyHostTest, SmallSignalsMatchReference) +{ + MATX_ENTER_HANDLER(); + using TestType = TypeParam; + + SingleThreadedHostExecutor exec{}; + struct { + index_t a_len; + index_t f_len; + index_t up; + index_t down; + } test_cases[] = { + {7, 5, 3, 2}, + {8, 4, 2, 5}, + {6, 1, 4, 1}, + {9, 3, 1, 3}, + {7, 4, 6, 4} + }; + + for (const auto &tc : test_cases) { + std::vector input(static_cast(tc.a_len)); + std::vector filter(static_cast(tc.f_len)); + FillResampleInputs(input, filter); + const auto expected = ReferenceResamplePoly(input, filter, tc.up, tc.down); + + auto a = make_tensor({tc.a_len}, MATX_HOST_MALLOC_MEMORY); + auto f = make_tensor({tc.f_len}, MATX_HOST_MALLOC_MEMORY); + auto b = make_tensor({static_cast(expected.size())}, + MATX_HOST_MALLOC_MEMORY); + + for (index_t i = 0; i < tc.a_len; i++) { + a(i) = input[static_cast(i)]; + } + for (index_t i = 0; i < tc.f_len; i++) { + f(i) = filter[static_cast(i)]; + } + + (b = resample_poly(shift<0>(shift<0>(a, 1), -1), + shift<0>(shift<0>(f, 1), -1), + tc.up, tc.down)).run(exec); + + for (index_t i = 0; i < b.Size(0); i++) { + AssertNearResampleValue(b(i), expected[static_cast(i)], 1.0e-5); + } + + if (tc.a_len == 7 && tc.f_len == 5) { + (b = resample_poly(shift<0>(shift<0>(a, 1), -1), + shift<0>(shift<0>(f, 1), -1), + tc.up, tc.down) + MakeResampleTestValue(0.0)).run(exec); + + for (index_t i = 0; i < b.Size(0); i++) { + AssertNearResampleValue(b(i), expected[static_cast(i)], 1.0e-5); + } + } + } + + MATX_EXIT_HANDLER(); +} + +TYPED_TEST(ResamplePolyHostTest, BatchedSignalsMatchReference) +{ + MATX_ENTER_HANDLER(); + using TestType = TypeParam; + + SingleThreadedHostExecutor exec{}; + constexpr index_t batches = 3; + constexpr index_t a_len = 8; + constexpr index_t f_len = 4; + constexpr index_t up = 3; + constexpr index_t down = 2; + + std::vector filter(static_cast(f_len)); + std::vector seed(static_cast(a_len)); + FillResampleInputs(seed, filter); + const auto expected_shape = ReferenceResamplePoly(seed, filter, up, down); + const index_t b_len = static_cast(expected_shape.size()); + + auto a = make_tensor({batches, a_len}, MATX_HOST_MALLOC_MEMORY); + auto f = make_tensor({f_len}, MATX_HOST_MALLOC_MEMORY); + auto b = make_tensor({batches, b_len}, MATX_HOST_MALLOC_MEMORY); + + for (index_t i = 0; i < f_len; i++) { + f(i) = filter[static_cast(i)]; + } + + std::vector> expected; + for (index_t batch = 0; batch < batches; batch++) { + std::vector input(static_cast(a_len)); + for (index_t i = 0; i < a_len; i++) { + const double real = static_cast(batch + 1) * 0.25 + + (static_cast(i * 5 % 17) - 8) / 5.0; + const double imag = static_cast(batch - 1) * 0.5 + + (static_cast(i * 3 % 11) - 5) / 6.0; + input[static_cast(i)] = MakeResampleTestValue(real, imag); + a(batch, i) = input[static_cast(i)]; + } + + expected.push_back(ReferenceResamplePoly(input, filter, up, down)); + } + + (b = resample_poly(shift<1>(shift<1>(a, 2), -2), + shift<0>(shift<0>(f, 1), -1), + up, down)).run(exec); + + for (index_t batch = 0; batch < batches; batch++) { + for (index_t i = 0; i < b_len; i++) { + AssertNearResampleValue(b(batch, i), + expected[static_cast(batch)][static_cast(i)], 1.0e-5); + } + } + + MATX_EXIT_HANDLER(); +} + // SimpleOddLength tests use random input and filter values and // odd-length filters. TYPED_TEST(ResamplePolyTestNonHalfFloatTypes, SimpleOddLength)