Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion docs_input/executor_compatibility.rst
Original file line number Diff line number Diff line change
Expand Up @@ -172,7 +172,7 @@ fused JIT expression; non-JIT CUDA execution through cudaExecutor remains availa
"reduce", "|no|", "|yes|", "|no|", "Generic custom reduction currently uses CUDA reduction support."
"remap", "|yes|", "|yes|", "|yes|", "View/reindex expression."
"repmat", "|yes|", "|yes|", "|yes|", "View/expression composition."
"resample_poly", "|no|", "|yes|", "|no|", "CUDA-only resampling transform."
"resample_poly", "|yes|", "|yes|", "|no|", "Polyphase resampling transform for host and CUDA executors."
"reshape", "|yes|", "|yes|", "|yes|", "View expression."
"reverse", "|yes|", "|yes|", "|yes|", "View/reindex expression."
"round", "|yes|", "|yes|", "|yes|", "Element-wise expression."
Expand Down
6 changes: 4 additions & 2 deletions include/matx/core/operator_utils.h
Original file line number Diff line number Diff line change
Expand Up @@ -96,7 +96,9 @@ namespace matx {
namespace detail {
// Used inside of transforms to allocate temporary output
template <typename TensorType, typename Executor, typename ShapeType>
__MATX_HOST__ __MATX_INLINE__ void AllocateTempTensor(TensorType &tensor, Executor &&ex, ShapeType &&shape, typename TensorType::value_type **ptr) {
__MATX_HOST__ __MATX_INLINE__ void AllocateTempTensor(TensorType &tensor, Executor &&ex,
ShapeType &&shape, typename TensorType::value_type **ptr,
matxMemorySpace_t host_memory_space = MATX_HOST_MEMORY) {

const auto ttl_size = cuda::std::accumulate(shape.begin(), shape.end(), static_cast<index_t>(1),
cuda::std::multiplies<index_t>()) * sizeof(typename TensorType::value_type);
Expand All @@ -106,7 +108,7 @@ namespace matx {
make_tensor(tensor, *ptr, shape);
}
else {
matxAlloc((void**)ptr, ttl_size, MATX_HOST_MEMORY);
matxAlloc((void**)ptr, ttl_size, host_memory_space);
make_tensor(tensor, *ptr, shape);
}
Comment thread
cliffburdick marked this conversation as resolved.
}
Expand Down
14 changes: 11 additions & 3 deletions include/matx/operators/resample_poly.h
Original file line number Diff line number Diff line change
Expand Up @@ -112,9 +112,11 @@ namespace detail {

template <typename Out, typename Executor>
void Exec(Out &&out, Executor &&ex) const {
static_assert(is_cuda_executor_v<Executor>, "resample_poly() only supports the CUDA executor currently");
static_assert(is_cuda_executor_v<Executor> || is_host_executor_v<Executor>,
"resample_poly() only supports CUDA and host executors");

resample_poly_impl(cuda::std::get<0>(out), a_, f_, up_, down_, ex.getStream());
resample_poly_impl(cuda::std::get<0>(out), a_, f_, up_, down_,
std::forward<Executor>(ex));
}

template <typename ShapeType, typename Executor>
Expand All @@ -137,7 +139,13 @@ namespace detail {

InnerPreRun(std::forward<ShapeType>(shape), std::forward<Executor>(ex));

detail::AllocateTempTensor(tmp_out_, std::forward<Executor>(ex), out_dims_, &ptr);
if constexpr (is_host_executor_v<Executor>) {
detail::AllocateTempTensor(tmp_out_, std::forward<Executor>(ex), out_dims_, &ptr,
MATX_HOST_MALLOC_MEMORY);
}
else {
detail::AllocateTempTensor(tmp_out_, std::forward<Executor>(ex), out_dims_, &ptr);
}

prerun_done_ = true;
Exec(cuda::std::make_tuple(tmp_out_), std::forward<Executor>(ex));
Expand Down
113 changes: 108 additions & 5 deletions include/matx/transforms/resample_poly.h
Original file line number Diff line number Diff line change
Expand Up @@ -33,18 +33,34 @@
#pragma once

#include <cstdint>
#include <cstddef>
#include <cstdio>
#include <numeric>
#include <type_traits>

#include <cuda/std/array>
#include <cuda/std/tuple>

#include "matx/core/error.h"
#include "matx/core/nvtx.h"
#include "matx/core/tensor.h"
#include "matx/executors/host.h"
#include "matx/operators/clone.h"
#include "matx/kernels/resample_poly.cuh"

namespace matx {
namespace detail {

template <typename Op, size_t RANK>
__MATX_INLINE__ decltype(auto) ApplyOpWithIdx(Op &&op,
const cuda::std::array<index_t, RANK> &idx)
{
return cuda::std::apply(
[&op](auto... indices) -> decltype(auto) {
return op(indices...);
}, idx);
}

template <typename OutType, typename InType, typename FilterType>
inline void matxResamplePoly1DInternal(OutType &o, const InType &i,
const FilterType &filter, index_t up, index_t down,
Expand Down Expand Up @@ -193,6 +209,77 @@ inline void matxResamplePoly1DInternal(OutType &o, const InType &i,
#endif
}

template <typename OutType, typename InType, typename FilterType, ThreadsMode MODE>
inline void matxResamplePoly1DInternal(OutType &o, const InType &i,
const FilterType &filter, index_t up, index_t down,
[[maybe_unused]] const HostExecutor<MODE> &exec)
{
using filter_t = typename FilterType::value_type;
using filter_inner_t = typename inner_op_type_t<filter_t>::type;
using output_t = typename OutType::value_type;

constexpr int RANK = InType::Rank();
const index_t output_len = o.Size(RANK - 1);
const index_t input_len = i.Size(RANK - 1);
index_t filter_len = filter.Size(FilterType::Rank() - 1);
const bool is_even_filter = (filter_len % 2) == 0;

if (is_even_filter) {
filter_len++;
}

const index_t filter_len_half = filter_len / 2;
const index_t filter_central_tap = (filter_len - 1) / 2;
const index_t max_input_ind = input_len - 1;
const index_t batch_count = TotalSize(o) / output_len;
const filter_t scale = static_cast<filter_t>(static_cast<filter_inner_t>(up));

auto run_batch = [&](index_t batch_idx) {
auto input_idx = BlockToIdx(o, batch_idx, 1);
auto output_idx = input_idx;

for (index_t out_ind = 0; out_ind < output_len; out_ind++) {
const index_t up_ind = out_ind * down;
const index_t up_start = (up_ind > filter_len_half) ?
up_ind - filter_len_half : 0;
const index_t up_end = cuda::std::min(max_input_ind * up,
up_ind + filter_len_half);
const index_t x_start = (up_start + up - 1) / up;
const index_t x_end = up_end / up;
index_t h_ind = filter_central_tap + (up_ind - up * x_start);

output_t accum {};
for (index_t in_ind = x_start; in_ind <= x_end; in_ind++) {
if (!is_even_filter || h_ind > 0) {
input_idx[RANK - 1] = in_ind;
const auto in_val = ApplyOpWithIdx(i, input_idx);
const index_t filter_ind = is_even_filter ? h_ind - 1 : h_ind;
accum += in_val * filter(filter_ind);
}
h_ind -= up;
}

output_idx[RANK - 1] = out_ind;
ApplyOpWithIdx(o, output_idx) = accum * scale;
}
};

#ifdef MATX_EN_OMP
if (exec.GetNumThreads() > 1) {
#pragma omp parallel for num_threads(exec.GetNumThreads())
for (index_t batch_idx = 0; batch_idx < batch_count; batch_idx++) {
run_batch(batch_idx);
}
}
else
#endif
{
for (index_t batch_idx = 0; batch_idx < batch_count; batch_idx++) {
run_batch(batch_idx);
}
}
}

} // end namespace detail


Expand All @@ -207,11 +294,12 @@ inline void matxResamplePoly1DInternal(OutType &o, const InType &i,
* @param f Filter operator
* @param up Factor by which to upsample
* @param down Factor by which to downsample
* @param stream CUDA stream on which to run the kernel(s)
* @param exec Executor on which to run the resampler
*/
template <typename OutType, typename InType, typename FilterType>
template <typename OutType, typename InType, typename FilterType, typename Executor>
requires is_executor<Executor>
inline void resample_poly_impl(OutType &out, const InType &in, const FilterType &f,
index_t up, index_t down, cudaStream_t stream = 0) {
index_t up, index_t down, Executor &&exec) {
MATX_NVTX_START("", matx::MATX_NVTX_LOG_API)

constexpr int RANK = InType::Rank();
Expand Down Expand Up @@ -243,11 +331,26 @@ inline void resample_poly_impl(OutType &out, const InType &in, const FilterType
// first interpretation and return a copy of the input tensor. This matches
// the behavior of scipy.
if (up == 1 && down == 1) {
(out = in).run(stream);
(out = in).run(exec);
return;
}

matxResamplePoly1DInternal(out, in, f, up, down, stream);
if constexpr (is_cuda_executor_v<Executor>) {
matxResamplePoly1DInternal(out, in, f, up, down, exec.getStream());
}
else if constexpr (is_host_executor_v<Executor>) {
matxResamplePoly1DInternal(out, in, f, up, down, exec);
}
else {
static_assert(is_cuda_executor_v<Executor> || is_host_executor_v<Executor>,
"resample_poly_impl() only supports CUDA and host executors");
}
}

template <typename OutType, typename InType, typename FilterType>
inline void resample_poly_impl(OutType &out, const InType &in, const FilterType &f,
index_t up, index_t down, cudaStream_t stream = 0) {
resample_poly_impl(out, in, f, up, down, cudaExecutor(stream));
}

} // end namespace matx
Loading