NVIDIA · cliffburdick · May 27, 2026 · May 26, 2026
diff --git a/docs_input/executor_compatibility.rst b/docs_input/executor_compatibility.rst
@@ -172,7 +172,7 @@ fused JIT expression; non-JIT CUDA execution through cudaExecutor remains availa
    "reduce", "|no|", "|yes|", "|no|", "Generic custom reduction currently uses CUDA reduction support."
    "remap", "|yes|", "|yes|", "|yes|", "View/reindex expression."
    "repmat", "|yes|", "|yes|", "|yes|", "View/expression composition."
-   "resample_poly", "|no|", "|yes|", "|no|", "CUDA-only resampling transform."
+   "resample_poly", "|yes|", "|yes|", "|no|", "Polyphase resampling transform for host and CUDA executors."
    "reshape", "|yes|", "|yes|", "|yes|", "View expression."
    "reverse", "|yes|", "|yes|", "|yes|", "View/reindex expression."
    "round", "|yes|", "|yes|", "|yes|", "Element-wise expression."

diff --git a/include/matx/core/operator_utils.h b/include/matx/core/operator_utils.h
@@ -96,7 +96,9 @@ namespace matx {
   namespace detail {
     // Used inside of transforms to allocate temporary output
     template <typename TensorType, typename Executor, typename ShapeType>
-    __MATX_HOST__ __MATX_INLINE__ void AllocateTempTensor(TensorType &tensor, Executor &&ex, ShapeType &&shape, typename TensorType::value_type **ptr) {
+    __MATX_HOST__ __MATX_INLINE__ void AllocateTempTensor(TensorType &tensor, Executor &&ex,
+        ShapeType &&shape, typename TensorType::value_type **ptr,
+        matxMemorySpace_t host_memory_space = MATX_HOST_MEMORY) {
 
       const auto ttl_size = cuda::std::accumulate(shape.begin(), shape.end(), static_cast<index_t>(1),
                                   cuda::std::multiplies<index_t>()) * sizeof(typename TensorType::value_type);      
@@ -106,7 +108,7 @@ namespace matx {
         make_tensor(tensor, *ptr, shape);
       }
       else {
-        matxAlloc((void**)ptr, ttl_size, MATX_HOST_MEMORY);
+        matxAlloc((void**)ptr, ttl_size, host_memory_space);
         make_tensor(tensor, *ptr, shape);
       }
     }

diff --git a/include/matx/operators/resample_poly.h b/include/matx/operators/resample_poly.h
@@ -112,9 +112,11 @@ namespace detail {
 
       template <typename Out, typename Executor>
       void Exec(Out &&out, Executor &&ex) const {
-        static_assert(is_cuda_executor_v<Executor>, "resample_poly() only supports the CUDA executor currently");
+        static_assert(is_cuda_executor_v<Executor> || is_host_executor_v<Executor>,
+          "resample_poly() only supports CUDA and host executors");
 
-        resample_poly_impl(cuda::std::get<0>(out), a_, f_, up_, down_, ex.getStream());
+        resample_poly_impl(cuda::std::get<0>(out), a_, f_, up_, down_,
+          std::forward<Executor>(ex));
       }
 
       template <typename ShapeType, typename Executor>
@@ -137,7 +139,13 @@ namespace detail {
 
         InnerPreRun(std::forward<ShapeType>(shape), std::forward<Executor>(ex));         
 
-        detail::AllocateTempTensor(tmp_out_, std::forward<Executor>(ex), out_dims_, &ptr);
+        if constexpr (is_host_executor_v<Executor>) {
+          detail::AllocateTempTensor(tmp_out_, std::forward<Executor>(ex), out_dims_, &ptr,
+            MATX_HOST_MALLOC_MEMORY);
+        }
+        else {
+          detail::AllocateTempTensor(tmp_out_, std::forward<Executor>(ex), out_dims_, &ptr);
+        }
 
         prerun_done_ = true;
         Exec(cuda::std::make_tuple(tmp_out_), std::forward<Executor>(ex));

diff --git a/include/matx/transforms/resample_poly.h b/include/matx/transforms/resample_poly.h
@@ -33,18 +33,34 @@
 #pragma once
 
 #include <cstdint>
+#include <cstddef>
 #include <cstdio>
+#include <numeric>
 #include <type_traits>
 
+#include <cuda/std/array>
+#include <cuda/std/tuple>
+
 #include "matx/core/error.h"
 #include "matx/core/nvtx.h"
 #include "matx/core/tensor.h"
+#include "matx/executors/host.h"
 #include "matx/operators/clone.h"
 #include "matx/kernels/resample_poly.cuh"
 
 namespace matx {
 namespace detail {
 
+template <typename Op, size_t RANK>
+__MATX_INLINE__ decltype(auto) ApplyOpWithIdx(Op &&op,
+  const cuda::std::array<index_t, RANK> &idx)
+{
+  return cuda::std::apply(
+    [&op](auto... indices) -> decltype(auto) {
+      return op(indices...);
+    }, idx);
+}
+
 template <typename OutType, typename InType, typename FilterType>
 inline void matxResamplePoly1DInternal(OutType &o, const InType &i,
                                      const FilterType &filter, index_t up, index_t down,
@@ -193,6 +209,77 @@ inline void matxResamplePoly1DInternal(OutType &o, const InType &i,
 #endif
 }
 
+template <typename OutType, typename InType, typename FilterType, ThreadsMode MODE>
+inline void matxResamplePoly1DInternal(OutType &o, const InType &i,
+                                     const FilterType &filter, index_t up, index_t down,
+                                     [[maybe_unused]] const HostExecutor<MODE> &exec)
+{
+  using filter_t = typename FilterType::value_type;
+  using filter_inner_t = typename inner_op_type_t<filter_t>::type;
+  using output_t = typename OutType::value_type;
+
+  constexpr int RANK = InType::Rank();
+  const index_t output_len = o.Size(RANK - 1);
+  const index_t input_len = i.Size(RANK - 1);
+  index_t filter_len = filter.Size(FilterType::Rank() - 1);
+  const bool is_even_filter = (filter_len % 2) == 0;
+
+  if (is_even_filter) {
+    filter_len++;
+  }
+
+  const index_t filter_len_half = filter_len / 2;
+  const index_t filter_central_tap = (filter_len - 1) / 2;
+  const index_t max_input_ind = input_len - 1;
+  const index_t batch_count = TotalSize(o) / output_len;
+  const filter_t scale = static_cast<filter_t>(static_cast<filter_inner_t>(up));
+
+  auto run_batch = [&](index_t batch_idx) {
+    auto input_idx = BlockToIdx(o, batch_idx, 1);
+    auto output_idx = input_idx;
+
+    for (index_t out_ind = 0; out_ind < output_len; out_ind++) {
+      const index_t up_ind = out_ind * down;
+      const index_t up_start = (up_ind > filter_len_half) ?
+        up_ind - filter_len_half : 0;
+      const index_t up_end = cuda::std::min(max_input_ind * up,
+        up_ind + filter_len_half);
+      const index_t x_start = (up_start + up - 1) / up;
+      const index_t x_end = up_end / up;
+      index_t h_ind = filter_central_tap + (up_ind - up * x_start);
+
+      output_t accum {};
+      for (index_t in_ind = x_start; in_ind <= x_end; in_ind++) {
+        if (!is_even_filter || h_ind > 0) {
+          input_idx[RANK - 1] = in_ind;
+          const auto in_val = ApplyOpWithIdx(i, input_idx);
+          const index_t filter_ind = is_even_filter ? h_ind - 1 : h_ind;
+          accum += in_val * filter(filter_ind);
+        }
+        h_ind -= up;
+      }
+
+      output_idx[RANK - 1] = out_ind;
+      ApplyOpWithIdx(o, output_idx) = accum * scale;
+    }
+  };
+
+#ifdef MATX_EN_OMP
+  if (exec.GetNumThreads() > 1) {
+    #pragma omp parallel for num_threads(exec.GetNumThreads())
+    for (index_t batch_idx = 0; batch_idx < batch_count; batch_idx++) {
+      run_batch(batch_idx);
+    }
+  }
+  else
+#endif
+  {
+    for (index_t batch_idx = 0; batch_idx < batch_count; batch_idx++) {
+      run_batch(batch_idx);
+    }
+  }
+}
+
 } // end namespace detail
 
 
@@ -207,11 +294,12 @@ inline void matxResamplePoly1DInternal(OutType &o, const InType &i,
  * @param f Filter operator
  * @param up Factor by which to upsample
  * @param down Factor by which to downsample
- * @param stream CUDA stream on which to run the kernel(s)
+ * @param exec Executor on which to run the resampler
  */
-template <typename OutType, typename InType, typename FilterType>
+template <typename OutType, typename InType, typename FilterType, typename Executor>
+  requires is_executor<Executor>
 inline void resample_poly_impl(OutType &out, const InType &in, const FilterType &f,
-                   index_t up, index_t down, cudaStream_t stream = 0) {
+                   index_t up, index_t down, Executor &&exec) {
   MATX_NVTX_START("", matx::MATX_NVTX_LOG_API)
 
   constexpr int RANK = InType::Rank();
@@ -243,11 +331,26 @@ inline void resample_poly_impl(OutType &out, const InType &in, const FilterType
   // first interpretation and return a copy of the input tensor. This matches
   // the behavior of scipy.
   if (up == 1 && down == 1) {
-    (out = in).run(stream);
+    (out = in).run(exec);
     return;
   }
 
-  matxResamplePoly1DInternal(out, in, f, up, down, stream);
+  if constexpr (is_cuda_executor_v<Executor>) {
+    matxResamplePoly1DInternal(out, in, f, up, down, exec.getStream());
+  }
+  else if constexpr (is_host_executor_v<Executor>) {
+    matxResamplePoly1DInternal(out, in, f, up, down, exec);
+  }
+  else {
+    static_assert(is_cuda_executor_v<Executor> || is_host_executor_v<Executor>,
+      "resample_poly_impl() only supports CUDA and host executors");
+  }
+}
+
+template <typename OutType, typename InType, typename FilterType>
+inline void resample_poly_impl(OutType &out, const InType &in, const FilterType &f,
+                   index_t up, index_t down, cudaStream_t stream = 0) {
+  resample_poly_impl(out, in, f, up, down, cudaExecutor(stream));
 }
 
 } // end namespace matx