diff --git a/docs_input/executor_compatibility.rst b/docs_input/executor_compatibility.rst
index e19d2bb54..2da78590e 100644
--- a/docs_input/executor_compatibility.rst
+++ b/docs_input/executor_compatibility.rst
@@ -172,7 +172,7 @@ fused JIT expression; non-JIT CUDA execution through cudaExecutor remains availa
    "reduce", "|no|", "|yes|", "|no|", "Generic custom reduction currently uses CUDA reduction support."
    "remap", "|yes|", "|yes|", "|yes|", "View/reindex expression."
    "repmat", "|yes|", "|yes|", "|yes|", "View/expression composition."
-   "resample_poly", "|no|", "|yes|", "|no|", "CUDA-only resampling transform."
+   "resample_poly", "|yes|", "|yes|", "|no|", "Polyphase resampling transform for host and CUDA executors."
    "reshape", "|yes|", "|yes|", "|yes|", "View expression."
    "reverse", "|yes|", "|yes|", "|yes|", "View/reindex expression."
    "round", "|yes|", "|yes|", "|yes|", "Element-wise expression."
diff --git a/include/matx/core/operator_utils.h b/include/matx/core/operator_utils.h
index e09d8228f..2c6bd11ae 100644
--- a/include/matx/core/operator_utils.h
+++ b/include/matx/core/operator_utils.h
@@ -96,7 +96,9 @@ namespace matx {
   namespace detail {
     // Used inside of transforms to allocate temporary output
     template <typename TensorType, typename Executor, typename ShapeType>
-    __MATX_HOST__ __MATX_INLINE__ void AllocateTempTensor(TensorType &tensor, Executor &&ex, ShapeType &&shape, typename TensorType::value_type **ptr) {
+    __MATX_HOST__ __MATX_INLINE__ void AllocateTempTensor(TensorType &tensor, Executor &&ex,
+        ShapeType &&shape, typename TensorType::value_type **ptr,
+        matxMemorySpace_t host_memory_space = MATX_HOST_MEMORY) {
 
       const auto ttl_size = cuda::std::accumulate(shape.begin(), shape.end(), static_cast<index_t>(1),
                                   cuda::std::multiplies<index_t>()) * sizeof(typename TensorType::value_type);      
@@ -106,7 +108,7 @@ namespace matx {
         make_tensor(tensor, *ptr, shape);
       }
       else {
-        matxAlloc((void**)ptr, ttl_size, MATX_HOST_MEMORY);
+        matxAlloc((void**)ptr, ttl_size, host_memory_space);
         make_tensor(tensor, *ptr, shape);
       }
     }
diff --git a/include/matx/operators/resample_poly.h b/include/matx/operators/resample_poly.h
index 86601a825..934120834 100644
--- a/include/matx/operators/resample_poly.h
+++ b/include/matx/operators/resample_poly.h
@@ -112,9 +112,11 @@ namespace detail {
 
       template <typename Out, typename Executor>
       void Exec(Out &&out, Executor &&ex) const {
-        static_assert(is_cuda_executor_v<Executor>, "resample_poly() only supports the CUDA executor currently");
+        static_assert(is_cuda_executor_v<Executor> || is_host_executor_v<Executor>,
+          "resample_poly() only supports CUDA and host executors");
 
-        resample_poly_impl(cuda::std::get<0>(out), a_, f_, up_, down_, ex.getStream());
+        resample_poly_impl(cuda::std::get<0>(out), a_, f_, up_, down_,
+          std::forward<Executor>(ex));
       }
 
       template <typename ShapeType, typename Executor>
@@ -137,7 +139,13 @@ namespace detail {
 
         InnerPreRun(std::forward<ShapeType>(shape), std::forward<Executor>(ex));         
 
-        detail::AllocateTempTensor(tmp_out_, std::forward<Executor>(ex), out_dims_, &ptr);
+        if constexpr (is_host_executor_v<Executor>) {
+          detail::AllocateTempTensor(tmp_out_, std::forward<Executor>(ex), out_dims_, &ptr,
+            MATX_HOST_MALLOC_MEMORY);
+        }
+        else {
+          detail::AllocateTempTensor(tmp_out_, std::forward<Executor>(ex), out_dims_, &ptr);
+        }
 
         prerun_done_ = true;
         Exec(cuda::std::make_tuple(tmp_out_), std::forward<Executor>(ex));
diff --git a/include/matx/transforms/resample_poly.h b/include/matx/transforms/resample_poly.h
index cdd254e35..1c85ffd5a 100644
--- a/include/matx/transforms/resample_poly.h
+++ b/include/matx/transforms/resample_poly.h
@@ -33,18 +33,34 @@
 #pragma once
 
 #include <cstdint>
+#include <cstddef>
 #include <cstdio>
+#include <numeric>
 #include <type_traits>
 
+#include <cuda/std/array>
+#include <cuda/std/tuple>
+
 #include "matx/core/error.h"
 #include "matx/core/nvtx.h"
 #include "matx/core/tensor.h"
+#include "matx/executors/host.h"
 #include "matx/operators/clone.h"
 #include "matx/kernels/resample_poly.cuh"
 
 namespace matx {
 namespace detail {
 
+template <typename Op, size_t RANK>
+__MATX_INLINE__ decltype(auto) ApplyOpWithIdx(Op &&op,
+  const cuda::std::array<index_t, RANK> &idx)
+{
+  return cuda::std::apply(
+    [&op](auto... indices) -> decltype(auto) {
+      return op(indices...);
+    }, idx);
+}
+
 template <typename OutType, typename InType, typename FilterType>
 inline void matxResamplePoly1DInternal(OutType &o, const InType &i,
                                      const FilterType &filter, index_t up, index_t down,
@@ -193,6 +209,77 @@ inline void matxResamplePoly1DInternal(OutType &o, const InType &i,
 #endif
 }
 
+template <typename OutType, typename InType, typename FilterType, ThreadsMode MODE>
+inline void matxResamplePoly1DInternal(OutType &o, const InType &i,
+                                     const FilterType &filter, index_t up, index_t down,
+                                     [[maybe_unused]] const HostExecutor<MODE> &exec)
+{
+  using filter_t = typename FilterType::value_type;
+  using filter_inner_t = typename inner_op_type_t<filter_t>::type;
+  using output_t = typename OutType::value_type;
+
+  constexpr int RANK = InType::Rank();
+  const index_t output_len = o.Size(RANK - 1);
+  const index_t input_len = i.Size(RANK - 1);
+  index_t filter_len = filter.Size(FilterType::Rank() - 1);
+  const bool is_even_filter = (filter_len % 2) == 0;
+
+  if (is_even_filter) {
+    filter_len++;
+  }
+
+  const index_t filter_len_half = filter_len / 2;
+  const index_t filter_central_tap = (filter_len - 1) / 2;
+  const index_t max_input_ind = input_len - 1;
+  const index_t batch_count = TotalSize(o) / output_len;
+  const filter_t scale = static_cast<filter_t>(static_cast<filter_inner_t>(up));
+
+  auto run_batch = [&](index_t batch_idx) {
+    auto input_idx = BlockToIdx(o, batch_idx, 1);
+    auto output_idx = input_idx;
+
+    for (index_t out_ind = 0; out_ind < output_len; out_ind++) {
+      const index_t up_ind = out_ind * down;
+      const index_t up_start = (up_ind > filter_len_half) ?
+        up_ind - filter_len_half : 0;
+      const index_t up_end = cuda::std::min(max_input_ind * up,
+        up_ind + filter_len_half);
+      const index_t x_start = (up_start + up - 1) / up;
+      const index_t x_end = up_end / up;
+      index_t h_ind = filter_central_tap + (up_ind - up * x_start);
+
+      output_t accum {};
+      for (index_t in_ind = x_start; in_ind <= x_end; in_ind++) {
+        if (!is_even_filter || h_ind > 0) {
+          input_idx[RANK - 1] = in_ind;
+          const auto in_val = ApplyOpWithIdx(i, input_idx);
+          const index_t filter_ind = is_even_filter ? h_ind - 1 : h_ind;
+          accum += in_val * filter(filter_ind);
+        }
+        h_ind -= up;
+      }
+
+      output_idx[RANK - 1] = out_ind;
+      ApplyOpWithIdx(o, output_idx) = accum * scale;
+    }
+  };
+
+#ifdef MATX_EN_OMP
+  if (exec.GetNumThreads() > 1) {
+    #pragma omp parallel for num_threads(exec.GetNumThreads())
+    for (index_t batch_idx = 0; batch_idx < batch_count; batch_idx++) {
+      run_batch(batch_idx);
+    }
+  }
+  else
+#endif
+  {
+    for (index_t batch_idx = 0; batch_idx < batch_count; batch_idx++) {
+      run_batch(batch_idx);
+    }
+  }
+}
+
 } // end namespace detail
 
 
@@ -207,11 +294,12 @@ inline void matxResamplePoly1DInternal(OutType &o, const InType &i,
  * @param f Filter operator
  * @param up Factor by which to upsample
  * @param down Factor by which to downsample
- * @param stream CUDA stream on which to run the kernel(s)
+ * @param exec Executor on which to run the resampler
  */
-template <typename OutType, typename InType, typename FilterType>
+template <typename OutType, typename InType, typename FilterType, typename Executor>
+  requires is_executor<Executor>
 inline void resample_poly_impl(OutType &out, const InType &in, const FilterType &f,
-                   index_t up, index_t down, cudaStream_t stream = 0) {
+                   index_t up, index_t down, Executor &&exec) {
   MATX_NVTX_START("", matx::MATX_NVTX_LOG_API)
 
   constexpr int RANK = InType::Rank();
@@ -243,11 +331,26 @@ inline void resample_poly_impl(OutType &out, const InType &in, const FilterType
   // first interpretation and return a copy of the input tensor. This matches
   // the behavior of scipy.
   if (up == 1 && down == 1) {
-    (out = in).run(stream);
+    (out = in).run(exec);
     return;
   }
 
-  matxResamplePoly1DInternal(out, in, f, up, down, stream);
+  if constexpr (is_cuda_executor_v<Executor>) {
+    matxResamplePoly1DInternal(out, in, f, up, down, exec.getStream());
+  }
+  else if constexpr (is_host_executor_v<Executor>) {
+    matxResamplePoly1DInternal(out, in, f, up, down, exec);
+  }
+  else {
+    static_assert(is_cuda_executor_v<Executor> || is_host_executor_v<Executor>,
+      "resample_poly_impl() only supports CUDA and host executors");
+  }
+}
+
+template <typename OutType, typename InType, typename FilterType>
+inline void resample_poly_impl(OutType &out, const InType &in, const FilterType &f,
+                   index_t up, index_t down, cudaStream_t stream = 0) {
+  resample_poly_impl(out, in, f, up, down, cudaExecutor(stream));
 }
 
 } // end namespace matx
diff --git a/test/00_transform/ResamplePoly.cu b/test/00_transform/ResamplePoly.cu
index c44bbae14..fe1d22e33 100644
--- a/test/00_transform/ResamplePoly.cu
+++ b/test/00_transform/ResamplePoly.cu
@@ -36,8 +36,108 @@
 #include "utilities.h"
 #include "gtest/gtest.h"
 
+#include <cmath>
+#include <numeric>
+#include <vector>
+
 using namespace matx;
 
+namespace {
+
+template <typename T>
+T MakeResampleTestValue(double real, double imag = 0.0)
+{
+  if constexpr (is_complex_v<T>) {
+    using inner_t = typename inner_op_type_t<T>::type;
+    return T{static_cast<inner_t>(real), static_cast<inner_t>(imag)};
+  }
+  else {
+    return static_cast<T>(real);
+  }
+}
+
+template <typename T>
+double ResampleAbsDiff(const T &a, const T &b)
+{
+  if constexpr (is_complex_v<T>) {
+    return static_cast<double>(cuda::std::abs(a - b));
+  }
+  else {
+    return std::abs(static_cast<double>(a - b));
+  }
+}
+
+template <typename T>
+std::vector<T> ReferenceResamplePoly(const std::vector<T> &in,
+                                     const std::vector<T> &filter,
+                                     index_t up, index_t down)
+{
+  const index_t g = std::gcd(up, down);
+  up /= g;
+  down /= g;
+
+  const index_t out_len =
+    static_cast<index_t>((in.size() * up + down - 1) / down);
+  std::vector<T> out(static_cast<size_t>(out_len), T{});
+
+  if (up == 1 && down == 1) {
+    return in;
+  }
+
+  const bool is_even_filter = (filter.size() % 2) == 0;
+  const index_t logical_filter_len =
+    static_cast<index_t>(filter.size()) + (is_even_filter ? 1 : 0);
+  const index_t filter_center = (logical_filter_len - 1) / 2;
+
+  for (index_t out_ind = 0; out_ind < out_len; out_ind++) {
+    T accum {};
+    const index_t up_ind = out_ind * down;
+    for (index_t in_ind = 0; in_ind < static_cast<index_t>(in.size()); in_ind++) {
+      const index_t h_ind = filter_center + up_ind - in_ind * up;
+      if (h_ind < 0 || h_ind >= logical_filter_len) {
+        continue;
+      }
+
+      if (is_even_filter && h_ind == 0) {
+        continue;
+      }
+
+      const index_t filter_ind = is_even_filter ? h_ind - 1 : h_ind;
+      accum += in[static_cast<size_t>(in_ind)] *
+        filter[static_cast<size_t>(filter_ind)];
+    }
+
+    out[static_cast<size_t>(out_ind)] =
+      accum * MakeResampleTestValue<T>(static_cast<double>(up));
+  }
+
+  return out;
+}
+
+template <typename T>
+void FillResampleInputs(std::vector<T> &in, std::vector<T> &filter)
+{
+  for (size_t i = 0; i < in.size(); i++) {
+    const double real = (static_cast<int>(i * 5 % 17) - 8) / 4.0;
+    const double imag = (static_cast<int>(i * 3 % 11) - 5) / 7.0;
+    in[i] = MakeResampleTestValue<T>(real, imag);
+  }
+
+  for (size_t i = 0; i < filter.size(); i++) {
+    const double real = (static_cast<int>(i * 7 % 13) - 6) / 9.0;
+    const double imag = (static_cast<int>(i * 2 % 9) - 4) / 8.0;
+    filter[i] = MakeResampleTestValue<T>(real, imag);
+  }
+}
+
+template <typename T>
+void AssertNearResampleValue(const T &actual, const T &expected, double thresh)
+{
+  ASSERT_NEAR(ResampleAbsDiff(actual, expected), 0.0, thresh);
+}
+
+} // namespace
+
 template <typename T>
 class ResamplePolyTest : public ::testing::Test {
   using GTestType = cuda::std::tuple_element_t<0, T>;
@@ -79,6 +179,126 @@ class ResamplePolyTestFloatTypes
 TYPED_TEST_SUITE(ResamplePolyTestNonHalfFloatTypes, MatXFloatNonHalfTypesCUDAExec);
 TYPED_TEST_SUITE(ResamplePolyTestFloatTypes, MatXFloatTypesCUDAExec);
 
+template <typename T>
+class ResamplePolyHostTest : public ::testing::Test {};
+
+using ResamplePolyHostTypes = ::testing::Types<float, double, cuda::std::complex<float>>;
+TYPED_TEST_SUITE(ResamplePolyHostTest, ResamplePolyHostTypes);
+
+TYPED_TEST(ResamplePolyHostTest, SmallSignalsMatchReference)
+{
+  MATX_ENTER_HANDLER();
+  using TestType = TypeParam;
+
+  SingleThreadedHostExecutor exec{};
+  struct {
+    index_t a_len;
+    index_t f_len;
+    index_t up;
+    index_t down;
+  } test_cases[] = {
+    {7, 5, 3, 2},
+    {8, 4, 2, 5},
+    {6, 1, 4, 1},
+    {9, 3, 1, 3},
+    {7, 4, 6, 4}
+  };
+
+  for (const auto &tc : test_cases) {
+    std::vector<TestType> input(static_cast<size_t>(tc.a_len));
+    std::vector<TestType> filter(static_cast<size_t>(tc.f_len));
+    FillResampleInputs(input, filter);
+    const auto expected = ReferenceResamplePoly(input, filter, tc.up, tc.down);
+
+    auto a = make_tensor<TestType>({tc.a_len}, MATX_HOST_MALLOC_MEMORY);
+    auto f = make_tensor<TestType>({tc.f_len}, MATX_HOST_MALLOC_MEMORY);
+    auto b = make_tensor<TestType>({static_cast<index_t>(expected.size())},
+      MATX_HOST_MALLOC_MEMORY);
+
+    for (index_t i = 0; i < tc.a_len; i++) {
+      a(i) = input[static_cast<size_t>(i)];
+    }
+    for (index_t i = 0; i < tc.f_len; i++) {
+      f(i) = filter[static_cast<size_t>(i)];
+    }
+
+    (b = resample_poly(shift<0>(shift<0>(a, 1), -1),
+                       shift<0>(shift<0>(f, 1), -1),
+                       tc.up, tc.down)).run(exec);
+
+    for (index_t i = 0; i < b.Size(0); i++) {
+      AssertNearResampleValue(b(i), expected[static_cast<size_t>(i)], 1.0e-5);
+    }
+
+    if (tc.a_len == 7 && tc.f_len == 5) {
+      (b = resample_poly(shift<0>(shift<0>(a, 1), -1),
+                         shift<0>(shift<0>(f, 1), -1),
+                         tc.up, tc.down) + MakeResampleTestValue<TestType>(0.0)).run(exec);
+
+      for (index_t i = 0; i < b.Size(0); i++) {
+        AssertNearResampleValue(b(i), expected[static_cast<size_t>(i)], 1.0e-5);
+      }
+    }
+  }
+
+  MATX_EXIT_HANDLER();
+}
+
+TYPED_TEST(ResamplePolyHostTest, BatchedSignalsMatchReference)
+{
+  MATX_ENTER_HANDLER();
+  using TestType = TypeParam;
+
+  SingleThreadedHostExecutor exec{};
+  constexpr index_t batches = 3;
+  constexpr index_t a_len = 8;
+  constexpr index_t f_len = 4;
+  constexpr index_t up = 3;
+  constexpr index_t down = 2;
+
+  std::vector<TestType> filter(static_cast<size_t>(f_len));
+  std::vector<TestType> seed(static_cast<size_t>(a_len));
+  FillResampleInputs(seed, filter);
+  const auto expected_shape = ReferenceResamplePoly(seed, filter, up, down);
+  const index_t b_len = static_cast<index_t>(expected_shape.size());
+
+  auto a = make_tensor<TestType>({batches, a_len}, MATX_HOST_MALLOC_MEMORY);
+  auto f = make_tensor<TestType>({f_len}, MATX_HOST_MALLOC_MEMORY);
+  auto b = make_tensor<TestType>({batches, b_len}, MATX_HOST_MALLOC_MEMORY);
+
+  for (index_t i = 0; i < f_len; i++) {
+    f(i) = filter[static_cast<size_t>(i)];
+  }
+
+  std::vector<std::vector<TestType>> expected;
+  for (index_t batch = 0; batch < batches; batch++) {
+    std::vector<TestType> input(static_cast<size_t>(a_len));
+    for (index_t i = 0; i < a_len; i++) {
+      const double real = static_cast<double>(batch + 1) * 0.25 +
+        (static_cast<int>(i * 5 % 17) - 8) / 5.0;
+      const double imag = static_cast<double>(batch - 1) * 0.5 +
+        (static_cast<int>(i * 3 % 11) - 5) / 6.0;
+      input[static_cast<size_t>(i)] = MakeResampleTestValue<TestType>(real, imag);
+      a(batch, i) = input[static_cast<size_t>(i)];
+    }
+
+    expected.push_back(ReferenceResamplePoly(input, filter, up, down));
+  }
+
+  (b = resample_poly(shift<1>(shift<1>(a, 2), -2),
+                     shift<0>(shift<0>(f, 1), -1),
+                     up, down)).run(exec);
+
+  for (index_t batch = 0; batch < batches; batch++) {
+    for (index_t i = 0; i < b_len; i++) {
+      AssertNearResampleValue(b(batch, i),
+        expected[static_cast<size_t>(batch)][static_cast<size_t>(i)], 1.0e-5);
+    }
+  }
+
+  MATX_EXIT_HANDLER();
+}
+
 // SimpleOddLength tests use random input and filter values and
 // odd-length filters.
 TYPED_TEST(ResamplePolyTestNonHalfFloatTypes, SimpleOddLength)