NVIDIA · cliffburdick · May 13, 2026 · May 13, 2026 · May 13, 2026
diff --git a/AGENTS.md b/AGENTS.md
@@ -59,7 +59,7 @@ clang-format -style=file -i -fallback-style=none <files>
 
 ## Architecture
 
-**Single C++/CUDA shared library** (`libdaqiri.so`) exposing a C++ API through `#include <daqiri/daqiri.h>`. The public surface is intentionally flat free-function helpers (`get_rx_burst`, `get_packet_ptr`, `set_udp_header`, …) that all operate on an opaque `BurstParams*`. Applications never touch backend types directly.
+**Single C++/CUDA shared library** (`libdaqiri.so`) exposing a C++ API through `#include <daqiri/daqiri.h>`. The public surface is intentionally flat free-function helpers (`get_rx_burst`, `get_packet_ptr`, `set_udp_header`, …) that all operate on opaque DAQIRI-owned buffers. Applications never touch backend types directly.
 
 ### Manager abstraction
 `src/manager.h` defines `daqiri::Manager` — an (almost) ABC with ~50 virtual methods covering init, RX/TX burst dequeue/enqueue, header-fill helpers, buffer free, and RDMA connection setup. Backends live in `src/managers/<name>/` (`dpdk/`, `rdma/`, `socket/`) and are selected at CMake configure time via `DAQIRI_MGR`. Each backend produces its own static library (`daqiri_dpdk`, `daqiri_rdma`, `daqiri_socket`) linked into `daqiri_common`, and each adds a `DAQIRI_MGR_<NAME>=1` compile definition (see `src/CMakeLists.txt:156-183`).

diff --git a/cmake/daqiri.pc.in b/cmake/daqiri.pc.in
@@ -8,6 +8,6 @@ Description: Data Acquisition for Integrated Real-time Instruments
 Version: @CMAKE_PROJECT_VERSION@
 @DAQIRI_PC_REQUIRES_LINE@
 @DAQIRI_PC_REQUIRES_PRIVATE_LINE@
-Libs: -L${libdir} -ldaqiri @DAQIRI_PC_LIBS@
+Libs: -L${libdir} -Wl,-rpath-link,${libdir} -ldaqiri @DAQIRI_PC_LIBS@
 Libs.private: @DAQIRI_PC_LIBS_PRIVATE@
 Cflags: -I${includedir}@DAQIRI_PC_EXTRA_CFLAGS@
diff --git a/docs/api-guide.md b/docs/api-guide.md
@@ -190,21 +190,23 @@ daqiri::free_all_segment_packets(burst, seg);
 daqiri::free_rx_burst(burst);
 ```
 
-### GPU Packet Aggregation
+### GPU Packet Processing
 
-When using batched GPU mode, packets arrive in scattered buffers — each at an arbitrary
-GPU address. For workloads that need contiguous data, DAQIRI provides a CUDA reorder
-kernel (`simple_packet_reorder` in `src/kernels.cu`) that copies scattered packets into
-a flat output buffer:
+When using batched GPU mode, packets arrive in CUDA-addressable buffers — each at an
+arbitrary GPU address. Launch your own CUDA work directly on the packet pointers. Packet
+reordering and aggregation should be configured through `rx.reorder_configs`; see
+`raw_reorder_seq_bench.cpp` and `raw_reorder_quantize_bench.cpp` for complete examples
+that consume DAQIRI's built-in reordered bursts.
 
 ```cpp
-// Collect GPU pointers from the burst
-for (int p = 0; p < daqiri::get_num_packets(burst); p++) {
-    h_dev_ptrs[p] = daqiri::get_packet_ptr(burst, p);
+__global__ void noop_packet_kernel(void *packet) {
+    (void)packet;
 }
 
-// Reorder into a contiguous GPU buffer
-simple_packet_reorder(output_buffer, h_dev_ptrs, packet_len, num_packets);
+if (daqiri::get_num_packets(burst) > 0) {
+    void *packet = daqiri::get_packet_ptr(burst, 0);
+    noop_packet_kernel<<<1, 1, 0, stream>>>(packet);
+}
 
 // Free once the kernel completes
 daqiri::free_all_packets_and_burst_rx(burst);

diff --git a/docs/daqiri-api.html b/docs/daqiri-api.html
@@ -203,7 +203,6 @@
         <a href="#get-packet-flow-id" class="sb-link">get_packet_flow_id <span class="lb lb-fn">fn</span></a>
         <a href="#get-segment-packet-ptr" class="sb-link">get_segment_packet_ptr <span class="lb lb-fn">fn</span></a>
         <a href="#free-buffers" class="sb-link">Free Buffers <span class="lb lb-fn">fn</span></a>
-        <a href="#gpu-reorder" class="sb-link">GPU Reorder Kernel <span class="lb lb-fn">fn</span></a>
       </div>
 
       <div class="sb-group">
@@ -396,27 +395,6 @@ <h4 style="color:var(--text-pri);margin-bottom:.5rem;">Segments</h4>
           </div>
         </div>
 
-        <div class="method-card" id="gpu-reorder">
-          <div class="method-hdr" onclick="toggleMethod(this)">
-            <span class="m-tag mt-fn">fn</span>
-            <span class="m-name">simple_packet_reorder</span>
-            <span class="m-params">(output_buf, dev_ptrs, pkt_len, num_pkts)</span>
-            <span class="mt-ret">→ void (CUDA)</span>
-            <span class="m-expand">▼</span>
-          </div>
-          <div class="method-body">
-            <p class="m-desc">CUDA kernel (defined in <code>src/kernels.cu</code>) that copies scattered per-packet GPU pointers into a single flat contiguous GPU buffer. Useful in batched GPU mode where packets arrive at non-contiguous addresses.</p>
-            <pre><span class="cm">// Collect device pointers from the burst</span>
-<span class="kw">for</span> (<span class="kw">int</span> p = <span class="nm">0</span>; p &lt; n; p++)
-    h_dev_ptrs[p] = daqiri::<span class="fn">get_packet_ptr</span>(burst, p);
-
-<span class="cm">// Reorder into a contiguous output buffer (async)</span>
-<span class="fn">simple_packet_reorder</span>(output_buf, h_dev_ptrs, packet_len, n);
-
-<span class="cm">// Free once kernel completes</span>
-daqiri::<span class="fn">free_all_packets_and_burst_rx</span>(burst);</pre>
-          </div>
-        </div>
       </div>
 
       <hr class="divider" />

diff --git a/docs/index.html b/docs/index.html
@@ -450,25 +450,23 @@ <h2 class="section-title">Examples</h2>
         </div>
 
         <div class="example-card">
-          <div class="ex-hdr"><span class="ex-dot dot-cpp"></span><span class="ex-title">GPU Packet Reorder Kernel</span><span class="ex-lang">C++/CUDA</span></div>
-          <div class="ex-body"><pre><span class="cm">// Batched GPU: packets arrive scattered.
-// Collect device ptrs, then reorder
-// into a flat contiguous output buffer.</span>
-<span class="kw">for</span> (<span class="kw">int</span> p = <span class="nm">0</span>; p &lt; n; p++) {
-  h_dev_ptrs[p] =
-    daqiri::<span class="fn">get_packet_ptr</span>(burst, p);
+          <div class="ex-hdr"><span class="ex-dot dot-cpp"></span><span class="ex-title">GPU Packet Processing</span><span class="ex-lang">C++/CUDA</span></div>
+          <div class="ex-body"><pre><span class="cm">// Batched GPU: packets arrive in
+// CUDA-addressable buffers. Reordering
+// is configured with rx.reorder_configs.</span>
+<span class="kw">__global__</span> <span class="kw">void</span> noop_packet_kernel(<span class="kw">void</span>* pkt) {
+  (<span class="kw">void</span>)pkt;
 }
 
-<span class="cm">// CUDA kernel: src/kernels.cu</span>
-<span class="fn">simple_packet_reorder</span>(
-  output_buf,   <span class="cm">// contiguous GPU buffer</span>
-  h_dev_ptrs,
-  packet_len,
-  num_packets);
+<span class="kw">if</span> (daqiri::<span class="fn">get_num_packets</span>(burst) &gt; <span class="nm">0</span>) {
+  <span class="kw">void</span>* pkt =
+    daqiri::<span class="fn">get_packet_ptr</span>(burst, <span class="nm">0</span>);
+  noop_packet_kernel&lt;&lt;&lt;<span class="nm">1</span>, <span class="nm">1</span>, <span class="nm">0</span>, stream&gt;&gt;&gt;(pkt);
+}
 
 daqiri::<span class="fn">free_all_packets_and_burst_rx</span>(
   burst);</pre></div>
-          <div class="ex-footer"><span class="ex-desc">Scatter→gather reorder on the GPU</span><a href="https://github.com/NVIDIA/daqiri/blob/main/src/kernels.cu" class="ex-link" target="_blank">Open ↗</a></div>
+          <div class="ex-footer"><span class="ex-desc">Launch custom GPU work on packet buffers</span><a href="https://github.com/NVIDIA/daqiri/tree/main/examples" class="ex-link" target="_blank">Open ↗</a></div>
         </div>
 
         <div class="example-card">

diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
@@ -139,6 +139,7 @@ set_target_properties(daqiri_common PROPERTIES
     CUDA_RESOLVE_DEVICE_SYMBOLS ON
     OUTPUT_NAME "daqiri"
     EXPORT_NAME "daqiri"
+    INSTALL_RPATH "$ORIGIN"
 )
 install(
     TARGETS daqiri_common
@@ -212,6 +213,7 @@ foreach(MGR IN LISTS DAQIRI_MGR_LIST)
     set_target_properties(${MGR_TARGET} PROPERTIES
         OUTPUT_NAME "daqiri_${MGR_LOWER}"
         EXPORT_NAME "${MGR_LOWER}"
+        INSTALL_RPATH "$ORIGIN"
     )
     install(TARGETS ${MGR_TARGET}
         EXPORT daqiriTargets

diff --git a/src/kernels.cu b/src/kernels.cu
@@ -21,49 +21,6 @@
 #include <cuda_bf16.h>
 #include <cuda_fp16.h>
 
-/**
- * @brief Simple packet reorder kernel to demonstrate reordering a batch of packets into
- *        contiguous memory
- *
- * @param out Output buffer
- * @param in Pointer to list of input packet pointers
- * @param pkt_len Length of each packet. All packets must be same length for this example
- * @param num_pkts Number of packets
- */
-__global__ void simple_packet_reorder_kernel(void* __restrict__ out,
-                                             const void* const* const __restrict__ in,
-                                             uint16_t pkt_len, uint32_t num_pkts) {
-  // Warmup
-  if (out == nullptr) return;
-
-  const int pkt_idx = blockIdx.x;
-  const int len = pkt_len;
-  const void* in_pkt = in[pkt_idx];
-
-  if (pkt_idx < num_pkts) {
-    for (int pos = threadIdx.x; pos < len / 4; pos += blockDim.x) {
-      const uint32_t* in_ptr = static_cast<const uint32_t*>(in_pkt) + pos;
-      uint32_t* out_ptr = (uint32_t*)((uint8_t*)out + pkt_idx * pkt_len) + pos;
-      *out_ptr = *in_ptr;
-    }
-  }
-}
-
-/**
- * @brief Wrapper to launch packet reorder kernel
- *
- * @param out Output buffer
- * @param in Pointer to list of input packet pointers
- * @param pkt_len Length of each packet in bytes. Must be a multiple of 4
- * @param num_pkts Number of packets
- * @param offset Offset into packet to start
- * @param stream CUDA stream
- */
-extern "C" void simple_packet_reorder(void* out, const void* const* const in, uint16_t pkt_len,
-                                      uint32_t num_pkts, cudaStream_t stream) {
-  simple_packet_reorder_kernel<<<num_pkts, 128, 0, stream>>>(out, in, pkt_len, num_pkts);
-}
-
 __device__ static inline uint32_t extract_bits_be(const uint8_t* data,
                                                   uint16_t bit_offset,
                                                   uint8_t bit_width) {

diff --git a/src/kernels.h b/src/kernels.h
@@ -16,20 +16,14 @@
  */
 
 #pragma once
-#include <stdint.h>
-#include <assert.h>
+
 #include <cuda_runtime.h>
+#include <stdint.h>
 
 #if __cplusplus
 extern "C" {
 #endif
 
-__attribute__((__visibility__("default"))) void simple_packet_reorder(void* out,
-                                                                      const void* const* const in,
-                                                                      uint16_t pkt_len,
-                                                                      uint32_t num_pkts,
-                                                                      cudaStream_t stream);
-
 __attribute__((__visibility__("default"))) void packet_reorder_copy_payload_by_sequence(
     void* out,
     const void* const* const in,