diff --git a/AGENTS.md b/AGENTS.md index 3d0bced..2af5983 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -59,7 +59,7 @@ clang-format -style=file -i -fallback-style=none ## Architecture -**Single C++/CUDA shared library** (`libdaqiri.so`) exposing a C++ API through `#include `. The public surface is intentionally flat free-function helpers (`get_rx_burst`, `get_packet_ptr`, `set_udp_header`, …) that all operate on an opaque `BurstParams*`. Applications never touch backend types directly. +**Single C++/CUDA shared library** (`libdaqiri.so`) exposing a C++ API through `#include `. The public surface is intentionally flat free-function helpers (`get_rx_burst`, `get_packet_ptr`, `set_udp_header`, …) that all operate on opaque DAQIRI-owned buffers. Applications never touch backend types directly. ### Manager abstraction `src/manager.h` defines `daqiri::Manager` — an (almost) ABC with ~50 virtual methods covering init, RX/TX burst dequeue/enqueue, header-fill helpers, buffer free, and RDMA connection setup. Backends live in `src/managers//` (`dpdk/`, `rdma/`, `socket/`) and are selected at CMake configure time via `DAQIRI_MGR`. Each backend produces its own static library (`daqiri_dpdk`, `daqiri_rdma`, `daqiri_socket`) linked into `daqiri_common`, and each adds a `DAQIRI_MGR_=1` compile definition (see `src/CMakeLists.txt:156-183`). diff --git a/cmake/daqiri.pc.in b/cmake/daqiri.pc.in index 258e424..d4312da 100644 --- a/cmake/daqiri.pc.in +++ b/cmake/daqiri.pc.in @@ -8,6 +8,6 @@ Description: Data Acquisition for Integrated Real-time Instruments Version: @CMAKE_PROJECT_VERSION@ @DAQIRI_PC_REQUIRES_LINE@ @DAQIRI_PC_REQUIRES_PRIVATE_LINE@ -Libs: -L${libdir} -ldaqiri @DAQIRI_PC_LIBS@ +Libs: -L${libdir} -Wl,-rpath-link,${libdir} -ldaqiri @DAQIRI_PC_LIBS@ Libs.private: @DAQIRI_PC_LIBS_PRIVATE@ Cflags: -I${includedir}@DAQIRI_PC_EXTRA_CFLAGS@ diff --git a/docs/api-guide.md b/docs/api-guide.md index 2dc2c0b..67b4477 100644 --- a/docs/api-guide.md +++ b/docs/api-guide.md @@ -190,21 +190,23 @@ daqiri::free_all_segment_packets(burst, seg); daqiri::free_rx_burst(burst); ``` -### GPU Packet Aggregation +### GPU Packet Processing -When using batched GPU mode, packets arrive in scattered buffers — each at an arbitrary -GPU address. For workloads that need contiguous data, DAQIRI provides a CUDA reorder -kernel (`simple_packet_reorder` in `src/kernels.cu`) that copies scattered packets into -a flat output buffer: +When using batched GPU mode, packets arrive in CUDA-addressable buffers — each at an +arbitrary GPU address. Launch your own CUDA work directly on the packet pointers. Packet +reordering and aggregation should be configured through `rx.reorder_configs`; see +`raw_reorder_seq_bench.cpp` and `raw_reorder_quantize_bench.cpp` for complete examples +that consume DAQIRI's built-in reordered bursts. ```cpp -// Collect GPU pointers from the burst -for (int p = 0; p < daqiri::get_num_packets(burst); p++) { - h_dev_ptrs[p] = daqiri::get_packet_ptr(burst, p); +__global__ void noop_packet_kernel(void *packet) { + (void)packet; } -// Reorder into a contiguous GPU buffer -simple_packet_reorder(output_buffer, h_dev_ptrs, packet_len, num_packets); +if (daqiri::get_num_packets(burst) > 0) { + void *packet = daqiri::get_packet_ptr(burst, 0); + noop_packet_kernel<<<1, 1, 0, stream>>>(packet); +} // Free once the kernel completes daqiri::free_all_packets_and_burst_rx(burst); diff --git a/docs/daqiri-api.html b/docs/daqiri-api.html index eff48e9..68bb3a6 100644 --- a/docs/daqiri-api.html +++ b/docs/daqiri-api.html @@ -203,7 +203,6 @@ get_packet_flow_id fn get_segment_packet_ptr fn Free Buffers fn - GPU Reorder Kernel fn
@@ -396,27 +395,6 @@

Segments

-
-
- fn - simple_packet_reorder - (output_buf, dev_ptrs, pkt_len, num_pkts) - → void (CUDA) - -
-
-

CUDA kernel (defined in src/kernels.cu) that copies scattered per-packet GPU pointers into a single flat contiguous GPU buffer. Useful in batched GPU mode where packets arrive at non-contiguous addresses.

-
// Collect device pointers from the burst
-for (int p = 0; p < n; p++)
-    h_dev_ptrs[p] = daqiri::get_packet_ptr(burst, p);
-
-// Reorder into a contiguous output buffer (async)
-simple_packet_reorder(output_buf, h_dev_ptrs, packet_len, n);
-
-// Free once kernel completes
-daqiri::free_all_packets_and_burst_rx(burst);
-
-

diff --git a/docs/index.html b/docs/index.html index dcfda6e..d01247f 100644 --- a/docs/index.html +++ b/docs/index.html @@ -450,25 +450,23 @@

Examples

-
GPU Packet Reorder KernelC++/CUDA
-
// Batched GPU: packets arrive scattered.
-// Collect device ptrs, then reorder
-// into a flat contiguous output buffer.
-for (int p = 0; p < n; p++) {
-  h_dev_ptrs[p] =
-    daqiri::get_packet_ptr(burst, p);
+          
GPU Packet ProcessingC++/CUDA
+
// Batched GPU: packets arrive in
+// CUDA-addressable buffers. Reordering
+// is configured with rx.reorder_configs.
+__global__ void noop_packet_kernel(void* pkt) {
+  (void)pkt;
 }
 
-// CUDA kernel: src/kernels.cu
-simple_packet_reorder(
-  output_buf,   // contiguous GPU buffer
-  h_dev_ptrs,
-  packet_len,
-  num_packets);
+if (daqiri::get_num_packets(burst) > 0) {
+  void* pkt =
+    daqiri::get_packet_ptr(burst, 0);
+  noop_packet_kernel<<<1, 1, 0, stream>>>(pkt);
+}
 
 daqiri::free_all_packets_and_burst_rx(
   burst);
- +
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 53aa5e4..67578b3 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -139,6 +139,7 @@ set_target_properties(daqiri_common PROPERTIES CUDA_RESOLVE_DEVICE_SYMBOLS ON OUTPUT_NAME "daqiri" EXPORT_NAME "daqiri" + INSTALL_RPATH "$ORIGIN" ) install( TARGETS daqiri_common @@ -212,6 +213,7 @@ foreach(MGR IN LISTS DAQIRI_MGR_LIST) set_target_properties(${MGR_TARGET} PROPERTIES OUTPUT_NAME "daqiri_${MGR_LOWER}" EXPORT_NAME "${MGR_LOWER}" + INSTALL_RPATH "$ORIGIN" ) install(TARGETS ${MGR_TARGET} EXPORT daqiriTargets diff --git a/src/kernels.cu b/src/kernels.cu index 7ca4024..d55046c 100644 --- a/src/kernels.cu +++ b/src/kernels.cu @@ -21,49 +21,6 @@ #include #include -/** - * @brief Simple packet reorder kernel to demonstrate reordering a batch of packets into - * contiguous memory - * - * @param out Output buffer - * @param in Pointer to list of input packet pointers - * @param pkt_len Length of each packet. All packets must be same length for this example - * @param num_pkts Number of packets - */ -__global__ void simple_packet_reorder_kernel(void* __restrict__ out, - const void* const* const __restrict__ in, - uint16_t pkt_len, uint32_t num_pkts) { - // Warmup - if (out == nullptr) return; - - const int pkt_idx = blockIdx.x; - const int len = pkt_len; - const void* in_pkt = in[pkt_idx]; - - if (pkt_idx < num_pkts) { - for (int pos = threadIdx.x; pos < len / 4; pos += blockDim.x) { - const uint32_t* in_ptr = static_cast(in_pkt) + pos; - uint32_t* out_ptr = (uint32_t*)((uint8_t*)out + pkt_idx * pkt_len) + pos; - *out_ptr = *in_ptr; - } - } -} - -/** - * @brief Wrapper to launch packet reorder kernel - * - * @param out Output buffer - * @param in Pointer to list of input packet pointers - * @param pkt_len Length of each packet in bytes. Must be a multiple of 4 - * @param num_pkts Number of packets - * @param offset Offset into packet to start - * @param stream CUDA stream - */ -extern "C" void simple_packet_reorder(void* out, const void* const* const in, uint16_t pkt_len, - uint32_t num_pkts, cudaStream_t stream) { - simple_packet_reorder_kernel<<>>(out, in, pkt_len, num_pkts); -} - __device__ static inline uint32_t extract_bits_be(const uint8_t* data, uint16_t bit_offset, uint8_t bit_width) { diff --git a/src/kernels.h b/src/kernels.h index 8ac8518..7989bff 100644 --- a/src/kernels.h +++ b/src/kernels.h @@ -16,20 +16,14 @@ */ #pragma once -#include -#include + #include +#include #if __cplusplus extern "C" { #endif -__attribute__((__visibility__("default"))) void simple_packet_reorder(void* out, - const void* const* const in, - uint16_t pkt_len, - uint32_t num_pkts, - cudaStream_t stream); - __attribute__((__visibility__("default"))) void packet_reorder_copy_payload_by_sequence( void* out, const void* const* const in,