Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion AGENTS.md
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,7 @@ clang-format -style=file -i -fallback-style=none <files>

## Architecture

**Single C++/CUDA shared library** (`libdaqiri.so`) exposing a C++ API through `#include <daqiri/daqiri.h>`. The public surface is intentionally flat free-function helpers (`get_rx_burst`, `get_packet_ptr`, `set_udp_header`, …) that all operate on an opaque `BurstParams*`. Applications never touch backend types directly.
**Single C++/CUDA shared library** (`libdaqiri.so`) exposing a C++ API through `#include <daqiri/daqiri.h>`. The public surface is intentionally flat free-function helpers (`get_rx_burst`, `get_packet_ptr`, `set_udp_header`, …) that all operate on opaque DAQIRI-owned buffers. Applications never touch backend types directly.

### Manager abstraction
`src/manager.h` defines `daqiri::Manager` — an (almost) ABC with ~50 virtual methods covering init, RX/TX burst dequeue/enqueue, header-fill helpers, buffer free, and RDMA connection setup. Backends live in `src/managers/<name>/` (`dpdk/`, `rdma/`, `socket/`) and are selected at CMake configure time via `DAQIRI_MGR`. Each backend produces its own static library (`daqiri_dpdk`, `daqiri_rdma`, `daqiri_socket`) linked into `daqiri_common`, and each adds a `DAQIRI_MGR_<NAME>=1` compile definition (see `src/CMakeLists.txt:156-183`).
Expand Down
2 changes: 1 addition & 1 deletion cmake/daqiri.pc.in
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,6 @@ Description: Data Acquisition for Integrated Real-time Instruments
Version: @CMAKE_PROJECT_VERSION@
@DAQIRI_PC_REQUIRES_LINE@
@DAQIRI_PC_REQUIRES_PRIVATE_LINE@
Libs: -L${libdir} -ldaqiri @DAQIRI_PC_LIBS@
Libs: -L${libdir} -Wl,-rpath-link,${libdir} -ldaqiri @DAQIRI_PC_LIBS@
Libs.private: @DAQIRI_PC_LIBS_PRIVATE@
Cflags: -I${includedir}@DAQIRI_PC_EXTRA_CFLAGS@
22 changes: 12 additions & 10 deletions docs/api-guide.md
Original file line number Diff line number Diff line change
Expand Up @@ -190,21 +190,23 @@ daqiri::free_all_segment_packets(burst, seg);
daqiri::free_rx_burst(burst);
```

### GPU Packet Aggregation
### GPU Packet Processing

When using batched GPU mode, packets arrive in scattered buffers — each at an arbitrary
GPU address. For workloads that need contiguous data, DAQIRI provides a CUDA reorder
kernel (`simple_packet_reorder` in `src/kernels.cu`) that copies scattered packets into
a flat output buffer:
When using batched GPU mode, packets arrive in CUDA-addressable buffers — each at an
arbitrary GPU address. Launch your own CUDA work directly on the packet pointers. Packet
reordering and aggregation should be configured through `rx.reorder_configs`; see
`raw_reorder_seq_bench.cpp` and `raw_reorder_quantize_bench.cpp` for complete examples
that consume DAQIRI's built-in reordered bursts.

```cpp
// Collect GPU pointers from the burst
for (int p = 0; p < daqiri::get_num_packets(burst); p++) {
h_dev_ptrs[p] = daqiri::get_packet_ptr(burst, p);
__global__ void noop_packet_kernel(void *packet) {
(void)packet;
}

// Reorder into a contiguous GPU buffer
simple_packet_reorder(output_buffer, h_dev_ptrs, packet_len, num_packets);
if (daqiri::get_num_packets(burst) > 0) {
void *packet = daqiri::get_packet_ptr(burst, 0);
noop_packet_kernel<<<1, 1, 0, stream>>>(packet);
}

// Free once the kernel completes
daqiri::free_all_packets_and_burst_rx(burst);
Expand Down
22 changes: 0 additions & 22 deletions docs/daqiri-api.html
Original file line number Diff line number Diff line change
Expand Up @@ -203,7 +203,6 @@
<a href="#get-packet-flow-id" class="sb-link">get_packet_flow_id <span class="lb lb-fn">fn</span></a>
<a href="#get-segment-packet-ptr" class="sb-link">get_segment_packet_ptr <span class="lb lb-fn">fn</span></a>
<a href="#free-buffers" class="sb-link">Free Buffers <span class="lb lb-fn">fn</span></a>
<a href="#gpu-reorder" class="sb-link">GPU Reorder Kernel <span class="lb lb-fn">fn</span></a>
</div>

<div class="sb-group">
Expand Down Expand Up @@ -396,27 +395,6 @@ <h4 style="color:var(--text-pri);margin-bottom:.5rem;">Segments</h4>
</div>
</div>

<div class="method-card" id="gpu-reorder">
<div class="method-hdr" onclick="toggleMethod(this)">
<span class="m-tag mt-fn">fn</span>
<span class="m-name">simple_packet_reorder</span>
<span class="m-params">(output_buf, dev_ptrs, pkt_len, num_pkts)</span>
<span class="mt-ret">→ void (CUDA)</span>
<span class="m-expand">▼</span>
</div>
<div class="method-body">
<p class="m-desc">CUDA kernel (defined in <code>src/kernels.cu</code>) that copies scattered per-packet GPU pointers into a single flat contiguous GPU buffer. Useful in batched GPU mode where packets arrive at non-contiguous addresses.</p>
<pre><span class="cm">// Collect device pointers from the burst</span>
<span class="kw">for</span> (<span class="kw">int</span> p = <span class="nm">0</span>; p &lt; n; p++)
h_dev_ptrs[p] = daqiri::<span class="fn">get_packet_ptr</span>(burst, p);

<span class="cm">// Reorder into a contiguous output buffer (async)</span>
<span class="fn">simple_packet_reorder</span>(output_buf, h_dev_ptrs, packet_len, n);

<span class="cm">// Free once kernel completes</span>
daqiri::<span class="fn">free_all_packets_and_burst_rx</span>(burst);</pre>
</div>
</div>
</div>

<hr class="divider" />
Expand Down
26 changes: 12 additions & 14 deletions docs/index.html
Original file line number Diff line number Diff line change
Expand Up @@ -450,25 +450,23 @@ <h2 class="section-title">Examples</h2>
</div>

<div class="example-card">
<div class="ex-hdr"><span class="ex-dot dot-cpp"></span><span class="ex-title">GPU Packet Reorder Kernel</span><span class="ex-lang">C++/CUDA</span></div>
<div class="ex-body"><pre><span class="cm">// Batched GPU: packets arrive scattered.
// Collect device ptrs, then reorder
// into a flat contiguous output buffer.</span>
<span class="kw">for</span> (<span class="kw">int</span> p = <span class="nm">0</span>; p &lt; n; p++) {
h_dev_ptrs[p] =
daqiri::<span class="fn">get_packet_ptr</span>(burst, p);
<div class="ex-hdr"><span class="ex-dot dot-cpp"></span><span class="ex-title">GPU Packet Processing</span><span class="ex-lang">C++/CUDA</span></div>
<div class="ex-body"><pre><span class="cm">// Batched GPU: packets arrive in
// CUDA-addressable buffers. Reordering
// is configured with rx.reorder_configs.</span>
<span class="kw">__global__</span> <span class="kw">void</span> noop_packet_kernel(<span class="kw">void</span>* pkt) {
(<span class="kw">void</span>)pkt;
}

<span class="cm">// CUDA kernel: src/kernels.cu</span>
<span class="fn">simple_packet_reorder</span>(
output_buf, <span class="cm">// contiguous GPU buffer</span>
h_dev_ptrs,
packet_len,
num_packets);
<span class="kw">if</span> (daqiri::<span class="fn">get_num_packets</span>(burst) &gt; <span class="nm">0</span>) {
<span class="kw">void</span>* pkt =
daqiri::<span class="fn">get_packet_ptr</span>(burst, <span class="nm">0</span>);
noop_packet_kernel&lt;&lt;&lt;<span class="nm">1</span>, <span class="nm">1</span>, <span class="nm">0</span>, stream&gt;&gt;&gt;(pkt);
}

daqiri::<span class="fn">free_all_packets_and_burst_rx</span>(
burst);</pre></div>
<div class="ex-footer"><span class="ex-desc">Scatter→gather reorder on the GPU</span><a href="https://github.com/NVIDIA/daqiri/blob/main/src/kernels.cu" class="ex-link" target="_blank">Open ↗</a></div>
<div class="ex-footer"><span class="ex-desc">Launch custom GPU work on packet buffers</span><a href="https://github.com/NVIDIA/daqiri/tree/main/examples" class="ex-link" target="_blank">Open ↗</a></div>
</div>

<div class="example-card">
Expand Down
2 changes: 2 additions & 0 deletions src/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -139,6 +139,7 @@ set_target_properties(daqiri_common PROPERTIES
CUDA_RESOLVE_DEVICE_SYMBOLS ON
OUTPUT_NAME "daqiri"
EXPORT_NAME "daqiri"
INSTALL_RPATH "$ORIGIN"
)
install(
TARGETS daqiri_common
Expand Down Expand Up @@ -212,6 +213,7 @@ foreach(MGR IN LISTS DAQIRI_MGR_LIST)
set_target_properties(${MGR_TARGET} PROPERTIES
OUTPUT_NAME "daqiri_${MGR_LOWER}"
EXPORT_NAME "${MGR_LOWER}"
INSTALL_RPATH "$ORIGIN"
)
install(TARGETS ${MGR_TARGET}
EXPORT daqiriTargets
Expand Down
43 changes: 0 additions & 43 deletions src/kernels.cu
Original file line number Diff line number Diff line change
Expand Up @@ -21,49 +21,6 @@
#include <cuda_bf16.h>
#include <cuda_fp16.h>

/**
* @brief Simple packet reorder kernel to demonstrate reordering a batch of packets into
* contiguous memory
*
* @param out Output buffer
* @param in Pointer to list of input packet pointers
* @param pkt_len Length of each packet. All packets must be same length for this example
* @param num_pkts Number of packets
*/
__global__ void simple_packet_reorder_kernel(void* __restrict__ out,
const void* const* const __restrict__ in,
uint16_t pkt_len, uint32_t num_pkts) {
// Warmup
if (out == nullptr) return;

const int pkt_idx = blockIdx.x;
const int len = pkt_len;
const void* in_pkt = in[pkt_idx];

if (pkt_idx < num_pkts) {
for (int pos = threadIdx.x; pos < len / 4; pos += blockDim.x) {
const uint32_t* in_ptr = static_cast<const uint32_t*>(in_pkt) + pos;
uint32_t* out_ptr = (uint32_t*)((uint8_t*)out + pkt_idx * pkt_len) + pos;
*out_ptr = *in_ptr;
}
}
}

/**
* @brief Wrapper to launch packet reorder kernel
*
* @param out Output buffer
* @param in Pointer to list of input packet pointers
* @param pkt_len Length of each packet in bytes. Must be a multiple of 4
* @param num_pkts Number of packets
* @param offset Offset into packet to start
* @param stream CUDA stream
*/
extern "C" void simple_packet_reorder(void* out, const void* const* const in, uint16_t pkt_len,
uint32_t num_pkts, cudaStream_t stream) {
simple_packet_reorder_kernel<<<num_pkts, 128, 0, stream>>>(out, in, pkt_len, num_pkts);
}

__device__ static inline uint32_t extract_bits_be(const uint8_t* data,
uint16_t bit_offset,
uint8_t bit_width) {
Expand Down
10 changes: 2 additions & 8 deletions src/kernels.h
Original file line number Diff line number Diff line change
Expand Up @@ -16,20 +16,14 @@
*/

#pragma once
#include <stdint.h>
#include <assert.h>

#include <cuda_runtime.h>
#include <stdint.h>

#if __cplusplus
extern "C" {
#endif

__attribute__((__visibility__("default"))) void simple_packet_reorder(void* out,
const void* const* const in,
uint16_t pkt_len,
uint32_t num_pkts,
cudaStream_t stream);

__attribute__((__visibility__("default"))) void packet_reorder_copy_payload_by_sequence(
void* out,
const void* const* const in,
Expand Down
Loading