NVIDIA · RamyaGuru · May 11, 2026 · May 13, 2026 · May 13, 2026 · May 13, 2026
diff --git a/.gitignore b/.gitignore
@@ -1,5 +1,9 @@
 build*/
 site/
+bench-results/
+
+# tune_system.py default output
+pcie_schematic.png
 
 # macOS
 .DS_Store
diff --git a/AGENTS.md b/AGENTS.md
@@ -93,14 +93,16 @@ The web docs live in `docs/` and are built with [MkDocs Material](https://squidf
 - `docs/index.html` — custom HTML landing page (not generated by MkDocs, hand-maintained)
 - `docs/daqiri-api.html` — standalone HTML API reference (hand-maintained)
 - `docs/api-guide.md`, `docs/getting-started.md`, `docs/configuration.md` — core markdown docs
+- `docs/performance-dgx-spark.md` — per-platform performance report (DGX Spark; more platforms to follow)
 - `docs/tutorials/` — tutorial walkthroughs (background, system config, benchmarking, config files)
 - `docs/stylesheets/extra.css` — custom theme overrides
 
 **Keeping docs in sync with code:** before committing changes, scan for the recurring drift hotspots:
 - **Backend list** (`src/managers/*/`) — README Backends table, `docs/getting-started.md`, `docs/configuration.md`
 - **CMake options / `DAQIRI_MGR` default** (`src/CMakeLists.txt:137`) — README Quick Start, `docs/getting-started.md`, this file's Build & run section
-- **Benchmark binary or YAML names** (`examples/`) — the benchmark table above, `docs/tutorials/benchmarking_examples.md`, and the "Choosing an example config" decision tree in `docs/tutorials/configuration-walkthrough.md` (every YAML must have a leaf; CI's `scripts/check_doc_refs.py` enforces coverage)
+- **Benchmark binary or YAML names** (`examples/`) — the benchmark table above, `docs/tutorials/benchmarking_examples.md`, the "Choosing an example config" decision tree in `docs/tutorials/configuration-walkthrough.md` (every YAML must have a leaf; CI's `scripts/check_doc_refs.py` enforces coverage), and per-platform performance docs (`docs/performance-*.md`)
 - **Public API** (`src/common.h`, `src/types.h`, `src/manager.h`) — `docs/api-guide.md`, `docs/daqiri-api.html`
+- **Bench CLI flags or output format** (`examples/raw_bench_common.{h,cpp}`, `*_bench.cpp`) — per-platform performance docs' Methodology section, `examples/run_spark_bench.sh` parsing logic
 - **Doc reorganization** (any rename in `docs/`) — `docs/index.html` landing page, `mkdocs.yml` nav, README Documentation table
 
 The full mapping with rationale lives in the docs-sync agent rule. Internal-link, anchor, and nav drift is enforced by CI (`.github/workflows/docs.yml`); content drift (stale binary names, defaults) is still a manual check at commit time.

diff --git a/README.md b/README.md
@@ -81,6 +81,7 @@ Reference material for the DAQIRI codebase:
 - [Getting Started](docs/getting-started.md) — System requirements, build/install instructions, and CMake options
 - [Configuration Reference](docs/configuration.md) — Full YAML config reference for all backends
 - [API Guide](docs/api-guide.md) — BurstParams, RX/TX workflows, buffer lifecycle, status codes
+- [Performance: DGX Spark](docs/performance-dgx-spark.md) — Per-platform throughput, drop, and utilization numbers for all backends
 - [Contributing](CONTRIBUTING.md) — Contribution guidelines, coding standards, DCO sign-off
 
 ## Tutorials

diff --git a/docs/index.html b/docs/index.html
@@ -595,6 +595,12 @@ <h2 class="section-title">News</h2>
         </div>
       </div>
       <div class="pub-grid">
+        <div class="pub-card">
+          <div class="pub-venue"><span class="pub-badge">Performance</span><span class="pub-year">2026</span></div>
+          <div class="pub-title">DAQIRI Performance on DGX Spark</div>
+          <div class="pub-authors">NVIDIA — Throughput, drops, and resource utilization for DPDK GPUDirect, RoCE, and socket backends measured on a DGX Spark (GB10) workstation. First in a series of per-platform performance reports.</div>
+          <div class="pub-links"><a href="performance-dgx-spark/" class="pub-link">Read report →</a></div>
+        </div>
         <div class="pub-card">
           <div class="pub-venue"><span class="pub-badge">GitHub</span><span class="pub-year">2025</span></div>
           <div class="pub-title">DAQIRI Open-Sourced on GitHub</div>

diff --git a/docs/performance-dgx-spark.md b/docs/performance-dgx-spark.md
diff --git a/docs/stylesheets/extra.css b/docs/stylesheets/extra.css
@@ -50,3 +50,64 @@
 [data-md-color-scheme="slate"] .md-footer {
   background: #111;
 }
+
+/* ── Performance-report heatmap cells ───────────────────────────────── */
+/* Used by the payload×batch and payload×target_gbps matrices in
+   docs/performance-*.md. Threshold logic (vs. matrix-global max):
+     green  = no drops AND Gbps ≥ 90% of max
+     yellow = no drops AND Gbps ≥ 70% of max
+     red    = any drops OR Gbps < 70% of max                            */
+.md-typeset table.perf-matrix {
+  width: 100%;
+  table-layout: auto;
+  border-collapse: separate;
+  border-spacing: 5px;
+  font-size: 0.64rem;
+}
+.md-typeset table.perf-matrix th,
+.md-typeset table.perf-matrix td {
+  text-align: center;
+  vertical-align: middle;
+  font-variant-numeric: tabular-nums;
+  padding: 0.55em 0.5em;
+  border-radius: 4px;
+  white-space: nowrap;
+}
+.md-typeset table.perf-matrix td small {
+  display: block;
+  opacity: 0.75;
+  font-size: 0.85em;
+  margin-top: 0.25em;
+}
+.md-typeset table.perf-matrix td.cell-green {
+  background-color: rgba(118, 185, 0, 0.28);
+  color: inherit;
+}
+.md-typeset table.perf-matrix td.cell-yellow {
+  background-color: rgba(255, 196, 0, 0.32);
+  color: inherit;
+}
+.md-typeset table.perf-matrix td.cell-red {
+  background-color: rgba(220, 60, 60, 0.32);
+  color: inherit;
+}
+.md-typeset table.perf-matrix th {
+  background-color: rgba(255, 255, 255, 0.05);
+  font-weight: 600;
+}
+/* Compact legend chips that pair with the matrix. */
+.md-typeset .perf-legend {
+  display: flex;
+  gap: 0.75em;
+  margin: 0.5em 0 1em 0;
+  font-size: 0.85em;
+  flex-wrap: wrap;
+}
+.md-typeset .perf-legend span {
+  padding: 0.1em 0.55em;
+  border-radius: 0.25em;
+  white-space: nowrap;
+}
+.md-typeset .perf-legend .cell-green  { background-color: rgba(118, 185, 0, 0.28); }
+.md-typeset .perf-legend .cell-yellow { background-color: rgba(255, 196, 0, 0.32); }
+.md-typeset .perf-legend .cell-red    { background-color: rgba(220, 60, 60, 0.32); }
diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt
@@ -71,14 +71,16 @@ add_daqiri_raw_bench(daqiri_bench_raw_reorder_quantize raw_reorder_quantize_benc
 add_daqiri_raw_bench(daqiri_example_gds_write gds_write_example.cpp)
 add_daqiri_raw_bench(daqiri_example_pcap_writer pcap_writer_example.cpp)
 
-add_executable(daqiri_bench_rdma rdma_bench.cpp)
+add_executable(daqiri_bench_rdma rdma_bench.cpp raw_bench_common.cpp)
 link_daqiri_bench(daqiri_bench_rdma)
+target_link_libraries(daqiri_bench_rdma PRIVATE CUDA::cudart)
 set_target_properties(daqiri_bench_rdma PROPERTIES
   BUILD_RPATH "$ORIGIN/../src;$ORIGIN/../src/third_party/yaml-cpp"
 )
 
-add_executable(daqiri_bench_socket socket_bench.cpp)
+add_executable(daqiri_bench_socket socket_bench.cpp raw_bench_common.cpp)
 link_daqiri_bench(daqiri_bench_socket)
+target_link_libraries(daqiri_bench_socket PRIVATE CUDA::cudart)
 
 foreach(cfg IN LISTS DAQIRI_BENCH_CONFIGS)
   configure_file(${CMAKE_CURRENT_SOURCE_DIR}/${cfg} ${CMAKE_CURRENT_BINARY_DIR}/${cfg} COPYONLY)

diff --git a/examples/bench_capture_environment.sh b/examples/bench_capture_environment.sh
@@ -0,0 +1,116 @@
+#!/usr/bin/env bash
+#
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Capture host/NIC/GPU/build state for a benchmark run, so numbers are
+# reproducible across machines and over time. Writes one structured text file
+# with named sections.
+#
+# Usage: ./bench_capture_environment.sh <output_dir>
+#        Default output dir: bench-results/<UTC timestamp>/
+
+set -u
+
+OUT_DIR="${1:-bench-results/$(date -u +%Y%m%dT%H%M%SZ)}"
+mkdir -p "$OUT_DIR"
+OUT="$OUT_DIR/environment.txt"
+
+# Run a command, capturing exit status. Always write a header so the section is
+# present even when the command is missing or fails — silent absence is harder
+# to debug than an explicit "command not found".
+run_section() {
+  local label="$1"; shift
+  {
+    echo "=========================================================="
+    echo "[$label]"
+    echo "  cmd: $*"
+    echo "=========================================================="
+    if command -v "$1" >/dev/null 2>&1 || [[ "$1" == /* || "$1" == ./* ]]; then
+      "$@" 2>&1
+      echo "  (exit: $?)"
+    else
+      echo "  (command not found in PATH: $1)"
+    fi
+    echo
+  } >> "$OUT"
+}
+
+# Cat a file/glob; write a header either way.
+cat_section() {
+  local label="$1"; shift
+  {
+    echo "=========================================================="
+    echo "[$label]"
+    echo "  paths: $*"
+    echo "=========================================================="
+    for p in "$@"; do
+      if compgen -G "$p" >/dev/null; then
+        for f in $p; do
+          echo "----- $f -----"
+          cat "$f" 2>&1
+        done
+      else
+        echo "  (no match: $p)"
+      fi
+    done
+    echo
+  } >> "$OUT"
+}
+
+: > "$OUT"
+
+echo "DAQIRI benchmark environment capture" >> "$OUT"
+echo "Generated: $(date -u +%Y-%m-%dT%H:%M:%SZ)" >> "$OUT"
+echo "Host:      $(hostname)" >> "$OUT"
+echo "Output:    $OUT" >> "$OUT"
+echo >> "$OUT"
+
+# --- Kernel / OS ---
+run_section "uname"           uname -a
+cat_section "kernel-cmdline"  /proc/cmdline
+cat_section "os-release"      /etc/os-release
+run_section "lsb-release"     lsb_release -a
+run_section "clocksource"     cat /sys/devices/system/clocksource/clocksource0/current_clocksource
+
+# --- CPU / NUMA / IRQ ---
+run_section "numactl"         numactl --show
+run_section "lscpu"           lscpu
+cat_section "cpu-isolated"    /sys/devices/system/cpu/isolated
+run_section "cpufreq-info"    cpupower frequency-info
+cat_section "cpu-governor"    /sys/devices/system/cpu/cpu*/cpufreq/scaling_governor
+run_section "irq-mlx5"        bash -c "grep mlx5 /proc/interrupts || true"
+
+# --- Hugepages ---
+cat_section "hugepages"       /sys/kernel/mm/hugepages/*/nr_hugepages
+run_section "free-h"          free -h
+
+# --- PCIe topology ---
+run_section "lspci-mellanox"  bash -c "lspci -vvv -d 15b3: 2>/dev/null"
+run_section "lspci-nvidia"    bash -c "lspci -vvv -d 10de: 2>/dev/null"
+
+# --- NIC: OFED / firmware / DPDK binding ---
+run_section "ofed-info"       ofed_info -s
+run_section "mlxfwmanager"    mlxfwmanager --query
+run_section "dpdk-devbind"    dpdk-devbind.py --status
+# Per-iface ethtool — iterate over the daqiri-tx/rx names if present, else all mlx5.
+for iface in daqiri-tx daqiri-rx $(ls /sys/class/net 2>/dev/null | grep -E '^(enP|enp|eth)' || true); do
+  [[ -d "/sys/class/net/$iface" ]] || continue
+  run_section "ethtool-i:$iface"  ethtool -i "$iface"
+  run_section "ethtool-g:$iface"  ethtool -g "$iface"
+  run_section "ethtool-l:$iface"  ethtool -l "$iface"
+  cat_section "iface-mtu:$iface"  "/sys/class/net/$iface/mtu"
+  cat_section "iface-mac:$iface"  "/sys/class/net/$iface/address"
+done
+
+# --- GPU ---
+run_section "nvidia-smi-q"        nvidia-smi -q
+run_section "nvidia-smi-tempclk"  nvidia-smi --query-gpu=name,driver_version,temperature.gpu,clocks.current.sm,clocks.current.memory --format=csv
+
+# --- Build state ---
+DAQIRI_DIR="$(git -C "$(dirname "$0")/.." rev-parse --show-toplevel 2>/dev/null || pwd)"
+run_section "git-rev-parse"   git -C "$DAQIRI_DIR" rev-parse HEAD
+run_section "git-status"      git -C "$DAQIRI_DIR" status --short
+run_section "git-describe"    git -C "$DAQIRI_DIR" describe --always --dirty
+
+echo "Capture complete: $OUT"
diff --git a/examples/raw_bench_common.cpp b/examples/raw_bench_common.cpp
@@ -19,10 +19,12 @@
 
 #include <arpa/inet.h>
 
+#include <algorithm>
 #include <chrono>
 #include <csignal>
 #include <cstring>
 #include <iostream>
+#include <sstream>
 #include <stdexcept>
 #include <thread>
 
@@ -97,6 +99,44 @@ int parse_run_seconds(int argc, char **argv) {
   return run_seconds;
 }
 
+double parse_target_gbps(int argc, char **argv) {
+  double target_gbps = 0.0;
+  for (int i = 2; i + 1 < argc; i += 2) {
+    if (std::string(argv[i]) == "--target-gbps") {
+      target_gbps = std::stod(argv[i + 1]);
+    }
+  }
+  return target_gbps;
+}
+
+TokenBucketPacer::TokenBucketPacer(double target_gbps)
+    : target_bps_(target_gbps > 0.0 ? target_gbps * 1e9 : 0.0),
+      t0_(std::chrono::steady_clock::now()) {}
+
+void TokenBucketPacer::wait_for_bytes(size_t bytes, std::atomic<bool> &stop) {
+  if (target_bps_ <= 0.0) {
+    return;
+  }
+  total_bytes_ += bytes;
+  const double scheduled_secs = (total_bytes_ * 8.0) / target_bps_;
+  const auto scheduled = t0_ + std::chrono::duration_cast<
+                                   std::chrono::steady_clock::duration>(
+                                   std::chrono::duration<double>(scheduled_secs));
+  // Slice the wait into 10 ms chunks so a stop flag (--seconds expiry or
+  // Ctrl-C) can break us out promptly. The total slept across the slices
+  // accumulates to the scheduled deadline, so pacing remains accurate.
+  constexpr auto kSlice = std::chrono::milliseconds(10);
+  while (!stop.load()) {
+    const auto now = std::chrono::steady_clock::now();
+    if (scheduled <= now) {
+      return;
+    }
+    const auto remaining = scheduled - now;
+    std::this_thread::sleep_for(
+        std::min<std::chrono::steady_clock::duration>(remaining, kSlice));
+  }
+}
+
 bool has_bench_rx(const YAML::Node &root) {
   return root["bench_rx"] && root["bench_rx"]["interface_name"];
 }
@@ -287,6 +327,7 @@ void rx_count_worker(const RawBenchRxConfig &cfg, std::atomic<bool> &stop) {
   uint64_t pkts = 0;
   uint64_t bytes = 0;
   uint64_t bursts = 0;
+  const auto t0 = std::chrono::steady_clock::now();
   while (!stop.load()) {
     const auto num_rx_queues =
         static_cast<int>(daqiri::get_num_rx_queues(port_id));
@@ -307,9 +348,17 @@ void rx_count_worker(const RawBenchRxConfig &cfg, std::atomic<bool> &stop) {
       std::this_thread::sleep_for(std::chrono::microseconds(100));
     }
   }
-
-  std::cout << "RX complete: packets=" << pkts << " bytes=" << bytes
-            << " bursts=" << bursts << "\n";
+  const double secs =
+      std::chrono::duration<double>(std::chrono::steady_clock::now() - t0)
+          .count();
+
+  // Build the line in a stringstream so the print to stdout is a single
+  // write(). RX and TX workers race at end-of-run and naive `cout <<` can
+  // interleave their output (corrupting downstream parsers).
+  std::ostringstream oss;
+  oss << "RX complete: packets=" << pkts << " bytes=" << bytes
+      << " bursts=" << bursts << " seconds=" << secs << "\n";
+  std::cout << oss.str() << std::flush;
 }
 
 } // namespace daqiri::bench
diff --git a/examples/raw_bench_common.h b/examples/raw_bench_common.h
@@ -21,13 +21,42 @@
 #include <yaml-cpp/yaml.h>
 
 #include <atomic>
+#include <chrono>
 #include <cstddef>
 #include <cstdint>
 #include <string>
 #include <vector>
 
 namespace daqiri::bench {
 
+// Software token-bucket pacer used by the bench TX workers. When
+// target_gbps == 0 the wait_for_bytes() call is a no-op early return, so the
+// pacer adds no overhead when --target-gbps is unset.
+//
+// Accuracy: ~5% at high rates due to Linux nanosleep granularity and scheduler
+// jitter. Acceptable for drop-curve sweeps; tighter pacing would require
+// hardware TX timestamping (DAQIRI's accurate_send YAML flag), deferred.
+class TokenBucketPacer {
+public:
+  TokenBucketPacer() = default;
+  explicit TokenBucketPacer(double target_gbps);
+
+  // Call after each TX burst. Sleeps in short slices until the pacer's notion
+  // of "time the configured target rate would have taken to send the
+  // accumulated bytes" catches up, OR `stop` flips true. Slicing keeps the
+  // bench responsive to --seconds expiry / Ctrl-C without truncating the total
+  // sleep (which would silently break pacing for low target rates).
+  void wait_for_bytes(size_t bytes, std::atomic<bool> &stop);
+
+  bool enabled() const { return target_bps_ > 0.0; }
+  double target_gbps() const { return target_bps_ / 1e9; }
+
+private:
+  double target_bps_ = 0.0;  // 0 means disabled
+  uint64_t total_bytes_ = 0;
+  std::chrono::steady_clock::time_point t0_;
+};
+
 struct RawBenchTxConfig {
   std::string interface_name = "tx_port";
   uint32_t batch_size = 1024;
@@ -68,6 +97,7 @@ class PinnedHostBuffer {
 };
 
 int parse_run_seconds(int argc, char **argv);
+double parse_target_gbps(int argc, char **argv);
 bool has_bench_rx(const YAML::Node &root);
 bool has_bench_tx(const YAML::Node &root);
 RawBenchRxConfig parse_rx(const YAML::Node &root);