diff --git a/docs/Doxyfile.in b/docs/Doxyfile.in
index 29b697c8..4cd44670 100644
--- a/docs/Doxyfile.in
+++ b/docs/Doxyfile.in
@@ -27,7 +27,8 @@ EXTRACT_STATIC         = YES
 ENABLE_PREPROCESSING   = YES
 MACRO_EXPANSION        = YES
 EXPAND_ONLY_PREDEF     = YES
-PREDEFINED             = "__qpu__="
+PREDEFINED             = "__qpu__=" \
+                         "__attribute__(x)="
 
 #---------------------------------------------------------------------------
 # Configuration options related to the HTML output
diff --git a/docs/sphinx/api/qec/cpp_api.rst b/docs/sphinx/api/qec/cpp_api.rst
index e93f4b4b..0b06ffda 100644
--- a/docs/sphinx/api/qec/cpp_api.rst
+++ b/docs/sphinx/api/qec/cpp_api.rst
@@ -56,16 +56,27 @@ Sliding Window Decoder
 
 .. include:: sliding_window_api.rst
 
+Real-Time Decoding
+==================
+
+.. include:: cpp_realtime_decoding_api.rst
+
+.. _parity_check_matrix_utilities:
+
 Parity Check Matrix Utilities
 =============================
 
 .. doxygenfunction:: cudaq::qec::dense_to_sparse(const cudaqx::tensor<uint8_t> &)
 .. doxygenfunction:: cudaq::qec::generate_random_pcm(std::size_t, std::size_t, std::size_t, int, std::mt19937_64 &&);
+.. doxygenfunction:: cudaq::qec::generate_timelike_sparse_detector_matrix(std::uint32_t num_syndromes_per_round, std::uint32_t num_rounds, bool include_first_round = false)
+.. doxygenfunction:: cudaq::qec::generate_timelike_sparse_detector_matrix(std::uint32_t num_syndromes_per_round, std::uint32_t num_rounds, std::vector<std::int64_t> first_round_matrix)
 .. doxygenfunction:: cudaq::qec::get_pcm_for_rounds(const cudaqx::tensor<uint8_t> &, std::uint32_t, std::uint32_t, std::uint32_t, bool, bool);
 .. doxygenfunction:: cudaq::qec::get_sorted_pcm_column_indices(const std::vector<std::vector<std::uint32_t>> &, std::uint32_t);
 .. doxygenfunction:: cudaq::qec::get_sorted_pcm_column_indices(const cudaqx::tensor<uint8_t> &, std::uint32_t);
 .. doxygenfunction:: cudaq::qec::pcm_extend_to_n_rounds(const cudaqx::tensor<uint8_t> &, std::size_t, std::uint32_t);
+.. doxygenfunction:: cudaq::qec::pcm_from_sparse_vec(const std::vector<std::int64_t>& sparse_vec, std::size_t num_rows, std::size_t num_cols)
 .. doxygenfunction:: cudaq::qec::pcm_is_sorted(const cudaqx::tensor<uint8_t> &, std::uint32_t);
+.. doxygenfunction:: cudaq::qec::pcm_to_sparse_vec(const cudaqx::tensor<uint8_t>& pcm)
 .. doxygenfunction:: cudaq::qec::reorder_pcm_columns(const cudaqx::tensor<uint8_t> &, const std::vector<std::uint32_t> &, uint32_t, uint32_t);
 .. doxygenfunction:: cudaq::qec::shuffle_pcm_columns(const cudaqx::tensor<uint8_t> &, std::mt19937_64 &&);
 .. doxygenfunction:: cudaq::qec::simplify_pcm(const cudaqx::tensor<uint8_t> &, const std::vector<double> &, std::uint32_t);
diff --git a/docs/sphinx/api/qec/cpp_realtime_decoding_api.rst b/docs/sphinx/api/qec/cpp_realtime_decoding_api.rst
new file mode 100644
index 00000000..2fd84dda
--- /dev/null
+++ b/docs/sphinx/api/qec/cpp_realtime_decoding_api.rst
@@ -0,0 +1,53 @@
+.. _cpp_realtime_decoding_api:
+
+
+The Real-Time Decoding API enables low-latency error correction on quantum hardware by allowing CUDA-Q quantum kernels to interact with decoders during circuit execution. This API is designed for use cases where corrections must be calculated and applied within qubit coherence times.
+
+The real-time decoding system supports simulation environments for local testing and hardware integration (e.g., on
+`Quantinuum's Helios QPU
+<https://www.quantinuum.com/products-solutions/quantinuum-systems/helios>`_).
+
+Core Decoding Functions
+------------------------
+
+These functions can be called from within CUDA-Q quantum kernels (``__qpu__`` functions) to interact with real-time decoders.
+
+.. doxygenfunction:: cudaq::qec::decoding::enqueue_syndromes
+.. doxygenfunction:: cudaq::qec::decoding::get_corrections
+.. doxygenfunction:: cudaq::qec::decoding::reset_decoder
+
+
+Configuration API
+-----------------
+
+The configuration API enables setting up decoders before circuit execution. Decoders are configured using YAML files or programmatically constructed configuration objects.
+
+.. doxygenfunction:: cudaq::qec::decoding::config::configure_decoders
+.. doxygenfunction:: cudaq::qec::decoding::config::configure_decoders_from_file
+.. doxygenfunction:: cudaq::qec::decoding::config::configure_decoders_from_str
+.. doxygenfunction:: cudaq::qec::decoding::config::finalize_decoders
+
+Helper Functions
+----------------
+
+Real-time decoding requires converting matrices to sparse format for efficient decoder configuration. The following utility functions are essential:
+
+- :cpp:func:`cudaq::qec::pcm_to_sparse_vec` for converting a dense PCM to a sparse PCM.
+   
+   **Usage in real-time decoding:**
+
+   .. code-block:: cpp
+
+      config.H_sparse = cudaq::qec::pcm_to_sparse_vec(dem.detector_error_matrix);
+      config.O_sparse = cudaq::qec::pcm_to_sparse_vec(dem.observables_flips_matrix);
+- :cpp:func:`cudaq::qec::pcm_from_sparse_vec` for converting a sparse PCM to a dense PCM.
+- :cpp:func:`cudaq::qec::generate_timelike_sparse_detector_matrix` for generating a sparse detector matrix.
+
+   **Usage in real-time decoding:**
+
+   .. code-block:: cpp
+
+      config.D_sparse = cudaq::qec::generate_timelike_sparse_detector_matrix(
+          numSyndromesPerRound, numRounds, false);
+
+See also :ref:`parity_check_matrix_utilities` for additional PCM manipulation functions.
diff --git a/docs/sphinx/api/qec/python_api.rst b/docs/sphinx/api/qec/python_api.rst
index 5934ee17..aae7a5bc 100644
--- a/docs/sphinx/api/qec/python_api.rst
+++ b/docs/sphinx/api/qec/python_api.rst
@@ -51,6 +51,11 @@ Tensor Network Decoder
 
 .. include:: tensor_network_decoder_api.rst
 
+Real-Time Decoding
+==================
+
+.. include:: python_realtime_decoding_api.rst
+
 
 Common
 =============
@@ -60,13 +65,15 @@ Common
 .. autofunction:: cudaq_qec.sample_code_capacity
 
 Parity Check Matrix Utilities
--------------
+=============================
 
 .. autofunction:: cudaq_qec.generate_random_pcm
+.. autofunction:: cudaq_qec.generate_timelike_sparse_detector_matrix
 .. autofunction:: cudaq_qec.get_pcm_for_rounds
 .. autofunction:: cudaq_qec.get_sorted_pcm_column_indices
 .. autofunction:: cudaq_qec.pcm_extend_to_n_rounds
 .. autofunction:: cudaq_qec.pcm_is_sorted
+.. autofunction:: cudaq_qec.pcm_to_sparse_vec
 .. autofunction:: cudaq_qec.reorder_pcm_columns
 .. autofunction:: cudaq_qec.shuffle_pcm_columns
 .. autofunction:: cudaq_qec.simplify_pcm
diff --git a/docs/sphinx/api/qec/python_realtime_decoding_api.rst b/docs/sphinx/api/qec/python_realtime_decoding_api.rst
new file mode 100644
index 00000000..99ddd7e6
--- /dev/null
+++ b/docs/sphinx/api/qec/python_realtime_decoding_api.rst
@@ -0,0 +1,144 @@
+.. _python_realtime_decoding_api:
+
+
+The Real-Time Decoding API enables low-latency error correction on quantum hardware by allowing CUDA-Q quantum kernels to interact with decoders during circuit execution. This API is designed for use cases where corrections must be calculated and applied within qubit coherence times.
+
+The real-time decoding system supports simulation environments for local testing and hardware integration (e.g., on
+`Quantinuum's Helios QPU
+<https://www.quantinuum.com/products-solutions/quantinuum-systems/helios>`_).
+
+Core Decoding Functions
+------------------------
+
+These functions can be called from within CUDA-Q quantum kernels (``@cudaq.kernel`` decorated functions) to interact with real-time decoders.
+
+.. py:function:: cudaq_qec.qec.enqueue_syndromes(decoder_id, syndromes, tag=0)
+
+   Enqueue syndrome measurements for decoding.
+
+   :param decoder_id: Unique identifier for the decoder instance (matches configured decoder ID)
+   :param syndromes: List of syndrome measurement results from stabilizer measurements
+   :param tag: Optional tag for logging and debugging (default: 0)
+
+   **Example:**
+
+   .. code-block:: python
+
+      import cudaq
+      import cudaq_qec as qec
+      from cudaq_qec import patch
+
+      @cudaq.kernel
+      def measure_and_decode(logical: patch, decoder_id: int):
+          syndromes = measure_stabilizers(logical)
+          qec.enqueue_syndromes(decoder_id, syndromes, 0)
+
+.. py:function:: cudaq_qec.qec.get_corrections(decoder_id, return_size, reset=False)
+
+   Retrieve calculated corrections from the decoder.
+
+   :param decoder_id: Unique identifier for the decoder instance
+   :param return_size: Number of correction bits to return (typically equals number of logical observables)
+   :param reset: Whether to reset accumulated corrections after retrieval (default: False)
+   :returns: List of boolean values indicating detected bit flips for each logical observable
+
+   **Example:**
+
+   .. code-block:: python
+
+      @cudaq.kernel
+      def apply_corrections(logical: patch, decoder_id: int):
+          corrections = qec.get_corrections(decoder_id, 1, False)
+          if corrections[0]:
+              x(logical.data)  # Apply transversal X correction
+
+.. py:function:: cudaq_qec.qec.reset_decoder(decoder_id)
+
+   Reset decoder state, clearing all queued syndromes and accumulated corrections.
+
+   :param decoder_id: Unique identifier for the decoder instance to reset
+
+   **Example:**
+
+   .. code-block:: python
+
+      @cudaq.kernel
+      def run_experiment(decoder_id: int):
+          qec.reset_decoder(decoder_id)  # Reset at start of each shot
+          # ... perform experiment ...
+
+Configuration API
+-----------------
+
+The configuration API enables setting up decoders before circuit execution. Decoders are configured using YAML files or programmatically constructed configuration objects.
+
+.. py:function:: cudaq_qec.configure_decoders(config)
+
+   Configure decoders from a multi_decoder_config object.
+
+   :param config: multi_decoder_config object containing decoder specifications
+   :returns: 0 on success, non-zero error code on failure
+
+.. py:function:: cudaq_qec.configure_decoders_from_file(config_file)
+
+   Configure decoders from a YAML file.
+
+   :param config_file: Path to YAML configuration file
+   :returns: 0 on success, non-zero error code on failure
+
+.. py:function:: cudaq_qec.configure_decoders_from_str(config_str)
+
+   Configure decoders from a YAML string.
+
+   :param config_str: YAML configuration as a string
+   :returns: 0 on success, non-zero error code on failure
+
+.. py:function:: cudaq_qec.finalize_decoders()
+
+   Finalize and clean up decoder resources. Should be called before program exit.
+
+Helper Functions
+----------------
+
+Real-time decoding requires converting matrices to sparse format for efficient decoder configuration. The following utility functions are essential:
+
+.. py:function:: cudaq_qec.pcm_to_sparse_vec(pcm)
+
+   Convert a parity check matrix (PCM) to sparse vector representation for decoder configuration.
+
+   :param pcm: Dense binary matrix as numpy array (e.g., ``dem.detector_error_matrix`` or ``dem.observables_flips_matrix``)
+   :returns: Sparse vector (list of integers) where -1 separates rows
+
+   **Usage in real-time decoding:**
+
+   .. code-block:: python
+
+      config.H_sparse = qec.pcm_to_sparse_vec(dem.detector_error_matrix)
+      config.O_sparse = qec.pcm_to_sparse_vec(dem.observables_flips_matrix)
+
+.. py:function:: cudaq_qec.pcm_from_sparse_vec(sparse_vec, num_rows, num_cols)
+
+   Convert sparse vector representation back to a dense parity check matrix.
+
+   :param sparse_vec: Sparse representation (from YAML or decoder config)
+   :param num_rows: Number of rows in the output matrix
+   :param num_cols: Number of columns in the output matrix
+   :returns: Dense binary matrix as numpy array
+
+.. py:function:: cudaq_qec.generate_timelike_sparse_detector_matrix(num_syndromes_per_round, num_rounds, include_first_round)
+
+   Generate the D_sparse matrix that encodes how detectors relate across syndrome measurement rounds.
+
+   :param num_syndromes_per_round: Number of syndrome measurements per round (typically code distance squared)
+   :param num_rounds: Total number of syndrome measurement rounds
+   :param include_first_round: Boolean (False for standard memory experiments) or list for custom first round
+   :returns: Sparse matrix encoding detector relationships
+
+   **Usage in real-time decoding:**
+
+   .. code-block:: python
+
+      config.D_sparse = qec.generate_timelike_sparse_detector_matrix(
+          numSyndromesPerRound, numRounds, False)
+
+See also :ref:`Parity Check Matrix Utilities <python_api:Parity Check Matrix Utilities>` for additional PCM manipulation functions.
diff --git a/docs/sphinx/components/qec/introduction.rst b/docs/sphinx/components/qec/introduction.rst
index 67fa4382..3297ed76 100644
--- a/docs/sphinx/components/qec/introduction.rst
+++ b/docs/sphinx/components/qec/introduction.rst
@@ -7,11 +7,18 @@ The ``cudaq-qec`` library provides a comprehensive framework for quantum
 error correction research and development. It leverages GPU acceleration
 for efficient syndrome decoding and error correction simulations (coming soon).
 
+The library supports both offline analysis and real-time error correction on quantum hardware,
+enabling low-latency decoding for practical quantum computing applications.
+
 Core Components
 ----------------
-``cudaq-qec`` is composed of two main interfaces - the :code:`cudaq::qec::code` and
-:code:`cudaq::qec::decoder` types. These types are meant to be extended by developers
-to provide new error correcting codes and new decoding strategies.
+``cudaq-qec`` is composed of three main interfaces:
+
+1. **QEC Codes** (:code:`cudaq::qec::code`) - Define quantum error correcting codes with logical operations
+2. **Decoders** (:code:`cudaq::qec::decoder`) - Implement syndrome decoding algorithms
+3. **Real-Time Decoding** (:code:`cudaq::qec::decoding`) - Enable online error correction on quantum hardware
+
+These types are meant to be extended by developers to provide new error correcting codes and decoding strategies.
 
 QEC Code Framework :code:`cudaq::qec::code`
 -------------------------------------------
@@ -631,7 +638,32 @@ Usage Example
 Pre-built QEC Decoders
 ----------------------
 
-CUDA-Q QEC provides pre-built decoders. Here's a detailed overview of each:
+CUDA-Q QEC provides pre-built decoders for a variety of use cases.
+
++------------------------+-----------------------------+----------+----------+-------------------+--------------------------------------------------+
+| Decoder                | Decoder String Identifier   | Python   | C++      | Real-Time Enabled | Notes                                            |
++========================+=============================+==========+==========+===================+==================================================+
+| NVIDIA QLDPC Decoder¹  | `"nv-qldpc-decoder"`        | Yes      | Yes      | Yes               | Supports Relay BP and BP+OSD                     |
++------------------------+-----------------------------+----------+----------+-------------------+--------------------------------------------------+
+| Tensor Network Decoder¹| `"tensor_network_decoder"`  | Yes²     | No       | No                | Exact Maximum Likelihood Decoder                 |
++------------------------+-----------------------------+----------+----------+-------------------+--------------------------------------------------+
+| TensorRT Decoder¹      | `"trt_decoder"`             | Yes³     | Yes      | Not yet           | AI decoder. Bring your own model.                |
++------------------------+-----------------------------+----------+----------+-------------------+--------------------------------------------------+
+| Look-Up Table Decoder  | `"single_error_lut"`        | Yes      | Yes      | Yes               | Simple decoder with no configurable options      |
++                        +-----------------------------+----------+----------+-------------------+--------------------------------------------------+
+|                        | `"multi_error_lut"`         | Yes      | Yes      | Yes               | Multi-error decoder that                         |
+|                        |                             |          |          |                   | can handle up to "lut_error_depth" errors        |
++------------------------+-----------------------------+----------+----------+-------------------+--------------------------------------------------+
+| Sliding Window Decoder | `"sliding_window"`          | Yes      | Yes      | Not yet           | Decodes syndromes in a sliding window fashion.   |
+|                        |                             |          |          |                   | May be paired with any other decoder as an       |
+|                        |                             |          |          |                   | inner decoder except Tensor RT Decoder           |
++------------------------+-----------------------------+----------+----------+-------------------+--------------------------------------------------+
+
+| ¹ GPU-accelerated decoder
+| ² Requires installation with `pip install cudaq-qec[tensor-network-decoder]` for Python
+| ³ Requires installation with `pip install cudaq-qec[trt-decoder]` for Python
+
+Here's a detailed overview of each:
 
 Quantum Low-Density Parity-Check Decoder
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
@@ -777,6 +809,63 @@ The decoder returns the probability that the logical observable has flipped for
     CUDA-Q 0.5.0 is released.
 
 
+Real-Time Decoding
+------------------
+
+CUDA-Q QEC provides real-time decoding capabilities for quantum error correction on actual quantum hardware.
+Real-time decoding enables decoders to process syndromes and compute corrections within qubit coherence times,
+making active error correction practical for real quantum computers.
+
+Key Features
+^^^^^^^^^^^^
+
+* **In-Kernel Operation**: Syndrome decoding within CUDA-Q kernels.
+* **Hardware Integration**: Direct integration with quantum hardware backends (`Quantinuum's Helios QPU <https://www.quantinuum.com/products-solutions/quantinuum-systems/helios>`_).
+* **Simulation Support**: Test real-time workflows locally before deploying to hardware.
+* **Multiple Decoder Types**: For real-time decoders, see the table `Pre-built QEC Decoders <https://nvidia.github.io/cudaqx/components/qec/introduction.html#pre-built-qec-decoders>`__.
+* **GPU Acceleration**: Leverage CUDA for high-performance syndrome decoding.
+
+Note: The real-time decoding interfaces are experimental, and subject to change. Real-time decoding on Quantinuum's Helios-1 device is currently only available to partners and collaborators. Please email QCSupport@quantinuum.com for more information.
+
+Workflow and Terminology
+^^^^^^^^^^^^^^^^^^^^^^^^
+
+The real-time decoding workflow involves configuring a decoder (or many) before CUDA-Q kernel launch, and communicating to the decoders with special in-kernel functions.
+A decoder is a single software instance of a decoding algorithm, and all its relevant inputs (parity-check matrices, error rates, etc.) which will remain static for the execution of the quantum program.
+A decoder config may contain many decoders, each with different algorithms and input parameters.
+
+In a quantum kernel, a user interacts with the decoders via the `enqueue_syndromes` and `get_corrections` interfaces.
+The behavior of these functions depends on their configuration and their usage.
+
+The real-time decoding workflow can be described with respect to the offline decoding workflow.
+The non-real-time decoders require a detector error model which is specified via a detector error matrix which is the parity check matrix `H` of the decoding problem, and a vector of weights (error rates).
+This matrix has dimensions of `[numDetectors, numErrors]`, where the each row is a detector, and each column is a possible error.
+For real-time decoding, we first need to convert the circuit measurements into detectors.
+This is specified via the detector matrix `D`, which has dimensions `[numDetectors, numMeasurements]`.
+Each column of the detector matrix defines which detectors a measurement participates in by including an entry of `1`.
+This when, once all `numMeasurements` measurements are enqueued, a matrix-vector multiply can convert this buffer of raw measurements into detectors which are then passed into the decoding algorithm.
+
+Similarly, an observables flips matrix `O` of size `[numObs, numErrors]` must be provided.
+Each column of the observables flips matrix describes for each error, which observables are flipped by that error by including an entry of `1`.
+Once the decoding algorithm has process the detectors it provides a vector of predicted errors of length `numErrors`.
+This vector then executes a matrix-vector multiply with the observables flips matrix to yield a new vector of length `numObs` which contains an entry of `1` if the observable is predicted to have flipped.
+
+Thus once a decoder is configured, we can view the real-time decoder as a transformation of data starting from a vector of raw measurements, then transformed into detectors via `D`, then error predictions via `H`, then observable flip predictions via `O`. This last step is what is returned via `get_corrections`. The user configures how many bits of information are returned, and what they represent via the `O` matrix in the decoder config.
+
+Similarly, the user determines how many measurements are needed for the decoder via the `D` matrix in the decoder config, and they are sent to the decoder via `enqueue_syndromes`.
+For flexibility, the user can choose to send all measurements with a single `enqueue_syndromes` call, or send them over several calls.
+However they are split up, the decoder will not begin decoding until all `numMeasurements` have been enqueued, and will throw an error if too many are sent.
+Thus it is the final `enqueue_syndromes` call which kicks off the decoder, and is an asynchronous function.
+Additional quantum gates can be applied, and only when `get_corrections` is called does the kernel sync and wait for the corrections.
+
+For detailed information on real-time decoding, see:
+
+* :doc:`/examples_rst/qec/realtime_decoding` - Complete Guide with Examples
+* :doc:`/api/qec/cpp_api` - C++ API Reference (see Real-Time Decoding section)
+* :doc:`/api/qec/python_api` - Python API Reference (see Real-Time Decoding section)
+
+
+
 Numerical Experiments
 ---------------------
 
@@ -1067,4 +1156,3 @@ Additional Noise Models
       noise.add_all_qubit_channel(
           "x", cudaq::depolarization2(/*probability*/ 0.01),
           /*numControls*/ 1);
-
diff --git a/docs/sphinx/examples/qec/cpp/real_time_complete.cpp b/docs/sphinx/examples/qec/cpp/real_time_complete.cpp
new file mode 100644
index 00000000..1e42a22e
--- /dev/null
+++ b/docs/sphinx/examples/qec/cpp/real_time_complete.cpp
@@ -0,0 +1,157 @@
+/*******************************************************************************
+ * Copyright (c) 2025 NVIDIA Corporation & Affiliates.                         *
+ * All rights reserved.                                                        *
+ *                                                                             *
+ * This source code and the accompanying materials are made available under    *
+ * the terms of the Apache License 2.0 which accompanies this distribution.    *
+ ******************************************************************************/
+
+// clang-format off
+// Compile and run:
+// nvq++ --target=stim -lcudaq-qec -lcudaq-qec-realtime-decoding -lcudaq-qec-realtime-decoding-simulation real_time_complete.cpp
+// NOTE: This must be on one line for the CI system to parse it correctly.
+// clang-format on
+
+// [Begin Documentation]
+
+// Simple 3-qubit repetition code with real-time decoding
+// This is the most basic QEC example possible
+
+#include "cudaq.h"
+#include "cudaq/qec/code.h"
+#include "cudaq/qec/experiments.h"
+#include "cudaq/qec/pcm_utils.h"
+#include "cudaq/qec/realtime/decoding.h"
+#include "cudaq/qec/realtime/decoding_config.h"
+#include <common/NoiseModel.h>
+#include <fstream>
+
+// [Begin Save DEM]
+// Save decoder configuration to YAML file
+void save_dem(const cudaq::qec::detector_error_model &dem,
+              const std::string &filename) {
+  // Create decoder config
+  cudaq::qec::decoding::config::decoder_config config;
+  config.id = 0;
+  config.type = "multi_error_lut";
+  config.block_size = dem.num_error_mechanisms();
+  config.syndrome_size = dem.num_detectors();
+  config.H_sparse = cudaq::qec::pcm_to_sparse_vec(dem.detector_error_matrix);
+  config.O_sparse = cudaq::qec::pcm_to_sparse_vec(dem.observables_flips_matrix);
+
+  // Calculate numRounds from DEM (we send 1 additional round, so add 1)
+  uint64_t numSyndromesPerRound = 2; // Z0Z1 and Z1Z2
+  auto numRounds = dem.num_detectors() / numSyndromesPerRound + 1;
+  config.D_sparse = cudaq::qec::generate_timelike_sparse_detector_matrix(
+      numSyndromesPerRound, numRounds, false);
+
+  cudaq::qec::decoding::config::multi_error_lut_config lut_config;
+  lut_config.lut_error_depth = 2;
+  config.decoder_custom_args = lut_config;
+
+  cudaq::qec::decoding::config::multi_decoder_config multi_config;
+  multi_config.decoders.push_back(config);
+
+  std::ofstream file(filename);
+  file << multi_config.to_yaml_str(200);
+  file.close();
+  printf("Saved config to %s\n", filename.c_str());
+}
+// [End Save DEM]
+
+// [Begin Load DEM]
+// Load decoder configuration from YAML file
+void load_dem(const std::string &filename) {
+  std::ifstream file(filename);
+  std::string yaml((std::istreambuf_iterator<char>(file)),
+                   std::istreambuf_iterator<char>());
+  auto config =
+      cudaq::qec::decoding::config::multi_decoder_config::from_yaml_str(yaml);
+  cudaq::qec::decoding::config::configure_decoders(config);
+  printf("Loaded config from %s\n", filename.c_str());
+}
+// [End Load DEM]
+
+// Prepare logical |0⟩
+__qpu__ void prep0(cudaq::qec::patch logical) {
+  for (std::size_t i = 0; i < logical.data.size(); ++i) {
+    cudaq::reset(logical.data[i]);
+  }
+}
+
+// Measure ZZ stabilizers for 3-qubit repetition code
+__qpu__ std::vector<cudaq::measure_result>
+measure_stabilizers(cudaq::qec::patch logical) {
+  for (std::size_t i = 0; i < logical.ancz.size(); ++i) {
+    cudaq::reset(logical.ancz[i]);
+  }
+
+  // Z0Z1 stabilizer
+  cudaq::x<cudaq::ctrl>(logical.data[0], logical.ancz[0]);
+  cudaq::x<cudaq::ctrl>(logical.data[1], logical.ancz[0]);
+
+  // Z1Z2 stabilizer
+  cudaq::x<cudaq::ctrl>(logical.data[1], logical.ancz[1]);
+  cudaq::x<cudaq::ctrl>(logical.data[2], logical.ancz[1]);
+
+  return {mz(logical.ancz[0]), mz(logical.ancz[1])};
+}
+
+// [Begin QEC Circuit]
+// QEC circuit with real-time decoding
+__qpu__ int64_t qec_circuit() {
+  cudaq::qec::decoding::reset_decoder(0);
+
+  cudaq::qvector data(3);
+  cudaq::qvector ancz(2);
+  cudaq::qvector ancx; // Empty for repetition code
+  cudaq::qec::patch logical(data, ancx, ancz);
+
+  prep0(logical);
+
+  // 3 rounds of syndrome measurement
+  for (int round = 0; round < 3; ++round) {
+    auto syndromes = measure_stabilizers(logical);
+    cudaq::qec::decoding::enqueue_syndromes(0, syndromes);
+  }
+
+  // Get corrections and apply them
+  auto corrections = cudaq::qec::decoding::get_corrections(0, 3);
+  for (std::size_t i = 0; i < 3; ++i) {
+    if (corrections[i])
+      cudaq::x(data[i]);
+  }
+
+  return cudaq::to_integer(mz(data));
+}
+// [End QEC Circuit]
+
+int main() {
+  auto code = cudaq::qec::get_code("repetition",
+                                   cudaqx::heterogeneous_map{{"distance", 3}});
+
+  // [Begin DEM Generation]
+  // Step 1: Generate detector error model
+  printf("Step 1: Generating DEM...\n");
+  cudaq::noise_model noise;
+  noise.add_all_qubit_channel("x", cudaq::depolarization2(0.01), 1);
+
+  auto dem = cudaq::qec::z_dem_from_memory_circuit(
+      *code, cudaq::qec::operation::prep0, 3, noise);
+  // [End DEM Generation]
+
+  save_dem(dem, "config.yaml");
+
+  // Step 2: Load config and run circuit
+  printf("\nStep 2: Running circuit with decoding...\n");
+  load_dem("config.yaml");
+
+  cudaq::run(10, qec_circuit);
+  printf("Ran 10 shots\n");
+
+  cudaq::qec::decoding::config::finalize_decoders();
+
+  printf("\nDone!\n");
+  return 0;
+}
+// [End Documentation]
diff --git a/docs/sphinx/examples/qec/python/real_time_complete.py b/docs/sphinx/examples/qec/python/real_time_complete.py
new file mode 100644
index 00000000..6b21991d
--- /dev/null
+++ b/docs/sphinx/examples/qec/python/real_time_complete.py
@@ -0,0 +1,138 @@
+# ============================================================================ #
+# Copyright (c) 2025 NVIDIA Corporation & Affiliates.                          #
+# All rights reserved.                                                         #
+#                                                                              #
+# This source code and the accompanying materials are made available under     #
+# the terms of the Apache License 2.0 which accompanies this distribution.     #
+# ============================================================================ #
+
+# [Begin Documentation]
+
+#!/usr/bin/env python3
+"""
+Simple 3-qubit repetition code with real-time decoding.
+This is the most basic QEC example possible.
+"""
+
+import os
+
+os.environ["CUDAQ_DEFAULT_SIMULATOR"] = "stim"
+
+import cudaq
+import cudaq_qec as qec
+
+
+# Prepare logical |0⟩
+@cudaq.kernel
+def prep0(logical: qec.patch):
+    for i in range(logical.data.size()):
+        reset(logical.data[i])
+
+
+# Measure ZZ stabilizers for 3-qubit repetition code
+@cudaq.kernel
+def measure_stabilizers(logical: qec.patch) -> list[bool]:
+    for i in range(logical.ancz.size()):
+        reset(logical.ancz[i])
+
+    # Z0Z1 stabilizer
+    cx(logical.data[0], logical.ancz[0])
+    cx(logical.data[1], logical.ancz[0])
+
+    # Z1Z2 stabilizer
+    cx(logical.data[1], logical.ancz[1])
+    cx(logical.data[2], logical.ancz[1])
+
+    return [mz(logical.ancz[0]), mz(logical.ancz[1])]
+
+
+# [Begin QEC Circuit]
+# QEC circuit with real-time decoding
+@cudaq.kernel
+def qec_circuit() -> list[bool]:
+    qec.reset_decoder(0)
+
+    data = cudaq.qvector(3)
+    ancz = cudaq.qvector(2)
+    ancx = cudaq.qvector(0)
+    logical = patch(data, ancx, ancz)
+
+    prep0(logical)
+
+    # 3 rounds of syndrome measurement
+    for _ in range(3):
+        syndromes = measure_stabilizers(logical)
+        qec.enqueue_syndromes(0, syndromes, 0)
+
+    # Get corrections and apply them
+    corrections = qec.get_corrections(0, 3, False)
+    for i in range(3):
+        if corrections[i]:
+            x(data[i])
+
+    return mz(data)
+
+
+# [End QEC Circuit]
+
+
+def main():
+    # Get 3-qubit repetition code
+    code = qec.get_code("repetition", distance=3)
+
+    # [Begin DEM Generation]
+    # Step 1: Generate detector error model
+    print("Step 1: Generating DEM...")
+    cudaq.set_target("stim")
+
+    noise = cudaq.NoiseModel()
+    noise.add_all_qubit_channel("x", cudaq.Depolarization2(0.01), 1)
+
+    dem = qec.z_dem_from_memory_circuit(code, qec.operation.prep0, 3, noise)
+    # [End DEM Generation]
+
+    # [Begin Save DEM]
+    # Save decoder config
+    config = qec.decoder_config()
+    config.id = 0
+    config.type = "multi_error_lut"
+    config.block_size = dem.detector_error_matrix.shape[1]
+    config.syndrome_size = dem.detector_error_matrix.shape[0]
+    config.H_sparse = qec.pcm_to_sparse_vec(dem.detector_error_matrix)
+    config.O_sparse = qec.pcm_to_sparse_vec(dem.observables_flips_matrix)
+
+    # Calculate numRounds from DEM (we send 1 additional round, so add 1)
+    num_syndromes_per_round = 2  # Z0Z1 and Z1Z2
+    num_rounds = dem.detector_error_matrix.shape[
+        0] // num_syndromes_per_round + 1
+    config.D_sparse = qec.generate_timelike_sparse_detector_matrix(
+        num_syndromes_per_round, num_rounds, False)
+    lut_config = qec.multi_error_lut_config()
+    lut_config.lut_error_depth = 2
+    config.set_decoder_custom_args(lut_config)
+
+    multi_config = qec.multi_decoder_config()
+    multi_config.decoders = [config]
+
+    with open("config.yaml", 'w') as f:
+        f.write(multi_config.to_yaml_str(200))
+    print("Saved config to config.yaml")
+    # [End Save DEM]
+
+    # Step 2: Load config and run circuit
+    print("\nStep 2: Running circuit with decoding...")
+    # [Begin Load DEM]
+    qec.configure_decoders_from_file("config.yaml")
+    # [End Load DEM]
+
+    run_result = cudaq.run(qec_circuit, shots_count=10)
+    print("Ran 10 shots")
+
+    qec.finalize_decoders()
+
+    print("\nDone!")
+
+
+if __name__ == "__main__":
+    main()
+# [End Documentation]
diff --git a/docs/sphinx/examples_rst/qec/examples.rst b/docs/sphinx/examples_rst/qec/examples.rst
index 1cfeaf38..79247213 100644
--- a/docs/sphinx/examples_rst/qec/examples.rst
+++ b/docs/sphinx/examples_rst/qec/examples.rst
@@ -9,4 +9,5 @@ Examples that illustrate how to use CUDA-QX for application development are avai
 
       Code-Capacity QEC <code_capacity_noise.rst>
       Circuit-Level QEC <circuit_level_noise.rst>
-      Decoders <decoders.rst>
\ No newline at end of file
+      Decoders <decoders.rst>
+      Real-Time Decoding <realtime_decoding.rst>
\ No newline at end of file
diff --git a/docs/sphinx/examples_rst/qec/realtime_decoding.rst b/docs/sphinx/examples_rst/qec/realtime_decoding.rst
new file mode 100644
index 00000000..0c14d180
--- /dev/null
+++ b/docs/sphinx/examples_rst/qec/realtime_decoding.rst
@@ -0,0 +1,577 @@
+Real-Time Decoding
+==================
+
+Real-time decoding enables CUDA-Q QEC decoders to operate in low-latency, online environments where decoders run concurrently with quantum computations. This capability is essential for quantum error correction on real quantum hardware, where corrections must be calculated and applied within qubit coherence times.
+
+The real-time decoding framework supports two primary deployment scenarios:
+
+1. **Hardware Integration**: Decoders running on classical computers connected to real quantum processing units (QPUs) via low-latency networks
+2. **Simulation Mode**: Decoders operating in simulated environments for testing and development on local systems
+
+Workflow Overview
+-----------------
+
+Real-time decoding integrates seamlessly into quantum error correction pipelines through a carefully designed four-stage workflow. This workflow separates the computationally intensive characterization phase from the latency-critical runtime phase, ensuring that decoders can operate efficiently during quantum circuit execution.
+
+The workflow consists of four stages:
+
+1. **Detector Error Model (DEM) Generation**: Before running a quantum program, the user first characterizes how errors propagate through the quantum circuit. The library internally uses Memory Syndrome Matrix (MSM) representations to track error propagation, but this complexity is abstracted through helper functions like ``z_dem_from_memory_circuit``. The user simply provides a quantum code, noise model, and circuit parameters, and receives a complete detector error model that maps error mechanisms to syndrome patterns. This step is performed once during development.
+
+2. **Decoder Configuration and Saving**: Using the DEM, the user configures decoder instances with the specific error model data. This includes converting parity check matrices to sparse format, setting decoder-specific parameters (like lookup table depth or BP iterations), and assigning unique IDs to each logical qubit's decoder. The configuration is then saved to a YAML file, capturing all the information decoders need to interpret syndrome measurements correctly. This creates a portable, reusable configuration that separates characterization from execution.
+
+3. **Decoder Loading and Initialization**: Just before circuit execution, the user loads the saved YAML configuration file. The library parses the configuration, instantiates the appropriate decoder implementations, initializes internal data structures, and registers the decoders with the CUDA-Q runtime. For GPU-based decoders, matrices are transferred to device memory; for lookup table decoders, syndrome-to-correction mappings are constructed. This initialization takes milliseconds to seconds depending on code size and happens before quantum operations begin.
+
+4. **Real-Time Decoding**: During quantum circuit execution, the decoding API is used within quantum kernels to interact with decoders. As the circuit measures stabilizers, syndromes are enqueued to the decoder, which processes them concurrently. When corrections are needed, the decoder is queried and the suggested operations are applied to the logical qubits. This entire process happens within the coherence time constraints of the quantum hardware.
+
+Real-Time Decoding Example
+----------------
+
+Here are two examples demonstrating real-time decoding in Python and C++:
+
+.. tab:: Python
+
+   .. literalinclude:: ../../examples/qec/python/real_time_complete.py
+      :language: python
+      :start-after: # [Begin Documentation]
+
+.. tab:: C++
+
+   .. literalinclude:: ../../examples/qec/cpp/real_time_complete.cpp
+      :language: cpp
+      :start-after: // [Begin Documentation]
+
+The examples above showcase the main components of the real-time decoding workflow:
+
+- Decoder configuration file: Initializes and configures the decoders before circuit execution.
+
+- Quantum kernel: Uses the real-time decoding API to interact with the decoders, primarily through reset_decoder, enqueue_syndromes, and get_corrections.
+
+- Syndrome extraction: Measures the stabilizers of the logical qubits.
+
+- Correction application: Applies the corrections to the logical qubits.
+
+- Logical observable measurement: Measures the logical observables of the logical qubits.
+
+- Decoder finalization: Frees up resources after circuit execution.
+
+The API is designed to be called from within quantum kernels (marked with ``@cudaq.kernel`` in Python or ``__qpu__``  in C++). The runtime automatically routes these calls to the appropriate backend—whether a simulation environment on the local machine or a low-latency connection to quantum hardware. The API is device-agnostic, so the same kernel code works across different deployment scenarios.
+
+The user is required to provide a configuration file or generate one if it is not present. The generation process depends on the decoder type and the detector error model studied in other sections of the documentation. Moreover, the user must write an appropriate kernel that describes the correct syndrome extraction and correction application logic.
+
+The next section provides instructions to generate a configuration file, write a quantum kernel, and compile and run the examples correctly.
+
+
+Configuration
+-------------
+
+The configuration process transforms a quantum circuit's error characteristics into a format that decoders can efficiently process. This section walks through each step in detail, showing how to go from circuit simulation to a fully configured real-time decoder.
+
+Step 1: Generate Detector Error Model
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+The first step is to characterize the quantum circuit's behavior under noise. 
+A detector error model (DEM) captures the relationship between physical errors and the syndrome patterns they produce. 
+This characterization is circuit-specific and depends on the code structure, noise model, and measurement schedule.
+
+Under the hood, the CUDA-Q QEC library uses the Memory Syndrome Matrix (MSM) representation to efficiently encode error propagation information. The MSM captures all possible error chains and their syndrome signatures, tracking how errors propagate through the circuit over time. However, this complexity is abstracted away from the user through convenient helper functions.
+
+The library provides a family of ``dem_from_memory_circuit`` functions that automatically handle the MSM generation and processing:
+
+* ``z_dem_from_memory_circuit``: For circuits measuring Z-basis stabilizers (used in the example below)
+* ``x_dem_from_memory_circuit``: For circuits measuring X-basis stabilizers
+* ``dem_from_memory_circuit``: General-purpose function for arbitrary stabilizer measurements
+
+These functions take a quantum code, an initial state preparation operation, the number of measurement rounds, and a noise model, then return a complete detector error model ready for decoder configuration. The user simply needs to configure the noise model and specify the circuit structure—the library handles all the error tracking and matrix construction automatically.
+
+Here is how to generate a DEM for a circuit:
+
+.. tab:: Python
+
+   .. literalinclude:: ../../examples/qec/python/real_time_complete.py
+      :language: python
+      :start-after: # [Begin DEM Generation]
+      :end-before: # [End DEM Generation]
+
+.. tab:: C++
+
+   .. literalinclude:: ../../examples/qec/cpp/real_time_complete.cpp
+      :language: cpp
+      :start-after: // [Begin DEM Generation]
+      :end-before: // [End DEM Generation]
+
+Step 2: Configure and Save Decoder
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Once a DEM has been generated, the next step is to package this information into a decoder configuration and save it to a YAML file. 
+The configuration structure holds all the parameters a decoder needs: the parity check matrix (H_sparse), 
+the observable flip matrix (O_sparse), the detector error matrix (D_sparse), 
+and decoder-specific tuning parameters. 
+
+These matrices are generated in sparse matrix format, which is crucial for performance. 
+They can be large considering error correcting codes with large number of physical qubits, and moreover, 
+real-time decoders process thousands of syndrome measurements per second, and take decision based on these matrices, so compact representations are essential.
+The helper function ``pcm_to_sparse_vec`` is used to convert the dense binary matrices into a space-efficient format where -1 marks row boundaries and integers represent column indices of non-zero elements.
+
+Each decoder type has its own configuration structure with specific parameters. 
+For lookup table decoders, the user specifies how many simultaneous errors to consider. 
+For belief propagation decoders, the user sets iteration limits and convergence criteria. 
+The configuration API provides type-safe structures for each decoder, ensuring that all required parameters are included.
+
+The configuration is then saved to a YAML file for reuse. The YAML format is human-readable, making it easy to inspect, modify, and share configurations across different execution environments.
+
+Here is how to create and save a decoder configuration:
+
+.. tab:: Python
+
+   .. literalinclude:: ../../examples/qec/python/real_time_complete.py
+      :language: python
+      :start-after: # [Begin Save DEM]
+      :end-before: # [End Save DEM]
+
+.. tab:: C++
+
+   .. literalinclude:: ../../examples/qec/cpp/real_time_complete.cpp
+      :language: cpp
+      :start-after: // [Begin Save DEM]
+      :end-before: // [End Save DEM]
+
+Step 3: Load Configuration
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Before running quantum circuits with real-time decoding, the saved decoder configuration must be loaded and initialized. 
+This step bridges the gap between the offline characterization phase (Steps 1-2) and the online execution phase (Step 4), 
+preparing the decoder instances for real-time operation.
+
+The configuration loading process performs several important operations:
+
+1. **YAML Parsing**: The configuration file is parsed and validated to ensure all required fields are present and properly formatted. This includes checking matrix dimensions, decoder parameters, and metadata.
+
+2. **Decoder Instantiation**: Based on the decoder type specified in the configuration (e.g., ``multi_error_lut``, ``nv-qldpc-decoder``), the appropriate decoder implementation is instantiated and allocated resources on the GPU or CPU.
+
+3. **Matrix Initialization**: The sparse matrices (H_sparse, O_sparse, D_sparse) are loaded into the decoder's internal data structures. For GPU-based decoders, this includes transferring data to device memory.
+
+4. **Decoder-Specific Initialization**: Each decoder type performs its own preparation: lookup table decoders build syndrome-to-correction mappings, belief propagation decoders initialize message-passing structures, and sliding window decoders configure their buffering mechanisms.
+
+5. **Backend Registration**: The decoder instances are registered with the CUDA-Q runtime so they can be accessed from quantum kernels using their unique IDs.
+
+This initialization happens quickly, typically only a few milliseconds for small codes and up to a few seconds for large distance codes with complex decoders. Since it occurs before quantum circuit execution, it does not impact the latency-critical decoding operations.
+
+The separation of configuration from execution provides significant benefits: users can maintain a library of configurations for different code distances, noise levels, and decoder types, then simply load the appropriate one when running experiments. Configurations can be version-controlled alongside code, shared across research teams, and validated offline before deployment to quantum hardware.
+
+Here is how to load a decoder configuration:
+
+.. tab:: Python
+
+   .. literalinclude:: ../../examples/qec/python/real_time_complete.py
+      :language: python
+      :start-after: # [Begin Load DEM]
+      :end-before: # [End Load DEM]
+
+.. tab:: C++
+
+   .. literalinclude:: ../../examples/qec/cpp/real_time_complete.cpp
+      :language: cpp
+      :start-after: // [Begin Load DEM]
+      :end-before: // [End Load DEM]
+
+Step 4: Use in Quantum Kernels
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+With decoders configured and initialized, they can be used within quantum kernels. The real-time decoding API provides three key functions that integrate seamlessly with CUDA-Q's quantum programming model: ``reset_decoder`` prepares a decoder for a new shot, ``enqueue_syndromes`` sends syndrome measurements to the decoder for processing, and ``get_corrections`` retrieves the decoder's recommended corrections.
+
+These functions are designed to be called from within quantum kernels (marked with ``@cudaq.kernel`` in Python or ``__qpu__`` in C++). The runtime automatically routes these calls to the appropriate backend - whether that is a simulation environment on the local machine or a low-latency connection to quantum hardware. The API is device-agnostic, so the same kernel code works across different deployment scenarios.
+
+The typical usage pattern is: reset the decoder at the start of each shot, enqueue
+syndromes after each stabilizer measurement round, then get corrections before
+measuring the logical observables. Decoders process syndromes asynchronously, so
+by the time ``get_corrections`` is called, the decoder has usually finished its
+analysis. If decoding takes longer than expected, ``get_corrections`` will block
+until results are available.
+
+.. note::
+   While resetting the decoder at the beginning of each shot isn't strictly
+   required, it is **strongly** recommended to ensure that when running on a
+   remote QPU, any potential errors encountered in one shot do not affect future
+   shot results.
+
+Here is how to use the real-time decoding API in quantum kernels:
+
+.. tab:: Python
+
+   .. literalinclude:: ../../examples/qec/python/real_time_complete.py
+      :language: python
+      :start-after: # [Begin QEC Circuit]
+      :end-before: # [End QEC Circuit]
+
+.. tab:: C++
+
+   .. literalinclude:: ../../examples/qec/cpp/real_time_complete.cpp
+      :language: cpp
+      :start-after: // [Begin QEC Circuit]
+      :end-before: // [End QEC Circuit]
+
+Backend Selection
+-----------------
+
+CUDA-Q QEC's real-time decoding system is designed to work seamlessly across different execution environments. The backend selection determines where quantum circuits run and how decoders communicate with the quantum processor. Understanding the differences between simulation and hardware backends helps the user develop efficiently and deploy confidently.
+
+Simulation Backend
+^^^^^^^^^^^^^^^^^^
+
+The simulation backend is the primary tool during development, testing, and
+algorithm validation. It runs entirely on the local machine, using quantum
+simulators like Stim to execute circuits while decoders process syndromes and
+calculation corrections. This setup is ideal for rapid iteration: the user can
+test decoder configurations, validate circuit logic, and debug syndrome
+processing without waiting for hardware access or paying for compute time.
+
+The simulation backend mimics real-time decoding's concurrent operation by
+running the decoder(s) within the same process as the simulator. This means that
+other than GPU hardware differences between the local environment and the remote
+NVQLink decoders, the decoders behave the same way whether testing locally or
+running on a quantum computer. The main difference is that simulation does not
+have the same strict latency constraints, making it easier to experiment with
+complex decoder configurations.
+
+Use the simulation backend for local development and testing:
+
+.. tab:: Python
+
+   .. code-block:: python
+
+      import cudaq
+      import cudaq_qec as qec
+      
+      cudaq.set_target("stim")  # Or other simulator
+      qec.configure_decoders_from_file("config.yaml")
+      
+      # Run circuit with noise model
+      results = cudaq.run(my_circuit, shots_count=100, 
+                         noise_model=cudaq.NoiseModel())
+
+.. tab:: C++
+
+   .. code-block:: bash
+
+      # Compile with simulation support
+      nvq++ -std=c++20 my_circuit.cpp -lcudaq-qec \
+            -lcudaq-qec-realtime-decoding \
+            -lcudaq-qec-realtime-decoding-simulation
+      
+      ./a.out
+
+Quantinuum Hardware Backend
+^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+The Quantinuum hardware backend connects quantum circuits to real ion-trap quantum computers. Unlike the simulation backend where decoders run on the local machine, **the Quantinuum backend uploads the decoder configuration to Quantinuum's infrastructure**, where decoders run on GPU-equipped servers co-located with the quantum hardware. This architecture minimizes latency between syndrome measurements and correction application.
+
+**Important Setup Requirements:**
+
+1. **Configuration Upload**: When ``configure_decoders_from_file()`` or ``configure_decoders()`` is called, the decoder configuration is automatically base64-encoded and uploaded to Quantinuum's REST API (``api/gpu_decoder_configs/v1beta/``). This happens before job submission. The configuration includes all decoder parameters, error models, and sparse matrices.
+
+2. **Extra Payload Provider**: The user **must** specify ``extra_payload_provider="decoder"`` when setting the target. This registers a payload provider that injects the decoder configuration UUID into each job request, telling Quantinuum which decoder configuration to use for the circuit.
+
+3. **Backend Compilation**: For C++, the user must link against ``-lcudaq-qec-realtime-decoding-quantinuum`` instead of the simulation library. This library implements the Quantinuum-specific communication protocol for syndrome transmission.
+
+4. **Configuration Lifetime**: Decoder configurations persist on Quantinuum's servers and are referenced by UUID. If the configuration is modified, it must be uploaded again - the system will generate a new UUID and use the new configuration for subsequent jobs.
+
+Note: The real-time decoding interfaces are experimental, and subject to change. Real-time decoding on Quantinuum's Helios-1 device is currently only available to partners and collaborators. Please email QCSupport@quantinuum.com for more information.
+
+**Emulation vs. Hardware Modes:**
+
+Emulation mode (``emulate=True``) is particularly valuable for testing the deployment setup without consuming hardware credits. Running with this flag performs a local, noise-free simulation without any actual submission to Quantinuum's servers.
+
+Use the Quantinuum backend for hardware or emulation:
+
+.. tab:: Python
+
+   .. code-block:: python
+
+      cudaq.set_target("quantinuum",
+                       emulate=False,  # True for emulation
+                       machine="Helios-1",
+                       extra_payload_provider="decoder")
+      
+      qec.configure_decoders_from_file("config.yaml")
+      results = cudaq.run(my_circuit, shots_count=100)
+
+.. tab:: C++
+
+   .. code-block:: bash
+
+      # Compile for Quantinuum
+      nvq++ --target quantinuum --quantinuum-machine Helios-1 \
+            my_circuit.cpp -lcudaq-qec \
+            -lcudaq-qec-realtime-decoding \
+            -lcudaq-qec-realtime-decoding-quantinuum
+      
+      ./a.out
+
+Compilation and Execution Examples
+-----------------------------------
+
+This section provides **complete, tested compilation and execution commands** for both simulation and hardware backends, extracted from the CUDA-Q QEC test infrastructure. The section begins with common usage patterns that guide decoder and compilation choices, then provides the specific commands needed for each backend.
+
+Common Use Cases
+^^^^^^^^^^^^^^^^^^^^^^
+
+Before diving into compilation details, it is helpful to understand the typical scenarios and how they map to decoder choices and workflow parameters. 
+A full set of common examples is provided to guide development.
+These examples describe the complete workflow for developing an application that uses real-time decoding in a single file.
+The relevant C++ and Python examples can be found at the following path:
+`libs/qec/unittests/realtime/app_examples <https://github.com/NVIDIA/cudaqx/tree/main/libs/qec/unittests/realtime/app_examples>`_.
+The files have names like ``surface_code-1.cpp`` and ``surface_code_1.py``. The rest of this section shows how to compile and run these 2 examples.
+
+These examples provide comprehensive support for application development with real-time decoding.
+The subsequent step, once the user has chosen the appropriate decoder and the appropriate backend, is to compile and execute the application.
+Instructions are provided below for both the simulation and the hardware backends.
+
+C++ Compilation
+^^^^^^^^^^^^^^^
+
+**Simulation Backend (Stim)**
+
+Compile with the simulation backend for local testing:
+
+.. code-block:: bash
+
+   nvq++ --target stim surface_code-1.cpp         \
+         -lcudaq-qec                              \
+         -lcudaq-qec-realtime-decoding            \
+         -lcudaq-qec-realtime-decoding-simulation \
+         -o surface_code-1
+
+   # Execute
+   ./surface_code-1 --distance 3 --num_shots 1000 --save_dem config.yaml
+
+**Key Points:**
+
+- ``--target stim``: Use the Stim quantum simulator
+- ``-lcudaq-qec``: Core QEC library with codes and experiments
+- ``-lcudaq-qec-realtime-decoding``: Real-time decoding core API
+- ``-lcudaq-qec-realtime-decoding-simulation``: Simulation-specific decoder backend
+
+**Quantinuum Backend (Hardware)**
+
+Compile for actual Quantinuum hardware:
+
+.. code-block:: bash
+
+   nvq++ --target quantinuum                         \
+         --quantinuum-machine Helios-1               \
+         --quantinuum-extra-payload-provider decoder \
+         surface_code-1.cpp                          \
+         -lcudaq-qec                                 \
+         -lcudaq-qec-realtime-decoding               \
+         -lcudaq-qec-realtime-decoding-quantinuum    \
+         -Wl,--export-dynamic                        \
+         -o surface_code-1-quantinuum-hardware
+
+   # Execute
+   export CUDAQ_QUANTINUUM_CREDENTIALS=<credentials_file_path>
+   ./surface_code-1-quantinuum-hardware --distance 3 --num_shots 100 --load_dem config.yaml
+
+**Key Points:**
+
+- Use Quantinuum target names: ``Helios-1``, ``Helios-1E``, ``Helios-1SC``, etc.
+- Currently only ``Helios-1`` will run the GPU decoders. The ``Helios-1E`` emulator will not run the GPU decoders.
+- Set ``CUDAQ_QUANTINUUM_CREDENTIALS`` environment variable with the user's credentials.
+  Check out the `Quantinuum hardware backend documentation <https://nvidia.github.io/cuda-quantum/latest/using/backends/hardware/iontrap.html#quantinuum>`_ for more information.
+
+**Emulated Quantinuum Compilation Workflow**
+
+Compile for Quantinuum emulation mode:
+
+.. code-block:: bash
+
+   nvq++ --target quantinuum --emulate            \
+         --quantinuum-machine Helios-Fake         \
+         surface_code-1.cpp                       \
+         -lcudaq-qec                              \
+         -lcudaq-qec-realtime-decoding            \
+         -lcudaq-qec-realtime-decoding-quantinuum \
+         -Wl,--export-dynamic                     \
+         -o surface_code-1-quantinuum-emulate
+
+   # Execute
+   ./surface_code-1-quantinuum-emulate --distance 3 --num_shots 1000 --load_dem config.yaml
+
+**Key Points:**
+
+- ``--target quantinuum --emulate``: Emulate Quantinuum compilation path
+- ``--quantinuum-machine Helios-Fake``: Specify machine (``Helios-Fake`` for emulation)
+- ``-lcudaq-qec-realtime-decoding-quantinuum``: Quantinuum-specific decoder backend (replaces ``-simulation``)
+- ``-Wl,--export-dynamic``: **Required** linker flag for dynamic symbol resolution
+
+.. note::
+  When running with `--emulate`, there is no noise being applied because there
+  is currently no way to express noise in target-specific QIR. Therefore, when
+  running with emulation, users will see noise-free sample data.
+
+Python Execution
+^^^^^^^^^^^^^^^^
+
+**Simulation Backend (Stim)**
+
+.. code-block:: bash
+
+   # Generate a decoder configuration file
+   python3 surface_code-1.py --distance 3 --save_dem config.yaml
+   # Run the circuit with the decoder configuration
+   python3 surface_code-1.py --distance 3 --load_dem config.yaml --num_shots 1000
+
+
+**Quantinuum Backend (Hardware)**
+
+.. code-block:: bash
+
+   python3 surface_code-1.py --distance 3 --load_dem config.yaml --num_shots 1000 --target quantinuum --machine-name Helios-1
+
+**Key Points:**
+
+- Use real machine names (check Quantinuum portal for available machines)
+- Reduce shot count for hardware experiments (hardware time is expensive)
+
+**Emulated Quantinuum Compilation Workflow**
+
+.. code-block:: bash
+
+   python3 surface_code-1.py --distance 3 --load_dem config.yaml --num_shots 1000 --target quantinuum --emulate
+**Key Points:**
+
+- ``emulate=True``: Emulate Quantinuum compilation path
+- Decoder config is automatically uploaded to Quantinuum's servers when
+  :py:func:`cudaq_qec.configure_decoders_from_file` (Python) or
+  :cpp:func:`cudaq::qec::decoding::config::configure_decoders_from_file` (C++) is called
+
+Complete Workflow Example
+^^^^^^^^^^^^^^^^^^^^^^^^^^
+Given that the user follows the structure of the examples provided, where each executable takes terminal arguments to configure the application, the following workflow can be used to compile and execute the application.
+
+
+.. code-block:: bash
+
+   # Phase 1: Generate Detector Error Model (DEM)
+   # This is done once per code/distance/noise configuration
+   
+   ## C++
+   ./surface_code-1 --distance 3 --num_shots 1000 --p_spam 0.01 \
+                    --save_dem config_d3.yaml --num_rounds 12 --decoder_window 6
+   
+   ## Python
+   python surface_code-1.py --distance 3 --num_shots 1000 --p_spam 0.01 \
+                            --save_dem config_d3.yaml --num_rounds 12 --decoder_window 6
+   
+   # Phase 2: Run with Real-Time Decoding
+   # Use the saved DEM configuration
+   
+   ## Simulation
+   ./surface_code-1 --distance 3 --num_shots 1000 --load_dem config_d3.yaml \
+                    --num_rounds 12 --decoder_window 6
+   
+   ## Quantinuum Emulation
+   ./surface_code-1-quantinuum-emulate --distance 3 --num_shots 1000 --load_dem config_d3.yaml \
+                           --num_rounds 12 --decoder_window 6
+   
+   ## Quantinuum Hardware
+   export CUDAQ_QUANTINUUM_CREDENTIALS=credentials.json
+   ./surface_code-1-quantinuum-hardware --distance 3 --num_shots 100 --load_dem config_d3.yaml \
+                         --num_rounds 12 --decoder_window 6
+
+**Application Parameters:**
+
+- ``--distance``: Code distance (3, 5, 7, etc.)
+- ``--num_shots``: Number of circuit repetitions
+- ``--p_spam``: Physical error rate for noise model (DEM generation only)
+- ``--save_dem``: Generate and save DEM configuration to file
+- ``--load_dem``: Load existing DEM configuration from file
+- ``--num_rounds``: Total number of syndrome measurement rounds
+- ``--decoder_window``: Number of rounds processed per decoding window
+
+Debugging and Environment Variables
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+ 
+**Useful Environment Variables:**
+
+.. code-block:: bash
+
+   # Enable decoder configuration debugging
+   export CUDAQ_QEC_DEBUG_DECODER=1
+   
+   # Set default simulator
+   export CUDAQ_DEFAULT_SIMULATOR=stim
+   
+   # Dump JIT IR for debugging compilation issues
+   export CUDAQ_DUMP_JIT_IR=1
+   
+   # Set Quantinuum credentials file
+   export CUDAQ_QUANTINUUM_CREDENTIALS=/path/to/credentials.json
+
+The variables can be set in the user's environment or in a script.
+They are valid both for python and C++ applications, however, they must be set before importing the cudaq or cudaq_qec libraries.
+
+**Common Compilation Issues:**
+
+1. **Missing libraries**: Ensure all ``-lcudaq-qec-*`` libraries are linked
+2. **Wrong backend library**: Use ``-simulation`` for Stim, ``-quantinuum`` for Quantinuum
+3. **Missing** ``--export-dynamic`` **flag**: Required for Quantinuum targets
+4. **Wrong target flags**: ``--emulate`` with ``Helios-Fake`` for emulation, remove for hardware
+
+**Common Runtime Issues:**
+
+1. **"Decoder X not found"**: Call ``configure_decoders_from_file()`` before circuit execution
+2. **"Configuration upload failed"**: Check network connectivity and Quantinuum credentials
+3. **Dimension mismatch errors**: Verify DEM dimensions match the circuit's syndrome count
+4. **High error rates**: Check decoder window size matches DEM generation window
+
+
+Decoder Selection
+^^^^^^^^^^^^^^^^^
+The page `CUDA-Q QEC Decoders <https://nvidia.github.io/cudaqx/components/qec/introduction.html#pre-built-qec-decoders>`_ provides information about which decoders are compatible with real-time decoding.
+
+Troubleshooting
+---------------
+
+Even with careful configuration, issues may be encountered during real-time decoding. This section covers the most common problems and their solutions, organized by symptom. When troubleshooting, start by isolating whether the issue is in DEM generation, decoder configuration, or runtime execution.
+
+Configuration Upload Failures (Quantinuum Backend)
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+When using the Quantinuum backend, the decoder configuration must be uploaded to their REST API before job submission. Upload failures prevent the quantum program from running and can be difficult to diagnose without knowing what to look for.
+
+**Possible Issues:**
+
+* **Network connectivity problems**: Connection to Quantinuum's servers is interrupted or unstable
+* **Configuration too large**: Decoder configuration exceeds Quantinuum's upload size limits (typically happens with large distance codes and lookup tables)
+* **Invalid credentials**: API authentication fails due to expired or incorrect credentials
+* **Malformed configuration**: YAML structure is invalid or contains unsupported parameters
+
+**Solutions**:
+
+* **Enable debug logging**: Set ``CUDAQ_QEC_DEBUG_DECODER=1`` environment variable to see the exact configuration being uploaded and any error messages from the REST API
+* **Check network**: Verify that Quantinuum's API endpoints can be reached before running the program. Test with a simple job submission first.
+* **Reduce configuration size**: If uploads fail due to size, switch from lookup table decoders to QLDPC (much more compact), or use sliding window with smaller windows
+* **Validate YAML locally**: Before uploading, test that ``multi_decoder_config::from_yaml_str()`` can parse the configuration file without errors
+* **Check credentials**: Ensure the Quantinuum API credentials are valid and have not expired. Refresh tokens if necessary.
+* **Test with emulation**: Try ``emulate=True`` first - emulation uses the same upload infrastructure but provides faster feedback if there are configuration issues
+
+**Verification**:
+
+After fixing configuration issues, the following log messages should appear:
+
+.. code-block:: text
+
+   [info] Initializing realtime decoding library with config file: config.yaml
+   [info] Initializing decoders...
+   [info] Creating decoder 0 of type multi_error_lut
+   [info] Done initializing decoder 0 in 0.234 seconds
+
+If errors appear instead, check the full error message - it often contains specific details about what failed (network timeout, size limit, parsing error, etc.).
+
+See Also
+--------
+
+* :doc:`/api/qec/cpp_api` - C++ API Reference (includes Real-Time Decoding)
+* :doc:`/api/qec/python_api` - Python API Reference (includes Real-Time Decoding)
+* Example source code: ``libs/qec/unittests/realtime/app_examples/``
+
diff --git a/docs/sphinx/index.rst b/docs/sphinx/index.rst
index 15153ebf..54feaf52 100644
--- a/docs/sphinx/index.rst
+++ b/docs/sphinx/index.rst
@@ -49,7 +49,8 @@ APIs for common quantum-classical solver workflows.
 * **cudaq-qec**: Quantum Error Correction Library
     * Extensible framework describing quantum error correcting codes as a collection of CUDA-Q kernels.
     * Extensible framework for describing syndrome decoders
-    * State-of-the-art, performant decoder implementations on NVIDIA GPUs (coming soon)
+    * State-of-the-art, performant decoder implementations on NVIDIA GPUs
+    * Real-time decoding for active error correction on quantum hardware
     * Pre-built numerical experiment APIs
 
 * **cudaq-solvers**: Performant Quantum-Classical Simulation Workflows
diff --git a/libs/qec/include/cudaq/qec/pcm_utils.h b/libs/qec/include/cudaq/qec/pcm_utils.h
index 350d57ae..c9387450 100644
--- a/libs/qec/include/cudaq/qec/pcm_utils.h
+++ b/libs/qec/include/cudaq/qec/pcm_utils.h
@@ -37,7 +37,8 @@ cudaqx::tensor<uint8_t> pcm_from_sparse_string(const std::string &sparse_str,
                                                std::size_t num_cols);
 
 /// @brief Return a PCM from a sparse representation.
-/// @param sparse_vec The sparse representation of the PCM.
+/// @param sparse_vec The sparse representation of the PCM, where -1 separates
+/// rows.
 /// @param num_rows The number of rows in the PCM.
 /// @param num_cols The number of columns in the PCM.
 /// @return A PCM tensor.
@@ -47,7 +48,8 @@ pcm_from_sparse_vec(const std::vector<std::int64_t> &sparse_vec,
 
 /// @brief Return a sparse representation of the PCM.
 /// @param pcm The PCM to convert to a sparse representation.
-/// @return A vector of integers that represents the PCM in a sparse format.
+/// @return A vector of integers that represents the PCM in a sparse format,
+/// where -1 separates rows.
 std::vector<std::int64_t> pcm_to_sparse_vec(const cudaqx::tensor<uint8_t> &pcm);
 
 /// @brief Generate a sparse detector matrix for a given number of syndromes per
diff --git a/libs/qec/python/bindings/py_decoder.cpp b/libs/qec/python/bindings/py_decoder.cpp
index 3c01054b..a5acd730 100644
--- a/libs/qec/python/bindings/py_decoder.cpp
+++ b/libs/qec/python/bindings/py_decoder.cpp
@@ -813,10 +813,12 @@ void bindDecoder(py::module &mod) {
         Generate a sparse detector matrix for a given number of syndromes per round
         and number of rounds. Time-like here means that each round of syndrome measurement
         bits are xor'd against the preceding round.
+        
         Args:
             num_syndromes_per_round: The number of syndrome measurements per round
             num_rounds: The number of rounds to generate the sparse detector matrix for
             include_first_round: Whether to include the first round of syndrome measurements
+
         Returns:
             The detector matrix format is CSR-like, with -1 values indicating the end of each row.
       )pbdoc",
diff --git a/scripts/ci/test_examples.sh b/scripts/ci/test_examples.sh
index 84ac1462..653c48ce 100755
--- a/scripts/ci/test_examples.sh
+++ b/scripts/ci/test_examples.sh
@@ -70,7 +70,17 @@ if [[ "$LIB" == "qec" || "$LIB" == "all" ]]; then
     done
     
     for file in examples/qec/cpp/*.cpp; do
-        run_cpp_test "$file" "--target=stim -lcudaq-qec"
+        # Get the filename without the path.
+        filename=$(basename $file)
+        # If the cpp file contains an nvq++ command, fetch the command line
+        # options from it and use them here. If there is no nvq++ command, use
+        # the default options.
+        nvqpp_options=$(grep nvq++ $file | sed -re "s/.*nvq\+\+ //" | sed -re "s/ $filename//")
+        if [ -n "$nvqpp_options" ]; then
+            run_cpp_test "$file" "$nvqpp_options"
+        else
+            run_cpp_test "$file" "--target=stim -lcudaq-qec"
+        fi
     done
 fi