diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index 3ac02c917a..0bf2b0f3f7 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -314,6 +314,81 @@ create_logger_macros(CUOPT "cuopt::default_logger()" include/cuopt)
 
 find_package(CUDSS REQUIRED)
 
+# ##################################################################################################
+# - NCCL (multi-GPU distributed PDLP) -------------------------------------------------------------
+# NCCL is shipped via the conda env; no canonical CMake config target, so look it
+# up by name in the standard lib paths (plus CONDA_PREFIX as a hint).
+set(NCCL_HINT_PREFIXES "")
+if (DEFINED ENV{CONDA_PREFIX} AND NOT "$ENV{CONDA_PREFIX}" STREQUAL "")
+    list(APPEND NCCL_HINT_PREFIXES "$ENV{CONDA_PREFIX}")
+endif ()
+find_path(NCCL_INCLUDE_DIR
+    NAMES nccl.h
+    HINTS ${NCCL_HINT_PREFIXES}
+    PATH_SUFFIXES include
+)
+find_library(NCCL_LIBRARY
+    NAMES nccl
+    HINTS ${NCCL_HINT_PREFIXES}
+    PATH_SUFFIXES lib lib64
+)
+if (NOT NCCL_INCLUDE_DIR OR NOT NCCL_LIBRARY)
+    message(FATAL_ERROR "NCCL not found. Looked in ${NCCL_HINT_PREFIXES}. Install nccl-dev / libnccl-dev in the active env.")
+endif ()
+add_library(nccl_external UNKNOWN IMPORTED GLOBAL)
+set_target_properties(nccl_external PROPERTIES
+    IMPORTED_LOCATION "${NCCL_LIBRARY}"
+    INTERFACE_INCLUDE_DIRECTORIES "${NCCL_INCLUDE_DIR}"
+)
+message(STATUS "Using NCCL: ${NCCL_LIBRARY}")
+
+# ##################################################################################################
+# - METIS (graph partitioning for distributed PDLP) -----------------------------------------------
+# Found by searching CONDA_PREFIX first, then CUOPT_METIS_ROOT (cmake var or env)
+# if the user wants to pull METIS from a different conda env / system path.
+set(METIS_HINT_PREFIXES "")
+if (DEFINED ENV{CONDA_PREFIX} AND NOT "$ENV{CONDA_PREFIX}" STREQUAL "")
+    list(APPEND METIS_HINT_PREFIXES "$ENV{CONDA_PREFIX}")
+endif ()
+if (DEFINED CUOPT_METIS_ROOT AND NOT "${CUOPT_METIS_ROOT}" STREQUAL "")
+    list(APPEND METIS_HINT_PREFIXES "${CUOPT_METIS_ROOT}")
+endif ()
+if (DEFINED ENV{CUOPT_METIS_ROOT} AND NOT "$ENV{CUOPT_METIS_ROOT}" STREQUAL "")
+    list(APPEND METIS_HINT_PREFIXES "$ENV{CUOPT_METIS_ROOT}")
+endif ()
+find_path(METIS_INCLUDE_DIR
+    NAMES metis.h
+    HINTS ${METIS_HINT_PREFIXES}
+    PATH_SUFFIXES include
+)
+find_library(METIS_LIBRARY
+    NAMES metis libmetis
+    HINTS ${METIS_HINT_PREFIXES}
+    PATH_SUFFIXES lib lib64
+)
+if (NOT METIS_INCLUDE_DIR OR NOT METIS_LIBRARY)
+    message(FATAL_ERROR "METIS not found. Looked in: ${METIS_HINT_PREFIXES}. "
+                        "Install it via 'conda install -c conda-forge metis' in the active env, "
+                        "or set CUOPT_METIS_ROOT to a prefix containing include/metis.h and lib/libmetis.{so,a}.")
+endif ()
+add_library(metis_external UNKNOWN IMPORTED GLOBAL)
+set_target_properties(metis_external PROPERTIES
+    IMPORTED_LOCATION "${METIS_LIBRARY}"
+    INTERFACE_INCLUDE_DIRECTORIES "${METIS_INCLUDE_DIR}"
+)
+message(STATUS "Using METIS: ${METIS_LIBRARY}")
+
+# ##################################################################################################
+# - KaMinPar (multi-threaded partitioning for distributed PDLP) ------------------------------------
+# Brought in the RAPIDS way (rapids_cpm_find): uses an installed KaMinPar (deb/rpm/conda,
+# discovered via its CMake config) if present, otherwise builds the pinned source via CPM.
+# Distributed PDLP prefers KaMinPar over METIS.
+include(cmake/thirdparty/get_kaminpar.cmake)
+if (NOT TARGET KaMinPar::KaMinPar)
+    message(FATAL_ERROR "KaMinPar::KaMinPar was not made available by get_kaminpar.cmake")
+endif ()
+message(STATUS "Using KaMinPar (distributed PDLP prefers KaMinPar over METIS)")
+
 # ##################################################################################################
 # - gRPC and Protobuf setup -----------------------------------------------------------------------
 
@@ -576,6 +651,9 @@ target_link_libraries(cuopt
         ${CUDSS_LIB_FILE}
         PRIVATE
         ${CUOPT_PRIVATE_CUDA_LIBS}
+        nccl_external
+        metis_external
+        KaMinPar::KaMinPar
         $<$<BOOL:${CUOPT_ENABLE_GRPC}>:protobuf::libprotobuf>
         $<$<BOOL:${CUOPT_ENABLE_GRPC}>:gRPC::grpc++>
 )
diff --git a/cpp/cmake/thirdparty/get_kaminpar.cmake b/cpp/cmake/thirdparty/get_kaminpar.cmake
new file mode 100644
index 0000000000..d548a76115
--- /dev/null
+++ b/cpp/cmake/thirdparty/get_kaminpar.cmake
@@ -0,0 +1,48 @@
+# cmake-format: off
+# SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+# cmake-format: on
+
+# Multi-threaded graph partitioner for distributed PDLP.
+# Uses rapids_cpm_find so a system / conda / .deb install of KaMinPar (which ships a
+# CMake config package exporting KaMinPar::KaMinPar) is used when available, and
+# otherwise the pinned source is cloned and built via CPM. KaMinPar depends on TBB,
+# which cuOpt already requires (see find_package(TBB) for papilo).
+function(find_and_configure_kaminpar)
+    set(oneValueArgs VERSION PINNED_TAG)
+    cmake_parse_arguments(PKG "" "${oneValueArgs}" "" ${ARGN})
+
+    rapids_cpm_find(KaMinPar ${PKG_VERSION}
+        GLOBAL_TARGETS KaMinPar::KaMinPar
+        CPM_ARGS
+        GIT_REPOSITORY https://github.com/KaHIP/KaMinPar.git
+        GIT_TAG ${PKG_PINNED_TAG}
+        EXCLUDE_FROM_ALL
+        OPTIONS
+            "KAMINPAR_BUILD_APPS OFF"
+            "KAMINPAR_BUILD_TOOLS OFF"
+            "KAMINPAR_BUILD_TESTS OFF"
+            "KAMINPAR_BUILD_BENCHMARKS OFF"
+            "KAMINPAR_BUILD_EXAMPLES OFF"
+            "KAMINPAR_BUILD_DISTRIBUTED OFF"
+            # Timers use global state and force single-threaded use of the library
+            # interface; disable so cuOpt can call the partitioner freely.
+            "KAMINPAR_ENABLE_TIMERS OFF"
+            # Avoid an extra hard dependency on Google Sparsehash.
+            "KAMINPAR_BUILD_WITH_SPARSEHASH OFF"
+            # cuOpt's TBB is discovered via a legacy find that only exposes TBB::tbb
+            # (no TBB::tbbmalloc target); disable KaMinPar's optional tbbmalloc use.
+            "KAMINPAR_ENABLE_TBB_MALLOC OFF"
+            # Large LP constraint graphs can exceed 2^31 directed edges.
+            "KAMINPAR_64BIT_EDGE_IDS ON"
+            "INSTALL_KAMINPAR OFF"
+    )
+
+    if(KaMinPar_ADDED)
+        message(VERBOSE "CUOPT: Using KaMinPar located in ${KaMinPar_SOURCE_DIR}")
+    else()
+        message(VERBOSE "CUOPT: Using KaMinPar located in ${KaMinPar_DIR}")
+    endif()
+endfunction()
+
+find_and_configure_kaminpar(VERSION 3.7.3 PINNED_TAG v3.7.3)
diff --git a/cpp/cuopt_cli.cpp b/cpp/cuopt_cli.cpp
index 39aab47170..b730067a28 100644
--- a/cpp/cuopt_cli.cpp
+++ b/cpp/cuopt_cli.cpp
@@ -176,7 +176,12 @@ int run_single_file(const std::string& file_path,
       auto solution = cuopt::linear_programming::solve_mip(problem_interface.get(), mip_settings);
     } else {
       auto& lp_settings = settings.get_pdlp_settings();
-      auto solution     = cuopt::linear_programming::solve_lp(problem_interface.get(), lp_settings);
+
+      if (lp_settings.hyper_params.use_distributed_pdlp) {
+        cuopt::linear_programming::solve_lp(handle_ptr.get(), mps_data_model, lp_settings);
+      } else {
+        cuopt::linear_programming::solve_lp(problem_interface.get(), lp_settings);
+      }
     }
   } catch (const std::exception& e) {
     fprintf(stderr, "cuopt_cli error: %s\n", e.what());
@@ -426,10 +431,21 @@ int main(int argc, char* argv[])
   std::vector<rmm::mr::cuda_async_memory_resource> memory_resources;
 
   if (memory_backend == cuopt::linear_programming::memory_backend_t::GPU) {
-    const int num_gpus = settings.get_parameter<int>(CUOPT_NUM_GPUS);
+    // Distributed PDLP scales one shard per GPU and uses its own knob; everything else
+    // (concurrent, batch, MIP) uses num_gpus which is capped at 2.
+    // For distributed PDLP, -1 means "auto-detect": resolve to the visible device
+    // count so the RMM memory pools match what solve.cu will eventually dispatch.
+    const bool use_distributed_pdlp = settings.get_parameter<bool>(CUOPT_USE_DISTRIBUTED_PDLP);
+    int requested_gpus              = use_distributed_pdlp
+                                        ? settings.get_parameter<int>(CUOPT_DISTRIBUTED_PDLP_NUM_GPUS)
+                                        : settings.get_parameter<int>(CUOPT_NUM_GPUS);
+    if (use_distributed_pdlp && requested_gpus == -1) {
+      requested_gpus = raft::device_setter::get_device_count();
+    }
+    const int provisioned_gpus = std::min(raft::device_setter::get_device_count(), requested_gpus);
 
-    memory_resources.reserve(std::min(raft::device_setter::get_device_count(), num_gpus));
-    for (int i = 0; i < std::min(raft::device_setter::get_device_count(), num_gpus); ++i) {
+    memory_resources.reserve(provisioned_gpus);
+    for (int i = 0; i < provisioned_gpus; ++i) {
       RAFT_CUDA_TRY(cudaSetDevice(i));
       memory_resources.emplace_back();
       rmm::mr::set_per_device_resource(rmm::cuda_device_id{i}, memory_resources.back());
diff --git a/cpp/include/cuopt/linear_programming/constants.h b/cpp/include/cuopt/linear_programming/constants.h
index 3342ba375c..29648d1a0f 100644
--- a/cpp/include/cuopt/linear_programming/constants.h
+++ b/cpp/include/cuopt/linear_programming/constants.h
@@ -80,14 +80,20 @@
 #define CUOPT_MIP_STRONG_BRANCHING_SIMPLEX_ITERATION_LIMIT \
   "mip_strong_branching_simplex_iteration_limit"
 
-#define CUOPT_SOLUTION_FILE            "solution_file"
-#define CUOPT_NUM_CPU_THREADS          "num_cpu_threads"
-#define CUOPT_NUM_GPUS                 "num_gpus"
-#define CUOPT_USER_PROBLEM_FILE        "user_problem_file"
-#define CUOPT_PRESOLVE_FILE            "presolve_file"
-#define CUOPT_RANDOM_SEED              "random_seed"
-#define CUOPT_PDLP_PRECISION           "pdlp_precision"
-#define CUOPT_MIP_SEMICONTINUOUS_BIG_M "mip_semi_continuous_big_m"
+#define CUOPT_SOLUTION_FILE                   "solution_file"
+#define CUOPT_NUM_CPU_THREADS                 "num_cpu_threads"
+#define CUOPT_NUM_GPUS                        "num_gpus"
+#define CUOPT_DISTRIBUTED_PDLP_NUM_GPUS       "distributed_pdlp_num_gpus"
+#define CUOPT_MULTI_GPU_PARTITION_FILE        "multi_gpu_partition_file"
+#define CUOPT_MULTI_GPU_EXPORT_PARTITION_FILE "multi_gpu_export_partition_file"
+#define CUOPT_DISTRIBUTED_PDLP_PARTITIONER    "distributed_pdlp_partitioner"
+#define CUOPT_USE_DISTRIBUTED_PDLP            "use_distributed_pdlp"
+#define CUOPT_PDLP_DISABLE_GRAPH              "pdlp_disable_graph"
+#define CUOPT_USER_PROBLEM_FILE               "user_problem_file"
+#define CUOPT_PRESOLVE_FILE                   "presolve_file"
+#define CUOPT_RANDOM_SEED                     "random_seed"
+#define CUOPT_PDLP_PRECISION                  "pdlp_precision"
+#define CUOPT_MIP_SEMICONTINUOUS_BIG_M        "mip_semi_continuous_big_m"
 
 #define CUOPT_MIP_HYPER_HEURISTIC_POPULATION_SIZE     "mip_hyper_heuristic_population_size"
 #define CUOPT_MIP_HYPER_HEURISTIC_NUM_CPUFJ_THREADS   "mip_hyper_heuristic_num_cpufj_threads"
diff --git a/cpp/include/cuopt/linear_programming/pdlp/pdlp_hyper_params.cuh b/cpp/include/cuopt/linear_programming/pdlp/pdlp_hyper_params.cuh
index 282e91d7ef..0ce90e7228 100644
--- a/cpp/include/cuopt/linear_programming/pdlp/pdlp_hyper_params.cuh
+++ b/cpp/include/cuopt/linear_programming/pdlp/pdlp_hyper_params.cuh
@@ -47,12 +47,16 @@ struct pdlp_hyper_params_t {
   bool bound_objective_rescaling                                  = true;
   bool use_reflected_primal_dual                                  = true;
   bool use_fixed_point_error                                      = true;
-  double reflection_coefficient                                   = 1.0;
-  double restart_k_p                                              = 0.99;
-  double restart_k_i                                              = 0.01;
-  double restart_k_d                                              = 0.0;
-  double restart_i_smooth                                         = 0.3;
-  bool use_conditional_major                                      = true;
+  bool use_distributed_pdlp                                       = false;
+  // Debug/diagnostic knob: when true, PDLP bypasses CUDA-graph capture in
+  // ping_pong_graph_t and executes each iteration eagerly
+  bool pdlp_disable_graph       = false;
+  double reflection_coefficient = 1.0;
+  double restart_k_p            = 0.99;
+  double restart_k_i            = 0.01;
+  double restart_k_d            = 0.0;
+  double restart_i_smooth       = 0.3;
+  bool use_conditional_major    = true;
 };
 
 // TODO most likely we want to get rid of pdlp_solver_mode and just have prebuilt
diff --git a/cpp/include/cuopt/linear_programming/pdlp/solver_settings.hpp b/cpp/include/cuopt/linear_programming/pdlp/solver_settings.hpp
index b30286f9ce..42ef1f592a 100644
--- a/cpp/include/cuopt/linear_programming/pdlp/solver_settings.hpp
+++ b/cpp/include/cuopt/linear_programming/pdlp/solver_settings.hpp
@@ -307,6 +307,25 @@ class pdlp_solver_settings_t {
   presolver_t presolver{presolver_t::Default};
   bool dual_postsolve{true};
   int num_gpus{1};
+  // Number of GPUs to use specifically for distributed PDLP (use_distributed_pdlp=true).
+  // -1 means auto-detect
+  int distributed_pdlp_num_gpus{-1};
+  std::string multi_gpu_partition_file{""};
+  // If non-empty, the partition computed for distributed PDLP is written to this
+  // path (one part-id per line) right after partitioning. The file can be fed
+  // back via multi_gpu_partition_file. Exposed as the multi_gpu_export_partition_file
+  // parameter (CLI: --multi-gpu-export-partition-file <path>).
+  std::string multi_gpu_export_partition_file{""};
+  // Which graph partitioner distributed PDLP uses. One of:
+  //   "auto"     - 1 GPU => Dummy; otherwise KaMinPar
+  //   "dummy"    - round-robin, no graph (trivial)
+  //   "metis"    - serial METIS_PartGraphKway
+  //   "kaminpar" - multi-threaded KaMinPar
+  // Exposed as the distributed_pdlp_partitioner parameter
+  // (CLI: --distributed-pdlp-partitioner <auto|dummy|metis|kaminpar>).
+  std::string distributed_pdlp_partitioner{"auto"};
+  // Set to true inside the shards
+  bool is_distributed_sub_pdlp{false};
   method_t method{method_t::Concurrent};
   bool inside_mip{false};
   // For concurrent termination
diff --git a/cpp/src/math_optimization/solver_settings.cu b/cpp/src/math_optimization/solver_settings.cu
index 0b4c360b10..254a3afb38 100644
--- a/cpp/src/math_optimization/solver_settings.cu
+++ b/cpp/src/math_optimization/solver_settings.cu
@@ -140,6 +140,7 @@ solver_settings_t<i_t, f_t>::solver_settings_t() : pdlp_settings(), mip_settings
     {CUOPT_MIP_REDUCED_COST_STRENGTHENING, &mip_settings.reduced_cost_strengthening, -1, std::numeric_limits<i_t>::max(), -1},
     {CUOPT_NUM_GPUS, &pdlp_settings.num_gpus, 1, 2, 1},
     {CUOPT_NUM_GPUS, &mip_settings.num_gpus, 1, 2, 1},
+    {CUOPT_DISTRIBUTED_PDLP_NUM_GPUS, &pdlp_settings.distributed_pdlp_num_gpus, -1, 576, -1},
     {CUOPT_MIP_BATCH_PDLP_STRONG_BRANCHING, &mip_settings.mip_batch_pdlp_strong_branching, 0, 2, 0},
     {CUOPT_MIP_BATCH_PDLP_RELIABILITY_BRANCHING, &mip_settings.mip_batch_pdlp_reliability_branching, 0, 2, 0},
     {CUOPT_MIP_STRONG_BRANCHING_SIMPLEX_ITERATION_LIMIT, &mip_settings.strong_branching_simplex_iteration_limit, -1,std::numeric_limits<i_t>::max(), -1},
@@ -177,6 +178,8 @@ solver_settings_t<i_t, f_t>::solver_settings_t() : pdlp_settings(), mip_settings
     {CUOPT_DUAL_POSTSOLVE, &pdlp_settings.dual_postsolve, true},
     {CUOPT_BARRIER_ITERATIVE_REFINEMENT, &pdlp_settings.barrier_iterative_refinement, true},
     {CUOPT_MIP_PROBING, &mip_settings.probing, true},
+    {CUOPT_USE_DISTRIBUTED_PDLP, &pdlp_settings.hyper_params.use_distributed_pdlp, false},
+    {CUOPT_PDLP_DISABLE_GRAPH, &pdlp_settings.hyper_params.pdlp_disable_graph, false},
   };
   // String parameters
   string_parameters = {
@@ -187,7 +190,10 @@ solver_settings_t<i_t, f_t>::solver_settings_t() : pdlp_settings(), mip_settings
     {CUOPT_USER_PROBLEM_FILE, &mip_settings.user_problem_file, ""},
     {CUOPT_USER_PROBLEM_FILE, &pdlp_settings.user_problem_file, ""},
     {CUOPT_PRESOLVE_FILE, &mip_settings.presolve_file, ""},
-    {CUOPT_PRESOLVE_FILE, &pdlp_settings.presolve_file, ""}
+    {CUOPT_PRESOLVE_FILE, &pdlp_settings.presolve_file, ""},
+    {CUOPT_MULTI_GPU_PARTITION_FILE, &pdlp_settings.multi_gpu_partition_file, ""},
+    {CUOPT_MULTI_GPU_EXPORT_PARTITION_FILE, &pdlp_settings.multi_gpu_export_partition_file, ""},
+    {CUOPT_DISTRIBUTED_PDLP_PARTITIONER, &pdlp_settings.distributed_pdlp_partitioner, "auto"},
   };
   // clang-format on
 }
diff --git a/cpp/src/pdlp/CMakeLists.txt b/cpp/src/pdlp/CMakeLists.txt
index f5f26837b6..12f2550203 100644
--- a/cpp/src/pdlp/CMakeLists.txt
+++ b/cpp/src/pdlp/CMakeLists.txt
@@ -29,6 +29,12 @@ set(LP_CORE_FILES
   ${CMAKE_CURRENT_SOURCE_DIR}/termination_strategy/convergence_information.cu
   ${CMAKE_CURRENT_SOURCE_DIR}/optimal_batch_size_handler/optimal_batch_size_handler.cu
   ${CMAKE_CURRENT_SOURCE_DIR}/utilities/ping_pong_graph.cu
+  ${CMAKE_CURRENT_SOURCE_DIR}/distributed_pdlp/shard.cu
+  ${CMAKE_CURRENT_SOURCE_DIR}/distributed_pdlp/multi_gpu_engine.cu
+  ${CMAKE_CURRENT_SOURCE_DIR}/distributed_pdlp/partition_loader.cu
+  ${CMAKE_CURRENT_SOURCE_DIR}/distributed_pdlp/partitioner.cu
+  ${CMAKE_CURRENT_SOURCE_DIR}/distributed_pdlp/metis_partitioner.cu
+  ${CMAKE_CURRENT_SOURCE_DIR}/distributed_pdlp/kaminpar_partitioner.cpp
 )
 
 # C and Python adapter files
diff --git a/cpp/src/pdlp/cusparse_view.cu b/cpp/src/pdlp/cusparse_view.cu
index 396fd27499..1e3638cdbd 100644
--- a/cpp/src/pdlp/cusparse_view.cu
+++ b/cpp/src/pdlp/cusparse_view.cu
@@ -498,14 +498,17 @@ cusparse_view_t<i_t, f_t>::cusparse_view_t(
   // setup cusparse view
   A.create(op_problem_scaled.n_constraints,
            op_problem_scaled.n_variables,
-           op_problem_scaled.nnz,
+           static_cast<int64_t>(A_.size()),
            const_cast<i_t*>(op_problem_scaled.offsets.data()),
            const_cast<i_t*>(op_problem_scaled.variables.data()),
            const_cast<f_t*>(op_problem_scaled.coefficients.data()));
 
+  // A_T can have a different nnz than A in multi-GPU shards
+  // A is just what is needed to compute A_x for owned constraints
+  // A_T is just what is needed to compute A_T_y for owned variables
   A_T.create(op_problem_scaled.n_variables,
              op_problem_scaled.n_constraints,
-             op_problem_scaled.nnz,
+             static_cast<int64_t>(A_T_.size()),
              const_cast<i_t*>(A_T_offsets_.data()),
              const_cast<i_t*>(A_T_indices_.data()),
              const_cast<f_t*>(A_T_.data()));
@@ -914,14 +917,14 @@ cusparse_view_t<i_t, f_t>::cusparse_view_t(
   // setup cusparse view
   A.create(op_problem.n_constraints,
            op_problem.n_variables,
-           op_problem.nnz,
+           static_cast<int64_t>(A_.size()),
            const_cast<i_t*>(op_problem.offsets.data()),
            const_cast<i_t*>(op_problem.variables.data()),
            const_cast<f_t*>(op_problem.coefficients.data()));
 
   A_T.create(op_problem.n_variables,
              op_problem.n_constraints,
-             op_problem.nnz,
+             static_cast<int64_t>(A_T_.size()),
              const_cast<i_t*>(A_T_offsets_.data()),
              const_cast<i_t*>(A_T_indices_.data()),
              const_cast<f_t*>(A_T_.data()));
@@ -1129,16 +1132,18 @@ cusparse_view_t<i_t, f_t>::cusparse_view_t(
   // Copying them from the existing cuSparse view is a bad practice and creates segfault post
   // CUDA 12.4 Using the saved pointer of the existing cusparse view to make sure we capture the
   // correct pointer
+  // See comment in the PDHG cusparse_view_t ctor: bind the descriptor nnz to
+  // the actual value-buffer length so A and A_T stay symmetric and shard-safe.
   A.create(op_problem.n_constraints,
            op_problem.n_variables,
-           op_problem.nnz,
+           static_cast<int64_t>(A_.size()),
            const_cast<i_t*>(A_offsets_.data()),
            const_cast<i_t*>(A_indices_.data()),
            const_cast<f_t*>(A_.data()));
 
   A_T.create(op_problem.n_variables,
              op_problem.n_constraints,
-             op_problem.nnz,
+             static_cast<int64_t>(existing_cusparse_view.A_T_.size()),
              const_cast<i_t*>(existing_cusparse_view.A_T_offsets_.data()),
              const_cast<i_t*>(existing_cusparse_view.A_T_indices_.data()),
              const_cast<f_t*>(existing_cusparse_view.A_T_.data()));
diff --git a/cpp/src/pdlp/distributed_pdlp/kaminpar_partitioner.cpp b/cpp/src/pdlp/distributed_pdlp/kaminpar_partitioner.cpp
new file mode 100644
index 0000000000..e7bf943f92
--- /dev/null
+++ b/cpp/src/pdlp/distributed_pdlp/kaminpar_partitioner.cpp
@@ -0,0 +1,142 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+// Plain C++ translation unit (not .cu): KaMinPar's public header is C++20 host code
+// and pulls in TBB; keeping it out of nvcc avoids device-compiler friction.
+
+#include <pdlp/distributed_pdlp/kaminpar_partitioner.hpp>
+#include <pdlp/distributed_pdlp/partitioner.hpp>
+
+#include <utilities/logger.hpp>
+
+#include <cuopt/error.hpp>
+
+#include <kaminpar.h>
+
+#include <chrono>
+#include <cstddef>
+#include <span>
+#include <thread>
+#include <vector>
+
+namespace cuopt::linear_programming::detail {
+
+// Builds the bipartite constraint/variable graph induced by A (identical layout
+// to metis_partitioner_t) and runs the multi-threaded KaMinPar k-way kernel.
+//   * nodes [0, nb_cstr)              : constraint nodes
+//   * nodes [nb_cstr, nb_cstr+nb_vars): variable nodes
+//   * undirected edges from each A nonzero (one half via A, one via A_t)
+template <typename i_t, typename f_t>
+std::vector<i_t> kaminpar_partitioner_t<i_t, f_t>::partition(
+  partitioner_input_t<i_t, f_t> const& input) const
+{
+  cuopt_expects(input.nb_parts >= 1,
+                error_type_t::ValidationError,
+                "kaminpar_partitioner: nb_parts must be >= 1");
+  cuopt_expects(input.nb_cstr >= 0 && input.nb_vars >= 0,
+                error_type_t::ValidationError,
+                "kaminpar_partitioner: invalid problem dimensions");
+
+  // The k-way kernel needs at least 2 blocks. For the single-shard case the
+  // partition is trivial (everything in block 0); short-circuit so KaMinPar can
+  // still be selected with distributed_pdlp_num_gpus == 1 without crashing.
+  if (input.nb_parts == 1) {
+    CUOPT_LOG_INFO("KaMinPar: nb_parts == 1, returning trivial single-block partition");
+    return std::vector<i_t>(static_cast<std::size_t>(input.nb_cstr + input.nb_vars), i_t{0});
+  }
+  cuopt_expects(input.A.row_offsets != nullptr && input.A.col_indices != nullptr,
+                error_type_t::ValidationError,
+                "kaminpar_partitioner: A.row_offsets and A.col_indices are required");
+  cuopt_expects(input.A_t.row_offsets != nullptr && input.A_t.col_indices != nullptr,
+                error_type_t::ValidationError,
+                "kaminpar_partitioner: A_t.row_offsets and A_t.col_indices are required");
+
+  auto const& A_offsets   = *input.A.row_offsets;
+  auto const& A_cols      = *input.A.col_indices;
+  auto const& A_t_offsets = *input.A_t.row_offsets;
+  auto const& A_t_cols    = *input.A_t.col_indices;
+
+  cuopt_expects(static_cast<i_t>(A_offsets.size()) == input.nb_cstr + 1,
+                error_type_t::ValidationError,
+                "kaminpar_partitioner: A.row_offsets size mismatch (expected nb_cstr+1)");
+  cuopt_expects(static_cast<i_t>(A_t_offsets.size()) == input.nb_vars + 1,
+                error_type_t::ValidationError,
+                "kaminpar_partitioner: A_t.row_offsets size mismatch (expected nb_vars+1)");
+  cuopt_expects(A_cols.size() == A_t_cols.size(),
+                error_type_t::ValidationError,
+                "kaminpar_partitioner: A and A_t nnz mismatch");
+
+  const i_t nb_cstr = input.nb_cstr;
+  const i_t nb_vars = input.nb_vars;
+  const i_t nnz     = static_cast<i_t>(A_cols.size());
+  const i_t nvtx    = nb_cstr + nb_vars;
+
+  // Resolve thread count: <= 0 => all hardware threads (1 as a last resort).
+  int nthreads = input.nb_threads > 0 ? static_cast<int>(input.nb_threads) : 0;
+  if (nthreads <= 0) {
+    nthreads = static_cast<int>(std::thread::hardware_concurrency());
+    if (nthreads <= 0) { nthreads = 1; }
+  }
+
+  // Bipartite CSR using KaMinPar index types (EdgeID for offsets, NodeID for neighbours).
+  std::vector<kaminpar::shm::EdgeID> xadj(static_cast<std::size_t>(nvtx) + 1);
+  std::vector<kaminpar::shm::NodeID> adjncy(2 * static_cast<std::size_t>(nnz));
+
+  for (i_t i = 0; i <= nb_cstr; ++i) {
+    xadj[i] = static_cast<kaminpar::shm::EdgeID>(A_offsets[i]);
+  }
+  for (i_t i = 0; i <= nb_vars; ++i) {
+    xadj[nb_cstr + i] =
+      static_cast<kaminpar::shm::EdgeID>(A_t_offsets[i]) + static_cast<kaminpar::shm::EdgeID>(nnz);
+  }
+  for (i_t k = 0; k < nnz; ++k) {
+    adjncy[k] =
+      static_cast<kaminpar::shm::NodeID>(A_cols[k]) + static_cast<kaminpar::shm::NodeID>(nb_cstr);
+  }
+  for (i_t k = 0; k < nnz; ++k) {
+    adjncy[nnz + k] = static_cast<kaminpar::shm::NodeID>(A_t_cols[k]);
+  }
+
+  std::vector<kaminpar::shm::BlockID> block_of(static_cast<std::size_t>(nvtx));
+
+  kaminpar::KaMinPar engine(nthreads, kaminpar::shm::create_default_context());
+  engine.copy_graph(std::span<const kaminpar::shm::EdgeID>(xadj),
+                    std::span<const kaminpar::shm::NodeID>(adjncy));
+  engine.set_k(static_cast<kaminpar::shm::BlockID>(input.nb_parts));
+  // ~3% imbalance, matching METIS_PartGraphKway's default balance constraint.
+  engine.set_uniform_max_block_weights(0.03);
+
+  auto t0 = std::chrono::high_resolution_clock::now();
+  const kaminpar::shm::EdgeWeight edge_cut =
+    engine.compute_partition(std::span<kaminpar::shm::BlockID>(block_of));
+  auto t1         = std::chrono::high_resolution_clock::now();
+  const double dt = std::chrono::duration<double>(t1 - t0).count();
+
+  CUOPT_LOG_INFO(
+    "KaMinPar partitioned bipartite graph: nvtx=%d nnz=%d nb_parts=%d nthreads=%d edge_cut=%lld "
+    "in %.3fs",
+    static_cast<int>(nvtx),
+    static_cast<int>(nnz),
+    static_cast<int>(input.nb_parts),
+    nthreads,
+    static_cast<long long>(edge_cut),
+    dt);
+
+  std::vector<i_t> parts(static_cast<std::size_t>(nvtx));
+  for (i_t i = 0; i < nvtx; ++i) {
+    parts[i] = static_cast<i_t>(block_of[i]);
+  }
+
+  validate_partition(parts,
+                     static_cast<int>(nb_cstr),
+                     static_cast<int>(nb_vars),
+                     static_cast<int>(input.nb_parts),
+                     "kaminpar_partitioner");
+  return parts;
+}
+
+template class kaminpar_partitioner_t<int, double>;
+
+}  // namespace cuopt::linear_programming::detail
diff --git a/cpp/src/pdlp/distributed_pdlp/kaminpar_partitioner.hpp b/cpp/src/pdlp/distributed_pdlp/kaminpar_partitioner.hpp
new file mode 100644
index 0000000000..43fda76f9f
--- /dev/null
+++ b/cpp/src/pdlp/distributed_pdlp/kaminpar_partitioner.hpp
@@ -0,0 +1,23 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#pragma once
+
+#include <pdlp/distributed_pdlp/partitioner.hpp>
+
+namespace cuopt::linear_programming::detail {
+
+// Multi-threaded k-way partitioner backed by KaMinPar. Builds the same
+// constraint/variable bipartite graph as metis_partitioner_t, but runs the
+// shared-memory parallel KaMinPar kernel so partitioning scales across all CPU
+// cores of a node (set via partitioner_input_t::nb_threads; <= 0 => all
+// hardware threads).
+template <typename i_t, typename f_t>
+class kaminpar_partitioner_t : public partitioner_i<i_t, f_t> {
+ public:
+  std::vector<i_t> partition(partitioner_input_t<i_t, f_t> const& input) const override;
+};
+
+}  // namespace cuopt::linear_programming::detail
diff --git a/cpp/src/pdlp/distributed_pdlp/metis_partitioner.cu b/cpp/src/pdlp/distributed_pdlp/metis_partitioner.cu
new file mode 100644
index 0000000000..9a4f0f50b1
--- /dev/null
+++ b/cpp/src/pdlp/distributed_pdlp/metis_partitioner.cu
@@ -0,0 +1,151 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#include <pdlp/distributed_pdlp/metis_partitioner.hpp>
+#include <pdlp/distributed_pdlp/partitioner.hpp>
+
+#include <utilities/logger.hpp>
+
+#include <cuopt/error.hpp>
+
+#include <metis.h>
+
+#include <chrono>
+#include <cstddef>
+#include <vector>
+
+namespace cuopt::linear_programming::detail {
+
+// Builds the bipartite constraint/variable graph induced by A and runs
+// METIS_PartGraphKway to assign each of the (nb_cstr + nb_vars) nodes to a
+// part in [0, nb_parts). Layout matches metis_tests:
+//   * nodes [0, nb_cstr)              : constraint nodes
+//   * nodes [nb_cstr, nb_cstr+nb_vars): variable nodes
+//   * undirected edges from each A nonzero (one half via A, one via A_t)
+// The output is consumed by partition_loader_t::create_rank_data_from_parts.
+template <typename i_t, typename f_t>
+std::vector<i_t> metis_partitioner_t<i_t, f_t>::partition(
+  partitioner_input_t<i_t, f_t> const& input) const
+{
+  cuopt_expects(input.nb_parts > 0,
+                error_type_t::ValidationError,
+                "metis_partitioner: nb_parts must be positive");
+  cuopt_expects(input.nb_cstr >= 0 && input.nb_vars >= 0,
+                error_type_t::ValidationError,
+                "metis_partitioner: invalid problem dimensions");
+
+  if (input.nb_parts == 1) {
+    CUOPT_LOG_INFO("METIS: nb_parts == 1, returning trivial single-block partition");
+    return std::vector<i_t>(static_cast<std::size_t>(input.nb_cstr + input.nb_vars), i_t{0});
+  }
+
+  cuopt_expects(input.A.row_offsets != nullptr && input.A.col_indices != nullptr,
+                error_type_t::ValidationError,
+                "metis_partitioner: A.row_offsets and A.col_indices are required");
+  cuopt_expects(input.A_t.row_offsets != nullptr && input.A_t.col_indices != nullptr,
+                error_type_t::ValidationError,
+                "metis_partitioner: A_t.row_offsets and A_t.col_indices are required");
+
+  auto const& A_offsets   = *input.A.row_offsets;
+  auto const& A_cols      = *input.A.col_indices;
+  auto const& A_t_offsets = *input.A_t.row_offsets;
+  auto const& A_t_cols    = *input.A_t.col_indices;
+
+  cuopt_expects(static_cast<i_t>(A_offsets.size()) == input.nb_cstr + 1,
+                error_type_t::ValidationError,
+                "metis_partitioner: A.row_offsets size mismatch (expected nb_cstr+1)");
+  cuopt_expects(static_cast<i_t>(A_t_offsets.size()) == input.nb_vars + 1,
+                error_type_t::ValidationError,
+                "metis_partitioner: A_t.row_offsets size mismatch (expected nb_vars+1)");
+  cuopt_expects(A_cols.size() == A_t_cols.size(),
+                error_type_t::ValidationError,
+                "metis_partitioner: A and A_t nnz mismatch");
+
+  const i_t nb_cstr = input.nb_cstr;
+  const i_t nb_vars = input.nb_vars;
+  const i_t nnz     = static_cast<i_t>(A_cols.size());
+  const i_t nvtx    = nb_cstr + nb_vars;
+
+  // Bipartite CSR. Same construction as metis_tests/src/main.cpp:
+  //   xadj   has length nvtx + 1
+  //   adjncy has length 2 * nnz (each A nonzero contributes one half-edge
+  //          from cstr side via A and one half-edge from var side via A_t)
+  std::vector<idx_t> xadj(nvtx + 1);
+  std::vector<idx_t> adjncy(2 * static_cast<std::size_t>(nnz));
+
+  // cstr-side row offsets: A_offsets[0..nb_cstr] (no shift).
+  for (i_t i = 0; i <= nb_cstr; ++i) {
+    xadj[i] = static_cast<idx_t>(A_offsets[i]);
+  }
+  // var-side row offsets: A_t_offsets[0..nb_vars], shifted by +nnz so that
+  // they index into the second half of adjncy.
+  for (i_t i = 0; i <= nb_vars; ++i) {
+    xadj[nb_cstr + i] = static_cast<idx_t>(A_t_offsets[i]) + static_cast<idx_t>(nnz);
+  }
+
+  // cstr-side neighbours: A_cols[i] shifted by +nb_cstr to index into the
+  // variable node block.
+  for (i_t k = 0; k < nnz; ++k) {
+    adjncy[k] = static_cast<idx_t>(A_cols[k]) + static_cast<idx_t>(nb_cstr);
+  }
+  // var-side neighbours: A_t_cols[i] already in [0, nb_cstr).
+  for (i_t k = 0; k < nnz; ++k) {
+    adjncy[nnz + k] = static_cast<idx_t>(A_t_cols[k]);
+  }
+
+  idx_t metis_options[METIS_NOPTIONS];
+  METIS_SetDefaultOptions(metis_options);
+  metis_options[METIS_OPTION_OBJTYPE] = METIS_OBJTYPE_CUT;
+
+  idx_t metis_nvtx = static_cast<idx_t>(nvtx);
+  idx_t ncon       = 1;
+  idx_t nparts     = static_cast<idx_t>(input.nb_parts);
+  idx_t objval     = 0;
+  std::vector<idx_t> metis_parts(nvtx);
+
+  auto t0          = std::chrono::high_resolution_clock::now();
+  const int status = METIS_PartGraphKway(&metis_nvtx,
+                                         &ncon,
+                                         xadj.data(),
+                                         adjncy.data(),
+                                         /*vwgt=*/nullptr,
+                                         /*vsize=*/nullptr,
+                                         /*adjwgt=*/nullptr,
+                                         &nparts,
+                                         /*tpwgts=*/nullptr,
+                                         /*ubvec=*/nullptr,
+                                         metis_options,
+                                         &objval,
+                                         metis_parts.data());
+  auto t1          = std::chrono::high_resolution_clock::now();
+  const double dt  = std::chrono::duration<double>(t1 - t0).count();
+  cuopt_expects(status == METIS_OK,
+                error_type_t::RuntimeError,
+                "METIS_PartGraphKway failed (status=%d)",
+                status);
+  CUOPT_LOG_INFO(
+    "METIS partitioned bipartite graph: nvtx=%d nnz=%d nb_parts=%d edge_cut=%lld in %.3fs",
+    static_cast<int>(nvtx),
+    static_cast<int>(nnz),
+    static_cast<int>(input.nb_parts),
+    static_cast<long long>(objval),
+    dt);
+
+  std::vector<i_t> parts(static_cast<std::size_t>(nvtx));
+  for (i_t i = 0; i < nvtx; ++i) {
+    parts[i] = static_cast<i_t>(metis_parts[i]);
+  }
+
+  validate_partition(parts,
+                     static_cast<int>(nb_cstr),
+                     static_cast<int>(nb_vars),
+                     static_cast<int>(input.nb_parts),
+                     "metis_partitioner");
+  return parts;
+}
+
+template class metis_partitioner_t<int, double>;
+
+}  // namespace cuopt::linear_programming::detail
diff --git a/cpp/src/pdlp/distributed_pdlp/metis_partitioner.hpp b/cpp/src/pdlp/distributed_pdlp/metis_partitioner.hpp
new file mode 100644
index 0000000000..c4e37f57a9
--- /dev/null
+++ b/cpp/src/pdlp/distributed_pdlp/metis_partitioner.hpp
@@ -0,0 +1,24 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#pragma once
+
+#include <pdlp/distributed_pdlp/partitioner.hpp>
+
+namespace cuopt::linear_programming::detail {
+
+// METIS k-way partitioner on the constraint/variable bipartite graph induced by A.
+// Requires partitioner_input_t::A and A_t (or A row_offsets/col_indices only — the
+// implementation builds the bipartite adjacency the same way as metis_tests:
+// cstr nodes [0, nb_cstr), var nodes [nb_cstr, nb_cstr+nb_vars), edges from A and A_t).
+//
+// Wire into make_partitioner() once METIS is an optional cuOpt dependency.
+template <typename i_t, typename f_t>
+class metis_partitioner_t : public partitioner_i<i_t, f_t> {
+ public:
+  std::vector<i_t> partition(partitioner_input_t<i_t, f_t> const& input) const override;
+};
+
+}  // namespace cuopt::linear_programming::detail
diff --git a/cpp/src/pdlp/distributed_pdlp/multi_gpu_engine.cu b/cpp/src/pdlp/distributed_pdlp/multi_gpu_engine.cu
new file mode 100644
index 0000000000..98f33b6c88
--- /dev/null
+++ b/cpp/src/pdlp/distributed_pdlp/multi_gpu_engine.cu
@@ -0,0 +1,102 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#include <pdlp/distributed_pdlp/multi_gpu_engine.hpp>
+// compute_A_x() / compute_At_y() (defined inline in the engine header) call
+// shard.sub_pdlp->pdhg_solver_.compute_* — pdlp_solver_t must be complete at
+// the explicit instantiation point below.
+#include <pdlp/pdlp.cuh>
+
+#include <cuopt/error.hpp>
+
+#include <raft/core/device_setter.hpp>
+
+#include <nccl.h>
+
+#include <numeric>
+
+namespace cuopt::linear_programming::detail {
+
+template <typename i_t, typename f_t>
+multi_gpu_engine_t<i_t, f_t>::multi_gpu_engine_t(
+  std::vector<rank_data_t<i_t, f_t>>&& rank_data,
+  std::vector<f_t> const& h_global_obj,
+  std::vector<f_t> const& h_global_var_lower,
+  std::vector<f_t> const& h_global_var_upper,
+  std::vector<f_t> const& h_global_cstr_lower,
+  std::vector<f_t> const& h_global_cstr_upper,
+  std::vector<f_t> const& h_global_obj_scaled,
+  std::vector<f_t> const& h_global_var_lower_scaled,
+  std::vector<f_t> const& h_global_var_upper_scaled,
+  std::vector<f_t> const& h_global_cstr_lower_scaled,
+  std::vector<f_t> const& h_global_cstr_upper_scaled,
+  std::vector<f_t> const& h_global_cummulative_cstr_scaling,
+  std::vector<f_t> const& h_global_cummulative_var_scaling,
+  f_t h_bound_rescaling,
+  f_t h_objective_rescaling,
+  bool maximize,
+  f_t objective_offset,
+  f_t objective_scaling_factor,
+  pdlp_solver_settings_t<i_t, f_t> const& sub_solver_settings)
+  : stream()
+{
+  const int nb_parts = static_cast<int>(rank_data.size());
+  cuopt_expects(
+    nb_parts > 0, error_type_t::ValidationError, "multi_gpu_engine_t: rank_data must be non-empty");
+
+  shards.reserve(nb_parts);
+  std::vector<int> devices(nb_parts);
+  std::iota(devices.begin(), devices.end(), 0);
+
+  // Create NCCL Comms then let shards own them
+  std::vector<ncclComm_t> raw_comms(nb_parts);
+  cuopt_expects(ncclCommInitAll(raw_comms.data(), nb_parts, devices.data()) == ncclSuccess,
+                error_type_t::RuntimeError,
+                "ncclCommInitAll failed");
+
+  // 3. Construct one shard per rank, pinned to its device.
+  for (int r = 0; r < nb_parts; ++r) {
+    raft::device_setter guard(devices[r]);  // shard ctor needs device set
+    shards.emplace_back(std::make_unique<pdlp_shard_t<i_t, f_t>>(devices[r],
+                                                                 std::move(rank_data[r]),
+                                                                 raw_comms[r],
+                                                                 h_global_obj,
+                                                                 h_global_var_lower,
+                                                                 h_global_var_upper,
+                                                                 h_global_cstr_lower,
+                                                                 h_global_cstr_upper,
+                                                                 h_global_obj_scaled,
+                                                                 h_global_var_lower_scaled,
+                                                                 h_global_var_upper_scaled,
+                                                                 h_global_cstr_lower_scaled,
+                                                                 h_global_cstr_upper_scaled,
+                                                                 h_global_cummulative_cstr_scaling,
+                                                                 h_global_cummulative_var_scaling,
+                                                                 h_bound_rescaling,
+                                                                 h_objective_rescaling,
+                                                                 maximize,
+                                                                 objective_offset,
+                                                                 objective_scaling_factor,
+                                                                 sub_solver_settings));
+  }
+
+  // Two different events
+  // capture_*_event_ are used inside graph capture
+  // ext_*_event_ are used when sync is needed outside of graph
+  graph_master_ready_event_ = std::make_unique<cuopt::event_handler_t>();
+  sync_master_ready_event_  = std::make_unique<cuopt::event_handler_t>();
+  graph_shard_ready_events_.reserve(nb_parts);
+  sync_shard_ready_events_.reserve(nb_parts);
+  for (int r = 0; r < nb_parts; ++r) {
+    raft::device_setter guard(devices[r]);
+    graph_shard_ready_events_.emplace_back(std::make_unique<cuopt::event_handler_t>());
+    sync_shard_ready_events_.emplace_back(std::make_unique<cuopt::event_handler_t>());
+  }
+}
+
+template struct multi_gpu_engine_t<int, double>;
+// template struct multi_gpu_engine_t<int, float>;
+
+}  // namespace cuopt::linear_programming::detail
diff --git a/cpp/src/pdlp/distributed_pdlp/multi_gpu_engine.hpp b/cpp/src/pdlp/distributed_pdlp/multi_gpu_engine.hpp
new file mode 100644
index 0000000000..89153e8bd7
--- /dev/null
+++ b/cpp/src/pdlp/distributed_pdlp/multi_gpu_engine.hpp
@@ -0,0 +1,1191 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ */
+#pragma once
+
+#include <pdlp/distributed_pdlp/rank_data.hpp>
+#include <pdlp/distributed_pdlp/shard.hpp>
+#include <pdlp/pdhg.hpp>
+#include <utilities/cuda_helpers.cuh>
+#include <utilities/event_handler.cuh>
+
+#include <cuopt/linear_programming/pdlp/solver_settings.hpp>
+
+#include <raft/sparse/detail/cusparse_wrappers.h>
+#include <raft/core/cusparse_macros.hpp>
+#include <raft/core/device_setter.hpp>
+#include <raft/linalg/detail/cublas_wrappers.hpp>
+
+#include <rmm/cuda_stream.hpp>
+#include <rmm/device_buffer.hpp>
+#include <rmm/exec_policy.hpp>
+
+#include <thrust/execution_policy.h>
+#include <thrust/gather.h>
+#include <thrust/iterator/transform_iterator.h>
+#include <thrust/iterator/zip_iterator.h>
+#include <thrust/scatter.h>
+#include <cub/device/device_reduce.cuh>
+#include <cub/device/device_transform.cuh>
+#include <cuda/std/tuple>
+
+#include <nccl.h>
+
+#include <cmath>
+#include <memory>
+#include <random>
+#include <tuple>
+#include <vector>
+
+namespace cuopt::linear_programming::detail {
+
+// Element-wise sqrt functor. Defined at namespace scope (not as a local
+// extended HD lambda) because nvcc disallows extended __host__ __device__
+// lambdas appearing inside templates whose template arguments are
+// themselves local lambda types (which happens when distributed_l2_norm is
+// invoked with closure accessors).
+template <typename f_t>
+struct sqrt_inplace_op_t {
+  __host__ __device__ f_t operator()(f_t x) const { return raft::sqrt(x); }
+};
+
+// Squared-norm contribution of a constraint's [lower, upper] bound pair, used to
+// build the distributed bound rescaling (mirrors rhs_sum_of_squares_t). Defined
+// at namespace scope to avoid extended-lambda-in-template restrictions.
+template <typename f_t>
+struct mgpu_rhs_sq_op_t {
+  __host__ __device__ f_t operator()(const thrust::tuple<f_t, f_t>& t) const
+  {
+    const f_t lower = thrust::get<0>(t);
+    const f_t upper = thrust::get<1>(t);
+    f_t sum         = f_t(0);
+    if (isfinite(lower) && (lower != upper)) sum += lower * lower;
+    if (isfinite(upper)) sum += upper * upper;
+    return sum;
+  }
+};
+
+// Weighted square of an objective coefficient (mirrors weighted_square_op).
+template <typename f_t>
+struct mgpu_weighted_sq_op_t {
+  f_t weight;
+  __host__ __device__ f_t operator()(f_t v) const { return v * v * weight; }
+};
+
+template <typename i_t, typename f_t>
+struct multi_gpu_engine_t {
+  // Constructs shards from rank_data
+  multi_gpu_engine_t(std::vector<rank_data_t<i_t, f_t>>&& rank_data,
+                     std::vector<f_t> const& h_global_obj,
+                     std::vector<f_t> const& h_global_var_lower,
+                     std::vector<f_t> const& h_global_var_upper,
+                     std::vector<f_t> const& h_global_cstr_lower,
+                     std::vector<f_t> const& h_global_cstr_upper,
+                     std::vector<f_t> const& h_global_obj_scaled,
+                     std::vector<f_t> const& h_global_var_lower_scaled,
+                     std::vector<f_t> const& h_global_var_upper_scaled,
+                     std::vector<f_t> const& h_global_cstr_lower_scaled,
+                     std::vector<f_t> const& h_global_cstr_upper_scaled,
+                     std::vector<f_t> const& h_global_cummulative_cstr_scaling,
+                     std::vector<f_t> const& h_global_cummulative_var_scaling,
+                     f_t h_bound_rescaling,
+                     f_t h_objective_rescaling,
+                     bool maximize,
+                     f_t objective_offset,
+                     f_t objective_scaling_factor,
+                     pdlp_solver_settings_t<i_t, f_t> const& sub_solver_settings);
+
+  multi_gpu_engine_t(const multi_gpu_engine_t&)            = delete;
+  multi_gpu_engine_t& operator=(const multi_gpu_engine_t&) = delete;
+
+  template <typename Fn>
+  void for_each_shard(Fn&& fn)
+  {
+    for (auto& s : shards) {
+      raft::device_setter guard(s->device_id);
+      fn(*s);
+    }
+  }
+
+  template <typename... InAccess, typename OutAccess, typename SizeAccess, typename Op>
+  void distributed_transform(std::tuple<InAccess...> in_accessors,
+                             OutAccess out,
+                             SizeAccess sz,
+                             Op op)
+  {
+    for_each_shard([&](auto& shard) {
+      auto& sub = *shard.sub_pdlp;
+      // turns the Tuple of lambdas into a tuple of rmm::device_uvector
+      auto cub_inputs = std::apply(
+        [&sub](auto&... acc) { return cuda::std::make_tuple(acc(sub)...); }, in_accessors);
+
+      cub::DeviceTransform::Transform(cub_inputs, out(sub), sz(sub), op, shard.stream.view());
+    });
+  }
+  // --- 2) convenience: single input accessor (delegates) ---
+  template <typename InAccess, typename OutAccess, typename SizeAccess, typename Op>
+  void distributed_transform(InAccess in, OutAccess out, SizeAccess sz, Op op)
+  {
+    distributed_transform(std::make_tuple(in), out, sz, op);
+  }
+
+  // -------- Halo exchange (variables / x) ---------------------------------
+  // Fills the halo slice [owned_var_size, total_var_size) of the per-shard
+  // input buffer returned by `buf_access(pdhg)` (the buffer A @ x will read).
+  // Step 1: thrust::gather per-peer outgoing values into staging buffers.
+  // Step 2: a single NCCL group with matched ncclSend / ncclRecv across all
+  // (rank, peer) pairs.
+  template <typename BufAccess>
+  void halo_exchange_var(BufAccess&& buf_access)
+  {
+    const int nb = static_cast<int>(shards.size());
+
+    // Step 1: gather owned values that each peer needs into per-peer staging.
+    for (int r = 0; r < nb; ++r) {
+      auto& s = *shards[r];
+      raft::device_setter guard(s.device_id);
+      auto& x = buf_access(s.sub_pdlp->pdhg_solver_);
+      for (int peer = 0; peer < nb; ++peer) {
+        if (peer == r) continue;
+        if (s.var_send_indices_d[peer].size() == 0) continue;
+        thrust::gather(rmm::exec_policy_nosync(s.stream.view()),
+                       s.var_send_indices_d[peer].begin(),
+                       s.var_send_indices_d[peer].end(),
+                       x.begin(),
+                       s.var_send_buf_d[peer].begin());
+      }
+    }
+
+    // Step 2: matched send / recv across the whole topology in one NCCL group.
+    ncclGroupStart();
+    for (int r = 0; r < nb; ++r) {
+      auto& s = *shards[r];
+      raft::device_setter guard(s.device_id);
+      for (int peer = 0; peer < nb; ++peer) {
+        if (peer == r) continue;
+        ncclSend(s.var_send_buf_d[peer].data(),
+                 s.var_send_buf_d[peer].size(),
+                 ncclFloat64,
+                 peer,
+                 s.comm.get(),
+                 s.stream.view().value());
+      }
+    }
+    for (int r = 0; r < nb; ++r) {
+      auto& s  = *shards[r];
+      auto& rd = s.rank_data;
+      raft::device_setter guard(s.device_id);
+      auto& x = buf_access(s.sub_pdlp->pdhg_solver_);
+      for (int peer = 0; peer < nb; ++peer) {
+        if (peer == r) continue;
+        f_t* recv_ptr = x.data() + rd.owned_var_size + rd.var_recv_offsets[peer];
+        ncclRecv(recv_ptr,
+                 static_cast<size_t>(rd.var_recv_counts[peer]),
+                 ncclFloat64,
+                 peer,
+                 s.comm.get(),
+                 s.stream.view().value());
+      }
+    }
+    ncclGroupEnd();
+  }
+
+  // -------- Halo exchange (constraints / y) -------------------------------
+  // Same as halo_exchange_var but for a constraint-shaped buffer (the input
+  // A_T @ y will read) and constraint halos.
+  template <typename BufAccess>
+  void halo_exchange_cstr(BufAccess&& buf_access)
+  {
+    const int nb = static_cast<int>(shards.size());
+
+    for (int r = 0; r < nb; ++r) {
+      auto& s = *shards[r];
+      raft::device_setter guard(s.device_id);
+      auto& y = buf_access(s.sub_pdlp->pdhg_solver_);
+      for (int peer = 0; peer < nb; ++peer) {
+        if (peer == r) continue;
+        if (s.cstr_send_indices_d[peer].size() == 0) continue;
+        thrust::gather(rmm::exec_policy_nosync(s.stream.view()),
+                       s.cstr_send_indices_d[peer].begin(),
+                       s.cstr_send_indices_d[peer].end(),
+                       y.begin(),
+                       s.cstr_send_buf_d[peer].begin());
+      }
+    }
+
+    ncclGroupStart();
+    for (int r = 0; r < nb; ++r) {
+      auto& s = *shards[r];
+      raft::device_setter guard(s.device_id);
+      for (int peer = 0; peer < nb; ++peer) {
+        if (peer == r) continue;
+        ncclSend(s.cstr_send_buf_d[peer].data(),
+                 s.cstr_send_buf_d[peer].size(),
+                 ncclFloat64,
+                 peer,
+                 s.comm.get(),
+                 s.stream.view().value());
+      }
+    }
+    for (int r = 0; r < nb; ++r) {
+      auto& s  = *shards[r];
+      auto& rd = s.rank_data;
+      raft::device_setter guard(s.device_id);
+      auto& y = buf_access(s.sub_pdlp->pdhg_solver_);
+      for (int peer = 0; peer < nb; ++peer) {
+        if (peer == r) continue;
+        f_t* recv_ptr = y.data() + rd.owned_cstr_size + rd.cstr_recv_offsets[peer];
+        ncclRecv(recv_ptr,
+                 static_cast<size_t>(rd.cstr_recv_counts[peer]),
+                 ncclFloat64,
+                 peer,
+                 s.comm.get(),
+                 s.stream.view().value());
+      }
+    }
+    ncclGroupEnd();
+  }
+
+  // -------- Broadcast owned constraint (row) scaling into halo ------------
+  void broadcast_constraint_scaling_to_halo()
+  {
+    const int nb    = static_cast<int>(shards.size());
+    auto buf_access = [](pdlp_shard_t<i_t, f_t>& s) -> rmm::device_uvector<f_t>& {
+      return s.sub_pdlp->get_initial_scaling_strategy().get_cummulative_constraint_matrix_scaling();
+    };
+
+    // Gather each owner's owned scaling values that peers need.
+    for (int r = 0; r < nb; ++r) {
+      auto& s = *shards[r];
+      raft::device_setter guard(s.device_id);
+      auto& y = buf_access(s);
+      for (int peer = 0; peer < nb; ++peer) {
+        if (peer == r) continue;
+        if (s.cstr_send_indices_d[peer].size() == 0) continue;
+        thrust::gather(rmm::exec_policy_nosync(s.stream.view()),
+                       s.cstr_send_indices_d[peer].begin(),
+                       s.cstr_send_indices_d[peer].end(),
+                       y.begin(),
+                       s.cstr_send_buf_d[peer].begin());
+      }
+    }
+
+    ncclGroupStart();
+    for (int r = 0; r < nb; ++r) {
+      auto& s = *shards[r];
+      raft::device_setter guard(s.device_id);
+      for (int peer = 0; peer < nb; ++peer) {
+        if (peer == r) continue;
+        ncclSend(s.cstr_send_buf_d[peer].data(),
+                 s.cstr_send_buf_d[peer].size(),
+                 ncclFloat64,
+                 peer,
+                 s.comm.get(),
+                 s.stream.view().value());
+      }
+    }
+    for (int r = 0; r < nb; ++r) {
+      auto& s  = *shards[r];
+      auto& rd = s.rank_data;
+      raft::device_setter guard(s.device_id);
+      auto& y = buf_access(s);
+      for (int peer = 0; peer < nb; ++peer) {
+        if (peer == r) continue;
+        f_t* recv_ptr = y.data() + rd.owned_cstr_size + rd.cstr_recv_offsets[peer];
+        ncclRecv(recv_ptr,
+                 static_cast<size_t>(rd.cstr_recv_counts[peer]),
+                 ncclFloat64,
+                 peer,
+                 s.comm.get(),
+                 s.stream.view().value());
+      }
+    }
+    ncclGroupEnd();
+  }
+
+  // -------- NCCL allreduce (sum, in place) --------------------------------
+  // Per-shard in-place sum-allreduce. Each shard's stream issues an
+  // ncclAllReduce(buf, buf, count, ncclFloat64, ncclSum, ...) inside a single
+  // group. After this returns, every shard's buffer holds the global sum.
+  //
+  // PtrAccess: pdlp_solver_t<i_t,f_t>& -> f_t*  (e.g. into step_size_strategy_).
+  template <typename PtrAccess>
+  void allreduce_sum_inplace(PtrAccess&& ptr_access, size_t count = 1)
+  {
+    ncclGroupStart();
+    for (auto& s : shards) {
+      raft::device_setter guard(s->device_id);
+      f_t* buf = ptr_access(*s->sub_pdlp);
+      ncclAllReduce(buf, buf, count, ncclFloat64, ncclSum, s->comm.get(), s->stream.view().value());
+    }
+    ncclGroupEnd();
+  }
+
+  // -------- Distributed L2 norm ------------------------------------------
+  // Computes sqrt(Σ_k Σ_{i ∈ owned_k} buf_k[i]²) and writes the scalar into
+  // the buffer returned by `out_access` on EVERY shard.
+  //
+  // Algorithm:
+  //   1) per shard: out = cublasdot(buf[0:n_owned], buf[0:n_owned])  (partial Σ²)
+  //   2) NCCL allreduce SUM on out (count = 1)                       (global Σ²)
+  //   3) per shard: out = sqrt(out)
+  //
+  // The caller is responsible for clipping correctness via `size_access`
+  // (which picks `rank_data.owned_var_size` or `rank_data.owned_cstr_size`
+  // depending on the shape of the input buffer), and for mirroring the
+  // result back to master if downstream code needs it there.
+  //
+  // BufAccess  : pdlp_solver_t<i_t,f_t>& -> rmm::device_uvector<f_t>&
+  // OutAccess  : pdlp_solver_t<i_t,f_t>& -> f_t*   (single scalar in shard memory)
+  // SizeAccess : pdlp_shard_t<i_t,f_t>&  -> i_t    (owned slice length)
+  template <typename BufAccess, typename OutAccess, typename SizeAccess>
+  void distributed_l2_norm(BufAccess&& buf_access, OutAccess&& out_access, SizeAccess&& size_access)
+  {
+    for_each_shard([&](auto& shard) {
+      auto& sub   = *shard.sub_pdlp;
+      auto& buf   = buf_access(sub);
+      const i_t n = size_access(shard);
+      f_t* out    = out_access(sub);
+      RAFT_CUBLAS_TRY(raft::linalg::detail::cublasdot(shard.handle.get_cublas_handle(),
+                                                      static_cast<int>(n),
+                                                      buf.data(),
+                                                      1,
+                                                      buf.data(),
+                                                      1,
+                                                      out,
+                                                      shard.stream.view().value()));
+    });
+
+    allreduce_sum_inplace(out_access, /*count=*/1);
+
+    for_each_shard([&](auto& shard) {
+      f_t* out = out_access(*shard.sub_pdlp);
+      cub::DeviceTransform::Transform(
+        out, out, 1, sqrt_inplace_op_t<f_t>{}, shard.stream.view().value());
+    });
+  }
+
+  // -------- Distributed bound / objective rescaling -----------------------
+  void distributed_bound_objective_rescaling(f_t c_scaling_weight)
+  {
+    const int nb = static_cast<int>(shards.size());
+
+    std::vector<rmm::device_uvector<f_t>> bound_sq;
+    std::vector<rmm::device_uvector<f_t>> obj_sq;
+    bound_sq.reserve(nb);
+    obj_sq.reserve(nb);
+
+    // 1) per-shard partial squared norms over OWNED entries only (halo rhs is
+    //    +/-inf and would otherwise double-count owned entries shared as halo).
+    for (int r = 0; r < nb; ++r) {
+      auto& s = *shards[r];
+      raft::device_setter guard(s.device_id);
+      bound_sq.emplace_back(1, s.stream.view());
+      obj_sq.emplace_back(1, s.stream.view());
+
+      const auto& scaled     = s.sub_pdlp->get_initial_scaling_strategy().get_scaled_op_problem();
+      const int n_owned_cstr = static_cast<int>(s.rank_data.owned_cstr_size);
+      const int n_owned_var  = static_cast<int>(s.rank_data.owned_var_size);
+
+      auto bound_in = thrust::make_transform_iterator(
+        thrust::make_zip_iterator(scaled.constraint_lower_bounds.data(),
+                                  scaled.constraint_upper_bounds.data()),
+        mgpu_rhs_sq_op_t<f_t>{});
+      size_t tmp_bytes_b = 0;
+      cub::DeviceReduce::Sum(
+        nullptr, tmp_bytes_b, bound_in, bound_sq[r].data(), n_owned_cstr, s.stream.view().value());
+      rmm::device_buffer scratch_b(tmp_bytes_b, s.stream.view());
+      cub::DeviceReduce::Sum(scratch_b.data(),
+                             tmp_bytes_b,
+                             bound_in,
+                             bound_sq[r].data(),
+                             n_owned_cstr,
+                             s.stream.view().value());
+
+      auto obj_in        = thrust::make_transform_iterator(scaled.objective_coefficients.data(),
+                                                    mgpu_weighted_sq_op_t<f_t>{c_scaling_weight});
+      size_t tmp_bytes_o = 0;
+      cub::DeviceReduce::Sum(
+        nullptr, tmp_bytes_o, obj_in, obj_sq[r].data(), n_owned_var, s.stream.view().value());
+      rmm::device_buffer scratch_o(tmp_bytes_o, s.stream.view());
+      cub::DeviceReduce::Sum(scratch_o.data(),
+                             tmp_bytes_o,
+                             obj_in,
+                             obj_sq[r].data(),
+                             n_owned_var,
+                             s.stream.view().value());
+    }
+
+    // 2) NCCL allreduce SUM -> every shard holds the global squared norms.
+    ncclGroupStart();
+    for (int r = 0; r < nb; ++r) {
+      auto& s = *shards[r];
+      raft::device_setter guard(s.device_id);
+      ncclAllReduce(bound_sq[r].data(),
+                    bound_sq[r].data(),
+                    1,
+                    ncclFloat64,
+                    ncclSum,
+                    s.comm.get(),
+                    s.stream.view().value());
+      ncclAllReduce(obj_sq[r].data(),
+                    obj_sq[r].data(),
+                    1,
+                    ncclFloat64,
+                    ncclSum,
+                    s.comm.get(),
+                    s.stream.view().value());
+    }
+    ncclGroupEnd();
+
+    // 3) derive the identical scalars and apply on every shard.
+    for (int r = 0; r < nb; ++r) {
+      auto& s = *shards[r];
+      raft::device_setter guard(s.device_id);
+      f_t h_bound_sq = f_t(0);
+      f_t h_obj_sq   = f_t(0);
+      raft::copy(&h_bound_sq, bound_sq[r].data(), 1, s.stream.view());
+      raft::copy(&h_obj_sq, obj_sq[r].data(), 1, s.stream.view());
+      s.stream.synchronize();
+      const f_t bound_rescaling     = f_t(1) / (std::sqrt(h_bound_sq) + f_t(1));
+      const f_t objective_rescaling = f_t(1) / (std::sqrt(h_obj_sq) + f_t(1));
+      s.sub_pdlp->get_initial_scaling_strategy().apply_distributed_bound_objective_rescaling(
+        bound_rescaling, objective_rescaling);
+    }
+    for (int r = 0; r < nb; ++r) {
+      auto& s = *shards[r];
+      raft::device_setter guard(s.device_id);
+      s.stream.synchronize();
+    }
+  }
+
+  // -------- Generic distributed SpMVs -------------------------------------
+  // distributed_spmv_A : halo-update the var-shaped input buffer returned by
+  // `in_buf(pdhg)`, then per-shard A @ in_buf -> out_desc.
+  // distributed_spmv_At: halo-update the cstr-shaped input buffer returned by
+  // `in_buf(pdhg)`, then per-shard A_T @ in_buf -> out_desc.
+  //
+  // Accessor signatures:
+  //   in_buf  (pdhg_solver_t<i_t,f_t>&) -> rmm::device_uvector<f_t>&
+  //   out_desc(pdhg_solver_t<i_t,f_t>&) -> cusparseDnVecDescr_t
+  template <typename InBufAccess, typename OutDescAccess>
+  void distributed_spmv_A(InBufAccess&& in_buf, OutDescAccess&& out_desc)
+  {
+    halo_exchange_var(in_buf);
+    for_each_shard([&](auto& shard) {
+      auto& sub_pdhg = shard.sub_pdlp->pdhg_solver_;
+      sub_pdhg.spmv_A_into(in_buf(sub_pdhg), out_desc(sub_pdhg));
+    });
+  }
+
+  template <typename InBufAccess, typename OutDescAccess>
+  void distributed_spmv_At(InBufAccess&& in_buf, OutDescAccess&& out_desc)
+  {
+    halo_exchange_cstr(in_buf);
+    for_each_shard([&](auto& shard) {
+      auto& sub_pdhg = shard.sub_pdlp->pdhg_solver_;
+      sub_pdhg.spmv_At_into(in_buf(sub_pdhg), out_desc(sub_pdhg));
+    });
+  }
+
+  // -------- High-level: A @ x and A_T @ y ---------------------------------
+  // Thin wrappers used from pdhg_solver_t::compute_A_x / compute_At_y when an
+  // engine is wired in. They drive the per-shard plan-based SpMV via the
+  // canonical cusparse_view bindings (no rebinding) so the descriptor binding
+  // is never disturbed by mGPU machinery.
+  //
+  // The halo-exchange MUST target the exact buffer the canonical descriptor
+  // is bound to in the PDHG cusparse_view (see cusparse_view.cu lines 516-519
+  // and 595-599):
+  //   - cv.reflected_primal_solution -> reflected_primal_ (var-shaped)
+  //   - cv.dual_solution             -> current.dual_solution_ (cstr-shaped)
+  // For 1 shard the halo-exchange is a no-op, but the buffer choice is what
+  // makes multi-shard correctness work, so we keep it accurate either way.
+  void distributed_compute_A_x()
+  {
+    halo_exchange_var(
+      [](auto& pdhg) -> rmm::device_uvector<f_t>& { return pdhg.get_reflected_primal(); });
+    for_each_shard([](auto& shard) { shard.sub_pdlp->pdhg_solver_.spmvop_A_x(); });
+  }
+
+  void distributed_compute_At_y()
+  {
+    halo_exchange_cstr(
+      [](auto& pdhg) -> rmm::device_uvector<f_t>& { return pdhg.get_dual_solution(); });
+    for_each_shard([](auto& shard) { shard.sub_pdlp->pdhg_solver_.spmvop_At_y(); });
+  }
+
+  // -------- Distributed Ruiz inf-scaling -----------------------------------
+  void alloc_global_var_scratch(i_t n_global_vars,
+                                std::vector<rmm::device_uvector<f_t>>& global_var_buf,
+                                std::vector<rmm::device_uvector<i_t>>& local_to_global_var_d)
+  {
+    const int nb = static_cast<int>(shards.size());
+    global_var_buf.reserve(nb);
+    local_to_global_var_d.reserve(nb);
+    for (int r = 0; r < nb; ++r) {
+      auto& s = *shards[r];
+      raft::device_setter guard(s.device_id);
+      global_var_buf.emplace_back(static_cast<std::size_t>(n_global_vars), s.stream.view());
+      local_to_global_var_d.emplace_back(static_cast<std::size_t>(s.rank_data.total_var_size),
+                                         s.stream.view());
+      if (s.rank_data.total_var_size > 0) {
+        RAFT_CUDA_TRY(cudaMemcpyAsync(local_to_global_var_d.back().data(),
+                                      s.rank_data.local_to_global_var.data(),
+                                      sizeof(i_t) * s.rank_data.local_to_global_var.size(),
+                                      cudaMemcpyHostToDevice,
+                                      s.stream.view().value()));
+      }
+    }
+  }
+
+  void reduce_iteration_variable_scaling_across_shards(
+    ncclRedOp_t op,
+    i_t n_global_vars,
+    std::vector<rmm::device_uvector<f_t>>& global_var_buf,
+    std::vector<rmm::device_uvector<i_t>>& local_to_global_var_d)
+  {
+    const int nb = static_cast<int>(shards.size());
+
+    // Zero global buffers, then scatter each shard's local values into their
+    // global column indices.
+    for (int r = 0; r < nb; ++r) {
+      auto& s = *shards[r];
+      raft::device_setter guard(s.device_id);
+      RAFT_CUDA_TRY(cudaMemsetAsync(global_var_buf[r].data(),
+                                    0,
+                                    sizeof(f_t) * static_cast<std::size_t>(n_global_vars),
+                                    s.stream.view().value()));
+      auto& iter_var_scaling =
+        s.sub_pdlp->get_initial_scaling_strategy().get_iteration_variable_scaling();
+      if (s.rank_data.total_var_size > 0) {
+        thrust::scatter(rmm::exec_policy_nosync(s.stream.view()),
+                        iter_var_scaling.begin(),
+                        iter_var_scaling.begin() + s.rank_data.total_var_size,
+                        local_to_global_var_d[r].begin(),
+                        global_var_buf[r].begin());
+      }
+    }
+
+    ncclGroupStart();
+    for (int r = 0; r < nb; ++r) {
+      auto& s = *shards[r];
+      raft::device_setter guard(s.device_id);
+      ncclAllReduce(global_var_buf[r].data(),
+                    global_var_buf[r].data(),
+                    static_cast<size_t>(n_global_vars),
+                    ncclFloat64,
+                    op,
+                    s.comm.get(),
+                    s.stream.view().value());
+    }
+    ncclGroupEnd();
+
+    // Gather the global per-column value back into each shard's local iter vector.
+    for (int r = 0; r < nb; ++r) {
+      auto& s = *shards[r];
+      raft::device_setter guard(s.device_id);
+      auto& iter_var_scaling =
+        s.sub_pdlp->get_initial_scaling_strategy().get_iteration_variable_scaling();
+      if (s.rank_data.total_var_size > 0) {
+        thrust::gather(rmm::exec_policy_nosync(s.stream.view()),
+                       local_to_global_var_d[r].begin(),
+                       local_to_global_var_d[r].begin() + s.rank_data.total_var_size,
+                       global_var_buf[r].begin(),
+                       iter_var_scaling.begin());
+      }
+    }
+  }
+
+  void distributed_ruiz_inf_scaling(int num_iter, i_t n_global_vars)
+  {
+    if (num_iter <= 0 || n_global_vars <= 0) return;
+    raft::common::nvtx::range scope("distributed_ruiz_inf_scaling");
+
+    std::vector<rmm::device_uvector<f_t>> global_var_buf;
+    std::vector<rmm::device_uvector<i_t>> local_to_global_var_d;
+    alloc_global_var_scratch(n_global_vars, global_var_buf, local_to_global_var_d);
+
+    for (int it = 0; it < num_iter; ++it) {
+      // 1) per-shard local kernel: writes iteration_variable_scaling (per-column
+      //    inf-norm partial) and iteration_constraint_matrix_scaling (row, complete).
+      for_each_shard([](auto& shard) {
+        shard.sub_pdlp->get_initial_scaling_strategy().ruiz_iter_compute_local_iteration_vectors();
+      });
+
+      // 2) cross-shard column inf-norm reduction (MAX).
+      reduce_iteration_variable_scaling_across_shards(
+        ncclMax, n_global_vars, global_var_buf, local_to_global_var_d);
+
+      // 3) per-shard fold into cumulative + reset iter vectors.
+      for_each_shard([](auto& shard) {
+        shard.sub_pdlp->get_initial_scaling_strategy().ruiz_iter_apply_cumulative_update();
+      });
+    }
+
+    // Make sure per-shard cumulative writes are observable on subsequent
+    // calls (e.g., the next distributed_max_singular_value).
+    for_each_shard([](auto& shard) { shard.stream.synchronize(); });
+  }
+
+  // Distributed Pock-Chambolle: one pass, mirroring single-GPU
+  // pock_chambolle_scaling but with the per-column sum-of-powers reduced across
+  // shards (SUM) between the local kernels and the cumulative fold. Rows are
+  // owned exclusively, so the row half stays local. Runs after the distributed
+  // Ruiz pass, matching the single-GPU order (Ruiz then Pock-Chambolle).
+  void distributed_pock_chambolle_scaling(f_t alpha, i_t n_global_vars)
+  {
+    if (n_global_vars <= 0) return;
+    raft::common::nvtx::range scope("distributed_pock_chambolle_scaling");
+
+    std::vector<rmm::device_uvector<f_t>> global_var_buf;
+    std::vector<rmm::device_uvector<i_t>> local_to_global_var_d;
+    alloc_global_var_scratch(n_global_vars, global_var_buf, local_to_global_var_d);
+
+    // 1) per-shard local kernels: row sum (complete) + column sum (partial).
+    for_each_shard([alpha](auto& shard) {
+      shard.sub_pdlp->get_initial_scaling_strategy().pock_chambolle_compute_local_iteration_vectors(
+        alpha);
+    });
+
+    // 2) cross-shard column sum-of-powers reduction (SUM).
+    reduce_iteration_variable_scaling_across_shards(
+      ncclSum, n_global_vars, global_var_buf, local_to_global_var_d);
+
+    // 3) per-shard fold into cumulative (cumulative /= sqrt(iteration)).
+    for_each_shard([](auto& shard) {
+      shard.sub_pdlp->get_initial_scaling_strategy().pock_chambolle_apply_cumulative_update();
+    });
+
+    for_each_shard([](auto& shard) { shard.stream.synchronize(); });
+  }
+
+  // -------- Distributed σ_max(A) via power iteration ----------------------
+  f_t distributed_max_singular_value(i_t n_global_cstrs,
+                                     int max_iterations = 5000,
+                                     f_t tolerance      = 1e-4)
+  {
+    raft::common::nvtx::range scope("distributed_max_singular_value");
+
+    const int nb = static_cast<int>(shards.size());
+
+    // Generate the GLOBAL z[] sequence in cstr-index order from a fresh
+    // mt19937(1), once per call. It's m doubles regardless of N (cheap).
+    // Each shard then keeps only z[global_idx_for_owned_local_i].
+    std::vector<f_t> h_global_z(static_cast<std::size_t>(n_global_cstrs));
+    {
+      std::mt19937 gen(1);
+      std::normal_distribution<f_t> dist(f_t(0.0), f_t(1.0));
+      for (i_t i = 0; i < n_global_cstrs; ++i) {
+        h_global_z[i] = dist(gen);
+      }
+    }
+
+    // Per-shard scratch lives on each shard's device. We use total (owned +
+    // halo) sizes for q/z/atq because they're SpMV inputs that need halo
+    // space. Norms / dot are scalars.
+    // We use size-1 rmm::device_uvector instead of rmm::device_scalar for the
+    // per-shard scratch scalars: nvcc + libcudacxx <cuda/basic_any> fail the
+    // copy_constructible concept check when device_scalar<T> appears in a
+    // std::vector (the check transitively touches rmm::cuda_stream, which is
+    // non-copyable). device_uvector<T> avoids that path.
+    std::vector<rmm::device_uvector<f_t>> q;
+    std::vector<rmm::device_uvector<f_t>> z;
+    std::vector<rmm::device_uvector<f_t>> atq;
+    std::vector<rmm::device_uvector<f_t>> sigma_sq;
+    std::vector<rmm::device_uvector<f_t>> norm_q;
+    std::vector<rmm::device_uvector<f_t>> residual_norm;
+    std::vector<cusparseDnVecDescr_t> z_dn(nb, nullptr);
+    std::vector<cusparseDnVecDescr_t> atq_dn(nb, nullptr);
+    q.reserve(nb);
+    z.reserve(nb);
+    atq.reserve(nb);
+    sigma_sq.reserve(nb);
+    norm_q.reserve(nb);
+    residual_norm.reserve(nb);
+
+    for (int r = 0; r < nb; ++r) {
+      auto& s = *shards[r];
+      raft::device_setter guard(s.device_id);
+      const i_t cstr_total = s.rank_data.total_cstr_size;
+      const i_t var_total  = s.rank_data.total_var_size;
+      q.emplace_back(static_cast<std::size_t>(cstr_total), s.stream.view());
+      z.emplace_back(static_cast<std::size_t>(cstr_total), s.stream.view());
+      atq.emplace_back(static_cast<std::size_t>(var_total), s.stream.view());
+      sigma_sq.emplace_back(std::size_t{1}, s.stream.view());
+      norm_q.emplace_back(std::size_t{1}, s.stream.view());
+      residual_norm.emplace_back(std::size_t{1}, s.stream.view());
+      RAFT_CUSPARSE_TRY(raft::sparse::detail::cusparsecreatednvec(
+        &z_dn[r], static_cast<int64_t>(cstr_total), z.back().data()));
+      RAFT_CUSPARSE_TRY(raft::sparse::detail::cusparsecreatednvec(
+        &atq_dn[r], static_cast<int64_t>(var_total), atq.back().data()));
+
+      std::vector<f_t> h_owned_z(static_cast<std::size_t>(s.rank_data.owned_cstr_size));
+      for (i_t i = 0; i < s.rank_data.owned_cstr_size; ++i) {
+        const i_t g  = s.rank_data.local_to_global_cstr[i];
+        h_owned_z[i] = h_global_z[g];
+      }
+      if (s.rank_data.owned_cstr_size > 0) {
+        RAFT_CUDA_TRY(
+          cudaMemcpyAsync(z.back().data(),
+                          h_owned_z.data(),
+                          sizeof(f_t) * static_cast<std::size_t>(s.rank_data.owned_cstr_size),
+                          cudaMemcpyHostToDevice,
+                          s.stream.view().value()));
+      }
+      if (cstr_total > s.rank_data.owned_cstr_size) {
+        RAFT_CUDA_TRY(cudaMemsetAsync(
+          z.back().data() + s.rank_data.owned_cstr_size,
+          0,
+          sizeof(f_t) * static_cast<std::size_t>(cstr_total - s.rank_data.owned_cstr_size),
+          s.stream.view().value()));
+      }
+      // Sync to ensure h_owned_z stays valid through the H2D copy (it goes
+      // out of scope at end of this iteration of the per-shard loop).
+      s.stream.synchronize();
+    }
+
+    // Local halo-exchange helpers that work directly on per-shard external
+    // buffers (the engine's halo_exchange_var/cstr expect accessors that
+    // resolve through pdhg_solver_t, which doesn't see our scratch).
+    auto halo_exchange_cstr_bufs = [&](std::vector<rmm::device_uvector<f_t>>& bufs) {
+      for (int r = 0; r < nb; ++r) {
+        auto& s = *shards[r];
+        raft::device_setter guard(s.device_id);
+        auto& y = bufs[r];
+        for (int peer = 0; peer < nb; ++peer) {
+          if (peer == r) continue;
+          if (s.cstr_send_indices_d[peer].size() == 0) continue;
+          thrust::gather(rmm::exec_policy_nosync(s.stream.view()),
+                         s.cstr_send_indices_d[peer].begin(),
+                         s.cstr_send_indices_d[peer].end(),
+                         y.begin(),
+                         s.cstr_send_buf_d[peer].begin());
+        }
+      }
+      ncclGroupStart();
+      for (int r = 0; r < nb; ++r) {
+        auto& s = *shards[r];
+        raft::device_setter guard(s.device_id);
+        for (int peer = 0; peer < nb; ++peer) {
+          if (peer == r) continue;
+          ncclSend(s.cstr_send_buf_d[peer].data(),
+                   s.cstr_send_buf_d[peer].size(),
+                   ncclFloat64,
+                   peer,
+                   s.comm.get(),
+                   s.stream.view().value());
+        }
+      }
+      for (int r = 0; r < nb; ++r) {
+        auto& s  = *shards[r];
+        auto& rd = s.rank_data;
+        raft::device_setter guard(s.device_id);
+        auto& y = bufs[r];
+        for (int peer = 0; peer < nb; ++peer) {
+          if (peer == r) continue;
+          f_t* recv_ptr = y.data() + rd.owned_cstr_size + rd.cstr_recv_offsets[peer];
+          ncclRecv(recv_ptr,
+                   static_cast<size_t>(rd.cstr_recv_counts[peer]),
+                   ncclFloat64,
+                   peer,
+                   s.comm.get(),
+                   s.stream.view().value());
+        }
+      }
+      ncclGroupEnd();
+    };
+    auto halo_exchange_var_bufs = [&](std::vector<rmm::device_uvector<f_t>>& bufs) {
+      for (int r = 0; r < nb; ++r) {
+        auto& s = *shards[r];
+        raft::device_setter guard(s.device_id);
+        auto& x = bufs[r];
+        for (int peer = 0; peer < nb; ++peer) {
+          if (peer == r) continue;
+          if (s.var_send_indices_d[peer].size() == 0) continue;
+          thrust::gather(rmm::exec_policy_nosync(s.stream.view()),
+                         s.var_send_indices_d[peer].begin(),
+                         s.var_send_indices_d[peer].end(),
+                         x.begin(),
+                         s.var_send_buf_d[peer].begin());
+        }
+      }
+      ncclGroupStart();
+      for (int r = 0; r < nb; ++r) {
+        auto& s = *shards[r];
+        raft::device_setter guard(s.device_id);
+        for (int peer = 0; peer < nb; ++peer) {
+          if (peer == r) continue;
+          ncclSend(s.var_send_buf_d[peer].data(),
+                   s.var_send_buf_d[peer].size(),
+                   ncclFloat64,
+                   peer,
+                   s.comm.get(),
+                   s.stream.view().value());
+        }
+      }
+      for (int r = 0; r < nb; ++r) {
+        auto& s  = *shards[r];
+        auto& rd = s.rank_data;
+        raft::device_setter guard(s.device_id);
+        auto& x = bufs[r];
+        for (int peer = 0; peer < nb; ++peer) {
+          if (peer == r) continue;
+          f_t* recv_ptr = x.data() + rd.owned_var_size + rd.var_recv_offsets[peer];
+          ncclRecv(recv_ptr,
+                   static_cast<size_t>(rd.var_recv_counts[peer]),
+                   ncclFloat64,
+                   peer,
+                   s.comm.get(),
+                   s.stream.view().value());
+        }
+      }
+      ncclGroupEnd();
+    };
+
+    // Per-shard partial reductions over the OWNED cstr slice + NCCL allreduce.
+    // For norm: out := sqrt(Σ_r ||bufs[r][0:owned_cstr]||²).
+    // For dot : out := Σ_r <a[r][0:owned_cstr], b[r][0:owned_cstr]>.
+    auto distributed_norm_owned_cstr = [&](std::vector<rmm::device_uvector<f_t>>& bufs,
+                                           std::vector<rmm::device_uvector<f_t>>& out) {
+      for (int r = 0; r < nb; ++r) {
+        auto& s = *shards[r];
+        raft::device_setter guard(s.device_id);
+        const i_t n_owned = s.rank_data.owned_cstr_size;
+        RAFT_CUBLAS_TRY(raft::linalg::detail::cublasdot(s.handle.get_cublas_handle(),
+                                                        static_cast<int>(n_owned),
+                                                        bufs[r].data(),
+                                                        1,
+                                                        bufs[r].data(),
+                                                        1,
+                                                        out[r].data(),
+                                                        s.stream.view().value()));
+      }
+      ncclGroupStart();
+      for (int r = 0; r < nb; ++r) {
+        auto& s = *shards[r];
+        raft::device_setter guard(s.device_id);
+        ncclAllReduce(out[r].data(),
+                      out[r].data(),
+                      1,
+                      ncclFloat64,
+                      ncclSum,
+                      s.comm.get(),
+                      s.stream.view().value());
+      }
+      ncclGroupEnd();
+      for (int r = 0; r < nb; ++r) {
+        auto& s = *shards[r];
+        raft::device_setter guard(s.device_id);
+        cub::DeviceTransform::Transform(
+          out[r].data(), out[r].data(), 1, sqrt_inplace_op_t<f_t>{}, s.stream.view().value());
+      }
+    };
+    auto distributed_dot_owned_cstr = [&](std::vector<rmm::device_uvector<f_t>>& a,
+                                          std::vector<rmm::device_uvector<f_t>>& b,
+                                          std::vector<rmm::device_uvector<f_t>>& out) {
+      for (int r = 0; r < nb; ++r) {
+        auto& s = *shards[r];
+        raft::device_setter guard(s.device_id);
+        const i_t n_owned = s.rank_data.owned_cstr_size;
+        RAFT_CUBLAS_TRY(raft::linalg::detail::cublasdot(s.handle.get_cublas_handle(),
+                                                        static_cast<int>(n_owned),
+                                                        a[r].data(),
+                                                        1,
+                                                        b[r].data(),
+                                                        1,
+                                                        out[r].data(),
+                                                        s.stream.view().value()));
+      }
+      ncclGroupStart();
+      for (int r = 0; r < nb; ++r) {
+        auto& s = *shards[r];
+        raft::device_setter guard(s.device_id);
+        ncclAllReduce(out[r].data(),
+                      out[r].data(),
+                      1,
+                      ncclFloat64,
+                      ncclSum,
+                      s.comm.get(),
+                      s.stream.view().value());
+      }
+      ncclGroupEnd();
+    };
+
+    // ===== Power iteration =====
+    // Mirrors single-GPU compute_initial_step_size: z is the carried iterate
+    // (A Aᵀ q each step); at the top of each iteration q := z then q is
+    // normalized; the residual z − σ²q is written back into q only to drive
+    // the convergence check (next iteration's q := z discards it).
+    for (int it = 0; it < max_iterations; ++it) {
+      // q := z on the owned slice (the carried iterate), then normalize.
+      for (int r = 0; r < nb; ++r) {
+        auto& s = *shards[r];
+        raft::device_setter guard(s.device_id);
+        const i_t n_owned = s.rank_data.owned_cstr_size;
+        raft::copy(q[r].data(), z[r].data(), n_owned, s.stream.view());
+      }
+
+      // ||q||₂ over the global OWNED cstr slice (one allreduce-sum + sqrt).
+      distributed_norm_owned_cstr(q, norm_q);
+
+      // q /= ||q||₂ on owned slice (halo gets refreshed by next exchange).
+      for (int r = 0; r < nb; ++r) {
+        auto& s = *shards[r];
+        raft::device_setter guard(s.device_id);
+        const i_t n_owned = s.rank_data.owned_cstr_size;
+        cub::DeviceTransform::Transform(
+          q[r].data(),
+          q[r].data(),
+          n_owned,
+          [n = norm_q[r].data()] __device__(f_t v) { return v / *n; },
+          s.stream.view().value());
+      }
+
+      // atq = A^T q : halo-exchange q, then per-shard SpMV. spmv_At_into
+      // rebinds the dual_solution dnvec to q[r].data() and restores the
+      // canonical binding after the call (see pdhg.cu:643-644).
+      halo_exchange_cstr_bufs(q);
+      for (int r = 0; r < nb; ++r) {
+        auto& s = *shards[r];
+        raft::device_setter guard(s.device_id);
+        s.sub_pdlp->pdhg_solver_.spmv_At_into(q[r], atq_dn[r]);
+      }
+
+      // z = A atq : halo-exchange atq, then per-shard SpMV.
+      halo_exchange_var_bufs(atq);
+      for (int r = 0; r < nb; ++r) {
+        auto& s = *shards[r];
+        raft::device_setter guard(s.device_id);
+        s.sub_pdlp->pdhg_solver_.spmv_A_into(atq[r], z_dn[r]);
+      }
+
+      // σ² = q · z over the global OWNED cstr slice (= q^T A A^T q = σ_max²
+      // when q is the dominant left-singular vector).
+      distributed_dot_owned_cstr(q, z, sigma_sq);
+
+      // q := -σ² q + z (owned slice) — residual of the eigen-equation.
+      for (int r = 0; r < nb; ++r) {
+        auto& s = *shards[r];
+        raft::device_setter guard(s.device_id);
+        const i_t n_owned = s.rank_data.owned_cstr_size;
+        cub::DeviceTransform::Transform(
+          cuda::std::make_tuple(q[r].data(), z[r].data()),
+          q[r].data(),
+          n_owned,
+          [s2 = sigma_sq[r].data()] __device__(f_t qv, f_t zv) { return -(*s2) * qv + zv; },
+          s.stream.view().value());
+      }
+
+      // Convergence check via global residual norm.
+      distributed_norm_owned_cstr(q, residual_norm);
+      auto& s0 = *shards[0];
+      raft::device_setter guard0(s0.device_id);
+      f_t h_res{};
+      RAFT_CUDA_TRY(cudaMemcpyAsync(&h_res,
+                                    residual_norm[0].data(),
+                                    sizeof(f_t),
+                                    cudaMemcpyDeviceToHost,
+                                    s0.stream.view().value()));
+      s0.stream.synchronize();
+      if (h_res < tolerance) break;
+    }
+
+    // σ_max² is the same on every shard after the last allreduce.
+    auto& s0 = *shards[0];
+    raft::device_setter guard0(s0.device_id);
+    f_t sigma_sq_h{};
+    RAFT_CUDA_TRY(cudaMemcpyAsync(&sigma_sq_h,
+                                  sigma_sq[0].data(),
+                                  sizeof(f_t),
+                                  cudaMemcpyDeviceToHost,
+                                  s0.stream.view().value()));
+    s0.stream.synchronize();
+
+    for (int r = 0; r < nb; ++r) {
+      raft::device_setter guard(shards[r]->device_id);
+      RAFT_CUSPARSE_TRY(cusparseDestroyDnVec(z_dn[r]));
+      RAFT_CUSPARSE_TRY(cusparseDestroyDnVec(atq_dn[r]));
+    }
+
+    return std::sqrt(std::max(sigma_sq_h, f_t(0)));
+  }
+
+  // -------- Solution gather (shards -> master) ----------------------------
+  // Assembles the global potential_next primal/dual solutions and the
+  // reduced_cost on the master from the owned slices distributed across
+  // shards. Each shard's first owned_var_size (resp. owned_cstr_size) entries
+  // of its potential_next_primal_solution_ / reduced_cost_ (resp.
+  // potential_next_dual_solution_) are the live, up-to-date owned values; the
+  // master buffers are not updated during iterations and would otherwise
+  // return stale data.
+  //
+  // Used right before fill_return_problem_solution() at the return sites in
+  // pdlp_solver_t::check_termination() and pdlp_solver_t::check_limits(): the
+  // user-visible solution must contain gathered global values for primal,
+  // dual, and reduced_cost.
+  //
+  // Mirrors the metis_tests engine::get_x_output / get_y_output pattern:
+  // per shard: alloc small host tmp, copy owned slice device->host, sync,
+  // host-scatter via rank_data.local_to_global_{var,cstr} into a contiguous
+  // host buffer. Then one host->device copy into the master buffer per field.
+  //
+  // master_pdhg          : provides destinations for primal / dual.
+  // master_reduced_cost  : destination for the reduced_cost (var-shaped, lives
+  //                        in the master pdlp_solver_t's termination strategy
+  //                        convergence_information_).
+  void gather_potential_next_solutions_to_master(pdhg_solver_t<i_t, f_t>& master_pdhg,
+                                                 rmm::device_uvector<f_t>& master_reduced_cost)
+  {
+    const std::size_t total_vars  = master_pdhg.get_potential_next_primal_solution().size();
+    const std::size_t total_cstrs = master_pdhg.get_potential_next_dual_solution().size();
+
+    std::vector<f_t> h_primal(total_vars);
+    std::vector<f_t> h_dual(total_cstrs);
+    std::vector<f_t> h_reduced_cost(total_vars);
+
+    for (auto& s_uptr : shards) {
+      auto& s = *s_uptr;
+      raft::device_setter guard(s.device_id);
+      const i_t nv = s.rank_data.owned_var_size;
+      const i_t nc = s.rank_data.owned_cstr_size;
+
+      std::vector<f_t> tmp_primal(nv);
+      std::vector<f_t> tmp_dual(nc);
+      std::vector<f_t> tmp_reduced_cost(nv);
+
+      auto& sub_reduced_cost = s.sub_pdlp->get_current_termination_strategy()
+                                 .get_convergence_information()
+                                 .get_reduced_cost();
+
+      if (nv > 0) {
+        RAFT_CUDA_TRY(
+          cudaMemcpyAsync(tmp_primal.data(),
+                          s.sub_pdlp->pdhg_solver_.get_potential_next_primal_solution().data(),
+                          static_cast<std::size_t>(nv) * sizeof(f_t),
+                          cudaMemcpyDeviceToHost,
+                          s.stream.view().value()));
+        RAFT_CUDA_TRY(cudaMemcpyAsync(tmp_reduced_cost.data(),
+                                      sub_reduced_cost.data(),
+                                      static_cast<std::size_t>(nv) * sizeof(f_t),
+                                      cudaMemcpyDeviceToHost,
+                                      s.stream.view().value()));
+      }
+      if (nc > 0) {
+        RAFT_CUDA_TRY(
+          cudaMemcpyAsync(tmp_dual.data(),
+                          s.sub_pdlp->pdhg_solver_.get_potential_next_dual_solution().data(),
+                          static_cast<std::size_t>(nc) * sizeof(f_t),
+                          cudaMemcpyDeviceToHost,
+                          s.stream.view().value()));
+      }
+      RAFT_CUDA_TRY(cudaStreamSynchronize(s.stream.view().value()));
+
+      if (nv > 0) {
+        thrust::scatter(thrust::host,
+                        tmp_primal.begin(),
+                        tmp_primal.end(),
+                        s.rank_data.local_to_global_var.begin(),
+                        h_primal.begin());
+        thrust::scatter(thrust::host,
+                        tmp_reduced_cost.begin(),
+                        tmp_reduced_cost.end(),
+                        s.rank_data.local_to_global_var.begin(),
+                        h_reduced_cost.begin());
+      }
+      if (nc > 0) {
+        thrust::scatter(thrust::host,
+                        tmp_dual.begin(),
+                        tmp_dual.end(),
+                        s.rank_data.local_to_global_cstr.begin(),
+                        h_dual.begin());
+      }
+    }
+
+    // Host -> master device. engine.stream lives on the master device
+    // (created at engine construction when master device was current).
+    RAFT_CUDA_TRY(cudaMemcpyAsync(master_pdhg.get_potential_next_primal_solution().data(),
+                                  h_primal.data(),
+                                  total_vars * sizeof(f_t),
+                                  cudaMemcpyHostToDevice,
+                                  stream.view().value()));
+    RAFT_CUDA_TRY(cudaMemcpyAsync(master_pdhg.get_potential_next_dual_solution().data(),
+                                  h_dual.data(),
+                                  total_cstrs * sizeof(f_t),
+                                  cudaMemcpyHostToDevice,
+                                  stream.view().value()));
+    RAFT_CUDA_TRY(cudaMemcpyAsync(master_reduced_cost.data(),
+                                  h_reduced_cost.data(),
+                                  total_vars * sizeof(f_t),
+                                  cudaMemcpyHostToDevice,
+                                  stream.view().value()));
+    RAFT_CUDA_TRY(cudaStreamSynchronize(stream.view().value()));
+  }
+
+  // Engine-level stream for fork/join orchestration (master side).
+  rmm::cuda_stream stream;
+
+  // Shards stored by unique_ptr because pdlp_shard_t is immovable
+  // (owns device-affine resources: handle, NCCL comm, RMM buffers).
+  std::vector<std::unique_ptr<pdlp_shard_t<i_t, f_t>>> shards;
+
+  // ===== Cross-stream synchronization events =====
+  // two different events
+  // capture_*_event_ are used inside graph capture
+  // ext_*_event_ are used when sync is needed outside of graph
+  std::unique_ptr<cuopt::event_handler_t> graph_master_ready_event_;
+  std::vector<std::unique_ptr<cuopt::event_handler_t>> graph_shard_ready_events_;
+  std::unique_ptr<cuopt::event_handler_t> sync_master_ready_event_;
+  std::vector<std::unique_ptr<cuopt::event_handler_t>> sync_shard_ready_events_;
+
+  // Forks master stream to shards, so that the captured graph can see the work on the shards
+  void graph_capture_fork_to_shards(rmm::cuda_stream_view master_stream)
+  {
+    graph_master_ready_event_->record(master_stream);
+    for (auto& s : shards) {
+      raft::device_setter guard(s->device_id);
+      graph_master_ready_event_->stream_wait(s->stream.view());
+    }
+  }
+
+  // Joins shards back to master stream for correct graph capture
+  void graph_capture_join_from_shards(rmm::cuda_stream_view master_stream)
+  {
+    const int nb = static_cast<int>(shards.size());
+    for (int r = 0; r < nb; ++r) {
+      raft::device_setter guard(shards[r]->device_id);
+      graph_shard_ready_events_[r]->record(shards[r]->stream.view());
+    }
+    for (auto& e : graph_shard_ready_events_) {
+      e->stream_wait(master_stream);
+    }
+  }
+
+  // Functionnaly same as graph_capture_fork_to_shards but on a different event to avoid race
+  // conditions Can be used as a way to sync shards with master stream
+  void sync_await_master(rmm::cuda_stream_view master_stream)
+  {
+    sync_master_ready_event_->record(master_stream);
+    for (auto& s : shards) {
+      raft::device_setter guard(s->device_id);
+      sync_master_ready_event_->stream_wait(s->stream.view());
+    }
+  }
+
+  // Same as sync_await_master
+  // Can be used as a way to sync master stream with shards
+  void sync_await_shards(rmm::cuda_stream_view master_stream)
+  {
+    const int nb = static_cast<int>(shards.size());
+    for (int r = 0; r < nb; ++r) {
+      raft::device_setter guard(shards[r]->device_id);
+      sync_shard_ready_events_[r]->record(shards[r]->stream.view());
+    }
+    for (auto& e : sync_shard_ready_events_) {
+      e->stream_wait(master_stream);
+    }
+  }
+};
+
+}  // namespace cuopt::linear_programming::detail
diff --git a/cpp/src/pdlp/distributed_pdlp/partition_loader.cu b/cpp/src/pdlp/distributed_pdlp/partition_loader.cu
new file mode 100644
index 0000000000..a6db3a9fe8
--- /dev/null
+++ b/cpp/src/pdlp/distributed_pdlp/partition_loader.cu
@@ -0,0 +1,262 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#include <pdlp/distributed_pdlp/partition_loader.hpp>
+
+#include <cuopt/error.hpp>
+
+#include <fstream>
+#include <set>
+#include <utility>
+
+namespace cuopt::linear_programming::detail {
+
+template <typename i_t, typename f_t>
+std::vector<i_t> partition_loader_t<i_t, f_t>::parse_distributed_pdlp_partition_file(
+  std::string const& file)
+{
+  std::ifstream part_file(file);
+  cuopt_expects(part_file.is_open(),
+                error_type_t::ValidationError,
+                "Failed to open partition file: %s",
+                file.c_str());
+
+  // One integer per line; operator>> skips whitespace so blank lines and
+  // trailing newlines are tolerated.
+  std::vector<i_t> parts;
+  i_t part = 0;
+  while (part_file >> part) {
+    parts.push_back(part);
+  }
+
+  // We must have hit EOF cleanly; any other state means a malformed token.
+  cuopt_expects(part_file.eof(),
+                error_type_t::ValidationError,
+                "Malformed partition file (expected one integer per line): %s",
+                file.c_str());
+
+  return parts;
+}
+
+template <typename i_t, typename f_t>
+void partition_loader_t<i_t, f_t>::export_distributed_pdlp_partition_file(
+  std::string const& file, std::vector<i_t> const& parts)
+{
+  std::ofstream part_file(file);
+  cuopt_expects(part_file.is_open(),
+                error_type_t::ValidationError,
+                "Failed to open partition file for export: %s",
+                file.c_str());
+  for (auto const& part : parts) {
+    part_file << part << "\n";
+  }
+  cuopt_expects(part_file.good(),
+                error_type_t::RuntimeError,
+                "Failed while writing partition file: %s",
+                file.c_str());
+}
+
+template <typename i_t, typename f_t>
+std::vector<rank_data_t<i_t, f_t>> partition_loader_t<i_t, f_t>::create_rank_data_from_parts(
+  const std::vector<i_t>& parts,
+  const std::vector<i_t>& A_row_offsets,
+  const std::vector<i_t>& A_col_indices,
+  const std::vector<f_t>& A_values,
+  const std::vector<f_t>& A_values_scaled,
+  const std::vector<i_t>& A_t_row_offsets,
+  const std::vector<i_t>& A_t_col_indices,
+  const std::vector<f_t>& A_t_values,
+  const std::vector<f_t>& A_t_values_scaled,
+  i_t nb_parts,
+  i_t nb_cstr,
+  i_t nb_vars,
+  i_t nnz)
+{
+  cuopt_expects(A_values.size() == A_values_scaled.size(),
+                error_type_t::ValidationError,
+                "A_values and A_values_scaled must have the same length");
+  cuopt_expects(A_t_values.size() == A_t_values_scaled.size(),
+                error_type_t::ValidationError,
+                "A_t_values and A_t_values_scaled must have the same length");
+
+  std::vector<rank_data_t<i_t, f_t>> rank_data(nb_parts, rank_data_t<i_t, f_t>(nb_parts));
+  std::vector<i_t> cstr_parts(parts.begin(), parts.begin() + nb_cstr);
+  std::vector<i_t> var_parts(parts.begin() + nb_cstr, parts.begin() + nb_cstr + nb_vars);
+
+  // 1. Compute ownership
+  for (i_t i = 0; i < nb_cstr; i++) {
+    rank_data[cstr_parts[i]].owned_cstr_indices.push_back(i);
+  }
+  for (i_t i = 0; i < nb_vars; i++) {
+    rank_data[var_parts[i]].owned_var_indices.push_back(i);
+  }
+
+  // 2. Compute local matrices and rank_data
+  for (i_t rank = 0; rank < nb_parts; rank++) {
+    auto& rd           = rank_data[rank];
+    rd.owned_var_size  = rd.owned_var_indices.size();
+    rd.owned_cstr_size = rd.owned_cstr_indices.size();
+    // ---- A side ----
+    std::vector<i_t> local_A_row_offsets;
+    std::vector<i_t> local_A_col_indices;
+    std::vector<f_t> local_A_values;
+    std::vector<f_t> local_A_values_scaled;
+
+    i_t local_A_nnz = 0;
+    local_A_row_offsets.push_back(local_A_nnz);
+
+    // For each owned constraint, build local matrix A. We walk both the
+    // unscaled and scaled global value arrays in lockstep so the produced
+    // local arrays share identical (offsets, col_indices) and differ only
+    // in values.
+    for (auto owned_cstr : rd.owned_cstr_indices) {
+      i_t cstr_len  = A_row_offsets[owned_cstr + 1] - A_row_offsets[owned_cstr];
+      i_t row_start = A_row_offsets[owned_cstr];
+      for (i_t v = 0; v < cstr_len; v++) {
+        local_A_col_indices.push_back(A_col_indices[row_start + v]);
+        local_A_values.push_back(A_values[row_start + v]);
+        local_A_values_scaled.push_back(A_values_scaled[row_start + v]);
+      }
+      local_A_nnz += cstr_len;
+      local_A_row_offsets.push_back(local_A_nnz);
+    }
+
+    std::set<i_t> needed_vars;
+    for (auto indice : local_A_col_indices) {
+      if (var_parts[indice] != rank) needed_vars.insert(indice);
+    }
+
+    for (i_t peer = 0; peer < nb_parts; peer++) {
+      std::vector<i_t> needed_var_from_peer;
+      for (auto needed_var : needed_vars) {
+        if (var_parts[needed_var] == peer) needed_var_from_peer.push_back(needed_var);
+      }
+      i_t nb_recv_from_peer    = needed_var_from_peer.size();
+      rd.var_recv_counts[peer] = nb_recv_from_peer;
+      rd.var_recv_offsets[peer] =
+        peer == 0 ? 0 : rd.var_recv_offsets[peer - 1] + rd.var_recv_counts[peer - 1];
+      rank_data[peer].var_send_per_peer[rank] = std::move(needed_var_from_peer);
+    }
+
+    rd.h_A_row_offsets   = std::move(local_A_row_offsets);
+    rd.h_A_col_indices   = std::move(local_A_col_indices);
+    rd.h_A_values        = std::move(local_A_values);
+    rd.h_A_values_scaled = std::move(local_A_values_scaled);
+
+    // ---- A_t side ----
+    std::vector<i_t> local_A_t_row_offsets;
+    std::vector<i_t> local_A_t_col_indices;
+    std::vector<f_t> local_A_t_values;
+    std::vector<f_t> local_A_t_values_scaled;
+    i_t local_A_t_nnz = 0;
+    local_A_t_row_offsets.push_back(local_A_t_nnz);
+
+    for (auto owned_var : rd.owned_var_indices) {
+      i_t var_len   = A_t_row_offsets[owned_var + 1] - A_t_row_offsets[owned_var];
+      i_t row_start = A_t_row_offsets[owned_var];
+      for (i_t v = 0; v < var_len; v++) {
+        local_A_t_col_indices.push_back(A_t_col_indices[row_start + v]);
+        local_A_t_values.push_back(A_t_values[row_start + v]);
+        local_A_t_values_scaled.push_back(A_t_values_scaled[row_start + v]);
+      }
+      local_A_t_nnz += var_len;
+      local_A_t_row_offsets.push_back(local_A_t_nnz);
+    }
+
+    std::set<i_t> needed_cstrs;
+    for (auto indice : local_A_t_col_indices) {
+      if (cstr_parts[indice] != rank) needed_cstrs.insert(indice);
+    }
+
+    for (i_t peer = 0; peer < nb_parts; peer++) {
+      std::vector<i_t> needed_cstr_from_peer;
+      for (auto needed_cstr : needed_cstrs) {
+        if (cstr_parts[needed_cstr] == peer) needed_cstr_from_peer.push_back(needed_cstr);
+      }
+      i_t nb_recv_from_peer     = needed_cstr_from_peer.size();
+      rd.cstr_recv_counts[peer] = nb_recv_from_peer;
+      rd.cstr_recv_offsets[peer] =
+        peer == 0 ? 0 : rd.cstr_recv_offsets[peer - 1] + rd.cstr_recv_counts[peer - 1];
+      rank_data[peer].cstr_send_per_peer[rank] = std::move(needed_cstr_from_peer);
+    }
+
+    rd.h_A_t_row_offsets   = std::move(local_A_t_row_offsets);
+    rd.h_A_t_col_indices   = std::move(local_A_t_col_indices);
+    rd.h_A_t_values        = std::move(local_A_t_values);
+    rd.h_A_t_values_scaled = std::move(local_A_t_values_scaled);
+
+    rd.total_var_size  = rd.owned_var_size + needed_vars.size();
+    rd.total_cstr_size = rd.owned_cstr_size + needed_cstrs.size();
+
+    // Pad row-offset arrays so cuSPARSE sees the local matrices as
+    // (total_cstr x total_var) for A and (total_var x total_cstr) for A_T
+    const i_t a_last_nnz = rd.h_A_row_offsets.empty() ? i_t{0} : rd.h_A_row_offsets.back();
+    rd.h_A_row_offsets.resize(rd.total_cstr_size + 1, a_last_nnz);
+
+    const i_t at_last_nnz = rd.h_A_t_row_offsets.empty() ? i_t{0} : rd.h_A_t_row_offsets.back();
+    rd.h_A_t_row_offsets.resize(rd.total_var_size + 1, at_last_nnz);
+  }
+
+  // 3. Generate local indices for contiguous [[self], [peer1], ..., [peer_k]]
+  //    Build scatter_gather_maps
+  for (i_t rank = 0; rank < nb_parts; rank++) {
+    auto& rd = rank_data[rank];
+
+    i_t curr_id = 0;
+    for (auto owned_cstr : rd.owned_cstr_indices) {
+      rd.global_to_local_cstr[owned_cstr] = curr_id;
+      rd.local_to_global_cstr.push_back(owned_cstr);
+      curr_id++;
+    }
+    for (i_t peer = 0; peer < nb_parts; peer++) {
+      if (peer == rank) continue;
+      for (auto recv_cstr : rank_data[peer].cstr_send_per_peer[rank]) {
+        rd.global_to_local_cstr[recv_cstr] = curr_id;
+        rd.local_to_global_cstr.push_back(recv_cstr);
+        curr_id++;
+      }
+    }
+
+    curr_id = 0;
+    for (auto owned_var : rd.owned_var_indices) {
+      rd.global_to_local_var[owned_var] = curr_id;
+      rd.local_to_global_var.push_back(owned_var);
+      curr_id++;
+    }
+    for (i_t peer = 0; peer < nb_parts; peer++) {
+      if (peer == rank) continue;
+      for (auto recv_var : rank_data[peer].var_send_per_peer[rank]) {
+        rd.global_to_local_var[recv_var] = curr_id;
+        rd.local_to_global_var.push_back(recv_var);
+        curr_id++;
+      }
+    }
+  }
+
+  // 4. Remap global -> local everywhere
+  for (i_t rank = 0; rank < nb_parts; rank++) {
+    auto& rd = rank_data[rank];
+
+    for (auto& send_vec : rd.var_send_per_peer) {
+      for (auto& v : send_vec)
+        v = rd.global_to_local_var.at(v);
+    }
+    for (auto& send_vec : rd.cstr_send_per_peer) {
+      for (auto& v : send_vec)
+        v = rd.global_to_local_cstr.at(v);
+    }
+
+    for (auto& v : rd.h_A_col_indices)
+      v = rd.global_to_local_var.at(v);
+    for (auto& v : rd.h_A_t_col_indices)
+      v = rd.global_to_local_cstr.at(v);
+  }
+
+  return rank_data;
+}
+
+template struct partition_loader_t<int, double>;
+
+}  // namespace cuopt::linear_programming::detail
diff --git a/cpp/src/pdlp/distributed_pdlp/partition_loader.hpp b/cpp/src/pdlp/distributed_pdlp/partition_loader.hpp
new file mode 100644
index 0000000000..ce12d241f9
--- /dev/null
+++ b/cpp/src/pdlp/distributed_pdlp/partition_loader.hpp
@@ -0,0 +1,45 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#pragma once
+
+#include <pdlp/distributed_pdlp/rank_data.hpp>
+
+#include <string>
+#include <vector>
+
+namespace cuopt::linear_programming::detail {
+
+template <typename i_t, typename f_t>
+struct partition_loader_t {
+  // Read a Metis-style partition file: one part-id per line (whitespace-tolerant),
+  // ASCII integers in [0, nb_parts). Returns a flat vector of length
+  // nb_cstr + nb_vars, indexed as in create_rank_data_from_parts (cstrs first, then vars).
+  static std::vector<i_t> parse_distributed_pdlp_partition_file(std::string const& file);
+
+  // Write a partition vector to file in the same format parse_... reads back:
+  // one part-id per line. Useful for inspecting / reusing a computed partition
+  // (e.g. CLI --distributed-pdlp-export-parts).
+  static void export_distributed_pdlp_partition_file(std::string const& file,
+                                                     std::vector<i_t> const& parts);
+
+  // Slices the data to prepare a split from metis partitionning with halo communication
+  static std::vector<rank_data_t<i_t, f_t>> create_rank_data_from_parts(
+    const std::vector<i_t>& parts,
+    const std::vector<i_t>& A_row_offsets,
+    const std::vector<i_t>& A_col_indices,
+    const std::vector<f_t>& A_values,
+    const std::vector<f_t>& A_values_scaled,
+    const std::vector<i_t>& A_t_row_offsets,
+    const std::vector<i_t>& A_t_col_indices,
+    const std::vector<f_t>& A_t_values,
+    const std::vector<f_t>& A_t_values_scaled,
+    i_t nb_parts,
+    i_t nb_cstr,
+    i_t nb_vars,
+    i_t nnz);
+};
+
+}  // namespace cuopt::linear_programming::detail
diff --git a/cpp/src/pdlp/distributed_pdlp/partitioner.cu b/cpp/src/pdlp/distributed_pdlp/partitioner.cu
new file mode 100644
index 0000000000..727a8b56f9
--- /dev/null
+++ b/cpp/src/pdlp/distributed_pdlp/partitioner.cu
@@ -0,0 +1,87 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#include <pdlp/distributed_pdlp/kaminpar_partitioner.hpp>
+#include <pdlp/distributed_pdlp/metis_partitioner.hpp>
+#include <pdlp/distributed_pdlp/partitioner.hpp>
+
+#include <cuopt/error.hpp>
+
+#include <algorithm>
+#include <cstddef>
+
+namespace cuopt::linear_programming::detail {
+
+template <typename i_t, typename f_t>
+std::vector<i_t> dummy_partitioner_t<i_t, f_t>::partition(
+  partitioner_input_t<i_t, f_t> const& input) const
+{
+  cuopt_expects(input.nb_parts > 0,
+                error_type_t::ValidationError,
+                "dummy_partitioner: nb_parts must be positive");
+  cuopt_expects(input.nb_cstr >= 0 && input.nb_vars >= 0,
+                error_type_t::ValidationError,
+                "dummy_partitioner: invalid problem dimensions");
+
+  const std::size_t nvtx =
+    static_cast<std::size_t>(input.nb_cstr) + static_cast<std::size_t>(input.nb_vars);
+  std::vector<i_t> parts(nvtx);
+  for (std::size_t i = 0; i < nvtx; ++i) {
+    parts[i] = static_cast<i_t>(i % static_cast<std::size_t>(input.nb_parts));
+  }
+  validate_partition(parts,
+                     static_cast<int>(input.nb_cstr),
+                     static_cast<int>(input.nb_vars),
+                     static_cast<int>(input.nb_parts),
+                     "dummy_partitioner");
+  return parts;
+}
+
+void validate_partition(
+  std::vector<int> const& parts, int nb_cstr, int nb_vars, int nb_parts, char const* context)
+{
+  const std::size_t expected =
+    static_cast<std::size_t>(nb_cstr) + static_cast<std::size_t>(nb_vars);
+  cuopt_expects(parts.size() == expected,
+                error_type_t::ValidationError,
+                "%s: expected %zu part entries (cstrs + vars), got %zu",
+                context,
+                expected,
+                parts.size());
+  cuopt_expects(
+    nb_parts > 0, error_type_t::ValidationError, "%s: nb_parts must be positive", context);
+  if (parts.empty()) { return; }
+  const auto [min_it, max_it] = std::minmax_element(parts.begin(), parts.end());
+  cuopt_expects(*min_it >= 0,
+                error_type_t::ValidationError,
+                "%s: partition ids must be non-negative (min=%d)",
+                context,
+                static_cast<int>(*min_it));
+  cuopt_expects(*max_it < nb_parts,
+                error_type_t::ValidationError,
+                "%s: partition ids must be in [0, %d) (max=%d)",
+                context,
+                static_cast<int>(nb_parts),
+                static_cast<int>(*max_it));
+}
+
+template <typename i_t, typename f_t>
+std::unique_ptr<partitioner_i<i_t, f_t>> make_partitioner(partitioner_kind_t kind)
+{
+  switch (kind) {
+    case partitioner_kind_t::Dummy: return std::make_unique<dummy_partitioner_t<i_t, f_t>>();
+    case partitioner_kind_t::Metis: return std::make_unique<metis_partitioner_t<i_t, f_t>>();
+    case partitioner_kind_t::KaMinPar: return std::make_unique<kaminpar_partitioner_t<i_t, f_t>>();
+  }
+  cuopt_expects(
+    false, error_type_t::RuntimeError, "make_partitioner: unsupported partitioner kind");
+  return nullptr;
+}
+
+template class dummy_partitioner_t<int, double>;
+template std::unique_ptr<partitioner_i<int, double>> make_partitioner<int, double>(
+  partitioner_kind_t);
+
+}  // namespace cuopt::linear_programming::detail
diff --git a/cpp/src/pdlp/distributed_pdlp/partitioner.hpp b/cpp/src/pdlp/distributed_pdlp/partitioner.hpp
new file mode 100644
index 0000000000..70b2e34c06
--- /dev/null
+++ b/cpp/src/pdlp/distributed_pdlp/partitioner.hpp
@@ -0,0 +1,70 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#pragma once
+
+#include <memory>
+#include <string>
+#include <vector>
+
+namespace cuopt::linear_programming::detail {
+
+// Non-owning view of a host CSR matrix (A or A_t).
+template <typename i_t, typename f_t>
+struct csr_host_view_t {
+  std::vector<i_t> const* row_offsets{nullptr};
+  std::vector<i_t> const* col_indices{nullptr};
+  std::vector<f_t> const* values{nullptr};  // optional; unused by topology-only partitioners
+  i_t num_rows{0};
+  i_t num_cols{0};
+};
+
+// Inputs shared by all distributed-PDLP partitioners.
+// Returns a flat vector of length (nb_cstr + nb_vars): constraint part-ids first,
+// then variable part-ids, each in [0, nb_parts).
+template <typename i_t, typename f_t>
+struct partitioner_input_t {
+  i_t nb_cstr{0};
+  i_t nb_vars{0};
+  i_t nb_parts{0};
+  // Number of CPU threads the partitioner may use. Only honored by the
+  // multi-threaded KaMinPar backend; <= 0 means "auto" (all hardware threads).
+  // Serial backends (METIS, Dummy) ignore it.
+  i_t nb_threads{0};
+  // Constraint matrix A (rows = constraints, cols = variables).
+  csr_host_view_t<i_t, f_t> A{};
+  // Transpose A_t (rows = variables, cols = constraints). Optional for partitioners
+  // that build a bipartite graph (e.g. METIS); dummy partitioner ignores both matrices.
+  csr_host_view_t<i_t, f_t> A_t{};
+};
+
+// Dummy: round-robin, no graph (single-shard / debugging).
+// Metis: serial METIS_PartGraphKway.
+// KaMinPar: multi-threaded KaMinPar (preferred for multi-shard partitioning).
+enum class partitioner_kind_t { Dummy, Metis, KaMinPar };
+
+template <typename i_t, typename f_t>
+class partitioner_i {
+ public:
+  virtual ~partitioner_i()                                                             = default;
+  virtual std::vector<i_t> partition(partitioner_input_t<i_t, f_t> const& input) const = 0;
+};
+
+template <typename i_t, typename f_t>
+class dummy_partitioner_t : public partitioner_i<i_t, f_t> {
+ public:
+  std::vector<i_t> partition(partitioner_input_t<i_t, f_t> const& input) const override;
+};
+
+void validate_partition(std::vector<int> const& parts,
+                        int nb_cstr,
+                        int nb_vars,
+                        int nb_parts,
+                        char const* context = "partition");
+
+template <typename i_t, typename f_t>
+std::unique_ptr<partitioner_i<i_t, f_t>> make_partitioner(partitioner_kind_t kind);
+
+}  // namespace cuopt::linear_programming::detail
diff --git a/cpp/src/pdlp/distributed_pdlp/rank_data.hpp b/cpp/src/pdlp/distributed_pdlp/rank_data.hpp
new file mode 100644
index 0000000000..d52d277116
--- /dev/null
+++ b/cpp/src/pdlp/distributed_pdlp/rank_data.hpp
@@ -0,0 +1,61 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#pragma once
+
+#include <unordered_map>
+#include <vector>
+
+namespace cuopt::linear_programming::detail {
+template <typename i_t, typename f_t>
+struct rank_data_t {
+  rank_data_t(std::size_t nb_parts)
+    : var_send_per_peer(nb_parts),
+      cstr_send_per_peer(nb_parts),
+      var_recv_counts(nb_parts, 0),
+      var_recv_offsets(nb_parts, 0),
+      cstr_recv_counts(nb_parts, 0),
+      cstr_recv_offsets(nb_parts, 0)
+  {
+  }
+
+  i_t owned_var_size{0};
+  i_t total_var_size{0};
+  i_t owned_cstr_size{0};
+  i_t total_cstr_size{0};
+
+  // === Ownership ===
+  std::vector<i_t> owned_var_indices;
+  std::vector<i_t> owned_cstr_indices;
+
+  // === Send plan: per peer, indices to gather + send ===
+  std::vector<std::vector<i_t>> var_send_per_peer;
+  std::vector<std::vector<i_t>> cstr_send_per_peer;
+
+  // === Recv plan: per peer, contiguous slot in halo region ===
+  std::vector<i_t> var_recv_counts;
+  std::vector<i_t> var_recv_offsets;
+  std::vector<i_t> cstr_recv_counts;
+  std::vector<i_t> cstr_recv_offsets;
+
+  // === Mappings ===
+  std::unordered_map<i_t, i_t> global_to_local_var;
+  std::unordered_map<i_t, i_t> global_to_local_cstr;
+  std::vector<i_t> local_to_global_var;
+  std::vector<i_t> local_to_global_cstr;
+
+  // === Local host CSR matrices ===
+  // A
+  std::vector<i_t> h_A_row_offsets;
+  std::vector<i_t> h_A_col_indices;
+  std::vector<f_t> h_A_values;
+  std::vector<f_t> h_A_values_scaled;
+  // A_t
+  std::vector<i_t> h_A_t_row_offsets;
+  std::vector<i_t> h_A_t_col_indices;
+  std::vector<f_t> h_A_t_values;
+  std::vector<f_t> h_A_t_values_scaled;
+};
+}  // namespace cuopt::linear_programming::detail
diff --git a/cpp/src/pdlp/distributed_pdlp/shard.cu b/cpp/src/pdlp/distributed_pdlp/shard.cu
new file mode 100644
index 0000000000..356e10a03c
--- /dev/null
+++ b/cpp/src/pdlp/distributed_pdlp/shard.cu
@@ -0,0 +1,242 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#include <pdlp/distributed_pdlp/shard.hpp>
+#include <pdlp/pdlp.cuh>
+#include <pdlp/utils.cuh>
+
+#include <utilities/copy_helpers.hpp>
+
+#include <raft/core/copy.hpp>
+#include <raft/core/device_setter.hpp>
+
+#include <cassert>
+#include <limits>
+
+namespace cuopt::linear_programming::detail {
+
+// This must be done in .cu file because the pdlp_solver_t is not already complete in the hpp file
+template <typename i_t, typename f_t>
+pdlp_shard_t<i_t, f_t>::~pdlp_shard_t() = default;
+
+template <typename i_t, typename f_t>
+pdlp_shard_t<i_t, f_t>::pdlp_shard_t(int device_id,
+                                     rank_data_t<i_t, f_t>&& rd,
+                                     ncclComm_t raw_comm,
+                                     std::vector<f_t> const& h_global_obj,
+                                     std::vector<f_t> const& h_global_var_lower,
+                                     std::vector<f_t> const& h_global_var_upper,
+                                     std::vector<f_t> const& h_global_cstr_lower,
+                                     std::vector<f_t> const& h_global_cstr_upper,
+                                     std::vector<f_t> const& h_global_obj_scaled,
+                                     std::vector<f_t> const& h_global_var_lower_scaled,
+                                     std::vector<f_t> const& h_global_var_upper_scaled,
+                                     std::vector<f_t> const& h_global_cstr_lower_scaled,
+                                     std::vector<f_t> const& h_global_cstr_upper_scaled,
+                                     std::vector<f_t> const& h_global_cummulative_cstr_scaling,
+                                     std::vector<f_t> const& h_global_cummulative_var_scaling,
+                                     f_t h_bound_rescaling,
+                                     f_t h_objective_rescaling,
+                                     bool maximize,
+                                     f_t objective_offset,
+                                     f_t objective_scaling_factor,
+                                     pdlp_solver_settings_t<i_t, f_t> const& settings)
+  : device_id(device_id),
+    stream(),
+    handle(stream.view()),
+    comm(raw_comm, nccl_comm_deleter_t{device_id}),
+    rank_data(std::move(rd)),
+    opt_problem(std::nullopt),
+    sub_problem(std::nullopt),
+    sub_pdlp(nullptr)
+{
+  assert(raft::device_setter::get_current_device() == device_id &&
+         "Right device must be set before building the shard");
+
+  // ---- 1. Gather per-shard host slices using rank_data's index maps. ----
+  // All vectors are sized to TOTAL (owned + halo). Owned slots get real
+  // values; halo slots keep defaults because they should not be accessed
+  std::vector<f_t> h_obj(rank_data.total_var_size, f_t{0});
+  std::vector<f_t> h_var_lower(rank_data.total_var_size, -std::numeric_limits<f_t>::infinity());
+  std::vector<f_t> h_var_upper(rank_data.total_var_size, std::numeric_limits<f_t>::infinity());
+  std::vector<f_t> h_cstr_lower(rank_data.total_cstr_size, -std::numeric_limits<f_t>::infinity());
+  std::vector<f_t> h_cstr_upper(rank_data.total_cstr_size, std::numeric_limits<f_t>::infinity());
+
+  std::vector<f_t> h_obj_scaled(rank_data.total_var_size, f_t{0});
+  std::vector<f_t> h_var_lower_scaled(rank_data.total_var_size,
+                                      -std::numeric_limits<f_t>::infinity());
+  std::vector<f_t> h_var_upper_scaled(rank_data.total_var_size,
+                                      std::numeric_limits<f_t>::infinity());
+  std::vector<f_t> h_cstr_lower_scaled(rank_data.total_cstr_size,
+                                       -std::numeric_limits<f_t>::infinity());
+  std::vector<f_t> h_cstr_upper_scaled(rank_data.total_cstr_size,
+                                       std::numeric_limits<f_t>::infinity());
+
+  for (i_t i = 0; i < rank_data.owned_var_size; ++i) {
+    const auto g          = rank_data.local_to_global_var[i];
+    h_obj[i]              = h_global_obj[g];
+    h_var_lower[i]        = h_global_var_lower[g];
+    h_var_upper[i]        = h_global_var_upper[g];
+    h_obj_scaled[i]       = h_global_obj_scaled[g];
+    h_var_lower_scaled[i] = h_global_var_lower_scaled[g];
+    h_var_upper_scaled[i] = h_global_var_upper_scaled[g];
+  }
+  for (i_t i = 0; i < rank_data.owned_cstr_size; ++i) {
+    const auto g           = rank_data.local_to_global_cstr[i];
+    h_cstr_lower[i]        = h_global_cstr_lower[g];
+    h_cstr_upper[i]        = h_global_cstr_upper[g];
+    h_cstr_lower_scaled[i] = h_global_cstr_lower_scaled[g];
+    h_cstr_upper_scaled[i] = h_global_cstr_upper_scaled[g];
+  }
+
+  // Get local scaling factors
+  std::vector<f_t> h_cstr_scaling_local(rank_data.total_cstr_size, f_t{1});
+  std::vector<f_t> h_var_scaling_local(rank_data.total_var_size, f_t{1});
+  for (i_t i = 0; i < rank_data.owned_cstr_size; ++i) {
+    h_cstr_scaling_local[i] = h_global_cummulative_cstr_scaling[rank_data.local_to_global_cstr[i]];
+  }
+  for (i_t i = 0; i < rank_data.owned_var_size; ++i) {
+    h_var_scaling_local[i] = h_global_cummulative_var_scaling[rank_data.local_to_global_var[i]];
+  }
+
+  // ---- 2. Build optimization_problem_t on this shard's device (UNSCALED). ----
+  opt_problem.emplace(&handle);
+  opt_problem->set_csr_constraint_matrix(rank_data.h_A_values.data(),
+                                         static_cast<i_t>(rank_data.h_A_values.size()),
+                                         rank_data.h_A_col_indices.data(),
+                                         static_cast<i_t>(rank_data.h_A_col_indices.size()),
+                                         rank_data.h_A_row_offsets.data(),
+                                         static_cast<i_t>(rank_data.h_A_row_offsets.size()));
+
+  // Primal axis: TOTAL (owned + halo). Halo slots have neutral defaults.
+  opt_problem->set_objective_coefficients(h_obj.data(), rank_data.total_var_size);
+  opt_problem->set_variable_lower_bounds(h_var_lower.data(), rank_data.total_var_size);
+  opt_problem->set_variable_upper_bounds(h_var_upper.data(), rank_data.total_var_size);
+
+  // Dual axis: TOTAL (owned + halo). Halo slots have ±inf so trivially satisfied.
+  opt_problem->set_constraint_lower_bounds(h_cstr_lower.data(), rank_data.total_cstr_size);
+  opt_problem->set_constraint_upper_bounds(h_cstr_upper.data(), rank_data.total_cstr_size);
+
+  opt_problem->set_maximize(maximize);
+  opt_problem->set_objective_offset(objective_offset);
+  opt_problem->set_objective_scaling_factor(objective_scaling_factor);
+  opt_problem->set_problem_category(problem_category_t::LP);
+
+  // ---- 3. Build problem_t from opt_problem (still UNSCALED). ----
+  sub_problem.emplace(*opt_problem);
+
+  // ---- 4. Override reverse_* with the real local A_T from rank_data. ----
+  // problem_t's ctor computes the transpose of the LOCAL A, which is wrong
+  // in multi-GPU: A_local is owned_cstr x total_var, and A_t_local is the
+  // pre-sliced owned_var x total_cstr matrix we built during partitioning.
+  auto stream_view = handle.get_stream();
+  sub_problem->reverse_offsets.resize(rank_data.h_A_t_row_offsets.size(), stream_view);
+  sub_problem->reverse_constraints.resize(rank_data.h_A_t_col_indices.size(), stream_view);
+  sub_problem->reverse_coefficients.resize(rank_data.h_A_t_values.size(), stream_view);
+  raft::copy(sub_problem->reverse_offsets.data(),
+             rank_data.h_A_t_row_offsets.data(),
+             rank_data.h_A_t_row_offsets.size(),
+             stream_view);
+  raft::copy(sub_problem->reverse_constraints.data(),
+             rank_data.h_A_t_col_indices.data(),
+             rank_data.h_A_t_col_indices.size(),
+             stream_view);
+  raft::copy(sub_problem->reverse_coefficients.data(),
+             rank_data.h_A_t_values.data(),
+             rank_data.h_A_t_values.size(),
+             stream_view);
+  handle.sync_stream(stream_view);
+
+  // ---- 5. Build sub_pdlp (single-GPU mode; multi_gpu flags cleared by caller). ----
+  sub_pdlp = std::make_unique<pdlp_solver_t<i_t, f_t>>(
+    *sub_problem, settings, /*is_legacy_batch_mode=*/false);
+
+  sub_pdlp->pdhg_solver_.set_is_multi_gpu(true);
+
+  // Re-inject master-scaled buffers inside sub_pdlp.
+  // Need to also re-inject the offsets and variables arrays to revert
+  // the csrsort done by problem_t's constructor.
+  auto& scaled = sub_pdlp->get_op_problem_scaled();
+  raft::copy(scaled.offsets.data(),
+             rank_data.h_A_row_offsets.data(),
+             rank_data.h_A_row_offsets.size(),
+             stream_view);
+  raft::copy(scaled.variables.data(),
+             rank_data.h_A_col_indices.data(),
+             rank_data.h_A_col_indices.size(),
+             stream_view);
+  raft::copy(scaled.coefficients.data(),
+             rank_data.h_A_values_scaled.data(),
+             rank_data.h_A_values_scaled.size(),
+             stream_view);
+  // A_T side: all three arrays were already overridden together from
+  // rank_data on sub_problem (see step 4 above) and deep-copied into the
+  // scaled problem, so reverse_offsets / reverse_constraints already match
+  // h_A_t_values_scaled's order. Only the values need a SCALED swap-in.
+  raft::copy(scaled.reverse_coefficients.data(),
+             rank_data.h_A_t_values_scaled.data(),
+             rank_data.h_A_t_values_scaled.size(),
+             stream_view);
+  raft::copy(
+    scaled.objective_coefficients.data(), h_obj_scaled.data(), h_obj_scaled.size(), stream_view);
+  raft::copy(scaled.constraint_lower_bounds.data(),
+             h_cstr_lower_scaled.data(),
+             h_cstr_lower_scaled.size(),
+             stream_view);
+  raft::copy(scaled.constraint_upper_bounds.data(),
+             h_cstr_upper_scaled.data(),
+             h_cstr_upper_scaled.size(),
+             stream_view);
+
+  using f_t2 = typename type_2<f_t>::type;
+  std::vector<f_t2> h_var_bounds_scaled_packed(rank_data.total_var_size);
+  for (i_t i = 0; i < rank_data.total_var_size; ++i) {
+    h_var_bounds_scaled_packed[i].x = h_var_lower_scaled[i];
+    h_var_bounds_scaled_packed[i].y = h_var_upper_scaled[i];
+  }
+  raft::copy(scaled.variable_bounds.data(),
+             h_var_bounds_scaled_packed.data(),
+             h_var_bounds_scaled_packed.size(),
+             stream_view);
+
+  combine_constraint_bounds<i_t, f_t>(scaled, scaled.combined_bounds);
+
+  // Inject master-scaled buffers inside sub_pdlp.initil_strategy
+  auto& scaling = sub_pdlp->get_initial_scaling_strategy();
+  scaling.set_cummulative_scaling(h_cstr_scaling_local, h_var_scaling_local);
+  scaling.set_h_bound_rescaling(h_bound_rescaling);
+  scaling.set_h_objective_rescaling(h_objective_rescaling);
+
+  sub_pdlp->pdhg_solver_.get_cusparse_view().create_spmv_op_plans(
+    /* is_reflected */ true);
+  // ---- 6. Build per-peer halo-exchange plans (ported from metis_tests). ----
+  // For each peer p, we precompute:
+  //   send_indices_d[p] : local indices to gather (uploaded from host send plan)
+  //   send_buf_d[p]     : f_t staging buffer sized to match
+  // Self-peer slot is present but empty (size 0). Used in engine halo exchange.
+  auto build_send_plan = [&](auto const& send_per_peer, auto& indices_d, auto& buf_d) {
+    const std::size_t n_peers = send_per_peer.size();
+    indices_d.reserve(n_peers);
+    buf_d.reserve(n_peers);
+    for (auto const& send_to_peer : send_per_peer) {
+      rmm::device_uvector<i_t> idx(send_to_peer.size(), stream_view);
+      rmm::device_uvector<f_t> buf(send_to_peer.size(), stream_view);
+      if (!send_to_peer.empty()) {
+        raft::copy(idx.data(), send_to_peer.data(), send_to_peer.size(), stream_view);
+      }
+      indices_d.emplace_back(std::move(idx));
+      buf_d.emplace_back(std::move(buf));
+    }
+  };
+  build_send_plan(rank_data.var_send_per_peer, var_send_indices_d, var_send_buf_d);
+  build_send_plan(rank_data.cstr_send_per_peer, cstr_send_indices_d, cstr_send_buf_d);
+
+  handle.sync_stream(stream_view);
+}
+
+template struct pdlp_shard_t<int, double>;
+template struct pdlp_shard_t<int, float>;
+
+}  // namespace cuopt::linear_programming::detail
diff --git a/cpp/src/pdlp/distributed_pdlp/shard.hpp b/cpp/src/pdlp/distributed_pdlp/shard.hpp
new file mode 100644
index 0000000000..35babc12db
--- /dev/null
+++ b/cpp/src/pdlp/distributed_pdlp/shard.hpp
@@ -0,0 +1,99 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ */
+#pragma once
+
+#include <pdlp/distributed_pdlp/rank_data.hpp>
+
+#include <cuopt/linear_programming/optimization_problem.hpp>
+#include <cuopt/linear_programming/pdlp/solver_settings.hpp>
+#include <mip_heuristics/problem/problem.cuh>
+
+#include <raft/core/device_setter.hpp>
+#include <raft/core/handle.hpp>
+#include <rmm/cuda_stream.hpp>
+#include <rmm/device_uvector.hpp>
+
+#include <nccl.h>
+
+#include <memory>
+#include <optional>
+#include <vector>
+
+namespace cuopt::linear_programming::detail {
+
+// Forward-declare to break the cyclic include with pdlp.cuh
+// (pdlp.cuh -> multi_gpu_engine.hpp -> shard.hpp -> pdlp.cuh).
+// Definitions of out-of-line members live in shard.cu, which includes pdlp.cuh.
+template <typename i_t, typename f_t>
+class pdlp_solver_t;
+
+// RAII deleter for ncclComm_t; sets the right device before destroy.
+struct nccl_comm_deleter_t {
+  int device_id{-1};
+  void operator()(ncclComm* comm) const noexcept
+  {
+    if (comm == nullptr) return;
+    raft::device_setter guard(device_id);
+    ncclCommDestroy(comm);
+  }
+};
+using nccl_comm_unique_ptr_t = std::unique_ptr<ncclComm, nccl_comm_deleter_t>;
+
+template <typename i_t, typename f_t>
+struct pdlp_shard_t {
+  // Out-of-line (in shard.cu) because pdlp_solver_t is incomplete here.
+  ~pdlp_shard_t();
+
+  // sub worker for distributed pdlp. Owns its own view on scaled problem and unscaled problem
+  // Owns necessary multi-gpu data (rank_data, device_id, nccl_comm)
+  pdlp_shard_t(int device_id,
+               rank_data_t<i_t, f_t>&& rd,
+               ncclComm_t raw_comm,
+               std::vector<f_t> const& h_global_obj,
+               std::vector<f_t> const& h_global_var_lower,
+               std::vector<f_t> const& h_global_var_upper,
+               std::vector<f_t> const& h_global_cstr_lower,
+               std::vector<f_t> const& h_global_cstr_upper,
+               std::vector<f_t> const& h_global_obj_scaled,
+               std::vector<f_t> const& h_global_var_lower_scaled,
+               std::vector<f_t> const& h_global_var_upper_scaled,
+               std::vector<f_t> const& h_global_cstr_lower_scaled,
+               std::vector<f_t> const& h_global_cstr_upper_scaled,
+               std::vector<f_t> const& h_global_cummulative_cstr_scaling,
+               std::vector<f_t> const& h_global_cummulative_var_scaling,
+               f_t h_bound_rescaling,
+               f_t h_objective_rescaling,
+               bool maximize,
+               f_t objective_offset,
+               f_t objective_scaling_factor,
+               pdlp_solver_settings_t<i_t, f_t> const& settings);
+
+  pdlp_shard_t(const pdlp_shard_t&)            = delete;
+  pdlp_shard_t& operator=(const pdlp_shard_t&) = delete;
+  // Move ops are implicitly deleted (user-declared dtor + deleted copy).
+  // Intentional: shard owns device-affine resources and must never move.
+  // Store as std::unique_ptr in any container.
+
+  int device_id;
+  rmm::cuda_stream stream;
+  raft::handle_t handle;
+  nccl_comm_unique_ptr_t comm;
+  rank_data_t<i_t, f_t> rank_data;
+  std::optional<optimization_problem_t<i_t, f_t>> opt_problem;
+  std::optional<problem_t<i_t, f_t>> sub_problem;
+  std::unique_ptr<pdlp_solver_t<i_t, f_t>> sub_pdlp;
+
+  // Per-peer halo-exchange state. Inner index = peer rank.
+  // Slot for self (peer == this rank) is present but unused (size 0).
+  // var_send_indices_d[peer] : local indices into primal vector to gather and ncclSend
+  // var_send_buf_d    [peer] : staging buffer for outgoing variable values
+  // cstr_send_indices_d/cstr_send_buf_d : same, for dual vector
+  std::vector<rmm::device_uvector<i_t>> var_send_indices_d;
+  std::vector<rmm::device_uvector<f_t>> var_send_buf_d;
+  std::vector<rmm::device_uvector<i_t>> cstr_send_indices_d;
+  std::vector<rmm::device_uvector<f_t>> cstr_send_buf_d;
+};
+
+}  // namespace cuopt::linear_programming::detail
diff --git a/cpp/src/pdlp/initial_scaling_strategy/initial_scaling.cu b/cpp/src/pdlp/initial_scaling_strategy/initial_scaling.cu
index c79249c45d..cb498b3756 100644
--- a/cpp/src/pdlp/initial_scaling_strategy/initial_scaling.cu
+++ b/cpp/src/pdlp/initial_scaling_strategy/initial_scaling.cu
@@ -142,6 +142,10 @@ void pdlp_initial_scaling_strategy_t<i_t, f_t>::compute_scaling_vectors(
 {
   raft::common::nvtx::range fun_scope("compute_scaling_vectors");
 
+  // Skip scaling entirely for a shape-0 problem (distributed PDLP builds the
+  // master pdlp_solver_t from a shape-0 placeholder)
+  if (primal_size_h_ == 0 || dual_size_h_ == 0) return;
+
   if (hyper_params_.do_ruiz_scaling) { ruiz_inf_scaling(number_of_ruiz_iterations); }
   if (hyper_params_.do_pock_chambolle_scaling) { pock_chambolle_scaling(alpha); }
 }
@@ -213,6 +217,72 @@ __global__ void inf_norm_row_and_col_kernel(
   }
 }
 
+template <typename i_t, typename f_t>
+void pdlp_initial_scaling_strategy_t<i_t, f_t>::ruiz_iter_compute_local_iteration_vectors()
+{
+  // find inf norm over rows and columns of the scaled matrix in given iteration
+  i_t number_of_blocks = op_problem_scaled_.n_constraints / block_size;
+  if (op_problem_scaled_.n_constraints % block_size) number_of_blocks++;
+  i_t number_of_threads = std::min(op_problem_scaled_.n_variables, (i_t)block_size);
+  inf_norm_row_and_col_kernel<i_t, f_t><<<number_of_blocks, number_of_threads, 0, stream_view_>>>(
+    op_problem_scaled_.view(), this->view());
+  RAFT_CUDA_TRY(cudaPeekAtLastError());
+
+  if (running_mip_) { reset_integer_variables(); }
+}
+
+template <typename i_t, typename f_t>
+void pdlp_initial_scaling_strategy_t<i_t, f_t>::ruiz_iter_apply_cumulative_update()
+{
+  raft::linalg::binaryOp(cummulative_constraint_matrix_scaling_.data(),
+                         cummulative_constraint_matrix_scaling_.data(),
+                         iteration_constraint_matrix_scaling_.data(),
+                         dual_size_h_,
+                         a_divides_sqrt_b_bounded<f_t>(),
+                         stream_view_);
+
+  raft::linalg::binaryOp(cummulative_variable_scaling_.data(),
+                         cummulative_variable_scaling_.data(),
+                         iteration_variable_scaling_.data(),
+                         primal_size_h_,
+                         a_divides_sqrt_b_bounded<f_t>(),
+                         stream_view_);
+
+  // Reset the iteration_scaling vectors to all 0
+  RAFT_CUDA_TRY(cudaMemsetAsync(
+    iteration_constraint_matrix_scaling_.data(), 0.0, sizeof(f_t) * dual_size_h_, stream_view_));
+  RAFT_CUDA_TRY(cudaMemsetAsync(
+    iteration_variable_scaling_.data(), 0.0, sizeof(f_t) * primal_size_h_, stream_view_));
+}
+
+template <typename i_t, typename f_t>
+void pdlp_initial_scaling_strategy_t<i_t, f_t>::reset_scaling_state_for_distributed()
+{
+  if (primal_size_h_ == 0 || dual_size_h_ == 0) return;
+
+  // Re-allocate the iteration vectors the ctor shrank to 0 and zero them.
+  iteration_constraint_matrix_scaling_.resize(static_cast<size_t>(dual_size_h_), stream_view_);
+  iteration_variable_scaling_.resize(static_cast<size_t>(primal_size_h_), stream_view_);
+  RAFT_CUDA_TRY(cudaMemsetAsync(
+    iteration_constraint_matrix_scaling_.data(), 0, sizeof(f_t) * dual_size_h_, stream_view_));
+  RAFT_CUDA_TRY(cudaMemsetAsync(
+    iteration_variable_scaling_.data(), 0, sizeof(f_t) * primal_size_h_, stream_view_));
+
+  // Reset cumulative scaling + rescaling to identity (the ctor's stray
+  // Pock-Chambolle pass and shard.cu's set_cummulative_scaling left these in
+  // an arbitrary state; distributed scaling recomputes from a clean slate).
+  thrust::fill(handle_ptr_->get_thrust_policy(),
+               cummulative_constraint_matrix_scaling_.begin(),
+               cummulative_constraint_matrix_scaling_.end(),
+               f_t(1));
+  thrust::fill(handle_ptr_->get_thrust_policy(),
+               cummulative_variable_scaling_.begin(),
+               cummulative_variable_scaling_.end(),
+               f_t(1));
+  set_h_bound_rescaling(f_t(1));
+  set_h_objective_rescaling(f_t(1));
+}
+
 template <typename i_t, typename f_t>
 void pdlp_initial_scaling_strategy_t<i_t, f_t>::ruiz_inf_scaling(i_t number_of_ruiz_iterations)
 {
@@ -221,36 +291,8 @@ void pdlp_initial_scaling_strategy_t<i_t, f_t>::ruiz_inf_scaling(i_t number_of_r
   std::cout << "Doing ruiz_inf_scaling" << std::endl;
 #endif
   for (int i = 0; i < number_of_ruiz_iterations; i++) {
-    // find inf norm over rows and columns of the scaled matrix in given iteration (matrix is not
-    // actually updated, but the scaled value is computed and evaluated)
-    i_t number_of_blocks = op_problem_scaled_.n_constraints / block_size;
-    if (op_problem_scaled_.n_constraints % block_size) number_of_blocks++;
-    i_t number_of_threads = std::min(op_problem_scaled_.n_variables, (i_t)block_size);
-    inf_norm_row_and_col_kernel<i_t, f_t><<<number_of_blocks, number_of_threads, 0, stream_view_>>>(
-      op_problem_scaled_.view(), this->view());
-    RAFT_CUDA_TRY(cudaPeekAtLastError());
-
-    if (running_mip_) { reset_integer_variables(); }
-
-    raft::linalg::binaryOp(cummulative_constraint_matrix_scaling_.data(),
-                           cummulative_constraint_matrix_scaling_.data(),
-                           iteration_constraint_matrix_scaling_.data(),
-                           dual_size_h_,
-                           a_divides_sqrt_b_bounded<f_t>(),
-                           stream_view_);
-
-    raft::linalg::binaryOp(cummulative_variable_scaling_.data(),
-                           cummulative_variable_scaling_.data(),
-                           iteration_variable_scaling_.data(),
-                           primal_size_h_,
-                           a_divides_sqrt_b_bounded<f_t>(),
-                           stream_view_);
-
-    // Reset the iteration_scaling vectors to all 0
-    RAFT_CUDA_TRY(cudaMemsetAsync(
-      iteration_constraint_matrix_scaling_.data(), 0.0, sizeof(f_t) * dual_size_h_, stream_view_));
-    RAFT_CUDA_TRY(cudaMemsetAsync(
-      iteration_variable_scaling_.data(), 0.0, sizeof(f_t) * primal_size_h_, stream_view_));
+    ruiz_iter_compute_local_iteration_vectors();
+    ruiz_iter_apply_cumulative_update();
   }
 }
 
@@ -343,8 +385,12 @@ __global__ void pock_chambolle_scaling_kernel_col(
   if (threadIdx.x == 0) initial_scaling_view.iteration_variable_scaling[col] = accumulated_value;
 }
 
+// Local half of one Pock-Chambolle pass: writes the per-row and per-column
+// sums-of-powers into iteration_constraint_matrix_scaling_ /
+// iteration_variable_scaling_
 template <typename i_t, typename f_t>
-void pdlp_initial_scaling_strategy_t<i_t, f_t>::pock_chambolle_scaling(f_t alpha)
+void pdlp_initial_scaling_strategy_t<i_t, f_t>::pock_chambolle_compute_local_iteration_vectors(
+  f_t alpha)
 {
   // Reset the iteration_scaling vectors to all 0
   RAFT_CUDA_TRY(cudaMemsetAsync(
@@ -379,7 +425,12 @@ void pdlp_initial_scaling_strategy_t<i_t, f_t>::pock_chambolle_scaling(f_t alpha
       A_T_offsets_.data(),
       A_T_indices_.data());
   RAFT_CUDA_TRY(cudaPeekAtLastError());
+}
 
+// Fold half of one Pock-Chambolle pass: cumulative /= sqrt(iteration).
+template <typename i_t, typename f_t>
+void pdlp_initial_scaling_strategy_t<i_t, f_t>::pock_chambolle_apply_cumulative_update()
+{
   if (running_mip_) { reset_integer_variables(); }
 
   // divide the sqrt of the vectors of the sums from above to the respective scaling vectors
@@ -398,6 +449,13 @@ void pdlp_initial_scaling_strategy_t<i_t, f_t>::pock_chambolle_scaling(f_t alpha
                          stream_view_);
 }
 
+template <typename i_t, typename f_t>
+void pdlp_initial_scaling_strategy_t<i_t, f_t>::pock_chambolle_scaling(f_t alpha)
+{
+  pock_chambolle_compute_local_iteration_vectors(alpha);
+  pock_chambolle_apply_cumulative_update();
+}
+
 template <typename i_t, typename f_t>
 __global__ void scale_problem_kernel(
   const typename pdlp_initial_scaling_strategy_t<i_t, f_t>::view_t initial_scaling_view,
@@ -586,7 +644,8 @@ void pdlp_initial_scaling_strategy_t<i_t, f_t>::scale_problem()
     cuda::std::multiplies<f_t>{},
     stream_view_);
 
-  if (hyper_params_.bound_objective_rescaling && !running_mip_) {
+  if (hyper_params_.bound_objective_rescaling && !running_mip_ &&
+      !skip_distributed_local_rescaling_) {
     // Coefficients are computed on the already scaled values
     bound_objective_rescaling();
 
@@ -899,6 +958,50 @@ const problem_t<i_t, f_t>& pdlp_initial_scaling_strategy_t<i_t, f_t>::get_scaled
   return op_problem_scaled_;
 }
 
+template <typename i_t, typename f_t>
+void pdlp_initial_scaling_strategy_t<i_t, f_t>::apply_distributed_bound_objective_rescaling(
+  f_t bound_rescaling, f_t objective_rescaling)
+{
+  using f_t2 = typename type_2<f_t>::type;
+
+  // constraint bounds *= bound_rescaling  (matches scale_problem() bound block)
+  cub::DeviceTransform::Transform(
+    cuda::std::make_tuple(op_problem_scaled_.constraint_lower_bounds.data(),
+                          op_problem_scaled_.constraint_upper_bounds.data()),
+    thrust::make_zip_iterator(op_problem_scaled_.constraint_lower_bounds.data(),
+                              op_problem_scaled_.constraint_upper_bounds.data()),
+    op_problem_scaled_.constraint_upper_bounds.size(),
+    [bound_rescaling] __device__(f_t lower, f_t upper) -> thrust::tuple<f_t, f_t> {
+      return {lower * bound_rescaling, upper * bound_rescaling};
+    },
+    stream_view_.value());
+
+  // variable bounds *= bound_rescaling (batch-1 path only; distributed is batch 1)
+  cub::DeviceTransform::Transform(
+    op_problem_scaled_.variable_bounds.data(),
+    op_problem_scaled_.variable_bounds.data(),
+    op_problem_scaled_.variable_bounds.size(),
+    [bound_rescaling] __device__(f_t2 variable_bounds) -> f_t2 {
+      return {variable_bounds.x * bound_rescaling, variable_bounds.y * bound_rescaling};
+    },
+    stream_view_);
+
+  // objective *= objective_rescaling
+  cub::DeviceTransform::Transform(
+    op_problem_scaled_.objective_coefficients.data(),
+    op_problem_scaled_.objective_coefficients.data(),
+    op_problem_scaled_.objective_coefficients.size(),
+    [objective_rescaling] __device__(f_t c) -> f_t { return c * objective_rescaling; },
+    stream_view_);
+
+  // Store the factors (sets both host copies and the device rescaling vectors)
+  // so unscale_solutions() / scale_solutions() apply them consistently. The flag
+  // hyper_params_.bound_objective_rescaling stays true on shards so those paths
+  // are active; only scale_problem()'s local recompute is skipped.
+  set_h_bound_rescaling(bound_rescaling);
+  set_h_objective_rescaling(objective_rescaling);
+}
+
 template <typename i_t, typename f_t>
 const rmm::device_uvector<f_t>&
 pdlp_initial_scaling_strategy_t<i_t, f_t>::get_constraint_matrix_scaling_vector() const
@@ -913,6 +1016,46 @@ pdlp_initial_scaling_strategy_t<i_t, f_t>::get_variable_scaling_vector() const
   return cummulative_variable_scaling_;
 }
 
+template <typename i_t, typename f_t>
+void pdlp_initial_scaling_strategy_t<i_t, f_t>::set_cummulative_scaling(
+  const std::vector<f_t>& h_cummulative_constraint_matrix_scaling,
+  const std::vector<f_t>& h_cummulative_variable_scaling)
+{
+  cuopt_expects(static_cast<i_t>(h_cummulative_constraint_matrix_scaling.size()) == dual_size_h_,
+                error_type_t::ValidationError,
+                "set_cummulative_scaling: host constraint scaling vector size mismatch");
+  cuopt_expects(static_cast<i_t>(h_cummulative_variable_scaling.size()) == primal_size_h_,
+                error_type_t::ValidationError,
+                "set_cummulative_scaling: host variable scaling vector size mismatch");
+
+  raft::copy(cummulative_constraint_matrix_scaling_.data(),
+             h_cummulative_constraint_matrix_scaling.data(),
+             h_cummulative_constraint_matrix_scaling.size(),
+             stream_view_);
+  raft::copy(cummulative_variable_scaling_.data(),
+             h_cummulative_variable_scaling.data(),
+             h_cummulative_variable_scaling.size(),
+             stream_view_);
+}
+
+template <typename i_t, typename f_t>
+void pdlp_initial_scaling_strategy_t<i_t, f_t>::set_h_bound_rescaling(f_t value)
+{
+  std::fill(h_bound_rescaling_.begin(), h_bound_rescaling_.end(), value);
+  thrust::fill(
+    handle_ptr_->get_thrust_policy(), bound_rescaling_.begin(), bound_rescaling_.end(), value);
+}
+
+template <typename i_t, typename f_t>
+void pdlp_initial_scaling_strategy_t<i_t, f_t>::set_h_objective_rescaling(f_t value)
+{
+  std::fill(h_objective_rescaling_.begin(), h_objective_rescaling_.end(), value);
+  thrust::fill(handle_ptr_->get_thrust_policy(),
+               objective_rescaling_.begin(),
+               objective_rescaling_.end(),
+               value);
+}
+
 template <typename i_t, typename f_t>
 typename pdlp_initial_scaling_strategy_t<i_t, f_t>::view_t
 pdlp_initial_scaling_strategy_t<i_t, f_t>::view()
diff --git a/cpp/src/pdlp/initial_scaling_strategy/initial_scaling.cuh b/cpp/src/pdlp/initial_scaling_strategy/initial_scaling.cuh
index 99d1472b6f..13f639079d 100644
--- a/cpp/src/pdlp/initial_scaling_strategy/initial_scaling.cuh
+++ b/cpp/src/pdlp/initial_scaling_strategy/initial_scaling.cuh
@@ -75,6 +75,11 @@ class pdlp_initial_scaling_strategy_t {
                          rmm::device_uvector<f_t>& dual_slack) const;
   void unscale_solutions(solution_t<i_t, f_t>& solution) const;
   const rmm::device_uvector<f_t>& get_constraint_matrix_scaling_vector() const;
+  // Mutable access needed by distributed PDLP to broadcast owned constraint
+  rmm::device_uvector<f_t>& get_cummulative_constraint_matrix_scaling()
+  {
+    return cummulative_constraint_matrix_scaling_;
+  }
   const rmm::device_uvector<f_t>& get_variable_scaling_vector() const;
   const problem_t<i_t, f_t>& get_scaled_op_problem();
 
@@ -85,8 +90,40 @@ class pdlp_initial_scaling_strategy_t {
   void swap_context(const thrust::universal_host_pinned_vector<swap_pair_t<i_t>>& swap_pairs);
   void resize_context(i_t new_size);
 
+  // Inject scaling state computed by another pdlp_initial_scaling_strategy_t
+  // Needed by distributed PDLP
+  void set_cummulative_scaling(const std::vector<f_t>& h_cummulative_constraint_matrix_scaling,
+                               const std::vector<f_t>& h_cummulative_variable_scaling);
+  void set_h_bound_rescaling(f_t value);
+  void set_h_objective_rescaling(f_t value);
+
   void bound_objective_rescaling();
 
+  // Distributed PDLP: apply an externally-computed GLOBAL bound / objective
+  // rescaling to the already-scaled problem.
+  void apply_distributed_bound_objective_rescaling(f_t bound_rescaling, f_t objective_rescaling);
+
+  // Distributed PDLP: skip the LOCAL bound/objective rescaling inside
+  // scale_problem()
+  void set_skip_distributed_local_rescaling(bool value)
+  {
+    skip_distributed_local_rescaling_ = value;
+  }
+
+  // Public for distributed PDLP
+  void compute_scaling_vectors(i_t number_of_ruiz_iterations, f_t alpha);
+
+  // ----- Distributed-PDLP hooks -----
+
+  void ruiz_iter_compute_local_iteration_vectors();
+  void ruiz_iter_apply_cumulative_update();
+  void pock_chambolle_compute_local_iteration_vectors(f_t alpha);
+  void pock_chambolle_apply_cumulative_update();
+  rmm::device_uvector<f_t>& get_iteration_variable_scaling() { return iteration_variable_scaling_; }
+
+  // Restore the clean pre-scaling state for the distributed path.
+  void reset_scaling_state_for_distributed();
+
   /**
    * @brief Gets the device-side view (with raw pointers), for ease of access
    *        inside cuda kernels
@@ -94,7 +131,6 @@ class pdlp_initial_scaling_strategy_t {
   view_t view();
 
  private:
-  void compute_scaling_vectors(i_t number_of_ruiz_iterations, f_t alpha);
   void ruiz_inf_scaling(i_t number_of_ruiz_iterations);
   void pock_chambolle_scaling(f_t alpha);
   void reset_integer_variables();
@@ -124,5 +160,8 @@ class pdlp_initial_scaling_strategy_t {
   rmm::device_uvector<i_t>& A_T_indices_;
   const pdlp_hyper_params::pdlp_hyper_params_t& hyper_params_;
   bool running_mip_;
+  // Distributed PDLP: when true, scale_problem() skips its local
+  // bound/objective rescaling (the global factor is applied separately).
+  bool skip_distributed_local_rescaling_{false};
 };
 }  // namespace cuopt::linear_programming::detail
diff --git a/cpp/src/pdlp/pdhg.cu b/cpp/src/pdlp/pdhg.cu
index e88366a295..b1f1a59ada 100644
--- a/cpp/src/pdlp/pdhg.cu
+++ b/cpp/src/pdlp/pdhg.cu
@@ -5,6 +5,11 @@
  */
 /* clang-format on */
 #include <pdlp/pdhg.hpp>
+// pdlp.cuh defines pdlp_solver_t which the engine's compute_A_x/compute_At_y
+// template bodies dereference via shard.sub_pdlp->pdhg_solver_. Must be a
+// complete type at the point of template instantiation below.
+#include <pdlp/distributed_pdlp/multi_gpu_engine.hpp>
+#include <pdlp/pdlp.cuh>
 #include <pdlp/pdlp_climber_strategy.hpp>
 #include <pdlp/pdlp_constants.hpp>
 #include <pdlp/swap_and_resize_helper.cuh>
@@ -506,6 +511,15 @@ void pdhg_solver_t<i_t, f_t>::compute_At_y()
 {
   // A_t @ y
 
+  // Multi-GPU dispatch: when the master pdhg has an engine, drive halo
+  // exchange + per-shard SpMV via the engine. Shards' pdhg_solver_ have no
+  // engine pointer set, so their compute_At_y falls through to the cusparse
+  // path below on each shard's local A_t.
+  if (mgpu_engine_ != nullptr) {
+    mgpu_engine_->distributed_compute_At_y();
+    return;
+  }
+
   if (!batch_mode_) {
     if constexpr (std::is_same_v<f_t, double>) {
       if (cusparse_view_.mixed_precision_enabled_) {
@@ -555,6 +569,15 @@ template <typename i_t, typename f_t>
 void pdhg_solver_t<i_t, f_t>::compute_A_x()
 {
   // A @ x
+
+  // Multi-GPU dispatch: see compute_At_y. The engine halo-updates the
+  // reflected_primal vector (the buffer this SpMV reads) and then drives
+  // per-shard local cusparse SpMV.
+  if (mgpu_engine_ != nullptr) {
+    mgpu_engine_->distributed_compute_A_x();
+    return;
+  }
+
   if (!batch_mode_) {
     if constexpr (std::is_same_v<f_t, double>) {
       if (cusparse_view_.mixed_precision_enabled_) {
@@ -600,6 +623,50 @@ void pdhg_solver_t<i_t, f_t>::compute_A_x()
   }
 }
 
+template <typename i_t, typename f_t>
+void pdhg_solver_t<i_t, f_t>::spmv_At_into(rmm::device_uvector<f_t>& in_buf,
+                                           cusparseDnVecDescr_t out_desc)
+{
+  RAFT_CUSPARSE_TRY(cusparseDnVecSetValues(cusparse_view_.dual_solution, in_buf.data()));
+  RAFT_CUSPARSE_TRY(raft::sparse::detail::cusparsespmv(handle_ptr_->get_cusparse_handle(),
+                                                       CUSPARSE_OPERATION_NON_TRANSPOSE,
+                                                       reusable_device_scalar_value_1_.data(),
+                                                       cusparse_view_.A_T,
+                                                       cusparse_view_.dual_solution,
+                                                       reusable_device_scalar_value_0_.data(),
+                                                       out_desc,
+                                                       CUSPARSE_SPMV_CSR_ALG2,
+                                                       (f_t*)cusparse_view_.buffer_transpose.data(),
+                                                       stream_view_));
+  // Restore the canonical binding so subsequent code on this shard that reads
+  // cv.dual_solution sees the dual_solution_ buffer it was constructed with.
+  RAFT_CUSPARSE_TRY(cusparseDnVecSetValues(cusparse_view_.dual_solution,
+                                           current_saddle_point_state_.get_dual_solution().data()));
+}
+
+template <typename i_t, typename f_t>
+void pdhg_solver_t<i_t, f_t>::spmv_A_into(rmm::device_uvector<f_t>& in_buf,
+                                          cusparseDnVecDescr_t out_desc)
+{
+  RAFT_CUSPARSE_TRY(
+    cusparseDnVecSetValues(cusparse_view_.reflected_primal_solution, in_buf.data()));
+  RAFT_CUSPARSE_TRY(
+    raft::sparse::detail::cusparsespmv(handle_ptr_->get_cusparse_handle(),
+                                       CUSPARSE_OPERATION_NON_TRANSPOSE,
+                                       reusable_device_scalar_value_1_.data(),
+                                       cusparse_view_.A,
+                                       cusparse_view_.reflected_primal_solution,
+                                       reusable_device_scalar_value_0_.data(),
+                                       out_desc,
+                                       CUSPARSE_SPMV_CSR_ALG2,
+                                       (f_t*)cusparse_view_.buffer_non_transpose.data(),
+                                       stream_view_));
+  // Restore the canonical binding so subsequent code on this shard that reads
+  // cv.reflected_primal_solution sees the reflected_primal_ buffer.
+  RAFT_CUSPARSE_TRY(
+    cusparseDnVecSetValues(cusparse_view_.reflected_primal_solution, reflected_primal_.data()));
+}
+
 template <typename i_t, typename f_t>
 void pdhg_solver_t<i_t, f_t>::compute_primal_projection_with_gradient(
   rmm::device_uvector<f_t>& primal_step_size)
@@ -694,6 +761,26 @@ struct primal_reflected_major_projection {
   const f_t* scalar_;
 };
 
+// Pure cub-transform extract — body byte-identical to the non-batch inline
+// path in compute_next_primal_dual_solution_reflected. The platform dispatch
+// (single-GPU vs per-shard fan-out) lives at the call site, not here.
+// Placed after primal_reflected_major_projection so the functor is visible.
+template <typename i_t, typename f_t>
+void pdhg_solver_t<i_t, f_t>::primal_reflected_major_projection_transform(
+  rmm::device_uvector<f_t>& primal_step_size)
+{
+  cub::DeviceTransform::Transform(
+    cuda::std::make_tuple(current_saddle_point_state_.get_primal_solution().data(),
+                          problem_ptr->objective_coefficients.data(),
+                          current_saddle_point_state_.get_current_AtY().data(),
+                          problem_ptr->variable_bounds.data()),
+    thrust::make_zip_iterator(
+      potential_next_primal_solution_.data(), dual_slack_.data(), reflected_primal_.data()),
+    primal_size_h_,
+    primal_reflected_major_projection<f_t>(primal_step_size.data()),
+    stream_view_.value());
+}
+
 template <typename f_t>
 struct primal_reflected_major_projection_batch {
   using f_t2 = typename type_2<f_t>::type;
@@ -722,6 +809,21 @@ struct primal_reflected_projection {
   const f_t* scalar_;
 };
 
+template <typename i_t, typename f_t>
+void pdhg_solver_t<i_t, f_t>::primal_reflected_projection_transform(
+  rmm::device_uvector<f_t>& primal_step_size)
+{
+  cub::DeviceTransform::Transform(
+    cuda::std::make_tuple(current_saddle_point_state_.get_primal_solution().data(),
+                          problem_ptr->objective_coefficients.data(),
+                          current_saddle_point_state_.get_current_AtY().data(),
+                          problem_ptr->variable_bounds.data()),
+    reflected_primal_.data(),
+    primal_size_h_,
+    primal_reflected_projection<f_t>(primal_step_size.data()),
+    stream_view_.value());
+}
+
 template <typename f_t>
 struct primal_reflected_projection_batch {
   using f_t2 = typename type_2<f_t>::type;
@@ -751,6 +853,21 @@ struct dual_reflected_major_projection {
   const f_t* scalar_;
 };
 
+template <typename i_t, typename f_t>
+void pdhg_solver_t<i_t, f_t>::dual_reflected_major_projection_transform(
+  rmm::device_uvector<f_t>& dual_step_size)
+{
+  cub::DeviceTransform::Transform(
+    cuda::std::make_tuple(current_saddle_point_state_.get_dual_solution().data(),
+                          current_saddle_point_state_.get_dual_gradient().data(),
+                          problem_ptr->constraint_lower_bounds.data(),
+                          problem_ptr->constraint_upper_bounds.data()),
+    thrust::make_zip_iterator(potential_next_dual_solution_.data(), reflected_dual_.data()),
+    dual_size_h_,
+    dual_reflected_major_projection<f_t>(dual_step_size.data()),
+    stream_view_.value());
+}
+
 template <typename f_t>
 struct dual_reflected_major_projection_batch {
   HDI thrust::tuple<f_t, f_t> operator()(
@@ -779,6 +896,21 @@ struct dual_reflected_projection {
   const f_t* scalar_;
 };
 
+template <typename i_t, typename f_t>
+void pdhg_solver_t<i_t, f_t>::dual_reflected_projection_transform(
+  rmm::device_uvector<f_t>& dual_step_size)
+{
+  cub::DeviceTransform::Transform(
+    cuda::std::make_tuple(current_saddle_point_state_.get_dual_solution().data(),
+                          current_saddle_point_state_.get_dual_gradient().data(),
+                          problem_ptr->constraint_lower_bounds.data(),
+                          problem_ptr->constraint_upper_bounds.data()),
+    reflected_dual_.data(),
+    dual_size_h_,
+    dual_reflected_projection<f_t>(dual_step_size.data()),
+    stream_view_.value());
+}
+
 template <typename f_t>
 struct dual_reflected_projection_batch {
   HDI f_t
@@ -1113,22 +1245,30 @@ void pdhg_solver_t<i_t, f_t>::compute_next_primal_dual_solution_reflected(
 
   using f_t2 = typename type_2<f_t>::type;
 
+  if (mgpu_engine_ != nullptr) { mgpu_engine_->sync_await_shards(stream_view_); }
+
   // Compute next primal solution reflected.
 
   if (should_major) {
     graph_all.run(should_major, [&]() {
+      // Multi-GPU: splice shard streams into the capture so their kernels and
+      // NCCL collectives are recorded into the same graph. Without this, work
+      // issued on shard.stream from inside this lambda would either invalidate
+      // the capture or run outside the graph, leaving the captured graph
+      // empty (or broken) -- which produces the cycling/stall behavior we
+      // observed on larger problems. Mirrors metis_tests bench.cu fork/join.
+      if (mgpu_engine_ != nullptr) { mgpu_engine_->graph_capture_fork_to_shards(stream_view_); }
+
       compute_At_y();
-      if (!batch_mode_) {
-        cub::DeviceTransform::Transform(
-          cuda::std::make_tuple(current_saddle_point_state_.get_primal_solution().data(),
-                                problem_ptr->objective_coefficients.data(),
-                                current_saddle_point_state_.get_current_AtY().data(),
-                                problem_ptr->variable_bounds.data()),
-          thrust::make_zip_iterator(
-            potential_next_primal_solution_.data(), dual_slack_.data(), reflected_primal_.data()),
-          primal_size_h_,
-          primal_reflected_major_projection<f_t>(primal_step_size.data()),
-          stream_view_.value());
+      if (mgpu_engine_ != nullptr) {
+        for (auto& shard : mgpu_engine_->shards) {
+          raft::device_setter guard(shard->device_id);
+          auto& sub_pdlp = *shard->sub_pdlp;
+          sub_pdlp.pdhg_solver_.primal_reflected_major_projection_transform(
+            sub_pdlp.get_primal_step_size());
+        }
+      } else if (!batch_mode_) {
+        primal_reflected_major_projection_transform(primal_step_size);
       } else {
         cub::DeviceFor::Bulk(
           potential_next_primal_solution_.size(),
@@ -1187,16 +1327,15 @@ void pdhg_solver_t<i_t, f_t>::compute_next_primal_dual_solution_reflected(
       // Compute next dual
       compute_A_x();
 
-      if (!batch_mode_) {
-        cub::DeviceTransform::Transform(
-          cuda::std::make_tuple(current_saddle_point_state_.get_dual_solution().data(),
-                                current_saddle_point_state_.get_dual_gradient().data(),
-                                problem_ptr->constraint_lower_bounds.data(),
-                                problem_ptr->constraint_upper_bounds.data()),
-          thrust::make_zip_iterator(potential_next_dual_solution_.data(), reflected_dual_.data()),
-          dual_size_h_,
-          dual_reflected_major_projection<f_t>(dual_step_size.data()),
-          stream_view_.value());
+      if (mgpu_engine_ != nullptr) {
+        for (auto& shard : mgpu_engine_->shards) {
+          raft::device_setter guard(shard->device_id);
+          auto& sub_pdlp = *shard->sub_pdlp;
+          sub_pdlp.pdhg_solver_.dual_reflected_major_projection_transform(
+            sub_pdlp.get_dual_step_size());
+        }
+      } else if (!batch_mode_) {
+        dual_reflected_major_projection_transform(dual_step_size);
       } else {
         cub::DeviceFor::Bulk(
           potential_next_dual_solution_.size(),
@@ -1217,10 +1356,17 @@ void pdhg_solver_t<i_t, f_t>::compute_next_primal_dual_solution_reflected(
       print("potential_next_dual_solution_", potential_next_dual_solution_);
       print("reflected_dual_", reflected_dual_);
 #endif
+
+      // Multi-GPU: close the fork by joining every shard stream back into
+      // the master stream so cudaStreamEndCapture sees a single graph
+      // spanning all streams.
+      if (mgpu_engine_ != nullptr) { mgpu_engine_->graph_capture_join_from_shards(stream_view_); }
     });
 
   } else {
     graph_all.run(should_major, [&]() {
+      if (mgpu_engine_ != nullptr) { mgpu_engine_->graph_capture_fork_to_shards(stream_view_); }
+
       // Compute next primal
       compute_At_y();
 
@@ -1232,16 +1378,15 @@ void pdhg_solver_t<i_t, f_t>::compute_next_primal_dual_solution_reflected(
             current_saddle_point_state_.get_current_AtY());
 #endif
 
-      if (!batch_mode_) {
-        cub::DeviceTransform::Transform(
-          cuda::std::make_tuple(current_saddle_point_state_.get_primal_solution().data(),
-                                problem_ptr->objective_coefficients.data(),
-                                current_saddle_point_state_.get_current_AtY().data(),
-                                problem_ptr->variable_bounds.data()),
-          reflected_primal_.data(),
-          primal_size_h_,
-          primal_reflected_projection<f_t>(primal_step_size.data()),
-          stream_view_.value());
+      if (mgpu_engine_ != nullptr) {
+        for (auto& shard : mgpu_engine_->shards) {
+          raft::device_setter guard(shard->device_id);
+          auto& sub_pdlp = *shard->sub_pdlp;
+          sub_pdlp.pdhg_solver_.primal_reflected_projection_transform(
+            sub_pdlp.get_primal_step_size());
+        }
+      } else if (!batch_mode_) {
+        primal_reflected_projection_transform(primal_step_size);
       } else {
         cub::DeviceFor::Bulk(
           reflected_primal_.size(),
@@ -1301,16 +1446,14 @@ void pdhg_solver_t<i_t, f_t>::compute_next_primal_dual_solution_reflected(
       // Compute next dual
       compute_A_x();
 
-      if (!batch_mode_) {
-        cub::DeviceTransform::Transform(
-          cuda::std::make_tuple(current_saddle_point_state_.get_dual_solution().data(),
-                                current_saddle_point_state_.get_dual_gradient().data(),
-                                problem_ptr->constraint_lower_bounds.data(),
-                                problem_ptr->constraint_upper_bounds.data()),
-          reflected_dual_.data(),
-          dual_size_h_,
-          dual_reflected_projection<f_t>(dual_step_size.data()),
-          stream_view_.value());
+      if (mgpu_engine_ != nullptr) {
+        for (auto& shard : mgpu_engine_->shards) {
+          raft::device_setter guard(shard->device_id);
+          auto& sub_pdlp = *shard->sub_pdlp;
+          sub_pdlp.pdhg_solver_.dual_reflected_projection_transform(sub_pdlp.get_dual_step_size());
+        }
+      } else if (!batch_mode_) {
+        dual_reflected_projection_transform(dual_step_size);
       } else {
         cub::DeviceFor::Bulk(
           reflected_dual_.size(),
@@ -1328,8 +1471,13 @@ void pdhg_solver_t<i_t, f_t>::compute_next_primal_dual_solution_reflected(
 #ifdef CUPDLP_DEBUG_MODE
       print("reflected_dual_", reflected_dual_);
 #endif
+
+      if (mgpu_engine_ != nullptr) { mgpu_engine_->graph_capture_join_from_shards(stream_view_); }
     });
   }
+
+  // sync to master stream after the graph is captured
+  if (mgpu_engine_ != nullptr) { mgpu_engine_->sync_await_master(stream_view_); }
 }
 
 template <typename i_t, typename f_t>
diff --git a/cpp/src/pdlp/pdhg.hpp b/cpp/src/pdlp/pdhg.hpp
index 01eabe967a..e4d16360a7 100644
--- a/cpp/src/pdlp/pdhg.hpp
+++ b/cpp/src/pdlp/pdhg.hpp
@@ -24,6 +24,12 @@
 #include <vector>
 
 namespace cuopt::linear_programming::detail {
+
+// Forward-declared to avoid include cycle: multi_gpu_engine.hpp itself includes pdhg.hpp
+// (engine calls per-shard pdhg compute_*). pdhg.cu does the full include.
+template <typename i_t, typename f_t>
+struct multi_gpu_engine_t;
+
 template <typename i_t, typename f_t>
 class pdhg_solver_t {
  public:
@@ -76,6 +82,53 @@ class pdhg_solver_t {
   void update_solution(cusparse_view_t<i_t, f_t>& current_op_problem_evaluation_cusparse_view_);
   void refine_initial_primal_projection(const rmm::device_uvector<f_t>& bound_rescaling);
 
+  // SpMV primitives. Public so the multi-GPU engine can drive them per-shard
+  // after halo-exchanging the relevant vector. Single-GPU PDLP still calls
+  // them internally via take_step / compute_next_*.
+  //
+  // If set_multi_gpu_engine() has been called, these dispatch to the engine
+  // (halo exchange + per-shard SpMV). Otherwise they run the single-GPU
+  // cusparse path on the local matrix.
+  void compute_At_y();
+  void compute_A_x();
+  void spmvop_At_y();
+  void spmvop_A_x();
+
+  // Parameterized SpMVs used by the multi-GPU engine.
+  // Both temporarily hijack a canonical input descriptor in cusparse_view_
+  // (cv.dual_solution for At, cv.reflected_primal_solution for A) to point at
+  // `in_buf.data()`, run the local SpMV into `out_desc`, then restore the
+  // descriptor to its original buffer so other code on this shard is unaffected.
+  // No multi-GPU dispatch inside — the engine is the orchestrator.
+  void spmv_At_into(rmm::device_uvector<f_t>& in_buf, cusparseDnVecDescr_t out_desc);
+  void spmv_A_into(rmm::device_uvector<f_t>& in_buf, cusparseDnVecDescr_t out_desc);
+
+  // Pure cub-transform extractions. Each one is byte-identical to the inline
+  // cub call it replaces — no platform dispatch inside. Callers handle the
+  // single-GPU vs per-shard branching at the call site (see the
+  // "if (mgpu_engine_) for shard..." blocks in compute_next_*).
+  void primal_reflected_major_projection_transform(rmm::device_uvector<f_t>& primal_step_size);
+  void dual_reflected_major_projection_transform(rmm::device_uvector<f_t>& dual_step_size);
+  void primal_reflected_projection_transform(rmm::device_uvector<f_t>& primal_step_size);
+  void dual_reflected_projection_transform(rmm::device_uvector<f_t>& dual_step_size);
+
+  // Master PDLP wires up the engine pointer here after the engine is built.
+  // Shards' pdhg_solver_ leaves this null so each shard runs single-GPU SpMV
+  // on its local matrix. Also flips is_multi_gpu_ — convenience flag that any
+  // pdhg participating in a distributed run (master OR shard) carries true.
+  void set_multi_gpu_engine(multi_gpu_engine_t<i_t, f_t>* engine)
+  {
+    mgpu_engine_  = engine;
+    is_multi_gpu_ = (engine != nullptr);
+  }
+
+  // Mark a shard's pdhg_solver_ as part of a distributed run without giving it
+  // an engine (shards don't orchestrate; they only run local SpMV on owned
+  // rows). Called from shard.cu right after sub_pdlp is constructed.
+  void set_is_multi_gpu(bool v) { is_multi_gpu_ = v; }
+  bool is_multi_gpu() const { return is_multi_gpu_; }
+  multi_gpu_engine_t<i_t, f_t>* get_mgpu_engine() const { return mgpu_engine_; }
+
   i_t total_pdhg_iterations_;
 
  private:
@@ -93,12 +146,9 @@ class pdhg_solver_t {
 
   void compute_primal_projection_with_gradient(rmm::device_uvector<f_t>& primal_step_size);
   void compute_primal_projection(rmm::device_uvector<f_t>& primal_step_size);
-  void compute_At_y();
-  void compute_A_x();
-  void spmvop_At_y();
-  void spmvop_A_x();
 
   bool batch_mode_{false};
+  bool is_multi_gpu_{false};
   raft::handle_t const* handle_ptr_{nullptr};
   rmm::cuda_stream_view stream_view_;
 
@@ -150,6 +200,10 @@ class pdhg_solver_t {
   rmm::device_uvector<f_t> new_bounds_lower_;
   rmm::device_uvector<f_t> new_bounds_upper_;
   cuda::fast_mod_div<size_t> batch_size_divisor_;
+
+  // Non-owning. Set on the master pdhg_solver_ in distributed mode; null
+  // (default) means single-GPU path. See compute_At_y / compute_A_x.
+  multi_gpu_engine_t<i_t, f_t>* mgpu_engine_{nullptr};
 };
 
 }  // namespace cuopt::linear_programming::detail
diff --git a/cpp/src/pdlp/pdlp.cu b/cpp/src/pdlp/pdlp.cu
index dadc54cc08..71c6b0a48c 100644
--- a/cpp/src/pdlp/pdlp.cu
+++ b/cpp/src/pdlp/pdlp.cu
@@ -11,12 +11,16 @@
 #include <cuopt/linear_programming/solver_settings.hpp>
 
 #include <pdlp/cusparse_view.hpp>
+#include <pdlp/distributed_pdlp/partition_loader.hpp>
+#include <pdlp/distributed_pdlp/partitioner.hpp>
 #include <pdlp/pdlp.cuh>
 #include <pdlp/swap_and_resize_helper.cuh>
 #include <pdlp/utils.cuh>
 
+#include <dual_simplex/sparse_matrix.hpp>
 #include <mip_heuristics/mip_constants.hpp>
 #include "cuopt/linear_programming/pdlp/solver_solution.hpp"
+#include "distributed_pdlp/multi_gpu_engine.hpp"
 
 #include <utilities/copy_helpers.hpp>
 #include <utilities/macros.cuh>
@@ -40,9 +44,11 @@
 #include <thrust/logical.h>
 
 #include <algorithm>
+#include <cctype>
 #include <cmath>
 #include <optional>
 #include <tuple>
+#include <type_traits>
 #include <unordered_set>
 
 namespace cuopt::linear_programming::detail {
@@ -371,6 +377,376 @@ pdlp_solver_t<i_t, f_t>::pdlp_solver_t(problem_t<i_t, f_t>& op_problem,
   }
 }
 
+// ============================================================================
+// Distributed multi-GPU ctor.
+// needs placeholder_problem to be a shape-0 problem
+// reads the problem from mps_data_model directly
+// builds internal attributes from the placeholder_problem
+// builds the engine from the mps_data_model
+template <typename i_t, typename f_t>
+pdlp_solver_t<i_t, f_t>::pdlp_solver_t(
+  problem_t<i_t, f_t>& placeholder_problem,
+  cuopt::linear_programming::io::mps_data_model_t<i_t, f_t> const& mps,
+  pdlp_solver_settings_t<i_t, f_t> const& settings)
+  // Makes all inner feilds of master 0 size
+  : pdlp_solver_t(placeholder_problem, settings, /*is_legacy_batch_mode=*/false)
+{
+  cuopt_expects(placeholder_problem.n_variables == 0 && placeholder_problem.n_constraints == 0 &&
+                  placeholder_problem.nnz == 0,
+                error_type_t::ValidationError,
+                "Distributed mGPU pdlp_solver_t ctor requires a shape-0 "
+                "placeholder problem (n_variables == n_constraints == nnz == 0)");
+  const int distributed_pdlp_num_gpus = settings.distributed_pdlp_num_gpus;
+  CUOPT_LOG_INFO("Solving with distributed PDLP on %d GPU (mps direct path)",
+                 distributed_pdlp_num_gpus);
+
+  if constexpr (!std::is_same_v<f_t, double>) {
+    cuopt_expects(
+      false, error_type_t::ValidationError, "Distributed PDLP currently requires double precision");
+    return;
+  }
+  // ----- 1. Read problem shape and bulk data directly from mps (host) -----
+  const i_t n_vars = static_cast<i_t>(mps.get_objective_coefficients().size());
+  const i_t n_cstr = static_cast<i_t>(mps.get_constraint_lower_bounds().size());
+  const i_t nnz    = static_cast<i_t>(mps.get_constraint_matrix_values().size());
+  cuopt_expects(n_vars > 0,
+                error_type_t::ValidationError,
+                "Distributed PDLP from mps requires a non-empty objective");
+  cuopt_expects(n_cstr > 0,
+                error_type_t::ValidationError,
+                "Distributed PDLP from mps requires at least one constraint");
+  cuopt_expects(static_cast<i_t>(mps.get_constraint_matrix_offsets().size()) == n_cstr + 1,
+                error_type_t::ValidationError,
+                "mps constraint_matrix_offsets size must equal n_constraints + 1");
+  cuopt_expects(
+    static_cast<i_t>(mps.get_constraint_matrix_indices().size()) == nnz,
+    error_type_t::ValidationError,
+    "mps constraint_matrix_indices size must equal nnz (constraint_matrix_values size)");
+  cuopt_expects(static_cast<i_t>(mps.get_constraint_upper_bounds().size()) == n_cstr,
+                error_type_t::ValidationError,
+                "mps constraint_upper_bounds size must equal n_constraints");
+  cuopt_expects(static_cast<i_t>(mps.get_variable_lower_bounds().size()) == n_vars,
+                error_type_t::ValidationError,
+                "mps variable_lower_bounds size must equal n_variables");
+  cuopt_expects(static_cast<i_t>(mps.get_variable_upper_bounds().size()) == n_vars,
+                error_type_t::ValidationError,
+                "mps variable_upper_bounds size must equal n_variables");
+
+  const bool maximize          = mps.get_sense();
+  f_t objective_offset         = mps.get_objective_offset();
+  f_t objective_scaling_factor = mps.get_objective_scaling_factor();
+
+  // Objective: copy (mutable so we can negate for maximize, matching
+  // problem_helpers.cuh::convert_to_maximization_problem).
+  std::vector<f_t> h_obj = mps.get_objective_coefficients();
+  if (maximize) {
+    for (auto& v : h_obj)
+      v = -v;
+    objective_offset         = -objective_offset;
+    objective_scaling_factor = -objective_scaling_factor;
+  }
+
+  // Bounds (copy from mps; engine ctor takes by const ref to std::vector).
+  std::vector<f_t> h_var_lower  = mps.get_variable_lower_bounds();
+  std::vector<f_t> h_var_upper  = mps.get_variable_upper_bounds();
+  std::vector<f_t> h_cstr_lower = mps.get_constraint_lower_bounds();
+  std::vector<f_t> h_cstr_upper = mps.get_constraint_upper_bounds();
+
+  // A (CSR) — mutable copies for the engine + partitioner consumers below.
+  std::vector<i_t> h_A_row_offsets = mps.get_constraint_matrix_offsets();
+  std::vector<i_t> h_A_col_indices = mps.get_constraint_matrix_indices();
+  std::vector<f_t> h_A_values      = mps.get_constraint_matrix_values();
+
+  // ----- 2. Transpose A -> A^T on the host (one-shot CSR transpose) -----
+  // CSC(A) and CSR(A^T) share the same memory layout, so the CSC produced
+  // by dual_simplex::csr_matrix_t::to_compressed_col IS the CSR of A^T.
+  // O(nnz + n_vars) counting sort, same as problem_t::compute_transpose.
+  namespace ds = cuopt::linear_programming::dual_simplex;
+  ds::csr_matrix_t<i_t, f_t> A_csr(n_cstr, n_vars, nnz);
+  A_csr.row_start = h_A_row_offsets;
+  A_csr.j         = h_A_col_indices;
+  A_csr.x         = h_A_values;
+  ds::csc_matrix_t<i_t, f_t> AT_as_csc(n_vars, n_cstr, nnz);
+  A_csr.to_compressed_col(AT_as_csc);
+  std::vector<i_t> h_A_t_row_offsets = std::move(AT_as_csc.col_start);
+  std::vector<i_t> h_A_t_col_indices = std::move(AT_as_csc.i);
+  std::vector<f_t> h_A_t_values      = std::move(AT_as_csc.x);
+
+  // ----- 3. Identity scaling for V1 -----
+  // Real multi-GPU scaling is a TODO; ship the unscaled problem to shards as
+  // both "unscaled" and "scaled" so the engine and per-shard pdlp_solver_t
+  // can run end-to-end. Scaling factor vectors are 1.0 everywhere so the
+  // shard-side unscale at the end is a no-op.
+  std::vector<f_t> h_A_values_scaled   = h_A_values;
+  std::vector<f_t> h_A_t_values_scaled = h_A_t_values;
+  std::vector<f_t> h_obj_scaled        = h_obj;
+  std::vector<f_t> h_var_lower_scaled  = h_var_lower;
+  std::vector<f_t> h_var_upper_scaled  = h_var_upper;
+  std::vector<f_t> h_cstr_lower_scaled = h_cstr_lower;
+  std::vector<f_t> h_cstr_upper_scaled = h_cstr_upper;
+  std::vector<f_t> h_cummulative_cstr_scaling(n_cstr, f_t(1.0));
+  std::vector<f_t> h_cummulative_var_scaling(n_vars, f_t(1.0));
+  const f_t h_bound_rescaling     = f_t(1.0);
+  const f_t h_objective_rescaling = f_t(1.0);
+
+  // ----- 4. Partition -----
+  std::vector<i_t> parts;
+  if (!settings.multi_gpu_partition_file.empty()) {
+    parts = partition_loader_t<i_t, f_t>::parse_distributed_pdlp_partition_file(
+      settings.multi_gpu_partition_file);
+    validate_partition(parts, n_cstr, n_vars, distributed_pdlp_num_gpus, "partition file");
+  } else {
+    partitioner_input_t<i_t, f_t> partition_input;
+    partition_input.nb_cstr  = n_cstr;
+    partition_input.nb_vars  = n_vars;
+    partition_input.nb_parts = distributed_pdlp_num_gpus;
+
+    // Resolve which partitioner to use.
+    std::string partitioner_choice = settings.distributed_pdlp_partitioner;
+    std::transform(partitioner_choice.begin(),
+                   partitioner_choice.end(),
+                   partitioner_choice.begin(),
+                   [](unsigned char c) { return std::tolower(c); });
+    partitioner_kind_t kind;
+    if (partitioner_choice.empty() || partitioner_choice == "auto") {
+      kind =
+        (distributed_pdlp_num_gpus == 1) ? partitioner_kind_t::Dummy : partitioner_kind_t::KaMinPar;
+    } else if (partitioner_choice == "dummy") {
+      kind = partitioner_kind_t::Dummy;
+    } else if (partitioner_choice == "metis") {
+      kind = partitioner_kind_t::Metis;
+    } else if (partitioner_choice == "kaminpar") {
+      kind = partitioner_kind_t::KaMinPar;
+    } else {
+      cuopt_expects(
+        false,
+        error_type_t::ValidationError,
+        "Unknown distributed_pdlp_partitioner '%s' (expected auto|dummy|metis|kaminpar)",
+        settings.distributed_pdlp_partitioner.c_str());
+      kind = partitioner_kind_t::Dummy;  // unreachable; silences -Wmaybe-uninitialized
+    }
+    const bool needs_graph =
+      (kind == partitioner_kind_t::Metis || kind == partitioner_kind_t::KaMinPar);
+    if (needs_graph) {
+      // partitioner_input_t holds non-const std::vector<i_t>* pointers; we
+      // already have the data in our local mutable buffers above.
+      partition_input.A.row_offsets   = &h_A_row_offsets;
+      partition_input.A.col_indices   = &h_A_col_indices;
+      partition_input.A.num_rows      = n_cstr;
+      partition_input.A.num_cols      = n_vars;
+      partition_input.A_t.row_offsets = &h_A_t_row_offsets;
+      partition_input.A_t.col_indices = &h_A_t_col_indices;
+      partition_input.A_t.num_rows    = n_vars;
+      partition_input.A_t.num_cols    = n_cstr;
+      // 0 => KaMinPar auto-detects and uses all hardware threads (ignored by METIS).
+      partition_input.nb_threads = 0;
+    }
+    const char* kind_name = (kind == partitioner_kind_t::Dummy)      ? "dummy"
+                            : (kind == partitioner_kind_t::Metis)    ? "metis"
+                            : (kind == partitioner_kind_t::KaMinPar) ? "kaminpar"
+                                                                     : "unknown";
+    CUOPT_LOG_INFO(
+      "Partitioning %d constraints + %d variables into %d part(s) using the %s "
+      "partitioner",
+      n_cstr,
+      n_vars,
+      distributed_pdlp_num_gpus,
+      kind_name);
+    auto partitioner = make_partitioner<i_t, f_t>(kind);
+    parts            = partitioner->partition(partition_input);
+  }
+
+  // Optionally dump the partition right after computing it (one part-id per line).
+  if (!settings.multi_gpu_export_partition_file.empty()) {
+    partition_loader_t<i_t, f_t>::export_distributed_pdlp_partition_file(
+      settings.multi_gpu_export_partition_file, parts);
+    std::cout << "Exported " << parts.size() << " part-ids to "
+              << settings.multi_gpu_export_partition_file << std::endl;
+  }
+
+  // ----- 5. Build per-rank data -----
+  std::vector<rank_data_t<i_t, f_t>> sub_pdlp_rank_data =
+    partition_loader_t<i_t, f_t>::create_rank_data_from_parts(parts,
+                                                              h_A_row_offsets,
+                                                              h_A_col_indices,
+                                                              h_A_values,
+                                                              h_A_values_scaled,
+                                                              h_A_t_row_offsets,
+                                                              h_A_t_col_indices,
+                                                              h_A_t_values,
+                                                              h_A_t_values_scaled,
+                                                              settings.distributed_pdlp_num_gpus,
+                                                              n_cstr,
+                                                              n_vars,
+                                                              nnz);
+
+  // ----- 6. Per-shard settings -----
+  pdlp_solver_settings_t<i_t, f_t> sub_pdlp_settings                    = settings;
+  sub_pdlp_settings.num_gpus                                            = 1;
+  sub_pdlp_settings.distributed_pdlp_num_gpus                           = 1;
+  sub_pdlp_settings.multi_gpu_partition_file                            = "";
+  sub_pdlp_settings.multi_gpu_export_partition_file                     = "";
+  sub_pdlp_settings.is_distributed_sub_pdlp                             = true;
+  sub_pdlp_settings.hyper_params.default_l_inf_ruiz_iterations          = 0;
+  sub_pdlp_settings.hyper_params.default_alpha_pock_chambolle_rescaling = 0.0;
+
+  // ----- 7. Construct the engine: NCCL comms + per-shard pdlp_solver_t -----
+  multi_gpu_engine.emplace(std::move(sub_pdlp_rank_data),
+                           h_obj,
+                           h_var_lower,
+                           h_var_upper,
+                           h_cstr_lower,
+                           h_cstr_upper,
+                           h_obj_scaled,
+                           h_var_lower_scaled,
+                           h_var_upper_scaled,
+                           h_cstr_lower_scaled,
+                           h_cstr_upper_scaled,
+                           h_cummulative_cstr_scaling,
+                           h_cummulative_var_scaling,
+                           h_bound_rescaling,
+                           h_objective_rescaling,
+                           maximize,
+                           objective_offset,
+                           objective_scaling_factor,
+                           sub_pdlp_settings);
+
+  // ----- 8 Distributed Scaling -----
+  for (auto& shard : multi_gpu_engine->shards) {
+    raft::device_setter guard(shard->device_id);
+    shard->sub_pdlp->get_initial_scaling_strategy().reset_scaling_state_for_distributed();
+  }
+  for (auto& shard : multi_gpu_engine->shards) {
+    raft::device_setter guard(shard->device_id);
+    shard->stream.synchronize();
+  }
+
+  // Distributed scaling
+  if (settings_.hyper_params.do_ruiz_scaling) {
+    multi_gpu_engine->distributed_ruiz_inf_scaling(
+      settings_.hyper_params.default_l_inf_ruiz_iterations, n_vars);
+  }
+  // push local scaling to halo
+  multi_gpu_engine->broadcast_constraint_scaling_to_halo();
+  if (settings_.hyper_params.do_pock_chambolle_scaling) {
+    multi_gpu_engine->distributed_pock_chambolle_scaling(
+      static_cast<f_t>(settings_.hyper_params.default_alpha_pock_chambolle_rescaling), n_vars);
+  }
+  // Refresh the halo constraint scaling after Pock-Chambolle
+  multi_gpu_engine->broadcast_constraint_scaling_to_halo();
+
+  for (auto& shard : multi_gpu_engine->shards) {
+    raft::device_setter guard(shard->device_id);
+    auto& scaling = shard->sub_pdlp->get_initial_scaling_strategy();
+    // Skip the per-shard local bound/objective rescaling; the global factor is
+    // applied below. Keeps the unscale path active (flag stays true).
+    scaling.set_skip_distributed_local_rescaling(true);
+    scaling.scale_problem();
+
+    shard->sub_pdlp->pdhg_solver_.get_cusparse_view().create_spmv_op_plans(
+      /*is_reflected=*/settings_.hyper_params.use_reflected_primal_dual);
+  }
+  for (auto& shard : multi_gpu_engine->shards) {
+    raft::device_setter guard(shard->device_id);
+    shard->stream.synchronize();
+  }
+
+  // Global bound/objective rescaling: allreduce the owned partial squared-norms
+  if (settings_.hyper_params.bound_objective_rescaling && !inside_mip_) {
+    multi_gpu_engine->distributed_bound_objective_rescaling(
+      static_cast<f_t>(settings_.hyper_params.initial_primal_weight_c_scaling));
+  }
+
+  // ----- 8b. Seed initial step-size / primal-weight (distributed, scales to N shards) -----
+  constexpr f_t kStepSizeScale = f_t{0.998};
+  const f_t sigma_max          = multi_gpu_engine->distributed_max_singular_value(n_cstr);
+  const f_t h_primal_weight    = f_t{1};
+  const f_t h_step_size        = (sigma_max > f_t{0}) ? kStepSizeScale / sigma_max : f_t{1};
+  // With primal_weight = 1 the adaptive step-size strategy collapses to
+  // primal_step_size = step_size / primal_weight = step_size
+  // dual_step_size   = step_size * primal_weight = step_size.
+  const f_t h_primal_step_size = h_step_size;
+  const f_t h_dual_step_size   = h_step_size;
+
+  // Put the values on master
+  raft::copy(step_size_.data(), &h_step_size, 1, stream_view_);
+  raft::copy(primal_weight_.data(), &h_primal_weight, 1, stream_view_);
+  raft::copy(best_primal_weight_.data(), &h_primal_weight, 1, stream_view_);
+  raft::copy(primal_step_size_.data(), &h_primal_step_size, 1, stream_view_);
+  raft::copy(dual_step_size_.data(), &h_dual_step_size, 1, stream_view_);
+  handle_ptr_->sync_stream(stream_view_);
+
+  // put the values on each shard
+  for (auto& shard : multi_gpu_engine->shards) {
+    raft::device_setter guard(shard->device_id);
+    auto& sub = *shard->sub_pdlp;
+    raft::copy(sub.step_size_.data(), &h_step_size, 1, shard->stream);
+    raft::copy(sub.primal_weight_.data(), &h_primal_weight, 1, shard->stream);
+    raft::copy(sub.best_primal_weight_.data(), &h_primal_weight, 1, shard->stream);
+    raft::copy(sub.get_primal_step_size().data(), &h_primal_step_size, 1, shard->stream);
+    raft::copy(sub.get_dual_step_size().data(), &h_dual_step_size, 1, shard->stream);
+  }
+  for (auto& shard : multi_gpu_engine->shards) {
+    raft::device_setter guard(shard->device_id);
+    shard->stream.synchronize();
+  }
+
+  // Wire the engine into master's pdhg_solver_; shards keep mgpu_engine_ == nullptr.
+  pdhg_solver_.set_multi_gpu_engine(&*multi_gpu_engine);
+
+  // ----- 9. Resize master gather destinations to the full problem size -----
+  pdhg_solver_.get_potential_next_primal_solution().resize(n_vars, stream_view_);
+  pdhg_solver_.get_potential_next_dual_solution().resize(n_cstr, stream_view_);
+  current_termination_strategy_.get_convergence_information().get_reduced_cost().resize(
+    n_vars, stream_view_);
+  primal_size_h_ = n_vars;
+  dual_size_h_   = n_cstr;
+
+  // Distributed conergence_information::init_l2_norms
+  for (auto& shard : multi_gpu_engine->shards) {
+    raft::device_setter guard(shard->device_id);
+    shard->sub_pdlp->get_current_termination_strategy()
+      .get_convergence_information()
+      .compute_owned_reference_norm_partials(shard->rank_data.owned_var_size,
+                                             shard->rank_data.owned_cstr_size);
+  }
+  multi_gpu_engine->allreduce_sum_inplace([](pdlp_solver_t<i_t, f_t>& sp) -> f_t* {
+    return sp.get_current_termination_strategy()
+      .get_convergence_information()
+      .l2_norm_primal_right_hand_side_data();
+  });
+  multi_gpu_engine->allreduce_sum_inplace([](pdlp_solver_t<i_t, f_t>& sp) -> f_t* {
+    return sp.get_current_termination_strategy()
+      .get_convergence_information()
+      .l2_norm_primal_linear_objective_data();
+  });
+  for (auto& shard : multi_gpu_engine->shards) {
+    raft::device_setter guard(shard->device_id);
+    shard->sub_pdlp->get_current_termination_strategy()
+      .get_convergence_information()
+      .sqrt_reference_norms_inplace();
+    shard->stream.synchronize();
+  }
+  // Broadcast the values to the master
+  {
+    auto& s0      = *multi_gpu_engine->shards[0];
+    auto& s0_conv = s0.sub_pdlp->get_current_termination_strategy().get_convergence_information();
+    raft::device_setter guard(s0.device_id);
+    for (auto* ts : {&current_termination_strategy_, &average_termination_strategy_}) {
+      auto& ci = ts->get_convergence_information();
+      raft::copy(ci.l2_norm_primal_right_hand_side_data(),
+                 s0_conv.l2_norm_primal_right_hand_side_data(),
+                 1,
+                 stream_view_);
+      raft::copy(ci.l2_norm_primal_linear_objective_data(),
+                 s0_conv.l2_norm_primal_linear_objective_data(),
+                 1,
+                 stream_view_);
+    }
+  }
+  handle_ptr_->sync_stream(stream_view_);
+}
+
 template <typename i_t, typename f_t>
 void pdlp_solver_t<i_t, f_t>::set_initial_primal_weight(f_t initial_primal_weight)
 {
@@ -442,6 +818,11 @@ std::optional<optimization_problem_solution_t<i_t, f_t>> pdlp_solver_t<i_t, f_t>
     RAFT_CUDA_TRY(cudaDeviceSynchronize());
     std::cout << "Time Limit reached, returning current solution" << std::endl;
 #endif
+    if (auto* engine = pdhg_solver_.get_mgpu_engine()) {
+      engine->gather_potential_next_solutions_to_master(
+        pdhg_solver_,
+        current_termination_strategy_.get_convergence_information().get_reduced_cost());
+    }
     return current_termination_strategy_.fill_return_problem_solution(
       internal_solver_iterations_,
       pdhg_solver_,
@@ -475,6 +856,11 @@ std::optional<optimization_problem_solution_t<i_t, f_t>> pdlp_solver_t<i_t, f_t>
       return finalize_batch_return_with_limit_reached(pdlp_termination_status_t::IterationLimit);
     }
 
+    if (auto* engine = pdhg_solver_.get_mgpu_engine()) {
+      engine->gather_potential_next_solutions_to_master(
+        pdhg_solver_,
+        current_termination_strategy_.get_convergence_information().get_reduced_cost());
+    }
     return current_termination_strategy_.fill_return_problem_solution(
       internal_solver_iterations_,
       pdhg_solver_,
@@ -1152,6 +1538,11 @@ std::optional<optimization_problem_solution_t<i_t, f_t>> pdlp_solver_t<i_t, f_t>
 #endif
     print_final_termination_criteria(
       timer, current_termination_strategy_.get_convergence_information(), termination_current);
+    if (auto* engine = pdhg_solver_.get_mgpu_engine()) {
+      engine->gather_potential_next_solutions_to_master(
+        pdhg_solver_,
+        current_termination_strategy_.get_convergence_information().get_reduced_cost());
+    }
     return current_termination_strategy_.fill_return_problem_solution(
       internal_solver_iterations_,
       pdhg_solver_,
@@ -2008,34 +2399,112 @@ void pdlp_solver_t<i_t, f_t>::compute_fixed_error(std::vector<int>& has_restarte
 
   // Computing the deltas
   // TODO batch mdoe: this only works if everyone restarts
-  cub::DeviceTransform::Transform(cuda::std::make_tuple(pdhg_solver_.get_reflected_primal().data(),
-                                                        pdhg_solver_.get_primal_solution().data()),
-                                  pdhg_solver_.get_saddle_point_state().get_delta_primal().data(),
-                                  pdhg_solver_.get_primal_solution().size(),
-                                  cuda::std::minus<f_t>{},
-                                  stream_view_.value());
-  cub::DeviceTransform::Transform(cuda::std::make_tuple(pdhg_solver_.get_reflected_dual().data(),
-                                                        pdhg_solver_.get_dual_solution().data()),
-                                  pdhg_solver_.get_saddle_point_state().get_delta_dual().data(),
-                                  pdhg_solver_.get_dual_solution().size(),
-                                  cuda::std::minus<f_t>{},
-                                  stream_view_.value());
+  if (multi_gpu_engine) {
+    // Go faire une fonction compute_delta_primal, compute_delta primal ?
+    for (auto& shard : multi_gpu_engine->shards) {
+      raft::device_setter guard(shard->device_id);
+      auto& sub_pdhg = shard->sub_pdlp->pdhg_solver_;
+      cub::DeviceTransform::Transform(cuda::std::make_tuple(sub_pdhg.get_reflected_primal().data(),
+                                                            sub_pdhg.get_primal_solution().data()),
+                                      sub_pdhg.get_saddle_point_state().get_delta_primal().data(),
+                                      sub_pdhg.get_primal_solution().size(),
+                                      cuda::std::minus<f_t>{},
+                                      shard->stream.view());
+      cub::DeviceTransform::Transform(cuda::std::make_tuple(sub_pdhg.get_reflected_dual().data(),
+                                                            sub_pdhg.get_dual_solution().data()),
+                                      sub_pdhg.get_saddle_point_state().get_delta_dual().data(),
+                                      sub_pdhg.get_dual_solution().size(),
+                                      cuda::std::minus<f_t>{},
+                                      shard->stream.view());
+    }
+  } else {
+    cub::DeviceTransform::Transform(
+      cuda::std::make_tuple(pdhg_solver_.get_reflected_primal().data(),
+                            pdhg_solver_.get_primal_solution().data()),
+      pdhg_solver_.get_saddle_point_state().get_delta_primal().data(),
+      pdhg_solver_.get_primal_solution().size(),
+      cuda::std::minus<f_t>{},
+      stream_view_.value());
+    cub::DeviceTransform::Transform(cuda::std::make_tuple(pdhg_solver_.get_reflected_dual().data(),
+                                                          pdhg_solver_.get_dual_solution().data()),
+                                    pdhg_solver_.get_saddle_point_state().get_delta_dual().data(),
+                                    pdhg_solver_.get_dual_solution().size(),
+                                    cuda::std::minus<f_t>{},
+                                    stream_view_.value());
+  }
 
   auto& cusparse_view = pdhg_solver_.get_cusparse_view();
-  // Sync to make sure all previous cuSparse operations are finished before setting the
-  // potential_next_dual_solution
-  RAFT_CUDA_TRY(cudaStreamSynchronize(stream_view_));
 
-  // Make potential_next_dual_solution point towards reflected dual solution to reuse the code
-  RAFT_CUSPARSE_TRY(cusparseDnVecSetValues(cusparse_view.potential_next_dual_solution,
-                                           (void*)pdhg_solver_.get_reflected_dual().data()));
+  if (multi_gpu_engine) {
+    // SpMV is the first operation in compute_interaction_and_movement so we can do halo before and
+    // call it naturally we then reduce the local dot products
+    multi_gpu_engine->halo_exchange_cstr(
+      [](auto& pdhg) -> rmm::device_uvector<f_t>& { return pdhg.get_reflected_dual(); });
 
-  if (batch_mode_)
-    RAFT_CUSPARSE_TRY(cusparseDnMatSetValues(cusparse_view.batch_potential_next_dual_solution,
+    for (auto& shard : multi_gpu_engine->shards) {
+      raft::device_setter guard(shard->device_id);
+      auto& sub_pdlp = *shard->sub_pdlp;
+      auto& sub_cv   = sub_pdlp.pdhg_solver_.get_cusparse_view();
+
+      RAFT_CUSPARSE_TRY(
+        cusparseDnVecSetValues(sub_cv.potential_next_dual_solution,
+                               (void*)sub_pdlp.pdhg_solver_.get_reflected_dual().data()));
+
+      // Ensure norm is on owned size
+      sub_pdlp.step_size_strategy_.compute_interaction_and_movement(
+        sub_pdlp.pdhg_solver_.get_primal_tmp_resource(),
+        sub_cv,
+        sub_pdlp.pdhg_solver_.get_saddle_point_state(),
+        shard->rank_data.owned_var_size,
+        shard->rank_data.owned_cstr_size);
+
+      RAFT_CUSPARSE_TRY(cusparseDnVecSetValues(
+        sub_cv.potential_next_dual_solution,
+        (void*)sub_pdlp.pdhg_solver_.get_potential_next_dual_solution().data()));
+    }
+
+    multi_gpu_engine->allreduce_sum_inplace(
+      [](auto& sp) -> f_t* { return sp.step_size_strategy_.get_interaction().data(); });
+    multi_gpu_engine->allreduce_sum_inplace([](auto& sp) -> f_t* {
+      return sp.step_size_strategy_.get_norm_squared_delta_primal().data();
+    });
+    multi_gpu_engine->allreduce_sum_inplace(
+      [](auto& sp) -> f_t* { return sp.step_size_strategy_.get_norm_squared_delta_dual().data(); });
+
+    auto& s0 = *multi_gpu_engine->shards[0];
+    {
+      raft::device_setter guard(s0.device_id);
+      RAFT_CUDA_TRY(cudaStreamSynchronize(s0.stream.view().value()));
+    }
+    auto& src_sp = s0.sub_pdlp->step_size_strategy_;
+    raft::copy(step_size_strategy_.get_interaction().data(),
+               src_sp.get_interaction().data(),
+               1,
+               stream_view_);
+    raft::copy(step_size_strategy_.get_norm_squared_delta_primal().data(),
+               src_sp.get_norm_squared_delta_primal().data(),
+               1,
+               stream_view_);
+    raft::copy(step_size_strategy_.get_norm_squared_delta_dual().data(),
+               src_sp.get_norm_squared_delta_dual().data(),
+               1,
+               stream_view_);
+  } else {
+    // Sync to make sure all previous cuSparse operations are finished before setting the
+    // potential_next_dual_solution
+    RAFT_CUDA_TRY(cudaStreamSynchronize(stream_view_));
+
+    // Make potential_next_dual_solution point towards reflected dual solution to reuse the code
+    RAFT_CUSPARSE_TRY(cusparseDnVecSetValues(cusparse_view.potential_next_dual_solution,
                                              (void*)pdhg_solver_.get_reflected_dual().data()));
 
-  step_size_strategy_.compute_interaction_and_movement(
-    pdhg_solver_.get_primal_tmp_resource(), cusparse_view, pdhg_solver_.get_saddle_point_state());
+    if (batch_mode_)
+      RAFT_CUSPARSE_TRY(cusparseDnMatSetValues(cusparse_view.batch_potential_next_dual_solution,
+                                               (void*)pdhg_solver_.get_reflected_dual().data()));
+
+    step_size_strategy_.compute_interaction_and_movement(
+      pdhg_solver_.get_primal_tmp_resource(), cusparse_view, pdhg_solver_.get_saddle_point_state());
+  }
 
   if (batch_mode_) {
     const auto [grid_size, block_size] = kernel_config_from_batch_size(climber_strategies_.size());
@@ -2066,11 +2535,12 @@ void pdlp_solver_t<i_t, f_t>::compute_fixed_error(std::vector<int>& has_restarte
   // potential_next_dual_solution
   RAFT_CUDA_TRY(cudaStreamSynchronize(stream_view_));
 
-  // Put back
-  RAFT_CUSPARSE_TRY(
-    cusparseDnVecSetValues(cusparse_view.potential_next_dual_solution,
-                           (void*)pdhg_solver_.get_potential_next_dual_solution().data()));
-
+  // Put back, already done in multi-gpu side
+  if (!multi_gpu_engine) {
+    RAFT_CUSPARSE_TRY(
+      cusparseDnVecSetValues(cusparse_view.potential_next_dual_solution,
+                             (void*)pdhg_solver_.get_potential_next_dual_solution().data()));
+  }
   if (batch_mode_) {
     RAFT_CUSPARSE_TRY(
       cusparseDnMatSetValues(cusparse_view.batch_potential_next_dual_solution,
@@ -2309,245 +2779,250 @@ optimization_problem_solution_t<i_t, f_t> pdlp_solver_t<i_t, f_t>::run_solver(co
   std::cout << "Starting PDLP loop:" << std::endl;
 #endif
 
-  // TODO handle that properly
-  if (settings_.hyper_params.compute_initial_step_size_before_scaling &&
-      !settings_.get_initial_step_size().has_value())
-    compute_initial_step_size();
-  if (settings_.hyper_params.compute_initial_primal_weight_before_scaling &&
-      !settings_.get_initial_primal_weight().has_value())
-    compute_initial_primal_weight();
-
-  initial_scaling_strategy_.scale_problem();
-  if constexpr (std::is_same_v<f_t, double>) {
-    if (!batch_mode_ && !pdhg_solver_.get_cusparse_view().mixed_precision_enabled_) {
-      pdhg_solver_.get_cusparse_view().create_spmv_op_plans(
-        settings_.hyper_params.use_reflected_primal_dual);
-    }
-  }
-
-  // Update FP32 matrix copies for mixed precision SpMV after scaling
-  pdhg_solver_.get_cusparse_view().update_mixed_precision_matrices();
+  bool warm_start_was_given = settings_.get_pdlp_warm_start_data().is_populated();
 
-  // Redirect cuSPARSE descriptors to use the original problem's structural data (offsets, indices),
-  // then free the duplicated structural vectors from the scaled copy to save device memory.
-  pdhg_solver_.get_cusparse_view().redirect_cusparse_csr_structure_pointers(*problem_ptr);
-  op_problem_scaled_.variables.resize(0, stream_view_);
-  op_problem_scaled_.offsets.resize(0, stream_view_);
-  op_problem_scaled_.reverse_constraints.resize(0, stream_view_);
-  op_problem_scaled_.reverse_offsets.resize(0, stream_view_);
+  // In distributed mode, skip all setup, it is done before
+  if (!settings_.hyper_params.use_distributed_pdlp) {
+    // TODO handle that properly
+    if (settings_.hyper_params.compute_initial_step_size_before_scaling &&
+        !settings_.get_initial_step_size().has_value())
+      compute_initial_step_size();
+    if (settings_.hyper_params.compute_initial_primal_weight_before_scaling &&
+        !settings_.get_initial_primal_weight().has_value())
+      compute_initial_primal_weight();
+
+    initial_scaling_strategy_.scale_problem();
+    if constexpr (std::is_same_v<f_t, double>) {
+      if (!batch_mode_ && !pdhg_solver_.get_cusparse_view().mixed_precision_enabled_) {
+        pdhg_solver_.get_cusparse_view().create_spmv_op_plans(
+          settings_.hyper_params.use_reflected_primal_dual);
+      }
+    }
 
-  if (!settings_.hyper_params.compute_initial_step_size_before_scaling &&
-      !settings_.get_initial_step_size().has_value())
-    compute_initial_step_size();
-  if (!settings_.hyper_params.compute_initial_primal_weight_before_scaling &&
-      !settings_.get_initial_primal_weight().has_value())
-    compute_initial_primal_weight();
+    // Update FP32 matrix copies for mixed precision SpMV after scaling
+    pdhg_solver_.get_cusparse_view().update_mixed_precision_matrices();
+
+    // Redirect cuSPARSE descriptors to use the original problem's structural data (offsets,
+    // indices), then free the duplicated structural vectors from the scaled copy to save device
+    // memory.
+    pdhg_solver_.get_cusparse_view().redirect_cusparse_csr_structure_pointers(*problem_ptr);
+    op_problem_scaled_.variables.resize(0, stream_view_);
+    op_problem_scaled_.offsets.resize(0, stream_view_);
+    op_problem_scaled_.reverse_constraints.resize(0, stream_view_);
+    op_problem_scaled_.reverse_offsets.resize(0, stream_view_);
+
+    if (!settings_.hyper_params.compute_initial_step_size_before_scaling &&
+        !settings_.get_initial_step_size().has_value())
+      compute_initial_step_size();
+    if (!settings_.hyper_params.compute_initial_primal_weight_before_scaling &&
+        !settings_.get_initial_primal_weight().has_value())
+      compute_initial_primal_weight();
 
 #ifdef PDLP_DEBUG_MODE
-  std::cout << "Initial Scaling done" << std::endl;
+    std::cout << "Initial Scaling done" << std::endl;
 #endif
-
-  // Needs to be performed here before the below line to make sure the initial primal_weight / step
-  // size are used as previous point when potentially updating them in this next call
-  if (settings_.get_initial_step_size().has_value() || initial_step_size_.has_value()) {
-    if (initial_step_size_.has_value())
-      thrust::uninitialized_fill(handle_ptr_->get_thrust_policy(),
-                                 step_size_.begin(),
-                                 step_size_.end(),
-                                 initial_step_size_.value());
-    else
-      thrust::uninitialized_fill(handle_ptr_->get_thrust_policy(),
-                                 step_size_.begin(),
-                                 step_size_.end(),
-                                 settings_.get_initial_step_size().value());
-  }
-  if (settings_.get_initial_primal_weight().has_value() || initial_primal_weight_.has_value()) {
-    if (initial_primal_weight_.has_value()) {
-      thrust::uninitialized_fill(handle_ptr_->get_thrust_policy(),
-                                 primal_weight_.begin(),
-                                 primal_weight_.end(),
-                                 initial_primal_weight_.value());
-      if (is_cupdlpx_restart<i_t, f_t>(settings_.hyper_params))
+    // Needs to be performed here before the below line to make sure the initial primal_weight /
+    // step size are used as previous point when potentially updating them in this next call
+    if (settings_.get_initial_step_size().has_value() || initial_step_size_.has_value()) {
+      if (initial_step_size_.has_value())
+        thrust::uninitialized_fill(handle_ptr_->get_thrust_policy(),
+                                   step_size_.begin(),
+                                   step_size_.end(),
+                                   initial_step_size_.value());
+      else
         thrust::uninitialized_fill(handle_ptr_->get_thrust_policy(),
-                                   best_primal_weight_.begin(),
-                                   best_primal_weight_.end(),
+                                   step_size_.begin(),
+                                   step_size_.end(),
+                                   settings_.get_initial_step_size().value());
+    }
+    if (settings_.get_initial_primal_weight().has_value() || initial_primal_weight_.has_value()) {
+      if (initial_primal_weight_.has_value()) {
+        thrust::uninitialized_fill(handle_ptr_->get_thrust_policy(),
+                                   primal_weight_.begin(),
+                                   primal_weight_.end(),
                                    initial_primal_weight_.value());
-    } else {
-      thrust::uninitialized_fill(handle_ptr_->get_thrust_policy(),
-                                 primal_weight_.begin(),
-                                 primal_weight_.end(),
-                                 settings_.get_initial_primal_weight().value());
-      if (is_cupdlpx_restart<i_t, f_t>(settings_.hyper_params))
+        if (is_cupdlpx_restart<i_t, f_t>(settings_.hyper_params))
+          thrust::uninitialized_fill(handle_ptr_->get_thrust_policy(),
+                                     best_primal_weight_.begin(),
+                                     best_primal_weight_.end(),
+                                     initial_primal_weight_.value());
+      } else {
         thrust::uninitialized_fill(handle_ptr_->get_thrust_policy(),
-                                   best_primal_weight_.begin(),
-                                   best_primal_weight_.end(),
+                                   primal_weight_.begin(),
+                                   primal_weight_.end(),
                                    settings_.get_initial_primal_weight().value());
+        if (is_cupdlpx_restart<i_t, f_t>(settings_.hyper_params))
+          thrust::uninitialized_fill(handle_ptr_->get_thrust_policy(),
+                                     best_primal_weight_.begin(),
+                                     best_primal_weight_.end(),
+                                     settings_.get_initial_primal_weight().value());
+      }
+    }
+    if (initial_k_.has_value()) {
+      pdhg_solver_.total_pdhg_iterations_ = initial_k_.value();
+      pdhg_solver_.get_d_total_pdhg_iterations().set_value_async(initial_k_.value(), stream_view_);
+    }
+    if (settings_.get_initial_pdlp_iteration().has_value()) {
+      total_pdlp_iterations_ = settings_.get_initial_pdlp_iteration().value();
+      // This is meaningless in batch mode since pdhg step is never used, set it just to avoid
+      // assertions
+      pdhg_solver_.get_d_total_pdhg_iterations().set_value_async(total_pdlp_iterations_,
+                                                                 stream_view_);
+      pdhg_solver_.total_pdhg_iterations_ = total_pdlp_iterations_;
+      // Reset the fixed point error since at this pdlp iteration it is expected to already be
+      // initialized to some value
+      std::fill(restart_strategy_.initial_fixed_point_error_.begin(),
+                restart_strategy_.initial_fixed_point_error_.end(),
+                f_t(0.0));
+      std::fill(restart_strategy_.fixed_point_error_.begin(),
+                restart_strategy_.fixed_point_error_.end(),
+                f_t(0.0));
     }
-  }
-  if (initial_k_.has_value()) {
-    pdhg_solver_.total_pdhg_iterations_ = initial_k_.value();
-    pdhg_solver_.get_d_total_pdhg_iterations().set_value_async(initial_k_.value(), stream_view_);
-  }
-  if (settings_.get_initial_pdlp_iteration().has_value()) {
-    total_pdlp_iterations_ = settings_.get_initial_pdlp_iteration().value();
-    // This is meaningless in batch mode since pdhg step is never used, set it just to avoid
-    // assertions
-    pdhg_solver_.get_d_total_pdhg_iterations().set_value_async(total_pdlp_iterations_,
-                                                               stream_view_);
-    pdhg_solver_.total_pdhg_iterations_ = total_pdlp_iterations_;
-    // Reset the fixed point error since at this pdlp iteration it is expected to already be
-    // initialized to some value
-    std::fill(restart_strategy_.initial_fixed_point_error_.begin(),
-              restart_strategy_.initial_fixed_point_error_.end(),
-              f_t(0.0));
-    std::fill(restart_strategy_.fixed_point_error_.begin(),
-              restart_strategy_.fixed_point_error_.end(),
-              f_t(0.0));
-  }
-
-  // Only the primal_weight_ and step_size_ variables are initialized during the initial phase
-  // The associated primal/dual step_size (computed using the two firstly mentionned) are not
-  // initialized. This calls ensures the latter
-  // In the event of a given primal and dual solutions and if the option is toggled, calling the
-  // update primal_weight and step_size will also update the associated primal_step_size_,
-  // dual_step_size_.
-  // In summary: the below call is only mandatory at the beginning when
-  // computing/setting the initial primal weight and step size and if they are not recomputed later.
-  step_size_strategy_.get_primal_and_dual_stepsizes(primal_step_size_, dual_step_size_);
+
+    // Only the primal_weight_ and step_size_ variables are initialized during the initial phase
+    // The associated primal/dual step_size (computed using the two firstly mentionned) are not
+    // initialized. This calls ensures the latter
+    // In the event of a given primal and dual solutions and if the option is toggled, calling the
+    // update primal_weight and step_size will also update the associated primal_step_size_,
+    // dual_step_size_.
+    // In summary: the below call is only mandatory at the beginning when
+    // computing/setting the initial primal weight and step size and if they are not recomputed
+    // later.
+    step_size_strategy_.get_primal_and_dual_stepsizes(primal_step_size_, dual_step_size_);
 
 #ifdef CUPDLP_DEBUG_MODE
-  if (initial_primal_.size() != 0 || initial_dual_.size() != 0) {
-    std::cout << "Initial primal and dual solution before scaling" << std::endl;
-    if (initial_primal_.size() != 0) { print("initial_primal_", initial_primal_); }
-    if (initial_dual_.size() != 0) { print("initial_dual_", initial_dual_); }
-  }
+    if (initial_primal_.size() != 0 || initial_dual_.size() != 0) {
+      std::cout << "Initial primal and dual solution before scaling" << std::endl;
+      if (initial_primal_.size() != 0) { print("initial_primal_", initial_primal_); }
+      if (initial_dual_.size() != 0) { print("initial_dual_", initial_dual_); }
+    }
 #endif
 
-  // If there is an initial primal or dual we should update the restart info as if there was a step
-  // that has happend
-  if (initial_primal_.size() != 0 || initial_dual_.size() != 0) {
-    update_primal_dual_solutions(
-      (initial_primal_.size() != 0) ? std::make_optional(&initial_primal_) : std::nullopt,
-      (initial_dual_.size() != 0) ? std::make_optional(&initial_dual_) : std::nullopt);
-  }
+    // If there is an initial primal or dual we should update the restart info as if there was a
+    // step that has happend
+    if (initial_primal_.size() != 0 || initial_dual_.size() != 0) {
+      update_primal_dual_solutions(
+        (initial_primal_.size() != 0) ? std::make_optional(&initial_primal_) : std::nullopt,
+        (initial_dual_.size() != 0) ? std::make_optional(&initial_dual_) : std::nullopt);
+    }
 
 #ifdef CUPDLP_DEBUG_MODE
-  std::cout << "Solution before projection" << std::endl;
-  print("pdhg_solver_.get_primal_solution()", pdhg_solver_.get_primal_solution());
-  print("pdhg_solver_.get_dual_solution()", pdhg_solver_.get_dual_solution());
-  print("pdhg_solver_.get_potential_next_primal_solution()",
-        pdhg_solver_.get_potential_next_primal_solution());
-  print("pdhg_solver_.get_potential_next_dual_solution()",
-        pdhg_solver_.get_potential_next_dual_solution());
-  print("restart_strategy_.last_restart_duality_gap_.primal_solution_",
-        restart_strategy_.last_restart_duality_gap_.primal_solution_);
-  print("restart_strategy_.last_restart_duality_gap_.dual_solution_",
-        restart_strategy_.last_restart_duality_gap_.dual_solution_);
+    std::cout << "Solution before projection" << std::endl;
+    print("pdhg_solver_.get_primal_solution()", pdhg_solver_.get_primal_solution());
+    print("pdhg_solver_.get_dual_solution()", pdhg_solver_.get_dual_solution());
+    print("pdhg_solver_.get_potential_next_primal_solution()",
+          pdhg_solver_.get_potential_next_primal_solution());
+    print("pdhg_solver_.get_potential_next_dual_solution()",
+          pdhg_solver_.get_potential_next_dual_solution());
+    print("restart_strategy_.last_restart_duality_gap_.primal_solution_",
+          restart_strategy_.last_restart_duality_gap_.primal_solution_);
+    print("restart_strategy_.last_restart_duality_gap_.dual_solution_",
+          restart_strategy_.last_restart_duality_gap_.dual_solution_);
 #endif
 
-  // Project initial primal solution
-  if (settings_.hyper_params.project_initial_primal) {
-    using f_t2 = typename type_2<f_t>::type;
-    if (batch_mode_) {
-      // In batch mode variable_bounds are shared and only the bound rescaling is per climber.
-      // Apply it here too so the initial point is projected into the correct saacled space
-      cub::DeviceTransform::Transform(
-        cuda::std::make_tuple(
+    // Project initial primal solution
+    if (settings_.hyper_params.project_initial_primal) {
+      using f_t2 = typename type_2<f_t>::type;
+      if (batch_mode_) {
+        // In batch mode variable_bounds are shared and only the bound rescaling is per climber.
+        // Apply it here too so the initial point is projected into the correct scaled space.
+        cub::DeviceTransform::Transform(
+          cuda::std::make_tuple(
+            pdhg_solver_.get_primal_solution().data(),
+            thrust::make_transform_iterator(
+              thrust::make_zip_iterator(
+                problem_wrap_container(op_problem_scaled_.variable_bounds),
+                batch_wrapped_container(initial_scaling_strategy_.get_bound_rescaling_vector(),
+                                        primal_size_h_)),
+              scale_bounds_by_scalar_op<f_t>{})),
           pdhg_solver_.get_primal_solution().data(),
-          thrust::make_transform_iterator(
-            thrust::make_zip_iterator(
-              problem_wrap_container(op_problem_scaled_.variable_bounds),
-              batch_wrapped_container(initial_scaling_strategy_.get_bound_rescaling_vector(),
-                                      primal_size_h_)),
-            scale_bounds_by_scalar_op<f_t>{})),
-        pdhg_solver_.get_primal_solution().data(),
-        pdhg_solver_.get_primal_solution().size(),
-        clamp<f_t, f_t2>(),
-        stream_view_.value());
-    } else {
-      cub::DeviceTransform::Transform(
-        cuda::std::make_tuple(pdhg_solver_.get_primal_solution().data(),
-                              problem_wrap_container(op_problem_scaled_.variable_bounds)),
-        pdhg_solver_.get_primal_solution().data(),
-        pdhg_solver_.get_primal_solution().size(),
-        clamp<f_t, f_t2>(),
-        stream_view_.value());
-    }
+          pdhg_solver_.get_primal_solution().size(),
+          clamp<f_t, f_t2>(),
+          stream_view_.value());
+      } else {
+        cub::DeviceTransform::Transform(
+          cuda::std::make_tuple(pdhg_solver_.get_primal_solution().data(),
+                                problem_wrap_container(op_problem_scaled_.variable_bounds)),
+          pdhg_solver_.get_primal_solution().data(),
+          pdhg_solver_.get_primal_solution().size(),
+          clamp<f_t, f_t2>(),
+          stream_view_.value());
+      }
 
-    pdhg_solver_.refine_initial_primal_projection(
-      initial_scaling_strategy_.get_bound_rescaling_vector());
+      pdhg_solver_.refine_initial_primal_projection(
+        initial_scaling_strategy_.get_bound_rescaling_vector());
 
-    if (!settings_.hyper_params.never_restart_to_average) {
-      cuopt_expects(!batch_mode_,
-                    cuopt::error_type_t::ValidationError,
-                    "Restart to average not supported in batch mode");
-      cub::DeviceTransform::Transform(
-        cuda::std::make_tuple(unscaled_primal_avg_solution_.data(),
-                              op_problem_scaled_.variable_bounds.data()),
-        unscaled_primal_avg_solution_.data(),
-        primal_size_h_,
-        clamp<f_t, f_t2>(),
-        stream_view_.value());
+      if (!settings_.hyper_params.never_restart_to_average) {
+        cuopt_expects(!batch_mode_,
+                      cuopt::error_type_t::ValidationError,
+                      "Restart to average not supported in batch mode");
+        cub::DeviceTransform::Transform(
+          cuda::std::make_tuple(unscaled_primal_avg_solution_.data(),
+                                op_problem_scaled_.variable_bounds.data()),
+          unscaled_primal_avg_solution_.data(),
+          primal_size_h_,
+          clamp<f_t, f_t2>(),
+          stream_view_.value());
+      }
     }
-  }
 
 #ifdef CUPDLP_DEBUG_MODE
-  std::cout << "Solution after projection" << std::endl;
-  print("pdhg_solver_.get_primal_solution()", pdhg_solver_.get_primal_solution());
-  print("pdhg_solver_.get_dual_solution()", pdhg_solver_.get_dual_solution());
-  print("pdhg_solver_.get_potential_next_primal_solution()",
-        pdhg_solver_.get_potential_next_primal_solution());
-  print("pdhg_solver_.get_potential_next_dual_solution()",
-        pdhg_solver_.get_potential_next_dual_solution());
-  print("restart_strategy_.last_restart_duality_gap_.primal_solution_",
-        restart_strategy_.last_restart_duality_gap_.primal_solution_);
-  print("restart_strategy_.last_restart_duality_gap_.dual_solution_",
-        restart_strategy_.last_restart_duality_gap_.dual_solution_);
+    std::cout << "Solution after projection" << std::endl;
+    print("pdhg_solver_.get_primal_solution()", pdhg_solver_.get_primal_solution());
+    print("pdhg_solver_.get_dual_solution()", pdhg_solver_.get_dual_solution());
+    print("pdhg_solver_.get_potential_next_primal_solution()",
+          pdhg_solver_.get_potential_next_primal_solution());
+    print("pdhg_solver_.get_potential_next_dual_solution()",
+          pdhg_solver_.get_potential_next_dual_solution());
+    print("restart_strategy_.last_restart_duality_gap_.primal_solution_",
+          restart_strategy_.last_restart_duality_gap_.primal_solution_);
+    print("restart_strategy_.last_restart_duality_gap_.dual_solution_",
+          restart_strategy_.last_restart_duality_gap_.dual_solution_);
 #endif
 
-  // Need to to tranpose primal solution to row format as there might be initial values or clamping
-  // Value may not be all 0
-  if (batch_mode_) {
-    rmm::device_uvector<f_t> dummy(0, stream_view_);
-    transpose_primal_dual_to_row(
-      pdhg_solver_.get_primal_solution(), pdhg_solver_.get_dual_solution(), dummy);
-    if (settings_.hyper_params.use_reflected_primal_dual) {
-      transpose_primal_dual_to_row(pdhg_solver_.get_potential_next_primal_solution(),
-                                   pdhg_solver_.get_potential_next_dual_solution(),
-                                   dummy);
-      transpose_primal_dual_to_row(restart_strategy_.last_restart_duality_gap_.primal_solution_,
-                                   restart_strategy_.last_restart_duality_gap_.dual_solution_,
-                                   dummy);
+    // Need to to tranpose primal solution to row format as there might be initial values or
+    // clamping Value may not be all 0
+    if (batch_mode_) {
+      rmm::device_uvector<f_t> dummy(0, stream_view_);
+      transpose_primal_dual_to_row(
+        pdhg_solver_.get_primal_solution(), pdhg_solver_.get_dual_solution(), dummy);
+      if (settings_.hyper_params.use_reflected_primal_dual) {
+        transpose_primal_dual_to_row(pdhg_solver_.get_potential_next_primal_solution(),
+                                     pdhg_solver_.get_potential_next_dual_solution(),
+                                     dummy);
+        transpose_primal_dual_to_row(restart_strategy_.last_restart_duality_gap_.primal_solution_,
+                                     restart_strategy_.last_restart_duality_gap_.dual_solution_,
+                                     dummy);
+      }
+      transpose_problem_fields(/*to_row=*/true);
+    }
+
+    if (verbose) {
+      std::cout << "primal_size_h_ " << primal_size_h_ << " dual_size_h_ " << dual_size_h_
+                << " nnz " << problem_ptr->nnz << std::endl;
+      std::cout << "Problem before scaling" << std::endl;
+      print_problem_info<f_t>(problem_ptr->coefficients,
+                              problem_ptr->objective_coefficients,
+                              problem_ptr->combined_bounds);
+      std::cout << "Problem after scaling" << std::endl;
+      print_problem_info<f_t>(op_problem_scaled_.coefficients,
+                              op_problem_scaled_.objective_coefficients,
+                              op_problem_scaled_.combined_bounds);
+      raft::print_device_vector("Initial step_size", step_size_.data(), 1, std::cout);
+      raft::print_device_vector("Initial primal_weight", primal_weight_.data(), 1, std::cout);
+      raft::print_device_vector("Initial primal_step_size", primal_step_size_.data(), 1, std::cout);
+      raft::print_device_vector("Initial dual_step_size", dual_step_size_.data(), 1, std::cout);
     }
-    transpose_problem_fields(/*to_row=*/true);
-  }
-
-  if (verbose) {
-    std::cout << "primal_size_h_ " << primal_size_h_ << " dual_size_h_ " << dual_size_h_ << " nnz "
-              << problem_ptr->nnz << std::endl;
-    std::cout << "Problem before scaling" << std::endl;
-    print_problem_info<f_t>(
-      problem_ptr->coefficients, problem_ptr->objective_coefficients, problem_ptr->combined_bounds);
-    std::cout << "Problem after scaling" << std::endl;
-    print_problem_info<f_t>(op_problem_scaled_.coefficients,
-                            op_problem_scaled_.objective_coefficients,
-                            op_problem_scaled_.combined_bounds);
-    raft::print_device_vector("Initial step_size", step_size_.data(), 1, std::cout);
-    raft::print_device_vector("Initial primal_weight", primal_weight_.data(), 1, std::cout);
-    raft::print_device_vector("Initial primal_step_size", primal_step_size_.data(), 1, std::cout);
-    raft::print_device_vector("Initial dual_step_size", dual_step_size_.data(), 1, std::cout);
-  }
 #ifdef CUPDLP_DEBUG_MODE
-  raft::print_device_vector("Initial step_size", step_size_.data(), step_size_.size(), std::cout);
-  raft::print_device_vector(
-    "Initial primal_weight", primal_weight_.data(), primal_weight_.size(), std::cout);
+    raft::print_device_vector("Initial step_size", step_size_.data(), step_size_.size(), std::cout);
+    raft::print_device_vector(
+      "Initial primal_weight", primal_weight_.data(), primal_weight_.size(), std::cout);
 #endif
 
-  bool warm_start_was_given = settings_.get_pdlp_warm_start_data().is_populated();
-
-  if (!inside_mip_) {
-    CUOPT_LOG_INFO(
-      "   Iter    Primal Obj.      Dual Obj.    Gap        Primal Res.  Dual Res.   Time");
+    if (!inside_mip_) {
+      CUOPT_LOG_INFO(
+        "   Iter    Primal Obj.      Dual Obj.    Gap        Primal Res.  Dual Res.   Time");
+    }
   }
   while (true) {
 #ifdef CUPDLP_DEBUG_MODE
@@ -2593,6 +3068,7 @@ optimization_problem_solution_t<i_t, f_t> pdlp_solver_t<i_t, f_t>::run_solver(co
         // 1. At the very beginning of the solver, when no steps have been taken yet
         // 2. After a single step, since average of one step is the same step
         if (internal_solver_iterations_ <= 1) {
+          if (multi_gpu_engine) { assert(false && "Not implemented"); }
           raft::copy(unscaled_primal_avg_solution_.data(),
                      pdhg_solver_.get_primal_solution().data(),
                      primal_size_h_,
@@ -2646,10 +3122,21 @@ optimization_problem_solution_t<i_t, f_t> pdlp_solver_t<i_t, f_t>::run_solver(co
         initial_scaling_strategy_.unscale_solutions(pdhg_solver_.get_primal_solution(),
                                                     pdhg_solver_.get_dual_solution());
       } else {
-        initial_scaling_strategy_.unscale_solutions(
-          pdhg_solver_.get_potential_next_primal_solution(),
-          pdhg_solver_.get_potential_next_dual_solution(),
-          pdhg_solver_.get_dual_slack());
+        if (multi_gpu_engine) {
+          // The only branch in cuPDLPx (Stable3)
+          multi_gpu_engine->for_each_shard([&](auto& shard) {
+            auto& sub = *shard.sub_pdlp;
+            sub.get_initial_scaling_strategy().unscale_solutions(
+              sub.pdhg_solver_.get_potential_next_primal_solution(),
+              sub.pdhg_solver_.get_potential_next_dual_solution(),
+              sub.pdhg_solver_.get_dual_slack());
+          });
+        } else {
+          initial_scaling_strategy_.unscale_solutions(
+            pdhg_solver_.get_potential_next_primal_solution(),
+            pdhg_solver_.get_potential_next_dual_solution(),
+            pdhg_solver_.get_dual_slack());
+        }
       }
 
 #ifdef CUPDLP_DEBUG_MODE
@@ -2681,10 +3168,21 @@ optimization_problem_solution_t<i_t, f_t> pdlp_solver_t<i_t, f_t>::run_solver(co
           initial_scaling_strategy_.scale_solutions(pdhg_solver_.get_primal_solution(),
                                                     pdhg_solver_.get_dual_solution());
         } else {
-          initial_scaling_strategy_.scale_solutions(
-            pdhg_solver_.get_potential_next_primal_solution(),
-            pdhg_solver_.get_potential_next_dual_solution(),
-            pdhg_solver_.get_dual_slack());
+          if (multi_gpu_engine) {
+            // The only branch in cuPDLPx (Stable3)
+            multi_gpu_engine->for_each_shard([&](auto& shard) {
+              auto& sub = *shard.sub_pdlp;
+              sub.get_initial_scaling_strategy().scale_solutions(
+                sub.pdhg_solver_.get_potential_next_primal_solution(),
+                sub.pdhg_solver_.get_potential_next_dual_solution(),
+                sub.pdhg_solver_.get_dual_slack());
+            });
+          } else {
+            initial_scaling_strategy_.scale_solutions(
+              pdhg_solver_.get_potential_next_primal_solution(),
+              pdhg_solver_.get_potential_next_dual_solution(),
+              pdhg_solver_.get_dual_slack());
+          }
         }
       }
 
@@ -2782,13 +3280,23 @@ optimization_problem_solution_t<i_t, f_t> pdlp_solver_t<i_t, f_t>::run_solver(co
           transpose_problem_fields(/*to_row=*/true);
         }
       }
-      halpern_update();
+      if (multi_gpu_engine) {
+        multi_gpu_engine->for_each_shard([&](auto& shard) { shard.sub_pdlp->halpern_update(); });
+      } else {
+        halpern_update();
+      }
     }
 
     ++total_pdlp_iterations_;
     ++internal_solver_iterations_;
-    if (settings_.hyper_params.never_restart_to_average)
+    if (settings_.hyper_params.never_restart_to_average) {
       restart_strategy_.increment_iteration_since_last_restart();
+      if (multi_gpu_engine) {
+        multi_gpu_engine->for_each_shard([&](auto& shard) {
+          shard.sub_pdlp->restart_strategy_.increment_iteration_since_last_restart();
+        });
+      }
+    }
   }
   return optimization_problem_solution_t<i_t, f_t>{pdlp_termination_status_t::NumericalError,
                                                    stream_view_};
diff --git a/cpp/src/pdlp/pdlp.cuh b/cpp/src/pdlp/pdlp.cuh
index 9447eaeaf3..3544de89fa 100644
--- a/cpp/src/pdlp/pdlp.cuh
+++ b/cpp/src/pdlp/pdlp.cuh
@@ -8,10 +8,12 @@
 #pragma once
 
 #include <branch_and_bound/shared_strong_branching_context.hpp>
+#include <cuopt/linear_programming/io/mps_data_model.hpp>
 #include <cuopt/linear_programming/pdlp/solver_settings.hpp>
 #include <cuopt/linear_programming/pdlp/solver_solution.hpp>
 
 #include <pdlp/cusparse_view.hpp>
+#include <pdlp/distributed_pdlp/multi_gpu_engine.hpp>
 #include <pdlp/initial_scaling_strategy/initial_scaling.cuh>
 #include <pdlp/pdhg.hpp>
 #include <pdlp/pdlp_climber_strategy.hpp>
@@ -62,6 +64,11 @@ class pdlp_solver_t {
                 pdlp_solver_settings_t<i_t, f_t> const& settings,
                 bool is_batch_mode = false);
 
+  // Distributed Solver Constructor
+  pdlp_solver_t(problem_t<i_t, f_t>& placeholder_problem,
+                cuopt::linear_programming::io::mps_data_model_t<i_t, f_t> const& mps,
+                pdlp_solver_settings_t<i_t, f_t> const& settings);
+
   optimization_problem_solution_t<i_t, f_t> run_solver(const timer_t& timer);
 
   f_t get_primal_weight_h(i_t id) const;
@@ -98,6 +105,25 @@ class pdlp_solver_t {
   void compute_initial_step_size();
   void compute_initial_primal_weight();
 
+  // Needed by multi-GPU to mutate them
+  problem_t<i_t, f_t>& get_op_problem_scaled() { return op_problem_scaled_; }
+  detail::pdlp_initial_scaling_strategy_t<i_t, f_t>& get_initial_scaling_strategy()
+  {
+    return initial_scaling_strategy_;
+  }
+  detail::pdlp_restart_strategy_t<i_t, f_t>& get_restart_strategy() { return restart_strategy_; }
+
+  // Per-shard primal/dual step sizes are private state on pdlp_solver_t but
+  // are needed inside the multi-GPU dispatch paths that fan out a master cub
+  // call across all shards' pdhg_solver_t::*_transform methods.
+  rmm::device_uvector<f_t>& get_primal_step_size() { return primal_step_size_; }
+  rmm::device_uvector<f_t>& get_dual_step_size() { return dual_step_size_; }
+  // Multi-GPU restart broadcast needs to mirror master's primal_weight /
+  // best_primal_weight onto every shard after each cuPDLPx restart so that
+  // downstream shard-side restart machinery stays in sync with master.
+  rmm::device_uvector<f_t>& get_primal_weight() { return primal_weight_; }
+  rmm::device_uvector<f_t>& get_best_primal_weight() { return best_primal_weight_; }
+
  private:
   void print_termination_criteria(const timer_t& timer, bool is_average = false);
   void print_final_termination_criteria(
@@ -247,6 +273,11 @@ class pdlp_solver_t {
   primal_quality_adapter_t best_primal_quality_so_far_;
   // Flag to indicate if solver is being called from MIP. No logging is done in this case.
   bool inside_mip_{false};
+
+  // std::optional because multi_gpu_engine_t is non-default-constructible
+  // (collectively bootstraps NCCL, owns RMM resources). Stays nullopt in
+  // single-GPU mode; emplaced by the multi-GPU ctor.
+  std::optional<multi_gpu_engine_t<i_t, f_t>> multi_gpu_engine;
 };
 
 }  // namespace cuopt::linear_programming::detail
diff --git a/cpp/src/pdlp/restart_strategy/pdlp_restart_strategy.cu b/cpp/src/pdlp/restart_strategy/pdlp_restart_strategy.cu
index 17c7abcac5..ee1d19b96b 100644
--- a/cpp/src/pdlp/restart_strategy/pdlp_restart_strategy.cu
+++ b/cpp/src/pdlp/restart_strategy/pdlp_restart_strategy.cu
@@ -8,6 +8,8 @@
 #include <cuopt/error.hpp>
 
 #include <cuopt/linear_programming/pdlp/pdlp_hyper_params.cuh>
+#include <pdlp/distributed_pdlp/multi_gpu_engine.hpp>
+#include <pdlp/pdlp.cuh>
 #include <pdlp/pdlp_constants.hpp>
 #include <pdlp/restart_strategy/pdlp_restart_strategy.cuh>
 #include <pdlp/swap_and_resize_helper.cuh>
@@ -892,20 +894,64 @@ void pdlp_restart_strategy_t<i_t, f_t>::cupdlpx_restart(
     "If any, all should be true");
 
   // Computing the deltas
-  distance_squared_moved_from_last_restart_period(
-    pdhg_solver.get_potential_next_primal_solution(),
-    last_restart_duality_gap_.primal_solution_,
-    pdhg_solver.get_primal_tmp_resource(),
-    primal_size_h_,
-    1,
-    last_restart_duality_gap_.primal_distance_traveled_);
-  distance_squared_moved_from_last_restart_period(
-    pdhg_solver.get_potential_next_dual_solution(),
-    last_restart_duality_gap_.dual_solution_,
-    pdhg_solver.get_dual_tmp_resource(),
-    dual_size_h_,
-    1,
-    last_restart_duality_gap_.dual_distance_traveled_);
+  if (auto* engine = pdhg_solver.get_mgpu_engine()) {
+    engine->for_each_shard([&](auto& shard) {
+      auto& sub      = *shard.sub_pdlp;
+      auto& sub_rest = sub.get_restart_strategy();
+      sub_rest.distance_squared_moved_from_last_restart_period(
+        sub.pdhg_solver_.get_potential_next_primal_solution(),
+        sub_rest.last_restart_duality_gap_.primal_solution_,
+        sub.pdhg_solver_.get_primal_tmp_resource(),
+        shard.rank_data.owned_var_size,
+        1,
+        sub_rest.last_restart_duality_gap_.primal_distance_traveled_);
+      sub_rest.distance_squared_moved_from_last_restart_period(
+        sub.pdhg_solver_.get_potential_next_dual_solution(),
+        sub_rest.last_restart_duality_gap_.dual_solution_,
+        sub.pdhg_solver_.get_dual_tmp_resource(),
+        shard.rank_data.owned_cstr_size,
+        1,
+        sub_rest.last_restart_duality_gap_.dual_distance_traveled_);
+    });
+
+    engine->allreduce_sum_inplace([](pdlp_solver_t<i_t, f_t>& sp) -> f_t* {
+      return sp.get_restart_strategy().last_restart_duality_gap_.primal_distance_traveled_.data();
+    });
+    engine->allreduce_sum_inplace([](pdlp_solver_t<i_t, f_t>& sp) -> f_t* {
+      return sp.get_restart_strategy().last_restart_duality_gap_.dual_distance_traveled_.data();
+    });
+
+    auto& s0 = *engine->shards[0];
+    {
+      raft::device_setter guard(s0.device_id);
+      RAFT_CUDA_TRY(cudaStreamSynchronize(s0.stream.view().value()));
+    }
+    raft::copy(last_restart_duality_gap_.primal_distance_traveled_.data(),
+               s0.sub_pdlp->get_restart_strategy()
+                 .last_restart_duality_gap_.primal_distance_traveled_.data(),
+               1,
+               stream_view_);
+    raft::copy(
+      last_restart_duality_gap_.dual_distance_traveled_.data(),
+      s0.sub_pdlp->get_restart_strategy().last_restart_duality_gap_.dual_distance_traveled_.data(),
+      1,
+      stream_view_);
+  } else {
+    distance_squared_moved_from_last_restart_period(
+      pdhg_solver.get_potential_next_primal_solution(),
+      last_restart_duality_gap_.primal_solution_,
+      pdhg_solver.get_primal_tmp_resource(),
+      primal_size_h_,
+      1,
+      last_restart_duality_gap_.primal_distance_traveled_);
+    distance_squared_moved_from_last_restart_period(
+      pdhg_solver.get_potential_next_dual_solution(),
+      last_restart_duality_gap_.dual_solution_,
+      pdhg_solver.get_dual_tmp_resource(),
+      dual_size_h_,
+      1,
+      last_restart_duality_gap_.dual_distance_traveled_);
+  }
 
   auto view = make_cupdlpx_restart_view(last_restart_duality_gap_.primal_distance_traveled_,
                                         last_restart_duality_gap_.dual_distance_traveled_,
@@ -958,24 +1004,71 @@ void pdlp_restart_strategy_t<i_t, f_t>::cupdlpx_restart(
     best_primal_weight.set_element_async(0, best_primal_weight_value, stream_view_);
   }
 
+  // mGPU: Broadcast all primal-weight / step-size scalars updated by the cuPDLPx
+  // restart on the master to every shard so the restart-state on
+  // each shard stays in sync with master.
+  if (auto* engine = pdhg_solver.get_mgpu_engine()) {
+    RAFT_CUDA_TRY(cudaStreamSynchronize(stream_view_));
+
+    f_t h_primal_step_size{}, h_dual_step_size{};
+    f_t h_primal_weight{}, h_best_primal_weight{};
+
+    raft::copy(&h_primal_step_size, primal_step_size.data(), 1, stream_view_);
+    raft::copy(&h_dual_step_size, dual_step_size.data(), 1, stream_view_);
+    raft::copy(&h_primal_weight, primal_weight.data(), 1, stream_view_);
+    raft::copy(&h_best_primal_weight, best_primal_weight.data(), 1, stream_view_);
+    RAFT_CUDA_TRY(cudaStreamSynchronize(stream_view_));
+
+    engine->for_each_shard([&](auto& shard) {
+      auto& sub = *shard.sub_pdlp;
+      raft::copy(sub.get_primal_step_size().data(), &h_primal_step_size, 1, shard.stream.view());
+      raft::copy(sub.get_dual_step_size().data(), &h_dual_step_size, 1, shard.stream.view());
+      raft::copy(sub.get_primal_weight().data(), &h_primal_weight, 1, shard.stream.view());
+      raft::copy(
+        sub.get_best_primal_weight().data(), &h_best_primal_weight, 1, shard.stream.view());
+    });
+  }
   // TODO later batch mode: remove if you have per climber restart
 
-  raft::copy(last_restart_duality_gap_.primal_solution_.data(),
-             pdhg_solver.get_potential_next_primal_solution().data(),
-             last_restart_duality_gap_.primal_solution_.size(),
-             stream_view_);
-  raft::copy(pdhg_solver.get_primal_solution().data(),
-             pdhg_solver.get_potential_next_primal_solution().data(),
-             last_restart_duality_gap_.primal_solution_.size(),
-             stream_view_);
-  raft::copy(last_restart_duality_gap_.dual_solution_.data(),
-             pdhg_solver.get_potential_next_dual_solution().data(),
-             last_restart_duality_gap_.dual_solution_.size(),
-             stream_view_);
-  raft::copy(pdhg_solver.get_dual_solution().data(),
-             pdhg_solver.get_potential_next_dual_solution().data(),
-             last_restart_duality_gap_.dual_solution_.size(),
-             stream_view_);
+  if (auto* engine = pdhg_solver.get_mgpu_engine()) {
+    engine->for_each_shard([&](auto& shard) {
+      auto& sub      = *shard.sub_pdlp;
+      auto& sub_rest = sub.get_restart_strategy();
+      raft::copy(sub_rest.last_restart_duality_gap_.primal_solution_.data(),
+                 sub.pdhg_solver_.get_potential_next_primal_solution().data(),
+                 sub_rest.last_restart_duality_gap_.primal_solution_.size(),
+                 shard.stream.view());
+      raft::copy(sub.pdhg_solver_.get_primal_solution().data(),
+                 sub.pdhg_solver_.get_potential_next_primal_solution().data(),
+                 sub.pdhg_solver_.get_primal_solution().size(),
+                 shard.stream.view());
+      raft::copy(sub_rest.last_restart_duality_gap_.dual_solution_.data(),
+                 sub.pdhg_solver_.get_potential_next_dual_solution().data(),
+                 sub_rest.last_restart_duality_gap_.dual_solution_.size(),
+                 shard.stream.view());
+      raft::copy(sub.pdhg_solver_.get_dual_solution().data(),
+                 sub.pdhg_solver_.get_potential_next_dual_solution().data(),
+                 sub.pdhg_solver_.get_dual_solution().size(),
+                 shard.stream.view());
+    });
+  } else {
+    raft::copy(last_restart_duality_gap_.primal_solution_.data(),
+               pdhg_solver.get_potential_next_primal_solution().data(),
+               last_restart_duality_gap_.primal_solution_.size(),
+               stream_view_);
+    raft::copy(pdhg_solver.get_primal_solution().data(),
+               pdhg_solver.get_potential_next_primal_solution().data(),
+               last_restart_duality_gap_.primal_solution_.size(),
+               stream_view_);
+    raft::copy(last_restart_duality_gap_.dual_solution_.data(),
+               pdhg_solver.get_potential_next_dual_solution().data(),
+               last_restart_duality_gap_.dual_solution_.size(),
+               stream_view_);
+    raft::copy(pdhg_solver.get_dual_solution().data(),
+               pdhg_solver.get_potential_next_dual_solution().data(),
+               last_restart_duality_gap_.dual_solution_.size(),
+               stream_view_);
+  }
 
 #ifdef CUPDLP_DEBUG_MODE
   print("New last_restart_duality_gap_.primal_solution_",
@@ -990,6 +1083,13 @@ void pdlp_restart_strategy_t<i_t, f_t>::cupdlpx_restart(
     weighted_average_solution_.iterations_since_last_restart_ = 0;
     last_trial_fixed_point_error_[i] = std::numeric_limits<f_t>::infinity();
   }
+
+  if (auto* engine = pdhg_solver.get_mgpu_engine()) {
+    engine->for_each_shard([&](auto& shard) {
+      shard.sub_pdlp->get_restart_strategy()
+        .weighted_average_solution_.iterations_since_last_restart_ = 0;
+    });
+  }
 }
 
 template <typename i_t, typename f_t>
diff --git a/cpp/src/pdlp/saddle_point.cu b/cpp/src/pdlp/saddle_point.cu
index f740176a3c..07a5d0146e 100644
--- a/cpp/src/pdlp/saddle_point.cu
+++ b/cpp/src/pdlp/saddle_point.cu
@@ -38,8 +38,11 @@ saddle_point_state_t<i_t, f_t>::saddle_point_state_t(
     current_AtY_{batch_size * primal_size, handle_ptr->get_stream()},
     next_AtY_{batch_size * primal_size, handle_ptr->get_stream()}
 {
-  EXE_CUOPT_EXPECTS(primal_size > 0, "Size of the primal problem must be larger than 0");
-  EXE_CUOPT_EXPECTS(dual_size > 0, "Size of the dual problem must be larger than 0");
+  // >= 0 (not > 0): distributed PDLP builds the master pdlp_solver_t from a
+  // shape-0 placeholder problem so the master never materializes per-variable
+  // / per-constraint vectors; size-0 device_uvectors are valid throughout.
+  EXE_CUOPT_EXPECTS(primal_size >= 0, "Size of the primal problem must be non-negative");
+  EXE_CUOPT_EXPECTS(dual_size >= 0, "Size of the dual problem must be non-negative");
 
   // Starting from all 0
   thrust::fill(
diff --git a/cpp/src/pdlp/solve.cu b/cpp/src/pdlp/solve.cu
index 43d507b2f9..595c06b20a 100644
--- a/cpp/src/pdlp/solve.cu
+++ b/cpp/src/pdlp/solve.cu
@@ -756,6 +756,9 @@ static optimization_problem_solution_t<i_t, f_t> run_pdlp_solver(
   const timer_t& timer,
   bool is_batch_mode)
 {
+  detail::pdlp_graph_disabled_flag().store(settings.hyper_params.pdlp_disable_graph,
+                                           std::memory_order_relaxed);
+
   if (problem.n_constraints == 0) {
     CUOPT_LOG_CONDITIONAL_INFO(
       !settings.inside_mip,
@@ -770,6 +773,15 @@ static optimization_problem_solution_t<i_t, f_t> run_pdlp_solver(
     }
   }
 #endif
+  // Distributed PDLP cannot enter through this path: by the time we have a
+  // problem_t, the full problem already lives on the master GPU, which defeats
+  // the purpose of distributed mode. Callers must route to
+  // solve_lp_distributed_from_mps via solve_lp(mps_data_model, ...).
+  cuopt_expects(!settings.hyper_params.use_distributed_pdlp,
+                error_type_t::ValidationError,
+                "Distributed PDLP must be entered via solve_lp(mps_data_model, ...) "
+                "so the master GPU never materializes the full problem. Call sites "
+                "with a problem_t cannot dispatch to distributed mode.");
   detail::pdlp_solver_t<i_t, f_t> solver(problem, settings, is_batch_mode);
   if (settings.inside_mip) { solver.set_inside_mip(true); }
   return solver.run_solver(timer);
@@ -2114,8 +2126,110 @@ optimization_problem_solution_t<i_t, f_t> solve_lp(
   bool problem_checking,
   bool use_pdlp_solver_mode)
 {
-  auto op_problem = mps_data_model_to_optimization_problem(handle_ptr, mps_data_model);
-  return solve_lp(op_problem, settings, problem_checking, use_pdlp_solver_mode);
+  cuopt_expects(
+    settings.hyper_params.use_distributed_pdlp,
+    error_type_t::ValidationError,
+    "solve_lp from mps_data_model: settings.hyper_params.use_distributed_pdlp must be true");
+  return solve_lp_distributed_from_mps(
+    handle_ptr, mps_data_model, settings, problem_checking, use_pdlp_solver_mode);
+}
+
+template <typename i_t, typename f_t>
+optimization_problem_solution_t<i_t, f_t> solve_lp_distributed_from_mps(
+  raft::handle_t const* handle_ptr,
+  const cuopt::linear_programming::io::mps_data_model_t<i_t, f_t>& mps_data_model,
+  pdlp_solver_settings_t<i_t, f_t> const& settings,
+  bool problem_checking,
+  bool use_pdlp_solver_mode)
+{
+  cuopt_expects(handle_ptr != nullptr,
+                error_type_t::ValidationError,
+                "solve_lp_distributed_from_mps: handle_ptr must not be null");
+  cuopt_expects(settings.hyper_params.use_distributed_pdlp,
+                error_type_t::ValidationError,
+                "solve_lp_distributed_from_mps: settings.hyper_params.use_distributed_pdlp "
+                "must be true");
+  cuopt_expects(settings.presolver == cuopt::linear_programming::presolver_t::None,
+                error_type_t::ValidationError,
+                "solve_lp_distributed_from_mps: presolve is not yet supported with "
+                "use_distributed_pdlp; please set settings.presolver = presolver_t::None");
+
+  pdlp_solver_settings_t<i_t, f_t> settings_resolved = settings;
+
+  detail::pdlp_graph_disabled_flag().store(settings_resolved.hyper_params.pdlp_disable_graph,
+                                           std::memory_order_relaxed);
+
+  if (settings_resolved.distributed_pdlp_num_gpus == -1) {
+    settings_resolved.distributed_pdlp_num_gpus = raft::device_setter::get_device_count();
+    CUOPT_LOG_INFO(
+      "solve_lp_distributed_from_mps: distributed_pdlp_num_gpus == -1, auto-detected "
+      "%d visible CUDA device(s)",
+      settings_resolved.distributed_pdlp_num_gpus);
+  }
+  // PDLP precision validations (mirror the checks in run_pdlp; distributed
+  // path only supports the default-precision, non-batch double config).
+  cuopt_expects(settings_resolved.pdlp_precision == pdlp_precision_t::DefaultPrecision,
+                error_type_t::ValidationError,
+                "Distributed PDLP only supports DefaultPrecision (double).");
+  cuopt_expects(!settings_resolved.inside_mip,
+                error_type_t::ValidationError,
+                "Distributed PDLP is not yet supported from inside MIP.");
+
+  init_logger_t log(settings_resolved.log_file, settings_resolved.log_to_console);
+  print_version_info();
+  init_handler(handle_ptr);
+
+  const i_t n_vars = static_cast<i_t>(mps_data_model.get_objective_coefficients().size());
+  const i_t n_cstr = static_cast<i_t>(mps_data_model.get_constraint_lower_bounds().size());
+  const i_t nnz    = static_cast<i_t>(mps_data_model.get_constraint_matrix_values().size());
+  CUOPT_LOG_INFO(
+    "Solving a problem with %d constraints, %d variables (%d integers), and %d "
+    "nonzeros (distributed mps-direct path)",
+    n_cstr,
+    n_vars,
+    0,
+    nnz);
+
+  auto lp_timer = cuopt::timer_t(settings_resolved.time_limit);
+
+  // Shape-0 placeholder: needed to build an empty pdlp_solver
+  cuopt::linear_programming::optimization_problem_t<i_t, f_t> placeholder_op(handle_ptr);
+  {
+    std::vector<i_t> empty_offsets = {0};
+    placeholder_op.set_csr_constraint_matrix(
+      nullptr, 0, nullptr, 0, empty_offsets.data(), static_cast<i_t>(empty_offsets.size()));
+  }
+  detail::problem_t<i_t, f_t> placeholder_problem(placeholder_op);
+
+  detail::pdlp_solver_t<i_t, f_t> solver(placeholder_problem, mps_data_model, settings_resolved);
+
+  auto sol = solver.run_solver(lp_timer);
+
+  // Maximization post-processing (matches run_pdlp at solve.cu:835-839):
+  // PDLP internally solves the negated objective, so flip dual / reduced
+  // cost signs on the gathered solution before returning.
+  if (mps_data_model.get_sense()) {
+    adjust_dual_solution_and_reduced_cost(
+      sol.get_dual_solution(), sol.get_reduced_cost(), handle_ptr->get_stream());
+    handle_ptr->sync_stream();
+  }
+
+  sol.set_solve_time(lp_timer.elapsed_time());
+  CUOPT_LOG_INFO("PDLP finished");
+  if (sol.get_termination_status() != pdlp_termination_status_t::ConcurrentLimit) {
+    CUOPT_LOG_INFO("Status: %s   Objective: %.8e  Iterations: %d  Time: %.3fs",
+                   sol.get_termination_status_string().c_str(),
+                   sol.get_objective_value(),
+                   sol.get_additional_termination_information().number_of_steps_taken,
+                   sol.get_solve_time());
+  }
+
+  if (settings_resolved.sol_file != "") {
+    CUOPT_LOG_INFO("Writing solution to file %s", settings_resolved.sol_file.c_str());
+    sol.write_to_sol_file(settings_resolved.sol_file, handle_ptr->get_stream());
+  }
+
+  return sol;
 }
 
 // ============================================================================
@@ -2258,6 +2372,14 @@ std::unique_ptr<lp_solution_interface_t<i_t, f_t>> solve_lp(
   template optimization_problem_t<int, F_TYPE> mps_data_model_to_optimization_problem(           \
     raft::handle_t const* handle_ptr,                                                            \
     const cuopt::linear_programming::io::mps_data_model_t<int, F_TYPE>& data_model);             \
+                                                                                                 \
+  template optimization_problem_solution_t<int, F_TYPE> solve_lp_distributed_from_mps(           \
+    raft::handle_t const* handle_ptr,                                                            \
+    const cuopt::linear_programming::io::mps_data_model_t<int, F_TYPE>& mps_data_model,          \
+    pdlp_solver_settings_t<int, F_TYPE> const& settings,                                         \
+    bool problem_checking,                                                                       \
+    bool use_pdlp_solver_mode);                                                                  \
+                                                                                                 \
   template void set_pdlp_solver_mode(pdlp_solver_settings_t<int, F_TYPE>& settings);
 
 #if MIP_INSTANTIATE_FLOAT
diff --git a/cpp/src/pdlp/solve.cuh b/cpp/src/pdlp/solve.cuh
index 90e5e4fe95..160f4602ba 100644
--- a/cpp/src/pdlp/solve.cuh
+++ b/cpp/src/pdlp/solve.cuh
@@ -32,6 +32,45 @@ cuopt::linear_programming::optimization_problem_solution_t<i_t, f_t> solve_lp_wi
   const timer_t& timer,
   bool is_batch_mode = false);
 
+/**
+ * @brief Distributed-PDLP entry point that consumes the host MPS data model
+ *        directly, without ever materializing the full problem on a single
+ *        (master) GPU.
+ *
+ * This is the entry point intended for problems whose `nnz` exceeds the memory
+ * of a single device. Today (Step 1 of the mGPU memory refactor) it is a thin
+ * routing shim: it resolves `distributed_pdlp_num_gpus == -1` against the
+ * visible-device count and delegates to the legacy
+ * `mps_data_model_to_optimization_problem(...)` + device-side `solve_lp(...)`
+ * pipeline, which still allocates the full problem on master. The shim exists
+ * so the public-facing call site is already in place; subsequent commits will
+ * replace the body with:
+ *   1. host-side METIS partitioning straight off the MPS CSR
+ *   2. per-shard host CSR slicing
+ *   3. construction of an mGPU-native pdlp_solver_t whose master only holds
+ *      scalar metadata + gather buffers (no full A / A^T / scaled copies).
+ *
+ * Until then, behaviour and memory footprint are identical to the legacy path.
+ *
+ * @param handle_ptr  Master raft handle (its stream owns the gather buffers
+ *                    and any master-side aggregator allocations).
+ * @param mps_data_model  Host-resident MPS data (CPU vectors only).
+ * @param settings    User-supplied PDLP solver settings; the
+ *                    `distributed_pdlp_num_gpus == -1` sentinel is resolved
+ *                    here against the visible-device count.
+ * @param problem_checking      Forwarded to the eventual solver.
+ * @param use_pdlp_solver_mode  Forwarded to the eventual solver.
+ *
+ * @pre `settings.hyper_params.use_distributed_pdlp == true`.
+ */
+template <typename i_t, typename f_t>
+cuopt::linear_programming::optimization_problem_solution_t<i_t, f_t> solve_lp_distributed_from_mps(
+  raft::handle_t const* handle_ptr,
+  const cuopt::linear_programming::io::mps_data_model_t<i_t, f_t>& mps_data_model,
+  pdlp_solver_settings_t<i_t, f_t> const& settings,
+  bool problem_checking,
+  bool use_pdlp_solver_mode);
+
 /**
  * @brief Entry point for batch PDLP. Solves multiple LPs sharing the same constraint
  *        matrix structure in a single batched GPU run.
diff --git a/cpp/src/pdlp/step_size_strategy/adaptive_step_size_strategy.cu b/cpp/src/pdlp/step_size_strategy/adaptive_step_size_strategy.cu
index 1f137dc9ea..aac777a44e 100644
--- a/cpp/src/pdlp/step_size_strategy/adaptive_step_size_strategy.cu
+++ b/cpp/src/pdlp/step_size_strategy/adaptive_step_size_strategy.cu
@@ -309,6 +309,24 @@ adaptive_step_size_strategy_t<i_t, f_t>::get_norm_squared_delta_dual() const
   return norm_squared_delta_dual_;
 }
 
+template <typename i_t, typename f_t>
+rmm::device_uvector<f_t>& adaptive_step_size_strategy_t<i_t, f_t>::get_interaction()
+{
+  return interaction_;
+}
+
+template <typename i_t, typename f_t>
+rmm::device_uvector<f_t>& adaptive_step_size_strategy_t<i_t, f_t>::get_norm_squared_delta_primal()
+{
+  return norm_squared_delta_primal_;
+}
+
+template <typename i_t, typename f_t>
+rmm::device_uvector<f_t>& adaptive_step_size_strategy_t<i_t, f_t>::get_norm_squared_delta_dual()
+{
+  return norm_squared_delta_dual_;
+}
+
 template <typename i_t, typename f_t>
 void adaptive_step_size_strategy_t<i_t, f_t>::set_valid_step_size(i_t valid)
 {
@@ -346,8 +364,16 @@ template <typename i_t, typename f_t>
 void adaptive_step_size_strategy_t<i_t, f_t>::compute_interaction_and_movement(
   rmm::device_uvector<f_t>& tmp_primal,
   cusparse_view_t<i_t, f_t>& cusparse_view,
-  saddle_point_state_t<i_t, f_t>& current_saddle_point_state)
+  saddle_point_state_t<i_t, f_t>& current_saddle_point_state,
+  i_t owned_primal_size,
+  i_t owned_cstr_size)
 {
+  // mGPU needs to know owned size to restrict the reductions to the owned prefix
+  const i_t reduce_primal_size =
+    (owned_primal_size >= 0) ? owned_primal_size : current_saddle_point_state.get_primal_size();
+  const i_t reduce_dual_size =
+    (owned_cstr_size >= 0) ? owned_cstr_size : current_saddle_point_state.get_dual_size();
+
   // QP would need this:
   // if iszero(problem.objective_matrix)
   //   primal_objective_interaction = 0.0
@@ -426,7 +452,7 @@ void adaptive_step_size_strategy_t<i_t, f_t>::compute_interaction_and_movement(
     // compute interaction (x'-x) . (A(y'-y))
     RAFT_CUBLAS_TRY(
       raft::linalg::detail::cublasdot(handle_ptr_->get_cublas_handle(),
-                                      current_saddle_point_state.get_primal_size(),
+                                      reduce_primal_size,
                                       tmp_primal.data(),
                                       primal_stride,
                                       current_saddle_point_state.get_delta_primal().data(),
@@ -444,7 +470,7 @@ void adaptive_step_size_strategy_t<i_t, f_t>::compute_interaction_and_movement(
     //               norm(delta_dual) ^ 2;
     RAFT_CUBLAS_TRY(
       raft::linalg::detail::cublasdot(handle_ptr_->get_cublas_handle(),
-                                      current_saddle_point_state.get_primal_size(),
+                                      reduce_primal_size,
                                       current_saddle_point_state.get_delta_primal().data(),
                                       primal_stride,
                                       current_saddle_point_state.get_delta_primal().data(),
@@ -454,7 +480,7 @@ void adaptive_step_size_strategy_t<i_t, f_t>::compute_interaction_and_movement(
 
     RAFT_CUBLAS_TRY(
       raft::linalg::detail::cublasdot(handle_ptr_->get_cublas_handle(),
-                                      current_saddle_point_state.get_dual_size(),
+                                      reduce_dual_size,
                                       current_saddle_point_state.get_delta_dual().data(),
                                       dual_stride,
                                       current_saddle_point_state.get_delta_dual().data(),
diff --git a/cpp/src/pdlp/step_size_strategy/adaptive_step_size_strategy.hpp b/cpp/src/pdlp/step_size_strategy/adaptive_step_size_strategy.hpp
index 1e969150e7..238735e8ff 100644
--- a/cpp/src/pdlp/step_size_strategy/adaptive_step_size_strategy.hpp
+++ b/cpp/src/pdlp/step_size_strategy/adaptive_step_size_strategy.hpp
@@ -81,9 +81,20 @@ class adaptive_step_size_strategy_t {
   const rmm::device_uvector<f_t>& get_norm_squared_delta_primal() const;
   const rmm::device_uvector<f_t>& get_norm_squared_delta_dual() const;
 
+  // Mutable overloads — used by the multi-GPU path to NCCL-allreduce the
+  // per-shard scalar contributions in place and to mirror them back to the
+  // master step_size_strategy_.
+  rmm::device_uvector<f_t>& get_interaction();
+  rmm::device_uvector<f_t>& get_norm_squared_delta_primal();
+  rmm::device_uvector<f_t>& get_norm_squared_delta_dual();
+
+  // owned_primal_size / owned_cstr_size are mGPU overrides.
+  // mGPU needs to know owned size to restrict the reductions to the owned prefix
   void compute_interaction_and_movement(rmm::device_uvector<f_t>& tmp_primal,
                                         cusparse_view_t<i_t, f_t>& cusparse_view,
-                                        saddle_point_state_t<i_t, f_t>& current_saddle_point_state);
+                                        saddle_point_state_t<i_t, f_t>& current_saddle_point_state,
+                                        i_t owned_primal_size = -1,
+                                        i_t owned_cstr_size   = -1);
 
   void swap_context(const thrust::universal_host_pinned_vector<swap_pair_t<i_t>>& swap_pairs);
   void resize_context(i_t new_size);
diff --git a/cpp/src/pdlp/termination_strategy/convergence_information.cu b/cpp/src/pdlp/termination_strategy/convergence_information.cu
index a6d6d14d96..1dfc8229da 100644
--- a/cpp/src/pdlp/termination_strategy/convergence_information.cu
+++ b/cpp/src/pdlp/termination_strategy/convergence_information.cu
@@ -5,12 +5,16 @@
  */
 /* clang-format on */
 
+#include <pdlp/distributed_pdlp/multi_gpu_engine.hpp>
+#include <pdlp/pdlp.cuh>
 #include <pdlp/pdlp_climber_strategy.hpp>
 #include <pdlp/pdlp_constants.hpp>
 #include <pdlp/swap_and_resize_helper.cuh>
 #include <pdlp/termination_strategy/convergence_information.hpp>
 #include <pdlp/utils.cuh>
 
+#include <raft/core/device_setter.hpp>
+
 #include <mip_heuristics/mip_constants.hpp>
 
 #include <cuopt/error.hpp>
@@ -208,6 +212,77 @@ void convergence_information_t<i_t, f_t>::init_l2_norms()
   }
 }
 
+template <typename i_t, typename f_t>
+void convergence_information_t<i_t, f_t>::compute_owned_reference_norm_partials(i_t owned_var_size,
+                                                                                i_t owned_cstr_size)
+{
+  cuopt_assert(!batch_mode_, "owned reference-norm partials only used in non-batch mGPU mode");
+  cuopt_assert(owned_var_size <= primal_size_h_, "owned_var_size must be <= primal_size_h_");
+  cuopt_assert(owned_cstr_size <= dual_size_h_, "owned_cstr_size must be <= dual_size_h_");
+
+  // Σ objective[0:owned_var]²
+  RAFT_CUBLAS_TRY(raft::linalg::detail::cublasdot(handle_ptr_->get_cublas_handle(),
+                                                  static_cast<int>(owned_var_size),
+                                                  problem_ptr->objective_coefficients.data(),
+                                                  1,
+                                                  problem_ptr->objective_coefficients.data(),
+                                                  1,
+                                                  l2_norm_primal_linear_objective_.data(),
+                                                  stream_view_));
+
+  // rhs_sum_of_squares(lower[0:owned_cstr], upper[0:owned_cstr])  (no sqrt)
+  {
+    rmm::device_buffer d_temp_storage;
+    size_t bytes   = 0;
+    auto zip_begin = thrust::make_zip_iterator(problem_ptr->constraint_lower_bounds.data(),
+                                               problem_ptr->constraint_upper_bounds.data());
+    cub::DeviceReduce::TransformReduce(nullptr,
+                                       bytes,
+                                       zip_begin,
+                                       l2_norm_primal_right_hand_side_.data(),
+                                       static_cast<int>(owned_cstr_size),
+                                       cuda::std::plus<>{},
+                                       rhs_sum_of_squares_t<f_t>{},
+                                       f_t(0),
+                                       stream_view_);
+    d_temp_storage.resize(bytes, stream_view_);
+    cub::DeviceReduce::TransformReduce(d_temp_storage.data(),
+                                       bytes,
+                                       zip_begin,
+                                       l2_norm_primal_right_hand_side_.data(),
+                                       static_cast<int>(owned_cstr_size),
+                                       cuda::std::plus<>{},
+                                       rhs_sum_of_squares_t<f_t>{},
+                                       f_t(0),
+                                       stream_view_);
+  }
+  RAFT_CUDA_TRY(cudaStreamSynchronize(stream_view_));
+}
+
+template <typename i_t, typename f_t>
+void convergence_information_t<i_t, f_t>::sqrt_reference_norms_inplace()
+{
+  cub::DeviceTransform::Transform(l2_norm_primal_linear_objective_.data(),
+                                  l2_norm_primal_linear_objective_.data(),
+                                  1,
+                                  sqrt_func_t<f_t>{},
+                                  stream_view_);
+  cub::DeviceTransform::Transform(l2_norm_primal_right_hand_side_.data(),
+                                  l2_norm_primal_right_hand_side_.data(),
+                                  1,
+                                  sqrt_func_t<f_t>{},
+                                  stream_view_);
+  // Broadcast slot [0] to all climbers (no-op outside batch mode).
+  thrust::fill(handle_ptr_->get_thrust_policy(),
+               l2_norm_primal_linear_objective_.begin(),
+               l2_norm_primal_linear_objective_.end(),
+               l2_norm_primal_linear_objective_.element(0, stream_view_));
+  thrust::fill(handle_ptr_->get_thrust_policy(),
+               l2_norm_primal_right_hand_side_.begin(),
+               l2_norm_primal_right_hand_side_.end(),
+               l2_norm_primal_right_hand_side_.element(0, stream_view_));
+}
+
 // ---------------------------------------------------------------------------
 // init_reduction_storage: allocate and size the temporary buffers used by
 // cub::DeviceReduce and cub::DeviceSegmentedReduce throughout solving.
@@ -416,17 +491,88 @@ void convergence_information_t<i_t, f_t>::compute_convergence_information(
   print("dual_slack", dual_slack);
 #endif
 
-  compute_primal_residual(
-    op_problem_cusparse_view_, current_pdhg_solver.get_dual_tmp_resource(), dual_iterate);
-  compute_primal_objective(primal_iterate);
+  if (current_pdhg_solver.is_multi_gpu()) {
+    auto* engine = current_pdhg_solver.get_mgpu_engine();
+    cuopt_assert(engine != nullptr,
+                 "mGPU branch reached but current_pdhg_solver has no engine (shard pdhg?)");
+    cuopt_expects(!settings.per_constraint_residual,
+                  error_type_t::ValidationError,
+                  "per_constraint_residual is not yet supported in multi-GPU mode");
+
+    // Prepares halo values in potential_next_primal_solution
+
+    engine->halo_exchange_var([](pdhg_solver_t<i_t, f_t>& pdhg) -> rmm::device_uvector<f_t>& {
+      return pdhg.get_potential_next_primal_solution();
+    });
+
+    for (auto& shard : engine->shards) {
+      raft::device_setter guard(shard->device_id);
+      auto& sub_pdlp = *shard->sub_pdlp;
+      auto& sub_conv = sub_pdlp.get_current_termination_strategy().get_convergence_information();
+      sub_conv.compute_primal_residual(sub_conv.op_problem_cusparse_view_,
+                                       sub_pdlp.pdhg_solver_.get_dual_tmp_resource(),
+                                       sub_pdlp.pdhg_solver_.get_potential_next_dual_solution());
+      sub_conv.compute_primal_objective_owned_partial(
+        sub_pdlp.pdhg_solver_.get_potential_next_primal_solution(),
+        shard->rank_data.owned_var_size);
+    }
+
+    // Reduce all primal objectives across shards
+    cuopt_assert(!batch_mode_, "multi-GPU PDLP is not supported in batch mode");
+    engine->allreduce_sum_inplace([](pdlp_solver_t<i_t, f_t>& sp) -> f_t* {
+      return sp.get_current_termination_strategy()
+        .get_convergence_information()
+        .get_primal_objective()
+        .data();
+    });
+
+    // Get the reduced primal objective from the shard[0] (arbitrary)
+    // Sync shards with master stream to avoid race conditions
+    engine->sync_await_shards(stream_view_);
+    {
+      auto& s0 = *engine->shards[0];
+      raft::device_setter guard(s0.device_id);
+      auto& s0_conv = s0.sub_pdlp->get_current_termination_strategy().get_convergence_information();
+      raft::copy(primal_objective_.data(), s0_conv.get_primal_objective().data(), 1, stream_view_);
+    }
+    apply_primal_objective_scaling_and_offset();
+  } else {
+    compute_primal_residual(
+      op_problem_cusparse_view_, current_pdhg_solver.get_dual_tmp_resource(), dual_iterate);
+    compute_primal_objective(primal_iterate);
+  }
 
 #ifdef CUPDLP_DEBUG_MODE
   print("Primal Residual", primal_residual_);
 #endif
 
-  if (!batch_mode_)
+  // L2 Norm
+  if (current_pdhg_solver.is_multi_gpu()) {
+    auto* engine = current_pdhg_solver.get_mgpu_engine();
+    engine->distributed_l2_norm(
+      [](pdlp_solver_t<i_t, f_t>& sp) -> rmm::device_uvector<f_t>& {
+        return sp.get_current_termination_strategy().get_convergence_information().primal_residual_;
+      },
+      [](pdlp_solver_t<i_t, f_t>& sp) -> f_t* {
+        return sp.get_current_termination_strategy()
+          .get_convergence_information()
+          .l2_primal_residual_.data();
+      },
+      [](pdlp_shard_t<i_t, f_t>& shard) -> i_t { return shard.rank_data.owned_cstr_size; });
+
+    // distributed L2 norm before copying scalar data out of shard 0.
+    engine->sync_await_shards(stream_view_);
+    auto& s0 = *engine->shards[0];
+    raft::device_setter guard(s0.device_id);
+    raft::copy(l2_primal_residual_.data(),
+               s0.sub_pdlp->get_current_termination_strategy()
+                 .get_convergence_information()
+                 .l2_primal_residual_.data(),
+               1,
+               stream_view_);
+  } else if (!batch_mode_) {
     my_l2_norm<i_t, f_t>(primal_residual_, l2_primal_residual_, handle_ptr_);
-  else {
+  } else {
     segmented_sum_handler_.segmented_sum_helper(
       thrust::make_transform_iterator(primal_residual_.data(), power_two_func_t<f_t>{}),
       l2_primal_residual_.data(),
@@ -444,6 +590,7 @@ void convergence_information_t<i_t, f_t>::compute_convergence_information(
   print("Absolute Primal Residual", l2_primal_residual_);
 #endif
   // If per_constraint_residual is false we still need to perform the l2 since it's used in kkt
+  // Not suported in mGPU
   if (settings.per_constraint_residual) {
     // Compute the linf of (residual_i - rel * b_i)
     if (settings.save_best_primal_so_far) {
@@ -466,19 +613,108 @@ void convergence_information_t<i_t, f_t>::compute_convergence_information(
                                                    std::numeric_limits<f_t>::lowest());
   }
 
-  compute_dual_residual(op_problem_cusparse_view_,
-                        current_pdhg_solver.get_primal_tmp_resource(),
-                        primal_iterate,
-                        dual_slack);
-  compute_dual_objective(dual_iterate, primal_iterate, dual_slack);
+  if (current_pdhg_solver.is_multi_gpu()) {
+    auto* engine = current_pdhg_solver.get_mgpu_engine();
+
+    // 1) Halo-exchange potential_next_dual_solution on every shard so the
+    //    A_T_shard @ y SpMV inside compute_dual_residual reads correct values
+    //    in the cstr halo region. The SpMV is driven through the eval view's
+    //    cv.dual_solution descriptor, which (cuPDLPx, see
+    //    cusparse_view.cu:931-937) is bound to _potential_next_dual -- not to
+    //    current.dual_solution. So we must halo-exchange the same buffer.
+    engine->halo_exchange_cstr([](pdhg_solver_t<i_t, f_t>& pdhg) -> rmm::device_uvector<f_t>& {
+      return pdhg.get_potential_next_dual_solution();
+    });
+
+    // 2-3) Per-shard:
+    //      - compute_dual_residual: shard.dual_residual_ has owned-var entries
+    //        correct, halo var entries garbage (their A_T row isn't on this
+    //        shard).
+    //      - compute_dual_objective_owned_partial: writes a *partial*
+    //        dot(slack[0:nv], x[0:nv]) + Σ primal_slack[0:nc] into
+    //        shard.dual_objective_, with NO scaling/offset. Relies on
+    //        primal_slack_ already populated by the per-shard
+    //        compute_primal_residual above.
+    //
+    // Same primal_iterate fix as the primal block above: use the shard's
+    // (fresh, unscaled) potential_next_primal_solution, matching single-GPU
+    // cuPDLPx (pdlp.cu:1190-1203). The previous code's get_primal_solution()
+    // would mix scaled x with unscaled dual_slack in the dual_objective
+    // cublasdot.
+    for (auto& shard : engine->shards) {
+      raft::device_setter guard(shard->device_id);
+      auto& sub_pdlp = *shard->sub_pdlp;
+      auto& sub_conv = sub_pdlp.get_current_termination_strategy().get_convergence_information();
+      sub_conv.compute_dual_residual(sub_conv.op_problem_cusparse_view_,
+                                     sub_pdlp.pdhg_solver_.get_primal_tmp_resource(),
+                                     sub_pdlp.pdhg_solver_.get_potential_next_primal_solution(),
+                                     sub_pdlp.pdhg_solver_.get_dual_slack());
+      sub_conv.compute_dual_objective_owned_partial(
+        sub_pdlp.pdhg_solver_.get_potential_next_primal_solution(),
+        sub_pdlp.pdhg_solver_.get_dual_slack(),
+        shard->rank_data.owned_var_size,
+        shard->rank_data.owned_cstr_size);
+    }
+
+    // 4) Allreduce dual_objective_ across shards (sum, in place). Same
+    //    offset/scaling-after-allreduce reasoning as primal: applying offset
+    //    per-shard would over-count it Nshards times.
+    engine->allreduce_sum_inplace([](pdlp_solver_t<i_t, f_t>& sp) -> f_t* {
+      return sp.get_current_termination_strategy()
+        .get_convergence_information()
+        .get_dual_objective()
+        .data();
+    });
+
+    // Sync shards with master stream to avoid race conditions
+    engine->sync_await_shards(stream_view_);
+    {
+      auto& s0 = *engine->shards[0];
+      raft::device_setter guard(s0.device_id);
+      auto& s0_conv = s0.sub_pdlp->get_current_termination_strategy().get_convergence_information();
+      raft::copy(dual_objective_.data(), s0_conv.get_dual_objective().data(), 1, stream_view_);
+    }
+    apply_dual_objective_scaling_and_offset();
+  } else {
+    compute_dual_residual(op_problem_cusparse_view_,
+                          current_pdhg_solver.get_primal_tmp_resource(),
+                          primal_iterate,
+                          dual_slack);
+    compute_dual_objective(dual_iterate, primal_iterate, dual_slack);
+  }
 
 #ifdef CUPDLP_DEBUG_MODE
   print("Dual Residual", dual_residual_);
 #endif
 
-  if (!batch_mode_)
+  if (current_pdhg_solver.is_multi_gpu()) {
+    // Multi-GPU dual residual L2 norm: same pattern as the primal L2 above,
+    // but the dual residual is var-shaped so we clip to owned_var_size.
+    auto* engine = current_pdhg_solver.get_mgpu_engine();
+    engine->distributed_l2_norm(
+      [](pdlp_solver_t<i_t, f_t>& sp) -> rmm::device_uvector<f_t>& {
+        return sp.get_current_termination_strategy().get_convergence_information().dual_residual_;
+      },
+      [](pdlp_solver_t<i_t, f_t>& sp) -> f_t* {
+        return sp.get_current_termination_strategy()
+          .get_convergence_information()
+          .l2_dual_residual_.data();
+      },
+      [](pdlp_shard_t<i_t, f_t>& shard) -> i_t { return shard.rank_data.owned_var_size; });
+
+    // distributed L2 norm before copying scalar data out of shard 0.
+    engine->sync_await_shards(stream_view_);
+    auto& s0 = *engine->shards[0];
+    raft::device_setter guard(s0.device_id);
+    raft::copy(l2_dual_residual_.data(),
+               s0.sub_pdlp->get_current_termination_strategy()
+                 .get_convergence_information()
+                 .l2_dual_residual_.data(),
+               1,
+               stream_view_);
+  } else if (!batch_mode_) {
     my_l2_norm<i_t, f_t>(dual_residual_, l2_dual_residual_, handle_ptr_);
-  else {
+  } else {
     segmented_sum_handler_.segmented_sum_helper(
       thrust::make_transform_iterator(dual_residual_.data(), power_two_func_t<f_t>{}),
       l2_dual_residual_.data(),
@@ -509,6 +745,8 @@ void convergence_information_t<i_t, f_t>::compute_convergence_information(
                                                    std::numeric_limits<f_t>::lowest());
   }
 
+  // In mGPU, full primal_objective and dual_objective already mirrored to master so no special
+  // behaviour
   const auto [grid_size, block_size] = kernel_config_from_batch_size(climber_strategies_.size());
   compute_remaining_stats_kernel<i_t, f_t>
     <<<grid_size, block_size, 0, stream_view_>>>(this->view(), climber_strategies_.size());
@@ -615,6 +853,24 @@ __global__ void apply_objective_scaling_and_offset(raft::device_span<f_t> object
   objective[idx] = objective_scaling_factor * (objective[idx] + objective_offsets[idx]);
 }
 
+template <typename i_t, typename f_t>
+void convergence_information_t<i_t, f_t>::compute_primal_objective_owned_partial(
+  rmm::device_uvector<f_t>& primal_solution, i_t n_owned)
+{
+  raft::common::nvtx::range fun_scope("compute_primal_objective_owned_partial");
+  cuopt_assert(!batch_mode_, "owned-partial primal objective is only used in non-batch mGPU mode");
+  cuopt_assert(n_owned <= primal_size_h_,
+               "n_owned must be <= primal_size_h_ (owned slice is a prefix)");
+  RAFT_CUBLAS_TRY(raft::linalg::detail::cublasdot(handle_ptr_->get_cublas_handle(),
+                                                  static_cast<int>(n_owned),
+                                                  primal_solution.data(),
+                                                  primal_stride,
+                                                  problem_ptr->objective_coefficients.data(),
+                                                  primal_stride,
+                                                  primal_objective_.data(),
+                                                  stream_view_));
+}
+
 template <typename i_t, typename f_t>
 void convergence_information_t<i_t, f_t>::compute_primal_objective(
   rmm::device_uvector<f_t>& primal_solution)
@@ -643,21 +899,25 @@ void convergence_information_t<i_t, f_t>::compute_primal_objective(
 
   // Apply per-climber objective scaling and offset. objective_offsets_ is always populated
   // (defaults to the scalar problem offset replicated, or user-specified per-climber offsets).
-  {
-    const auto [grid_size, block_size] = kernel_config_from_batch_size(climber_strategies_.size());
-    apply_objective_scaling_and_offset<i_t, f_t><<<grid_size, block_size, 0, stream_view_>>>(
-      make_span(primal_objective_),
-      problem_ptr->presolve_data.objective_scaling_factor,
-      make_span(objective_offsets_),
-      climber_strategies_.size());
-    RAFT_CUDA_TRY(cudaPeekAtLastError());
-  }
+  apply_primal_objective_scaling_and_offset();
 
 #ifdef CUPDLP_DEBUG_MODE
   print("Primal objective", primal_objective_);
 #endif
 }
 
+template <typename i_t, typename f_t>
+void convergence_information_t<i_t, f_t>::apply_primal_objective_scaling_and_offset()
+{
+  const auto [grid_size, block_size] = kernel_config_from_batch_size(climber_strategies_.size());
+  apply_objective_scaling_and_offset<i_t, f_t><<<grid_size, block_size, 0, stream_view_>>>(
+    make_span(primal_objective_),
+    problem_ptr->presolve_data.objective_scaling_factor,
+    make_span(objective_offsets_),
+    climber_strategies_.size());
+  RAFT_CUDA_TRY(cudaPeekAtLastError());
+}
+
 template <typename i_t, typename f_t>
 void convergence_information_t<i_t, f_t>::compute_dual_residual(
   cusparse_view_t<i_t, f_t>& cusparse_view,
@@ -740,6 +1000,50 @@ void convergence_information_t<i_t, f_t>::compute_dual_residual(
   }
 }
 
+template <typename i_t, typename f_t>
+void convergence_information_t<i_t, f_t>::compute_dual_objective_owned_partial(
+  rmm::device_uvector<f_t>& primal_solution,
+  rmm::device_uvector<f_t>& dual_slack,
+  i_t n_owned_var,
+  i_t n_owned_cstr)
+{
+  raft::common::nvtx::range fun_scope("compute_dual_objective_owned_partial");
+  cuopt_assert(!batch_mode_, "owned-partial dual objective is only used in non-batch mGPU mode");
+  cuopt_assert(hyper_params_.use_reflected_primal_dual,
+               "owned-partial dual objective requires use_reflected_primal_dual");
+  cuopt_assert(n_owned_var <= primal_size_h_,
+               "n_owned_var must be <= primal_size_h_ (owned slice is a prefix)");
+  cuopt_assert(n_owned_cstr <= dual_size_h_,
+               "n_owned_cstr must be <= dual_size_h_ (owned slice is a prefix)");
+
+  // dual_dot_ = dot(dual_slack[0:n_owned_var], primal_solution[0:n_owned_var])
+  RAFT_CUBLAS_TRY(raft::linalg::detail::cublasdot(handle_ptr_->get_cublas_handle(),
+                                                  static_cast<int>(n_owned_var),
+                                                  dual_slack.data(),
+                                                  primal_stride,
+                                                  primal_solution.data(),
+                                                  primal_stride,
+                                                  dual_dot_.data(),
+                                                  stream_view_));
+
+  // sum_primal_slack_ = Σ primal_slack_[0:n_owned_cstr]
+  // primal_slack_ is assumed populated for owned cstrs by a prior
+  // compute_primal_residual call on this same shard.
+  cub::DeviceReduce::Sum(rmm_tmp_buffer_.data(),
+                         size_of_buffer_,
+                         primal_slack_.data(),
+                         sum_primal_slack_.data(),
+                         static_cast<int>(n_owned_cstr),
+                         stream_view_);
+
+  // dual_objective_ = dual_dot_ + sum_primal_slack_ (still a partial sum).
+  cub::DeviceTransform::Transform(cuda::std::make_tuple(dual_dot_.data(), sum_primal_slack_.data()),
+                                  dual_objective_.data(),
+                                  1,
+                                  cuda::std::plus<>{},
+                                  stream_view_);
+}
+
 template <typename i_t, typename f_t>
 void convergence_information_t<i_t, f_t>::compute_dual_objective(
   rmm::device_uvector<f_t>& dual_solution,
@@ -821,21 +1125,25 @@ void convergence_information_t<i_t, f_t>::compute_dual_objective(
   }
 
   // Apply per-climber objective scaling and offset.
-  {
-    const auto [grid_size, block_size] = kernel_config_from_batch_size(climber_strategies_.size());
-    apply_objective_scaling_and_offset<i_t, f_t><<<grid_size, block_size, 0, stream_view_>>>(
-      make_span(dual_objective_),
-      problem_ptr->presolve_data.objective_scaling_factor,
-      make_span(objective_offsets_),
-      climber_strategies_.size());
-    RAFT_CUDA_TRY(cudaPeekAtLastError());
-  }
+  apply_dual_objective_scaling_and_offset();
 
 #ifdef CUPDLP_DEBUG_MODE
   print("Dual objective", dual_objective_);
 #endif
 }
 
+template <typename i_t, typename f_t>
+void convergence_information_t<i_t, f_t>::apply_dual_objective_scaling_and_offset()
+{
+  const auto [grid_size, block_size] = kernel_config_from_batch_size(climber_strategies_.size());
+  apply_objective_scaling_and_offset<i_t, f_t><<<grid_size, block_size, 0, stream_view_>>>(
+    make_span(dual_objective_),
+    problem_ptr->presolve_data.objective_scaling_factor,
+    make_span(objective_offsets_),
+    climber_strategies_.size());
+  RAFT_CUDA_TRY(cudaPeekAtLastError());
+}
+
 template <typename i_t, typename f_t>
 void convergence_information_t<i_t, f_t>::compute_reduced_cost_from_primal_gradient(
   const rmm::device_uvector<f_t>& primal_gradient, const rmm::device_uvector<f_t>& primal_solution)
@@ -916,12 +1224,24 @@ const rmm::device_uvector<f_t>& convergence_information_t<i_t, f_t>::get_primal_
   return primal_objective_;
 }
 
+template <typename i_t, typename f_t>
+rmm::device_uvector<f_t>& convergence_information_t<i_t, f_t>::get_primal_objective()
+{
+  return primal_objective_;
+}
+
 template <typename i_t, typename f_t>
 const rmm::device_uvector<f_t>& convergence_information_t<i_t, f_t>::get_dual_objective() const
 {
   return dual_objective_;
 }
 
+template <typename i_t, typename f_t>
+rmm::device_uvector<f_t>& convergence_information_t<i_t, f_t>::get_dual_objective()
+{
+  return dual_objective_;
+}
+
 template <typename i_t, typename f_t>
 const rmm::device_uvector<f_t>& convergence_information_t<i_t, f_t>::get_l2_dual_residual() const
 {
diff --git a/cpp/src/pdlp/termination_strategy/convergence_information.hpp b/cpp/src/pdlp/termination_strategy/convergence_information.hpp
index 2389a60fae..7ff45e46f0 100644
--- a/cpp/src/pdlp/termination_strategy/convergence_information.hpp
+++ b/cpp/src/pdlp/termination_strategy/convergence_information.hpp
@@ -52,7 +52,10 @@ class convergence_information_t {
 
   // Needed for kkt restart & debug prints
   const rmm::device_uvector<f_t>& get_primal_objective() const;
+  // Non-const overload used by the multi-GPU branch to mirror / allreduce.
+  rmm::device_uvector<f_t>& get_primal_objective();
   const rmm::device_uvector<f_t>& get_dual_objective() const;
+  rmm::device_uvector<f_t>& get_dual_objective();
   const rmm::device_uvector<f_t>& get_l2_primal_residual() const;
   const rmm::device_uvector<f_t>& get_l2_dual_residual() const;
   const rmm::device_uvector<f_t>& get_relative_linf_primal_residual() const;
@@ -66,6 +69,11 @@ class convergence_information_t {
   const rmm::device_uvector<f_t>& get_l2_norm_primal_linear_objective() const;
   const rmm::device_uvector<f_t>& get_l2_norm_primal_right_hand_side() const;
 
+  void compute_owned_reference_norm_partials(i_t owned_var_size, i_t owned_cstr_size);
+  void sqrt_reference_norms_inplace();
+  f_t* l2_norm_primal_right_hand_side_data() { return l2_norm_primal_right_hand_side_.data(); }
+  f_t* l2_norm_primal_linear_objective_data() { return l2_norm_primal_linear_objective_.data(); }
+
   struct view_t {
     i_t primal_size;
     i_t dual_size;
@@ -123,12 +131,40 @@ class convergence_information_t {
                                rmm::device_uvector<f_t>& tmp_dual,
                                [[maybe_unused]] const rmm::device_uvector<f_t>& dual_iterate);
 
+  // Multi-GPU shard helper: writes a partial dot(c[0:n_owned], x[0:n_owned])
+  // into primal_objective_ (no scaling, no offset). Master is responsible for
+  // allreduce SUM across shards and then applying scaling + offset once on the
+  // reduced value. n_owned must be <= primal_size_h_; pass owned_var_size on
+  // each shard.
+  void compute_primal_objective_owned_partial(rmm::device_uvector<f_t>& primal_solution,
+                                              i_t n_owned);
+
+  // Multi-GPU shard helper: writes a partial dual objective into
+  // dual_objective_ (no scaling, no offset). Computes
+  //   dual_dot_           = dot(dual_slack[0:n_owned_var], primal_solution[0:n_owned_var])
+  //   sum_primal_slack_   = Σ primal_slack_[0:n_owned_cstr]
+  //   dual_objective_     = dual_dot_ + sum_primal_slack_
+  // primal_slack_ is assumed already populated by a prior per-shard
+  // compute_primal_residual call. Use only in the use_reflected_primal_dual
+  // path (the multi-GPU mode).
+  void compute_dual_objective_owned_partial(rmm::device_uvector<f_t>& primal_solution,
+                                            rmm::device_uvector<f_t>& dual_slack,
+                                            i_t n_owned_var,
+                                            i_t n_owned_cstr);
+
   void swap_context(const thrust::universal_host_pinned_vector<swap_pair_t<i_t>>& swap_pairs);
   void resize_context(i_t new_size);
 
  private:
   void compute_primal_objective(rmm::device_uvector<f_t>& primal_solution);
 
+  // Applies per-climber objective scaling + offset to primal_objective_.
+  // Single-GPU path: called from compute_primal_objective right after the dot.
+  // Multi-GPU path: called on master once after allreduce of partial sums.
+  void apply_primal_objective_scaling_and_offset();
+  // Same as above but for dual_objective_.
+  void apply_dual_objective_scaling_and_offset();
+
   void compute_dual_residual(cusparse_view_t<i_t, f_t>& cusparse_view,
                              rmm::device_uvector<f_t>& tmp_primal,
                              rmm::device_uvector<f_t>& primal_solution,
diff --git a/cpp/src/pdlp/termination_strategy/termination_strategy.cu b/cpp/src/pdlp/termination_strategy/termination_strategy.cu
index d1a88799d6..0320b420a8 100644
--- a/cpp/src/pdlp/termination_strategy/termination_strategy.cu
+++ b/cpp/src/pdlp/termination_strategy/termination_strategy.cu
@@ -195,6 +195,13 @@ pdlp_termination_strategy_t<i_t, f_t>::get_convergence_information() const
   return convergence_information_;
 }
 
+template <typename i_t, typename f_t>
+convergence_information_t<i_t, f_t>&
+pdlp_termination_strategy_t<i_t, f_t>::get_convergence_information()
+{
+  return convergence_information_;
+}
+
 template <typename i_t, typename f_t>
 const infeasibility_information_t<i_t, f_t>&
 pdlp_termination_strategy_t<i_t, f_t>::get_infeasibility_information() const
diff --git a/cpp/src/pdlp/termination_strategy/termination_strategy.hpp b/cpp/src/pdlp/termination_strategy/termination_strategy.hpp
index 5cd43d7be7..63b2e81ff4 100644
--- a/cpp/src/pdlp/termination_strategy/termination_strategy.hpp
+++ b/cpp/src/pdlp/termination_strategy/termination_strategy.hpp
@@ -187,6 +187,7 @@ class pdlp_termination_strategy_t {
   i_t get_optimal_solution_id() const;
 
   const convergence_information_t<i_t, f_t>& get_convergence_information() const;
+  convergence_information_t<i_t, f_t>& get_convergence_information();
   const infeasibility_information_t<i_t, f_t>& get_infeasibility_information() const;
 
   // Deep copy is used when save best primal so far is toggled
diff --git a/cpp/src/pdlp/utilities/mgpu_trace.cuh b/cpp/src/pdlp/utilities/mgpu_trace.cuh
new file mode 100644
index 0000000000..d9975d3202
--- /dev/null
+++ b/cpp/src/pdlp/utilities/mgpu_trace.cuh
@@ -0,0 +1,52 @@
+/* clang-format off */
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ */
+/* clang-format on */
+#pragma once
+
+// Lightweight env-gated tracing for multi-GPU PDLP diagnosis.
+//
+// Enable by setting CUOPT_MGPU_TRACE=1 in the environment.
+// All prints go to stderr (line-buffered + explicit flush) so they survive
+// a CUDA hang and interleave with cuOpt's normal output.
+//
+// Usage:
+//   MGPU_TRACE("entering compute_At_y");
+//   MGPU_TRACE_FMT("shard %d nnz=%lld", r, (long long)nnz);
+//
+// The guard reads the env var once on first use (thread-safe via static
+// initialization) and the cost when disabled is a single load + branch.
+
+#include <cstdio>
+#include <cstdlib>
+
+namespace cuopt::linear_programming::detail {
+
+inline bool mgpu_trace_enabled()
+{
+  static const bool enabled = []() {
+    const char* v = std::getenv("CUOPT_MGPU_TRACE");
+    return v != nullptr && v[0] != '\0' && v[0] != '0';
+  }();
+  return enabled;
+}
+
+}  // namespace cuopt::linear_programming::detail
+
+#define MGPU_TRACE(msg)                                                     \
+  do {                                                                      \
+    if (::cuopt::linear_programming::detail::mgpu_trace_enabled()) {        \
+      std::fprintf(stderr, "[mgpu %s:%d] %s\n", __func__, __LINE__, (msg)); \
+      std::fflush(stderr);                                                  \
+    }                                                                       \
+  } while (0)
+
+#define MGPU_TRACE_FMT(fmt, ...)                                                       \
+  do {                                                                                 \
+    if (::cuopt::linear_programming::detail::mgpu_trace_enabled()) {                   \
+      std::fprintf(stderr, "[mgpu %s:%d] " fmt "\n", __func__, __LINE__, __VA_ARGS__); \
+      std::fflush(stderr);                                                             \
+    }                                                                                  \
+  } while (0)
diff --git a/cpp/src/pdlp/utilities/ping_pong_graph.cuh b/cpp/src/pdlp/utilities/ping_pong_graph.cuh
index dbc8fe5828..6b527f81b2 100644
--- a/cpp/src/pdlp/utilities/ping_pong_graph.cuh
+++ b/cpp/src/pdlp/utilities/ping_pong_graph.cuh
@@ -12,10 +12,25 @@
 
 #include <rmm/cuda_stream_view.hpp>
 
+#include <atomic>
 #include <utility>
 
 namespace cuopt::linear_programming::detail {
 
+// Debug/diagnostic toggle: when set, ping_pong_graph_t::run() bypasses CUDA
+// graph capture and executes its work eagerly on every iteration. Useful for
+// for debugging
+inline std::atomic<bool>& pdlp_graph_disabled_flag()
+{
+  static std::atomic<bool> s_flag{false};
+  return s_flag;
+}
+
+inline bool pdlp_graph_disabled()
+{
+  return pdlp_graph_disabled_flag().load(std::memory_order_relaxed);
+}
+
 // Two-slot CUDA-graph cache for PDLP. PDLP swaps pointers (rather than
 // copying vectors) at the end of adaptive pdhg step, so the captured graph
 // topology alternates between two layouts depending on iteration parity.
@@ -49,7 +64,7 @@ class ping_pong_graph_t {
 #ifdef CUPDLP_DEBUG_MODE
     work();
 #else
-    if (is_legacy_batch_mode_) {
+    if (is_legacy_batch_mode_ || pdlp_graph_disabled()) {
       work();
       return;
     }
diff --git a/cpp/tests/linear_programming/pdlp_test.cu b/cpp/tests/linear_programming/pdlp_test.cu
index d29995efc5..d17cf2af6f 100644
--- a/cpp/tests/linear_programming/pdlp_test.cu
+++ b/cpp/tests/linear_programming/pdlp_test.cu
@@ -8,12 +8,16 @@
 #include <branch_and_bound/shared_strong_branching_context.hpp>
 #include <mps_parser_internal.hpp>
 #include <pdlp/cusparse_view.hpp>
+#include <pdlp/distributed_pdlp/partition_loader.hpp>
+#include <pdlp/distributed_pdlp/partitioner.hpp>
 #include <pdlp/initial_scaling_strategy/initial_scaling.cuh>
 #include <pdlp/pdlp.cuh>
 #include <pdlp/pdlp_constants.hpp>
 #include <pdlp/solve.cuh>
 #include <pdlp/utils.cuh>
 
+#include <dual_simplex/sparse_matrix.hpp>
+
 #include "utilities/pdlp_test_utilities.cuh"
 
 #include "../mip/mip_utils.cuh"
@@ -45,11 +49,13 @@
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
 
+#include <algorithm>
 #include <chrono>
 #include <cmath>
 #include <cstdint>
 #include <limits>
 #include <sstream>
+#include <string>
 #include <thread>
 #include <tuple>
 #include <utility>
@@ -91,6 +97,166 @@ TEST(pdlp_class, run_double)
     afiro_primal_objective, solution.get_additional_termination_information().primal_objective));
 }
 
+// Distributed-PDLP partition round-trip: partition the afiro constraint/variable
+// bipartite graph with METIS, write it out, read it back, and confirm the parsed
+// vector is identical to what the partitioner produced.
+TEST(pdlp_class, distributed_partition_metis_export_import_roundtrip)
+{
+  using namespace cuopt::linear_programming::detail;
+  namespace ds = cuopt::linear_programming::dual_simplex;
+
+  auto path = make_path_absolute("linear_programming/afiro_original.mps");
+  cuopt::linear_programming::io::mps_data_model_t<int, double> mps =
+    cuopt::linear_programming::io::parse_mps<int, double>(path, true);
+
+  const int n_vars = static_cast<int>(mps.get_objective_coefficients().size());
+  const int n_cstr = static_cast<int>(mps.get_constraint_lower_bounds().size());
+  const int nnz    = static_cast<int>(mps.get_constraint_matrix_values().size());
+
+  std::vector<int> h_A_row_offsets = mps.get_constraint_matrix_offsets();
+  std::vector<int> h_A_col_indices = mps.get_constraint_matrix_indices();
+  std::vector<double> h_A_values   = mps.get_constraint_matrix_values();
+
+  // Transpose A -> A^T (CSR of A^T == CSC of A), mirroring solve_lp_distributed_from_mps.
+  ds::csr_matrix_t<int, double> A_csr(n_cstr, n_vars, nnz);
+  A_csr.row_start = h_A_row_offsets;
+  A_csr.j         = h_A_col_indices;
+  A_csr.x         = h_A_values;
+  ds::csc_matrix_t<int, double> AT_as_csc(n_vars, n_cstr, nnz);
+  A_csr.to_compressed_col(AT_as_csc);
+  std::vector<int> h_A_t_row_offsets = AT_as_csc.col_start;
+  std::vector<int> h_A_t_col_indices = AT_as_csc.i;
+
+  partitioner_input_t<int, double> input;
+  input.nb_cstr         = n_cstr;
+  input.nb_vars         = n_vars;
+  input.nb_parts        = 2;
+  input.A.row_offsets   = &h_A_row_offsets;
+  input.A.col_indices   = &h_A_col_indices;
+  input.A.num_rows      = n_cstr;
+  input.A.num_cols      = n_vars;
+  input.A_t.row_offsets = &h_A_t_row_offsets;
+  input.A_t.col_indices = &h_A_t_col_indices;
+  input.A_t.num_rows    = n_vars;
+  input.A_t.num_cols    = n_cstr;
+
+  auto partitioner       = make_partitioner<int, double>(partitioner_kind_t::Metis);
+  std::vector<int> parts = partitioner->partition(input);
+  ASSERT_EQ(parts.size(), static_cast<std::size_t>(n_cstr + n_vars));
+
+  std::string dir = ::testing::TempDir();
+  if (!dir.empty() && dir.back() != '/') { dir.push_back('/'); }
+  const std::string out_path = dir + "afiro_metis_roundtrip.parts";
+
+  partition_loader_t<int, double>::export_distributed_pdlp_partition_file(out_path, parts);
+  std::vector<int> reloaded =
+    partition_loader_t<int, double>::parse_distributed_pdlp_partition_file(out_path);
+
+  EXPECT_EQ(parts, reloaded);
+}
+
+namespace {
+
+// Solve `mps_rel_path` with the single-GPU PDLP ("base") and with distributed PDLP
+// (num_gpus = -1 => auto-detect; 1 GPU is fine), then assert the distributed run
+// matches the base run on everything meaningful: termination status, step count
+// (within 15%), primal/dual objective, and the full primal/dual solution vectors.
+// All value comparisons use a loose relative tolerance.
+void expect_distributed_matches_base(raft::handle_t const& handle,
+                                     std::string const& mps_rel_path,
+                                     bool fixed_mps_format = false)
+{
+  constexpr double loose_rel = 1e-3;
+  auto near_rel              = [](double a, double b, double rel) {
+    return std::fabs(a - b) <= rel * (1.0 + std::fabs(a));
+  };
+
+  auto path                                 = make_path_absolute(mps_rel_path);
+  io::mps_data_model_t<int, double> problem = io::parse_mps<int, double>(path, fixed_mps_format);
+
+  // Shared settings: PDLP, no presolve (distributed requires presolver == None, so the
+  // base run must match to keep the two problems identical).
+  pdlp_solver_settings_t<int, double> base_settings{};
+  base_settings.method    = method_t::PDLP;
+  base_settings.presolver = presolver_t::None;
+
+  // ----- base: single-GPU PDLP (materialize the full problem on one GPU) -----
+  auto base_op = mps_data_model_to_optimization_problem<int, double>(&handle, problem);
+  auto base    = solve_lp(base_op, base_settings);
+
+  // ----- distributed PDLP (identical settings, only the distributed flags flipped) -----
+  pdlp_solver_settings_t<int, double> dist_settings = base_settings;
+  dist_settings.hyper_params.use_distributed_pdlp   = true;
+  dist_settings.distributed_pdlp_num_gpus           = -1;
+  auto dist                                         = solve_lp(&handle, problem, dist_settings);
+
+  // ----- termination status -----
+  ASSERT_EQ(static_cast<int>(base.get_termination_status()), CUOPT_TERMINATION_STATUS_OPTIMAL)
+    << mps_rel_path << ": base did not reach optimal";
+  EXPECT_EQ(static_cast<int>(dist.get_termination_status()),
+            static_cast<int>(base.get_termination_status()))
+    << mps_rel_path << ": distributed termination status differs from base";
+
+  const auto& base_info = base.get_additional_termination_information();
+  const auto& dist_info = dist.get_additional_termination_information();
+
+  // ----- objectives -----
+  EXPECT_TRUE(near_rel(base_info.primal_objective, dist_info.primal_objective, loose_rel))
+    << mps_rel_path << ": primal objective base=" << base_info.primal_objective
+    << " distributed=" << dist_info.primal_objective;
+  EXPECT_TRUE(near_rel(base_info.dual_objective, dist_info.dual_objective, loose_rel))
+    << mps_rel_path << ": dual objective base=" << base_info.dual_objective
+    << " distributed=" << dist_info.dual_objective;
+
+  // ----- step count: within 15% of the larger of the two -----
+  const int base_steps = base_info.number_of_steps_taken;
+  const int dist_steps = dist_info.number_of_steps_taken;
+  const int max_steps  = std::max(base_steps, dist_steps);
+  const int step_diff  = std::max(base_steps, dist_steps) - std::min(base_steps, dist_steps);
+  EXPECT_LE(static_cast<double>(step_diff), 0.15 * max_steps)
+    << mps_rel_path << ": step counts differ by >15% (base=" << base_steps
+    << ", distributed=" << dist_steps << ")";
+
+  // ----- primal / dual solution vectors -----
+  auto base_primal = cuopt::host_copy(base.get_primal_solution(), handle.get_stream());
+  auto dist_primal = cuopt::host_copy(dist.get_primal_solution(), handle.get_stream());
+  ASSERT_EQ(base_primal.size(), dist_primal.size()) << mps_rel_path << ": primal size mismatch";
+  for (std::size_t i = 0; i < base_primal.size(); ++i) {
+    EXPECT_TRUE(near_rel(base_primal[i], dist_primal[i], loose_rel))
+      << mps_rel_path << ": primal[" << i << "] base=" << base_primal[i]
+      << " distributed=" << dist_primal[i];
+  }
+
+  auto base_dual = cuopt::host_copy(base.get_dual_solution(), handle.get_stream());
+  auto dist_dual = cuopt::host_copy(dist.get_dual_solution(), handle.get_stream());
+  ASSERT_EQ(base_dual.size(), dist_dual.size()) << mps_rel_path << ": dual size mismatch";
+  for (std::size_t i = 0; i < base_dual.size(); ++i) {
+    EXPECT_TRUE(near_rel(base_dual[i], dist_dual[i], loose_rel))
+      << mps_rel_path << ": dual[" << i << "] base=" << base_dual[i]
+      << " distributed=" << dist_dual[i];
+  }
+}
+
+}  // namespace
+
+TEST(pdlp_class, distributed_parity_afiro)
+{
+  const raft::handle_t handle{};
+  expect_distributed_matches_base(handle, "linear_programming/afiro_original.mps", true);
+}
+
+TEST(pdlp_class, distributed_parity_square41)
+{
+  const raft::handle_t handle{};
+  expect_distributed_matches_base(handle, "linear_programming/neos3/neos3.mps");
+}
+
+TEST(pdlp_class, distributed_parity_a2864)
+{
+  const raft::handle_t handle{};
+  expect_distributed_matches_base(handle, "linear_programming/a2864/a2864.mps");
+}
+
 TEST(pdlp_class, precision_mixed)
 {
   using namespace cuopt::linear_programming::detail;