From 1e0bd53da23fd9e4c093603d41c3fa6a06e899e4 Mon Sep 17 00:00:00 2001
From: Bulle Mostovoi <vmostovoi@nvidia.com>
Date: Thu, 7 May 2026 15:07:26 +0200
Subject: [PATCH 01/67] first commit !! added multi_gpu_partition file to
 solver settings

---
 cpp/include/cuopt/linear_programming/constants.h             | 1 +
 .../cuopt/linear_programming/pdlp/solver_settings.hpp        | 1 +
 cpp/src/math_optimization/solver_settings.cu                 | 5 +++--
 3 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/cpp/include/cuopt/linear_programming/constants.h b/cpp/include/cuopt/linear_programming/constants.h
index b251b3eaba..7e2682b997 100644
--- a/cpp/include/cuopt/linear_programming/constants.h
+++ b/cpp/include/cuopt/linear_programming/constants.h
@@ -77,6 +77,7 @@
 #define CUOPT_SOLUTION_FILE            "solution_file"
 #define CUOPT_NUM_CPU_THREADS          "num_cpu_threads"
 #define CUOPT_NUM_GPUS                 "num_gpus"
+#define CUOPT_MULTI_GPU_PARTITION_FILE "multi_gpu_partition_file"
 #define CUOPT_USER_PROBLEM_FILE        "user_problem_file"
 #define CUOPT_PRESOLVE_FILE            "presolve_file"
 #define CUOPT_RANDOM_SEED              "random_seed"
diff --git a/cpp/include/cuopt/linear_programming/pdlp/solver_settings.hpp b/cpp/include/cuopt/linear_programming/pdlp/solver_settings.hpp
index bcf5a736f0..4585b9d1cf 100644
--- a/cpp/include/cuopt/linear_programming/pdlp/solver_settings.hpp
+++ b/cpp/include/cuopt/linear_programming/pdlp/solver_settings.hpp
@@ -286,6 +286,7 @@ class pdlp_solver_settings_t {
   presolver_t presolver{presolver_t::Default};
   bool dual_postsolve{true};
   int num_gpus{1};
+  std::string multi_gpu_partition_file{""};
   method_t method{method_t::Concurrent};
   bool inside_mip{false};
   // For concurrent termination
diff --git a/cpp/src/math_optimization/solver_settings.cu b/cpp/src/math_optimization/solver_settings.cu
index b968ad18ea..42ea533152 100644
--- a/cpp/src/math_optimization/solver_settings.cu
+++ b/cpp/src/math_optimization/solver_settings.cu
@@ -136,7 +136,7 @@ solver_settings_t<i_t, f_t>::solver_settings_t() : pdlp_settings(), mip_settings
     {CUOPT_MIP_IMPLIED_BOUND_CUTS, &mip_settings.implied_bound_cuts, -1, 1, -1},
     {CUOPT_MIP_STRONG_CHVATAL_GOMORY_CUTS, &mip_settings.strong_chvatal_gomory_cuts, -1, 1, -1},
     {CUOPT_MIP_REDUCED_COST_STRENGTHENING, &mip_settings.reduced_cost_strengthening, -1, std::numeric_limits<i_t>::max(), -1},
-    {CUOPT_NUM_GPUS, &pdlp_settings.num_gpus, 1, 2, 1},
+    {CUOPT_NUM_GPUS, &pdlp_settings.num_gpus, 1, 576, 1},
     {CUOPT_NUM_GPUS, &mip_settings.num_gpus, 1, 2, 1},
     {CUOPT_MIP_BATCH_PDLP_STRONG_BRANCHING, &mip_settings.mip_batch_pdlp_strong_branching, 0, 2, 0},
     {CUOPT_MIP_BATCH_PDLP_RELIABILITY_BRANCHING, &mip_settings.mip_batch_pdlp_reliability_branching, 0, 2, 0},
@@ -182,7 +182,8 @@ solver_settings_t<i_t, f_t>::solver_settings_t() : pdlp_settings(), mip_settings
     {CUOPT_USER_PROBLEM_FILE, &mip_settings.user_problem_file, ""},
     {CUOPT_USER_PROBLEM_FILE, &pdlp_settings.user_problem_file, ""},
     {CUOPT_PRESOLVE_FILE, &mip_settings.presolve_file, ""},
-    {CUOPT_PRESOLVE_FILE, &pdlp_settings.presolve_file, ""}
+    {CUOPT_PRESOLVE_FILE, &pdlp_settings.presolve_file, ""},
+    {CUOPT_MULTI_GPU_PARTITION_FILE, &pdlp_settings.multi_gpu_partition_file, ""},
   };
   // clang-format on
 }

From 978d17bc5e81f10bb0f4305e5886b777251b4ad4 Mon Sep 17 00:00:00 2001
From: Bulle Mostovoi <vmostovoi@nvidia.com>
Date: Thu, 7 May 2026 17:51:27 +0200
Subject: [PATCH 02/67] slowly skeletonning

---
 .../pdlp/distributed_pdlp/communicator.cuh    |  0
 cpp/src/pdlp/distributed_pdlp/partition.cuh   | 33 +++++++++++++++
 cpp/src/pdlp/distributed_pdlp/shard.cu        | 41 +++++++++++++++++++
 cpp/src/pdlp/distributed_pdlp/shard.cuh       | 24 +++++++++++
 4 files changed, 98 insertions(+)
 create mode 100644 cpp/src/pdlp/distributed_pdlp/communicator.cuh
 create mode 100644 cpp/src/pdlp/distributed_pdlp/partition.cuh
 create mode 100644 cpp/src/pdlp/distributed_pdlp/shard.cu
 create mode 100644 cpp/src/pdlp/distributed_pdlp/shard.cuh

diff --git a/cpp/src/pdlp/distributed_pdlp/communicator.cuh b/cpp/src/pdlp/distributed_pdlp/communicator.cuh
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/cpp/src/pdlp/distributed_pdlp/partition.cuh b/cpp/src/pdlp/distributed_pdlp/partition.cuh
new file mode 100644
index 0000000000..38457029be
--- /dev/null
+++ b/cpp/src/pdlp/distributed_pdlp/partition.cuh
@@ -0,0 +1,33 @@
+
+
+
+namespace cuopt::linear_programming::detail {
+
+
+template <typename i_t, typename f_t>
+class partition_t {
+    public:
+    partition_t(const std::string& partition_file);
+    partition_t(const problem_t<i_t, f_t>& op_problem);
+
+
+  size_t nb_parts;
+  
+  std::vector<i_t> raw_parts;
+  std::vector<i_t> cstr_parts;
+  std::vector<i_t> var_parts;
+  std::vector<std::vector<i_t>> owned_cstr_per_part;
+  std::vector<std::vector<i_t>> owned_var_per_part;
+  std::vector<std::unordered_set<i_t>> needed_cstr_per_part;
+  std::vector<std::unordered_set<i_t>> needed_var_per_part;
+  std::vector<std::vector<std::vector<i_t>>> sent_cstr_per_part;
+  std::vector<std::vector<std::vector<i_t>>> sent_var_per_part;
+  std::vector<std::vector<std::vector<i_t>>> received_cstr_per_part;
+  std::vector<std::vector<std::vector<i_t>>> received_var_per_part;
+
+  private:
+  void fill_data();
+  void validate() const;
+
+};
+} // namespace cuopt::linear_programming::detail
\ No newline at end of file
diff --git a/cpp/src/pdlp/distributed_pdlp/shard.cu b/cpp/src/pdlp/distributed_pdlp/shard.cu
new file mode 100644
index 0000000000..43a4526c29
--- /dev/null
+++ b/cpp/src/pdlp/distributed_pdlp/shard.cu
@@ -0,0 +1,41 @@
+
+
+
+void pre_SpMV_communication(bool is_A_x){
+    // Prepare the send_buffers
+    for (auto& shard: shards){
+        comm_planner_t<i_t, f_t>& plan = is_A_x ? shard.x_plan : shard.y_plan;
+        raft::device_setter guard(shard.device_id);
+        for (size_t peer = 0; peer < partition.nb_parts; peer++){
+            if (peer == shard.rank) continue;
+            thrust::gather(
+                shard.handle.get_thrust_policy(), // TODO what exactly do we put here
+                plan.send_indices_per_peer[peer].begin(),
+                plan.send_indices_per_peer[peer].end(),
+                plan.full_local.begin(),
+                plan.send_buf_per_peer[peer].begin());
+        }
+    }
+    // Will merge them if it works
+    ncclgroupstart()
+    // Send all the data current shard has to send
+    for (auto& shard: shards){
+        comm_planner_t<i_t, f_t>& plan = is_A_x ? shard.x_plan : shard.y_plan;
+        raft::device_setter guard(shard.device_id);
+        for (size_t peer = 0; peer < partition.nb_parts; peer++){
+            if (peer == shard.rank) continue;
+            ncclSend(plan.send_buf_per_peer[peer].data(), plan.nb_elt_send_per_peer[peer], peer)
+        }
+    }
+    // Receive all the data current shard has to receive
+    for (auto& shard: shards){
+        comm_planner_t<i_t, f_t>& plan = is_A_x ? shard.x_plan : shard.y_plan;
+        raft::device_setter guard(shard.device_id);
+        for (size_t peer = 0; peer < partition.nb_parts; peer++){
+            if (peer == shard.rank) continue;
+        f_t* recv_buff = &plan.full_local[offset_per_peer[peer]];
+        ncclRecv(recv_buff, plan.nb_elt_recv_per_peer[peer], peer);
+        }
+    }
+    ncclgroupend()
+}
\ No newline at end of file
diff --git a/cpp/src/pdlp/distributed_pdlp/shard.cuh b/cpp/src/pdlp/distributed_pdlp/shard.cuh
new file mode 100644
index 0000000000..30449e04a0
--- /dev/null
+++ b/cpp/src/pdlp/distributed_pdlp/shard.cuh
@@ -0,0 +1,24 @@
+
+
+template <typename i_t, typename f_t>
+struct pdlp_shard_t {
+  size_t rank;
+  comm_planner_t<i_t, f_t> x_plan;
+  comm_planner_t<i_t, f_t> y_plan;
+};
+
+
+template <typename i_t, f_t>
+struct comm_planner_t {
+
+    // The indices of the data we have to send to the others
+    // Maybe could merge evrything if it gives a speedup but a bit harder to read
+    std::vector<std::vector<int>> send_indices_per_peer;
+    std::vector<int> nb_elt_send_per_peer;
+    std::vector<rmm::device_uvector<f_t>> send_buf_per_peer;
+
+    // Where to start writing in full_local for each peer    
+    std::vector<i_t> offset_per_peer;
+    std::vector<i_t> nb_elt_recv_per_peer;
+    rmm::device_uvector<f_t> full_local; // The full var/cstr vector containing all local data then all remote data
+};
\ No newline at end of file

From dd0c0eff2de119511065cb1e40a726c6443fb102 Mon Sep 17 00:00:00 2001
From: Bulle Mostovoi <vmostovoi@nvidia.com>
Date: Thu, 7 May 2026 18:02:02 +0200
Subject: [PATCH 03/67] better shard.cuh

---
 cpp/src/pdlp/distributed_pdlp/shard.cuh | 30 ++++++++++++++++---------
 1 file changed, 20 insertions(+), 10 deletions(-)

diff --git a/cpp/src/pdlp/distributed_pdlp/shard.cuh b/cpp/src/pdlp/distributed_pdlp/shard.cuh
index 30449e04a0..6e4f7eabae 100644
--- a/cpp/src/pdlp/distributed_pdlp/shard.cuh
+++ b/cpp/src/pdlp/distributed_pdlp/shard.cuh
@@ -1,19 +1,12 @@
 
 
-template <typename i_t, typename f_t>
-struct pdlp_shard_t {
-  size_t rank;
-  comm_planner_t<i_t, f_t> x_plan;
-  comm_planner_t<i_t, f_t> y_plan;
-};
-
 
-template <typename i_t, f_t>
+template <typename i_t, typename f_t>
 struct comm_planner_t {
 
     // The indices of the data we have to send to the others
     // Maybe could merge evrything if it gives a speedup but a bit harder to read
-    std::vector<std::vector<int>> send_indices_per_peer;
+    std::vector<rmm::device_uvector<i_t>> send_indices_per_peer;
     std::vector<int> nb_elt_send_per_peer;
     std::vector<rmm::device_uvector<f_t>> send_buf_per_peer;
 
@@ -21,4 +14,21 @@ struct comm_planner_t {
     std::vector<i_t> offset_per_peer;
     std::vector<i_t> nb_elt_recv_per_peer;
     rmm::device_uvector<f_t> full_local; // The full var/cstr vector containing all local data then all remote data
-};
\ No newline at end of file
+};
+
+template <typename i_t, typename f_t>
+struct pdlp_shard_t {
+
+  // Local per-rank PDLP data
+  raft::handle_t                   handle;          // owned: the actual handle for this shard's device/stream
+  problem_t<i_t, f_t>              local_problem;   // owned: holds handle_ptr = &handle (back-ref)
+  saddle_point_state_t<i_t, f_t>   saddle_point;    // owned: per-iter state, sized to local
+  cusparse_view_t<i_t, f_t>        cusparse_view;   // owned: descriptors bound to local_problem + saddle_point
+
+  // Specific multi-GPU data
+  int device_id;
+  ncclComm_t                comm;
+  comm_planner_t<i_t, f_t> x_plan, y_plan;
+};
+
+

From 2037eca41a05ac925d36bd1482c3a1e29b525b49 Mon Sep 17 00:00:00 2001
From: Bulle Mostovoi <vmostovoi@nvidia.com>
Date: Sun, 10 May 2026 18:39:30 +0200
Subject: [PATCH 04/67] wip

---
 cpp/src/pdlp/distributed_pdlp/partition.cu  | 24 ++++++++++++++++
 cpp/src/pdlp/distributed_pdlp/partition.cuh | 32 ++++++++++++++-------
 cpp/src/pdlp/distributed_pdlp/shard.cu      |  2 +-
 cpp/src/pdlp/distributed_pdlp/shard.cuh     | 32 +++++++++++++++++----
 4 files changed, 73 insertions(+), 17 deletions(-)
 create mode 100644 cpp/src/pdlp/distributed_pdlp/partition.cu

diff --git a/cpp/src/pdlp/distributed_pdlp/partition.cu b/cpp/src/pdlp/distributed_pdlp/partition.cu
new file mode 100644
index 0000000000..3410b74fd1
--- /dev/null
+++ b/cpp/src/pdlp/distributed_pdlp/partition.cu
@@ -0,0 +1,24 @@
+
+namespace cuopt::linear_programming::detail {
+
+template <typename i_t, typename f_t>
+partition_t<i_t, f_t>::partition_t(const std::string& partition_file){
+    
+}
+
+template <typename i_t, typename f_t>
+partition_t<i_t, f_t>::partition_t(const problem_t<i_t, f_t>& op_problem)
+{
+  std::cout << "NOT IMPLEMENTED" << std::endl;
+  return; // TODO: Implement
+}
+
+template <typename i_t, typename f_t>
+void export_to_file(const std::string& partition_file) const{
+    std::cout << "NOT IMPLEMENTED" << std::endl;
+    return; // TODO: Implement
+}
+
+
+
+}
\ No newline at end of file
diff --git a/cpp/src/pdlp/distributed_pdlp/partition.cuh b/cpp/src/pdlp/distributed_pdlp/partition.cuh
index 38457029be..a5b5175105 100644
--- a/cpp/src/pdlp/distributed_pdlp/partition.cuh
+++ b/cpp/src/pdlp/distributed_pdlp/partition.cuh
@@ -4,26 +4,36 @@
 namespace cuopt::linear_programming::detail {
 
 
+template <typename i_t>
+struct rank_data_t {
+  // === Ownership ===
+  std::vector<i_t> owned_var_indices;       // global indices of variables in S_r
+  std::vector<i_t> owned_constr_indices;    // global indices of constraints in T_r
+  // === Send plan: per peer, LOCAL positions to gather + send ===
+  std::vector<std::vector<i_t>> y_send_per_peer;     // [peer] -> local positions in T_r to send
+  std::vector<std::vector<i_t>> x_send_per_peer;   // [peer] -> local positions in S_r to send
+  // === Recv plan: per peer, contiguous slot in halo region ===
+  std::vector<int> y_recv_counts;        // [peer] -> count
+  std::vector<int> y_recv_offsets;       // [peer] -> offset in dual halo region
+  std::vector<int> x_recv_counts;
+  std::vector<int> x_recv_offsets;
+};
+
+
 template <typename i_t, typename f_t>
 class partition_t {
-    public:
-    partition_t(const std::string& partition_file);
+  public:
+    // not sure, good luck hihi
+    partition_t(std::vector<i_t> parts, std::vector<i_t> A_row_offsets, std::vector<i_t> A_indices, std::vector<i_t> A_t_row_offsets, std::vector<i_t> A_t_indices, );
     partition_t(const problem_t<i_t, f_t>& op_problem);
-
+    void export_to_file(const std::string& partition_file) const;
 
   size_t nb_parts;
   
   std::vector<i_t> raw_parts;
   std::vector<i_t> cstr_parts;
   std::vector<i_t> var_parts;
-  std::vector<std::vector<i_t>> owned_cstr_per_part;
-  std::vector<std::vector<i_t>> owned_var_per_part;
-  std::vector<std::unordered_set<i_t>> needed_cstr_per_part;
-  std::vector<std::unordered_set<i_t>> needed_var_per_part;
-  std::vector<std::vector<std::vector<i_t>>> sent_cstr_per_part;
-  std::vector<std::vector<std::vector<i_t>>> sent_var_per_part;
-  std::vector<std::vector<std::vector<i_t>>> received_cstr_per_part;
-  std::vector<std::vector<std::vector<i_t>>> received_var_per_part;
+  std::vector<rank_data_t<i_t>> rank_data; // [rank] -> partition data for this rank
 
   private:
   void fill_data();
diff --git a/cpp/src/pdlp/distributed_pdlp/shard.cu b/cpp/src/pdlp/distributed_pdlp/shard.cu
index 43a4526c29..6de93ad3b8 100644
--- a/cpp/src/pdlp/distributed_pdlp/shard.cu
+++ b/cpp/src/pdlp/distributed_pdlp/shard.cu
@@ -17,7 +17,7 @@ void pre_SpMV_communication(bool is_A_x){
         }
     }
     // Will merge them if it works
-    ncclgroupstart()
+    ncclgroupstart();
     // Send all the data current shard has to send
     for (auto& shard: shards){
         comm_planner_t<i_t, f_t>& plan = is_A_x ? shard.x_plan : shard.y_plan;
diff --git a/cpp/src/pdlp/distributed_pdlp/shard.cuh b/cpp/src/pdlp/distributed_pdlp/shard.cuh
index 6e4f7eabae..127cc496f1 100644
--- a/cpp/src/pdlp/distributed_pdlp/shard.cuh
+++ b/cpp/src/pdlp/distributed_pdlp/shard.cuh
@@ -3,7 +3,6 @@
 
 template <typename i_t, typename f_t>
 struct comm_planner_t {
-
     // The indices of the data we have to send to the others
     // Maybe could merge evrything if it gives a speedup but a bit harder to read
     std::vector<rmm::device_uvector<i_t>> send_indices_per_peer;
@@ -19,16 +18,39 @@ struct comm_planner_t {
 template <typename i_t, typename f_t>
 struct pdlp_shard_t {
 
+  // Specific multi-GPU data
+  int device_id;
+  ncclComm_t                comm;
+  comm_planner_t<i_t, f_t> x_plan, y_plan;
+
   // Local per-rank PDLP data
   raft::handle_t                   handle;          // owned: the actual handle for this shard's device/stream
   problem_t<i_t, f_t>              local_problem;   // owned: holds handle_ptr = &handle (back-ref)
   saddle_point_state_t<i_t, f_t>   saddle_point;    // owned: per-iter state, sized to local
   cusparse_view_t<i_t, f_t>        cusparse_view;   // owned: descriptors bound to local_problem + saddle_point
 
-  // Specific multi-GPU data
-  int device_id;
-  ncclComm_t                comm;
-  comm_planner_t<i_t, f_t> x_plan, y_plan;
+  rmm::device_uvector<f_t>         tmp_primal;
+  rmm::device_uvector<f_t>         tmp_dual;
+  rmm::device_uvector<f_t>         potential_next_primal;
+  rmm::device_uvector<f_t>         potential_next_dual;
+  rmm::device_uvector<f_t>         dual_slack;
+  rmm::device_uvector<f_t>         reflected_primal; // x, so it has primal_size + halo
+  rmm::device_uvector<f_t>         reflected_dual; // y, so it has dual_size + halo
+
+  rmm::device_scalar<f_t>          reusable_one;        // = 1.0
+  rmm::device_scalar<f_t>          reusable_zero;       // = 0.0
+  rmm::device_scalar<f_t>          reusable_neg_one;    // = -1.0
+
+  // ===== Missing for cuPDLP+ Halpern update =====
+  rmm::device_uvector<f_t>         initial_primal;      // snapshot at start of restart epoch
+  rmm::device_uvector<f_t>         initial_dual;
+
+  i_t                              primal_size_h;
+  i_t                              dual_size_h;
+  i_t                              primal_halo_size;
+  i_t                              dual_halo_size;
+  i_t                              full_primal_size_h;// = primal_size_h + primal_halo_size
+  i_t                              full_dual_size_h;  // = dual_size_h + dual_halo_size
 };
 
 

From 0f62eff269ce7ab5c7f5b2141c5178abeb61ec2c Mon Sep 17 00:00:00 2001
From: Bulle Mostovoi <vmostovoi@nvidia.com>
Date: Mon, 18 May 2026 18:06:27 +0200
Subject: [PATCH 05/67] added a bit of skeleton. Forward declared pdlp_solver
 in shard.hpp, the cycle seems to be fixed, cuopt compiles

---
 .../pdlp/distributed_pdlp/communicator.cuh    |  0
 .../distributed_pdlp/multi_gpu_engine.hpp     | 14 +++++
 cpp/src/pdlp/distributed_pdlp/partition.cu    | 24 --------
 cpp/src/pdlp/distributed_pdlp/partition.cuh   | 43 --------------
 .../distributed_pdlp/partition_loader.hpp     |  2 +
 cpp/src/pdlp/distributed_pdlp/rank_data.hpp   | 52 +++++++++++++++++
 cpp/src/pdlp/distributed_pdlp/shard.cu        | 54 ++++++------------
 cpp/src/pdlp/distributed_pdlp/shard.cuh       | 56 -------------------
 cpp/src/pdlp/distributed_pdlp/shard.hpp       | 31 ++++++++++
 cpp/src/pdlp/pdlp.cuh                         |  4 ++
 10 files changed, 119 insertions(+), 161 deletions(-)
 delete mode 100644 cpp/src/pdlp/distributed_pdlp/communicator.cuh
 create mode 100644 cpp/src/pdlp/distributed_pdlp/multi_gpu_engine.hpp
 delete mode 100644 cpp/src/pdlp/distributed_pdlp/partition.cu
 delete mode 100644 cpp/src/pdlp/distributed_pdlp/partition.cuh
 create mode 100644 cpp/src/pdlp/distributed_pdlp/partition_loader.hpp
 create mode 100644 cpp/src/pdlp/distributed_pdlp/rank_data.hpp
 delete mode 100644 cpp/src/pdlp/distributed_pdlp/shard.cuh
 create mode 100644 cpp/src/pdlp/distributed_pdlp/shard.hpp

diff --git a/cpp/src/pdlp/distributed_pdlp/communicator.cuh b/cpp/src/pdlp/distributed_pdlp/communicator.cuh
deleted file mode 100644
index e69de29bb2..0000000000
diff --git a/cpp/src/pdlp/distributed_pdlp/multi_gpu_engine.hpp b/cpp/src/pdlp/distributed_pdlp/multi_gpu_engine.hpp
new file mode 100644
index 0000000000..13ded70009
--- /dev/null
+++ b/cpp/src/pdlp/distributed_pdlp/multi_gpu_engine.hpp
@@ -0,0 +1,14 @@
+#pragma once
+
+#include <pdlp/distributed_pdlp/shard.hpp>
+
+#include <vector>
+
+namespace cuopt::linear_programming::detail {
+
+template <typename i_t, typename f_t>
+struct multi_gpu_engine_t {
+  std::vector<pdlp_shard_t<i_t, f_t>> shards;
+};
+
+}  // namespace cuopt::linear_programming::detail
diff --git a/cpp/src/pdlp/distributed_pdlp/partition.cu b/cpp/src/pdlp/distributed_pdlp/partition.cu
deleted file mode 100644
index 3410b74fd1..0000000000
--- a/cpp/src/pdlp/distributed_pdlp/partition.cu
+++ /dev/null
@@ -1,24 +0,0 @@
-
-namespace cuopt::linear_programming::detail {
-
-template <typename i_t, typename f_t>
-partition_t<i_t, f_t>::partition_t(const std::string& partition_file){
-    
-}
-
-template <typename i_t, typename f_t>
-partition_t<i_t, f_t>::partition_t(const problem_t<i_t, f_t>& op_problem)
-{
-  std::cout << "NOT IMPLEMENTED" << std::endl;
-  return; // TODO: Implement
-}
-
-template <typename i_t, typename f_t>
-void export_to_file(const std::string& partition_file) const{
-    std::cout << "NOT IMPLEMENTED" << std::endl;
-    return; // TODO: Implement
-}
-
-
-
-}
\ No newline at end of file
diff --git a/cpp/src/pdlp/distributed_pdlp/partition.cuh b/cpp/src/pdlp/distributed_pdlp/partition.cuh
deleted file mode 100644
index a5b5175105..0000000000
--- a/cpp/src/pdlp/distributed_pdlp/partition.cuh
+++ /dev/null
@@ -1,43 +0,0 @@
-
-
-
-namespace cuopt::linear_programming::detail {
-
-
-template <typename i_t>
-struct rank_data_t {
-  // === Ownership ===
-  std::vector<i_t> owned_var_indices;       // global indices of variables in S_r
-  std::vector<i_t> owned_constr_indices;    // global indices of constraints in T_r
-  // === Send plan: per peer, LOCAL positions to gather + send ===
-  std::vector<std::vector<i_t>> y_send_per_peer;     // [peer] -> local positions in T_r to send
-  std::vector<std::vector<i_t>> x_send_per_peer;   // [peer] -> local positions in S_r to send
-  // === Recv plan: per peer, contiguous slot in halo region ===
-  std::vector<int> y_recv_counts;        // [peer] -> count
-  std::vector<int> y_recv_offsets;       // [peer] -> offset in dual halo region
-  std::vector<int> x_recv_counts;
-  std::vector<int> x_recv_offsets;
-};
-
-
-template <typename i_t, typename f_t>
-class partition_t {
-  public:
-    // not sure, good luck hihi
-    partition_t(std::vector<i_t> parts, std::vector<i_t> A_row_offsets, std::vector<i_t> A_indices, std::vector<i_t> A_t_row_offsets, std::vector<i_t> A_t_indices, );
-    partition_t(const problem_t<i_t, f_t>& op_problem);
-    void export_to_file(const std::string& partition_file) const;
-
-  size_t nb_parts;
-  
-  std::vector<i_t> raw_parts;
-  std::vector<i_t> cstr_parts;
-  std::vector<i_t> var_parts;
-  std::vector<rank_data_t<i_t>> rank_data; // [rank] -> partition data for this rank
-
-  private:
-  void fill_data();
-  void validate() const;
-
-};
-} // namespace cuopt::linear_programming::detail
\ No newline at end of file
diff --git a/cpp/src/pdlp/distributed_pdlp/partition_loader.hpp b/cpp/src/pdlp/distributed_pdlp/partition_loader.hpp
new file mode 100644
index 0000000000..139597f9cb
--- /dev/null
+++ b/cpp/src/pdlp/distributed_pdlp/partition_loader.hpp
@@ -0,0 +1,2 @@
+
+
diff --git a/cpp/src/pdlp/distributed_pdlp/rank_data.hpp b/cpp/src/pdlp/distributed_pdlp/rank_data.hpp
new file mode 100644
index 0000000000..ee107f5cf1
--- /dev/null
+++ b/cpp/src/pdlp/distributed_pdlp/rank_data.hpp
@@ -0,0 +1,52 @@
+#pragma once
+
+#include <vector>
+#include <unordered_map>
+
+namespace cuopt::linear_programming::detail {
+template <typename i_t, typename f_t>
+struct rank_data_t {
+    rank_data_t(std::size_t nb_parts)
+      : var_send_per_peer(nb_parts),
+        cstr_send_per_peer(nb_parts),
+        var_recv_counts(nb_parts, 0),
+        var_recv_offsets(nb_parts, 0),
+        cstr_recv_counts(nb_parts, 0),
+        cstr_recv_offsets(nb_parts, 0) {}
+  
+    i_t owned_var_size{0};
+    i_t total_var_size{0};
+    i_t owned_cstr_size{0};
+    i_t total_cstr_size{0};
+  
+    // === Ownership ===
+    std::vector<i_t> owned_var_indices;
+    std::vector<i_t> owned_cstr_indices;
+  
+    // === Send plan: per peer, indices to gather + send ===
+    std::vector<std::vector<i_t>> var_send_per_peer;
+    std::vector<std::vector<i_t>> cstr_send_per_peer;
+  
+    // === Recv plan: per peer, contiguous slot in halo region ===
+    std::vector<i_t> var_recv_counts;
+    std::vector<i_t> var_recv_offsets;
+    std::vector<i_t> cstr_recv_counts;
+    std::vector<i_t> cstr_recv_offsets;
+  
+    // === Mappings ===
+    std::unordered_map<i_t, i_t> global_to_local_var;
+    std::unordered_map<i_t, i_t> global_to_local_cstr;
+    std::vector<i_t> local_to_global_var;
+    std::vector<i_t> local_to_global_cstr;
+  
+    // === Local host CSR matrices ===
+    // A
+    std::vector<i_t> h_A_row_offsets;
+    std::vector<i_t> h_A_col_indices;
+    std::vector<f_t> h_A_values;
+    // A_t
+    std::vector<i_t> h_A_t_row_offsets;
+    std::vector<i_t> h_A_t_col_indices;
+    std::vector<f_t> h_A_t_values;
+  };
+} // namespace cuopt::linear_programming::detail
\ No newline at end of file
diff --git a/cpp/src/pdlp/distributed_pdlp/shard.cu b/cpp/src/pdlp/distributed_pdlp/shard.cu
index 6de93ad3b8..b7e176c3ee 100644
--- a/cpp/src/pdlp/distributed_pdlp/shard.cu
+++ b/cpp/src/pdlp/distributed_pdlp/shard.cu
@@ -1,41 +1,19 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ */
 
+#include <pdlp/distributed_pdlp/shard.hpp>
+#include <pdlp/pdlp.cuh>
+namespace cuopt::linear_programming::detail {
 
+// This must be done in .cu file because the pdlp_solver_t is not already complete in the hpp file
+template <typename i_t, typename f_t>
+pdlp_shard_t<i_t, f_t>::~pdlp_shard_t() = default;
 
-void pre_SpMV_communication(bool is_A_x){
-    // Prepare the send_buffers
-    for (auto& shard: shards){
-        comm_planner_t<i_t, f_t>& plan = is_A_x ? shard.x_plan : shard.y_plan;
-        raft::device_setter guard(shard.device_id);
-        for (size_t peer = 0; peer < partition.nb_parts; peer++){
-            if (peer == shard.rank) continue;
-            thrust::gather(
-                shard.handle.get_thrust_policy(), // TODO what exactly do we put here
-                plan.send_indices_per_peer[peer].begin(),
-                plan.send_indices_per_peer[peer].end(),
-                plan.full_local.begin(),
-                plan.send_buf_per_peer[peer].begin());
-        }
-    }
-    // Will merge them if it works
-    ncclgroupstart();
-    // Send all the data current shard has to send
-    for (auto& shard: shards){
-        comm_planner_t<i_t, f_t>& plan = is_A_x ? shard.x_plan : shard.y_plan;
-        raft::device_setter guard(shard.device_id);
-        for (size_t peer = 0; peer < partition.nb_parts; peer++){
-            if (peer == shard.rank) continue;
-            ncclSend(plan.send_buf_per_peer[peer].data(), plan.nb_elt_send_per_peer[peer], peer)
-        }
-    }
-    // Receive all the data current shard has to receive
-    for (auto& shard: shards){
-        comm_planner_t<i_t, f_t>& plan = is_A_x ? shard.x_plan : shard.y_plan;
-        raft::device_setter guard(shard.device_id);
-        for (size_t peer = 0; peer < partition.nb_parts; peer++){
-            if (peer == shard.rank) continue;
-        f_t* recv_buff = &plan.full_local[offset_per_peer[peer]];
-        ncclRecv(recv_buff, plan.nb_elt_recv_per_peer[peer], peer);
-        }
-    }
-    ncclgroupend()
-}
\ No newline at end of file
+
+
+
+template struct pdlp_shard_t<int, double>;
+//template struct pdlp_shard_t<int, float>;
+}  // namespace cuopt::linear_programming::detail
diff --git a/cpp/src/pdlp/distributed_pdlp/shard.cuh b/cpp/src/pdlp/distributed_pdlp/shard.cuh
deleted file mode 100644
index 127cc496f1..0000000000
--- a/cpp/src/pdlp/distributed_pdlp/shard.cuh
+++ /dev/null
@@ -1,56 +0,0 @@
-
-
-
-template <typename i_t, typename f_t>
-struct comm_planner_t {
-    // The indices of the data we have to send to the others
-    // Maybe could merge evrything if it gives a speedup but a bit harder to read
-    std::vector<rmm::device_uvector<i_t>> send_indices_per_peer;
-    std::vector<int> nb_elt_send_per_peer;
-    std::vector<rmm::device_uvector<f_t>> send_buf_per_peer;
-
-    // Where to start writing in full_local for each peer    
-    std::vector<i_t> offset_per_peer;
-    std::vector<i_t> nb_elt_recv_per_peer;
-    rmm::device_uvector<f_t> full_local; // The full var/cstr vector containing all local data then all remote data
-};
-
-template <typename i_t, typename f_t>
-struct pdlp_shard_t {
-
-  // Specific multi-GPU data
-  int device_id;
-  ncclComm_t                comm;
-  comm_planner_t<i_t, f_t> x_plan, y_plan;
-
-  // Local per-rank PDLP data
-  raft::handle_t                   handle;          // owned: the actual handle for this shard's device/stream
-  problem_t<i_t, f_t>              local_problem;   // owned: holds handle_ptr = &handle (back-ref)
-  saddle_point_state_t<i_t, f_t>   saddle_point;    // owned: per-iter state, sized to local
-  cusparse_view_t<i_t, f_t>        cusparse_view;   // owned: descriptors bound to local_problem + saddle_point
-
-  rmm::device_uvector<f_t>         tmp_primal;
-  rmm::device_uvector<f_t>         tmp_dual;
-  rmm::device_uvector<f_t>         potential_next_primal;
-  rmm::device_uvector<f_t>         potential_next_dual;
-  rmm::device_uvector<f_t>         dual_slack;
-  rmm::device_uvector<f_t>         reflected_primal; // x, so it has primal_size + halo
-  rmm::device_uvector<f_t>         reflected_dual; // y, so it has dual_size + halo
-
-  rmm::device_scalar<f_t>          reusable_one;        // = 1.0
-  rmm::device_scalar<f_t>          reusable_zero;       // = 0.0
-  rmm::device_scalar<f_t>          reusable_neg_one;    // = -1.0
-
-  // ===== Missing for cuPDLP+ Halpern update =====
-  rmm::device_uvector<f_t>         initial_primal;      // snapshot at start of restart epoch
-  rmm::device_uvector<f_t>         initial_dual;
-
-  i_t                              primal_size_h;
-  i_t                              dual_size_h;
-  i_t                              primal_halo_size;
-  i_t                              dual_halo_size;
-  i_t                              full_primal_size_h;// = primal_size_h + primal_halo_size
-  i_t                              full_dual_size_h;  // = dual_size_h + dual_halo_size
-};
-
-
diff --git a/cpp/src/pdlp/distributed_pdlp/shard.hpp b/cpp/src/pdlp/distributed_pdlp/shard.hpp
new file mode 100644
index 0000000000..0fe57be974
--- /dev/null
+++ b/cpp/src/pdlp/distributed_pdlp/shard.hpp
@@ -0,0 +1,31 @@
+#pragma once
+#include <pdlp/distributed_pdlp/rank_data.hpp>
+#include <raft/core/handle.hpp>
+#include <nccl.h>
+#include <memory>
+namespace cuopt::linear_programming::detail {
+
+template <typename i_t, typename f_t>
+class pdlp_solver_t;
+
+template <typename i_t, typename f_t>
+class pdlp_shard_t {
+  // Declaration only, will be set as default in shard.cu . Needed to manage cyclic include of pdlp_solver_t.
+  public: 
+    ~pdlp_shard_t();
+  pdlp_shard_t(int device_id,
+    rank_data_t<i_t, f_t>&& rd,
+    ncclComm_t comm
+    /* ???????? */);
+
+  pdlp_shard_t(const pdlp_shard_t&)            = delete;
+  pdlp_shard_t& operator=(const pdlp_shard_t&) = delete;  // Specific multi-GPU data
+  int device_id;
+  raft::handle_t                            handle; 
+  ncclComm_t                comm;
+  rank_data_t<i_t, f_t>     rank_data;
+
+  std::unique_ptr<pdlp_solver_t<i_t, f_t>> sub_pdlp;
+};
+
+}
diff --git a/cpp/src/pdlp/pdlp.cuh b/cpp/src/pdlp/pdlp.cuh
index d03430f150..5cb267730f 100644
--- a/cpp/src/pdlp/pdlp.cuh
+++ b/cpp/src/pdlp/pdlp.cuh
@@ -12,6 +12,7 @@
 #include <cuopt/linear_programming/pdlp/solver_solution.hpp>
 
 #include <pdlp/cusparse_view.hpp>
+#include <pdlp/distributed_pdlp/multi_gpu_engine.hpp>
 #include <pdlp/initial_scaling_strategy/initial_scaling.cuh>
 #include <pdlp/pdhg.hpp>
 #include <pdlp/pdlp_climber_strategy.hpp>
@@ -32,6 +33,7 @@
 
 #include <optional>
 #include <unordered_set>
+#include "distributed_pdlp/multi_gpu_engine.hpp"
 
 namespace cuopt::linear_programming::detail {
 /**
@@ -237,6 +239,8 @@ class pdlp_solver_t {
   primal_quality_adapter_t best_primal_quality_so_far_;
   // Flag to indicate if solver is being called from MIP. No logging is done in this case.
   bool inside_mip_{false};
+
+  multi_gpu_engine_t<i_t, f_t> multi_gpu_engine;
 };
 
 }  // namespace cuopt::linear_programming::detail

From d89c85a9af1303ae12641a868a9cb83d64c32aee Mon Sep 17 00:00:00 2001
From: Bulle Mostovoi <vmostovoi@nvidia.com>
Date: Tue, 19 May 2026 13:49:37 +0200
Subject: [PATCH 06/67] still wip but going well

---
 .../pdlp/pdlp_hyper_params.cuh                |   1 +
 cpp/src/pdlp/CMakeLists.txt                   |   3 +
 .../pdlp/distributed_pdlp/multi_gpu_engine.cu |  73 +++++++
 .../distributed_pdlp/multi_gpu_engine.hpp     |  61 ++++--
 .../pdlp/distributed_pdlp/partition_loader.cu | 178 ++++++++++++++++++
 .../distributed_pdlp/partition_loader.hpp     |  14 ++
 cpp/src/pdlp/distributed_pdlp/shard.cu        | 101 +++++++++-
 cpp/src/pdlp/distributed_pdlp/shard.hpp       |  21 ++-
 cpp/src/pdlp/pdlp.cu                          |  97 +++++++++-
 cpp/src/pdlp/pdlp.cuh                         |  10 +-
 cpp/src/pdlp/solve.cu                         |  11 ++
 11 files changed, 551 insertions(+), 19 deletions(-)
 create mode 100644 cpp/src/pdlp/distributed_pdlp/multi_gpu_engine.cu
 create mode 100644 cpp/src/pdlp/distributed_pdlp/partition_loader.cu

diff --git a/cpp/include/cuopt/linear_programming/pdlp/pdlp_hyper_params.cuh b/cpp/include/cuopt/linear_programming/pdlp/pdlp_hyper_params.cuh
index 282e91d7ef..962f06ee4a 100644
--- a/cpp/include/cuopt/linear_programming/pdlp/pdlp_hyper_params.cuh
+++ b/cpp/include/cuopt/linear_programming/pdlp/pdlp_hyper_params.cuh
@@ -47,6 +47,7 @@ struct pdlp_hyper_params_t {
   bool bound_objective_rescaling                                  = true;
   bool use_reflected_primal_dual                                  = true;
   bool use_fixed_point_error                                      = true;
+  bool use_distributed_pdlp                                       = false;
   double reflection_coefficient                                   = 1.0;
   double restart_k_p                                              = 0.99;
   double restart_k_i                                              = 0.01;
diff --git a/cpp/src/pdlp/CMakeLists.txt b/cpp/src/pdlp/CMakeLists.txt
index f5f26837b6..2bc2771c91 100644
--- a/cpp/src/pdlp/CMakeLists.txt
+++ b/cpp/src/pdlp/CMakeLists.txt
@@ -29,6 +29,9 @@ set(LP_CORE_FILES
   ${CMAKE_CURRENT_SOURCE_DIR}/termination_strategy/convergence_information.cu
   ${CMAKE_CURRENT_SOURCE_DIR}/optimal_batch_size_handler/optimal_batch_size_handler.cu
   ${CMAKE_CURRENT_SOURCE_DIR}/utilities/ping_pong_graph.cu
+  ${CMAKE_CURRENT_SOURCE_DIR}/distributed_pdlp/shard.cu
+  ${CMAKE_CURRENT_SOURCE_DIR}/distributed_pdlp/multi_gpu_engine.cu
+  ${CMAKE_CURRENT_SOURCE_DIR}/distributed_pdlp/partition_loader.cu
 )
 
 # C and Python adapter files
diff --git a/cpp/src/pdlp/distributed_pdlp/multi_gpu_engine.cu b/cpp/src/pdlp/distributed_pdlp/multi_gpu_engine.cu
new file mode 100644
index 0000000000..c7307c46ee
--- /dev/null
+++ b/cpp/src/pdlp/distributed_pdlp/multi_gpu_engine.cu
@@ -0,0 +1,73 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+ #include <pdlp/distributed_pdlp/multi_gpu_engine.hpp>
+
+ #include <cuopt/error.hpp>
+ 
+ #include <raft/core/device_setter.hpp>
+ 
+ #include <nccl.h>
+ 
+ #include <numeric>
+ 
+ namespace cuopt::linear_programming::detail {
+ 
+ template <typename i_t, typename f_t>
+ multi_gpu_engine_t<i_t, f_t>::multi_gpu_engine_t(
+   std::vector<rank_data_t<i_t, f_t>>&&      rank_data,
+   std::vector<f_t> const&                   h_global_obj,
+   std::vector<f_t> const&                   h_global_var_lower,
+   std::vector<f_t> const&                   h_global_var_upper,
+   std::vector<f_t> const&                   h_global_cstr_lower,
+   std::vector<f_t> const&                   h_global_cstr_upper,
+   bool                                      maximize,
+   f_t                                       objective_offset,
+   f_t                                       objective_scaling_factor,
+   pdlp_solver_settings_t<i_t, f_t> const&   sub_solver_settings)
+   : stream()
+ {
+   const int nb_parts = static_cast<int>(rank_data.size());
+   cuopt_expects(nb_parts > 0,
+                 error_type_t::ValidationError,
+                 "multi_gpu_engine_t: rank_data must be non-empty");
+ 
+   shards.reserve(nb_parts);
+ 
+   // 1:1 rank -> device mapping. (Matches metis_tests; refine later if needed.)
+   std::vector<int> devices(nb_parts);
+   std::iota(devices.begin(), devices.end(), 0);
+ 
+   // 2. Collectively bootstrap NCCL communicators across all devices.
+   //    Must be done together; each comm is then handed to one shard,
+   //    which wraps it in a unique_ptr with the device-aware deleter.
+   std::vector<ncclComm_t> raw_comms(nb_parts);
+   cuopt_expects(ncclCommInitAll(raw_comms.data(), nb_parts, devices.data()) == ncclSuccess,
+                 error_type_t::RuntimeError,
+                 "ncclCommInitAll failed");
+ 
+   // 3. Construct one shard per rank, pinned to its device.
+   for (int r = 0; r < nb_parts; ++r) {
+     raft::device_setter guard(devices[r]);  // shard ctor asserts current device
+     shards.emplace_back(std::make_unique<pdlp_shard_t<i_t, f_t>>(
+       devices[r],
+       std::move(rank_data[r]),
+       raw_comms[r],
+       h_global_obj,
+       h_global_var_lower,
+       h_global_var_upper,
+       h_global_cstr_lower,
+       h_global_cstr_upper,
+       maximize,
+       objective_offset,
+       objective_scaling_factor,
+       sub_solver_settings));
+   }
+ }
+ 
+ template struct multi_gpu_engine_t<int, double>;
+ // template struct multi_gpu_engine_t<int, float>;
+ 
+ }  // namespace cuopt::linear_programming::detail
\ No newline at end of file
diff --git a/cpp/src/pdlp/distributed_pdlp/multi_gpu_engine.hpp b/cpp/src/pdlp/distributed_pdlp/multi_gpu_engine.hpp
index 13ded70009..6142c938e3 100644
--- a/cpp/src/pdlp/distributed_pdlp/multi_gpu_engine.hpp
+++ b/cpp/src/pdlp/distributed_pdlp/multi_gpu_engine.hpp
@@ -1,14 +1,49 @@
-#pragma once
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ */
+ #pragma once
 
-#include <pdlp/distributed_pdlp/shard.hpp>
-
-#include <vector>
-
-namespace cuopt::linear_programming::detail {
-
-template <typename i_t, typename f_t>
-struct multi_gpu_engine_t {
-  std::vector<pdlp_shard_t<i_t, f_t>> shards;
-};
-
-}  // namespace cuopt::linear_programming::detail
+ #include <pdlp/distributed_pdlp/rank_data.hpp>
+ #include <pdlp/distributed_pdlp/shard.hpp>
+ 
+ #include <cuopt/linear_programming/pdlp/solver_settings.hpp>
+ 
+ #include <rmm/cuda_stream.hpp>
+ 
+ #include <memory>
+ #include <vector>
+ 
+ namespace cuopt::linear_programming::detail {
+ 
+ template <typename i_t, typename f_t>
+ struct multi_gpu_engine_t {
+   // Constructs one shard per partition. Caller is responsible for:
+   //   - rank_data[i] being correctly populated for rank i
+   //   - the host vectors holding the (already scaled) global problem data
+   //   - sub_solver_settings being the per-shard PDLP config (num_gpus=1,
+   //     multi_gpu_partition_file="", scaling disabled).
+   multi_gpu_engine_t(
+     std::vector<rank_data_t<i_t, f_t>>&&      rank_data,
+     std::vector<f_t> const&                   h_global_obj,
+     std::vector<f_t> const&                   h_global_var_lower,
+     std::vector<f_t> const&                   h_global_var_upper,
+     std::vector<f_t> const&                   h_global_cstr_lower,
+     std::vector<f_t> const&                   h_global_cstr_upper,
+     bool                                      maximize,
+     f_t                                       objective_offset,
+     f_t                                       objective_scaling_factor,
+     pdlp_solver_settings_t<i_t, f_t> const&   sub_solver_settings);
+ 
+   multi_gpu_engine_t(const multi_gpu_engine_t&)            = delete;
+   multi_gpu_engine_t& operator=(const multi_gpu_engine_t&) = delete;
+ 
+   // Engine-level stream for fork/join orchestration (master side).
+   rmm::cuda_stream stream;
+ 
+   // Shards stored by unique_ptr because pdlp_shard_t is immovable
+   // (owns device-affine resources: handle, NCCL comm, RMM buffers).
+   std::vector<std::unique_ptr<pdlp_shard_t<i_t, f_t>>> shards;
+ };
+ 
+ }  // namespace cuopt::linear_programming::detail
\ No newline at end of file
diff --git a/cpp/src/pdlp/distributed_pdlp/partition_loader.cu b/cpp/src/pdlp/distributed_pdlp/partition_loader.cu
new file mode 100644
index 0000000000..449e8640ab
--- /dev/null
+++ b/cpp/src/pdlp/distributed_pdlp/partition_loader.cu
@@ -0,0 +1,178 @@
+static std::vector<int> parse_distributed_pdlp_partition_file(std::string file){
+    //returns a vector with all the values separated by a \n
+}
+
+std::vector<rank_data_t> create_rank_data_from_parts(const std::vector<i_t>& parts,
+    const std::vector<i_t>& A_row_offsets,
+    const std::vector<i_t>& A_col_indices,
+    const std::vector<f_t>& A_values,
+    const std::vector<i_t>& A_t_row_offsets,
+    const std::vector<i_t>& A_t_col_indices,
+    const std::vector<f_t>& A_t_values,
+    i_t nb_parts,
+    i_t nb_cstr,
+    i_t nb_vars,
+    i_t nnz)
+{
+std::vector<rank_data_t> rank_data(nb_parts, rank_data_t(nb_parts));
+std::vector<i_t> cstr_parts(parts.begin(), parts.begin() + nb_cstr);
+std::vector<i_t> var_parts(parts.begin() + nb_cstr, parts.begin() + nb_cstr + nb_vars);
+
+// 1. Compute ownership
+for (i_t i = 0; i < nb_cstr; i++) {
+rank_data[cstr_parts[i]].owned_cstr_indices.push_back(i);
+}
+for (i_t i = 0; i < nb_vars; i++) {
+rank_data[var_parts[i]].owned_var_indices.push_back(i);
+}
+
+// 2. Compute local matrices and rank_data
+for (i_t rank = 0; rank < nb_parts; rank++) {
+auto& rd = rank_data[rank];
+rd.owned_var_size  = rd.owned_var_indices.size();
+rd.owned_cstr_size = rd.owned_cstr_indices.size();
+// ---- A side ----
+std::vector<i_t> local_A_row_offsets;
+std::vector<i_t> local_A_col_indices;
+std::vector<f_t> local_A_values;
+
+i_t local_A_nnz = 0;
+local_A_row_offsets.push_back(local_A_nnz);
+
+// For each owned constraint, build local matrix A
+for (auto owned_cstr : rd.owned_cstr_indices) {
+i_t cstr_len = A_row_offsets[owned_cstr + 1] - A_row_offsets[owned_cstr];
+i_t row_start = A_row_offsets[owned_cstr];
+for (i_t v = 0; v < cstr_len; v++) {
+local_A_col_indices.push_back(A_col_indices[row_start + v]);
+local_A_values.push_back(A_values[row_start + v]);
+}
+local_A_nnz += cstr_len;
+local_A_row_offsets.push_back(local_A_nnz);
+}
+
+std::set<i_t> needed_vars;
+for (auto indice : local_A_col_indices) {
+if (var_parts[indice] != rank)
+needed_vars.insert(indice);
+}
+
+for (i_t peer = 0; peer < nb_parts; peer++) {
+std::vector<i_t> needed_var_from_peer;
+for (auto needed_var : needed_vars) {
+if (var_parts[needed_var] == peer)
+needed_var_from_peer.push_back(needed_var);
+}
+i_t nb_recv_from_peer = needed_var_from_peer.size();
+rd.var_recv_counts[peer] = nb_recv_from_peer;
+rd.var_recv_offsets[peer] =
+peer == 0
+? 0
+: rd.var_recv_offsets[peer - 1] + rd.var_recv_counts[peer - 1];
+rank_data[peer].var_send_per_peer[rank] = std::move(needed_var_from_peer);
+}
+
+rd.h_A_row_offsets = std::move(local_A_row_offsets);
+rd.h_A_col_indices = std::move(local_A_col_indices);
+rd.h_A_values = std::move(local_A_values);
+
+// ---- A_t side ----
+std::vector<i_t> local_A_t_row_offsets;
+std::vector<i_t> local_A_t_col_indices;
+std::vector<f_t> local_A_t_values;
+i_t local_A_t_nnz = 0;
+local_A_t_row_offsets.push_back(local_A_t_nnz);
+
+for (auto owned_var : rd.owned_var_indices) {
+i_t var_len = A_t_row_offsets[owned_var + 1] - A_t_row_offsets[owned_var];
+i_t row_start = A_t_row_offsets[owned_var];
+for (i_t v = 0; v < var_len; v++) {
+local_A_t_col_indices.push_back(A_t_col_indices[row_start + v]);
+local_A_t_values.push_back(A_t_values[row_start + v]);
+}
+local_A_t_nnz += var_len;
+local_A_t_row_offsets.push_back(local_A_t_nnz);
+}
+
+std::set<i_t> needed_cstrs;
+for (auto indice : local_A_t_col_indices) {
+if (cstr_parts[indice] != rank)
+needed_cstrs.insert(indice);
+}
+
+for (i_t peer = 0; peer < nb_parts; peer++) {
+std::vector<i_t> needed_cstr_from_peer;
+for (auto needed_cstr : needed_cstrs) {
+if (cstr_parts[needed_cstr] == peer)
+needed_cstr_from_peer.push_back(needed_cstr);
+}
+i_t nb_recv_from_peer = needed_cstr_from_peer.size();
+rd.cstr_recv_counts[peer] = nb_recv_from_peer;
+rd.cstr_recv_offsets[peer] =
+peer == 0
+? 0
+: rd.cstr_recv_offsets[peer - 1] + rd.cstr_recv_counts[peer - 1];
+rank_data[peer].cstr_send_per_peer[rank] = std::move(needed_cstr_from_peer);
+}
+
+rd.h_A_t_row_offsets = std::move(local_A_t_row_offsets);
+rd.h_A_t_col_indices = std::move(local_A_t_col_indices);
+rd.h_A_t_values = std::move(local_A_t_values);
+
+rd.total_var_size  = rd.owned_var_size  + needed_vars.size();
+rd.total_cstr_size = rd.owned_cstr_size + needed_cstrs.size();
+}
+
+// 3. Generate local indices for contiguous [[self], [peer1], ..., [peer_k]]
+//    Build scatter_gather_maps
+for (i_t rank = 0; rank < nb_parts; rank++) {
+auto& rd = rank_data[rank];
+
+i_t curr_id = 0;
+for (auto owned_cstr : rd.owned_cstr_indices) {
+rd.global_to_local_cstr[owned_cstr] = curr_id;
+rd.local_to_global_cstr.push_back(owned_cstr);
+curr_id++;
+}
+for (i_t peer = 0; peer < nb_parts; peer++) {
+if (peer == rank) continue;
+for (auto recv_cstr : rank_data[peer].cstr_send_per_peer[rank]) {
+rd.global_to_local_cstr[recv_cstr] = curr_id;
+// rd.local_to_global_cstr.push_back(recv_cstr); // Not needed, we only do local_to_global on owned side
+curr_id++;
+}
+}
+
+curr_id = 0;
+for (auto owned_var : rd.owned_var_indices) {
+rd.global_to_local_var[owned_var] = curr_id;
+rd.local_to_global_var.push_back(owned_var);
+curr_id++;
+}
+for (i_t peer = 0; peer < nb_parts; peer++) {
+if (peer == rank) continue;
+for (auto recv_var : rank_data[peer].var_send_per_peer[rank]) {
+rd.global_to_local_var[recv_var] = curr_id;
+// rd.local_to_global_var.push_back(recv_var); // same as over
+curr_id++;
+}
+}
+}
+
+// 4. Remap global -> local everywhere
+for (i_t rank = 0; rank < nb_parts; rank++) {
+auto& rd = rank_data[rank];
+
+for (auto& send_vec : rd.var_send_per_peer) {
+for (auto& v : send_vec) v = rd.global_to_local_var.at(v);
+}
+for (auto& send_vec : rd.cstr_send_per_peer) {
+for (auto& v : send_vec) v = rd.global_to_local_cstr.at(v);
+}
+
+for (auto& v : rd.h_A_col_indices) v = rd.global_to_local_var.at(v);
+for (auto& v : rd.h_A_t_col_indices) v = rd.global_to_local_cstr.at(v);
+}
+
+return rank_data;
+}
diff --git a/cpp/src/pdlp/distributed_pdlp/partition_loader.hpp b/cpp/src/pdlp/distributed_pdlp/partition_loader.hpp
index 139597f9cb..4d66d4445c 100644
--- a/cpp/src/pdlp/distributed_pdlp/partition_loader.hpp
+++ b/cpp/src/pdlp/distributed_pdlp/partition_loader.hpp
@@ -1,2 +1,16 @@
 
 
+partition_loader_t {
+    static std::vector<int> parse_distributed_pdlp_partition_file(std::string file);
+    std::vector<rank_data_t> create_rank_data_from_parts(const std::vector<i_t>& parts,
+        const std::vector<i_t>& A_row_offsets,
+        const std::vector<i_t>& A_col_indices,
+        const std::vector<f_t>& A_values,
+        const std::vector<i_t>& A_t_row_offsets,
+        const std::vector<i_t>& A_t_col_indices,
+        const std::vector<f_t>& A_t_values,
+        i_t nb_parts,
+        i_t nb_cstr,
+        i_t nb_vars,
+        i_t nnz);
+}
\ No newline at end of file
diff --git a/cpp/src/pdlp/distributed_pdlp/shard.cu b/cpp/src/pdlp/distributed_pdlp/shard.cu
index b7e176c3ee..d5e795bb61 100644
--- a/cpp/src/pdlp/distributed_pdlp/shard.cu
+++ b/cpp/src/pdlp/distributed_pdlp/shard.cu
@@ -5,15 +5,114 @@
 
 #include <pdlp/distributed_pdlp/shard.hpp>
 #include <pdlp/pdlp.cuh>
+
+#include <raft/core/copy.hpp>
+#include <raft/core/device_setter.hpp>
+
+#include <cassert>
+#include <limits>
+
 namespace cuopt::linear_programming::detail {
 
 // This must be done in .cu file because the pdlp_solver_t is not already complete in the hpp file
 template <typename i_t, typename f_t>
 pdlp_shard_t<i_t, f_t>::~pdlp_shard_t() = default;
 
+template <typename i_t, typename f_t>
+pdlp_shard_t<i_t, f_t>::pdlp_shard_t(
+  int device_id,
+  rank_data_t<i_t, f_t>&& rd,
+  ncclComm_t raw_comm,
+  std::vector<f_t> const& h_global_obj,
+  std::vector<f_t> const& h_global_var_lower,
+  std::vector<f_t> const& h_global_var_upper,
+  std::vector<f_t> const& h_global_cstr_lower,
+  std::vector<f_t> const& h_global_cstr_upper,
+  bool                                     maximize,
+  f_t                                      objective_offset,
+  f_t                                      objective_scaling_factor,
+  pdlp_solver_settings_t<i_t, f_t> const&  settings)
+  : device_id(device_id),
+    stream(),
+    handle(stream.view()),
+    comm(raw_comm, nccl_comm_deleter_t{device_id}),
+    rank_data(std::move(rd)),
+    opt_problem(std::nullopt),
+    sub_problem(std::nullopt),
+    sub_pdlp(nullptr)
+{
+  assert(raft::device_setter::get_current_device() == device_id && "Right device must be set before building the shard");
+
+  // ---- 1. Gather per-shard host slices using rank_data's index maps. ----
+  // All vectors are sized to TOTAL (owned + halo). Owned slots get real
+  // values; halo slots keep neutral defaults so they are no-ops even if
+  // accidentally touched before `owned_*_size_` plumbing is in place.
+  std::vector<f_t> h_obj       (rank_data.total_var_size,   f_t{0});
+  std::vector<f_t> h_var_lower (rank_data.total_var_size,  -std::numeric_limits<f_t>::infinity());
+  std::vector<f_t> h_var_upper (rank_data.total_var_size,   std::numeric_limits<f_t>::infinity());
+  std::vector<f_t> h_cstr_lower(rank_data.total_cstr_size, -std::numeric_limits<f_t>::infinity());
+  std::vector<f_t> h_cstr_upper(rank_data.total_cstr_size,  std::numeric_limits<f_t>::infinity());
+
+  for (i_t i = 0; i < rank_data.owned_var_size; ++i) {
+    const auto g  = rank_data.local_to_global_var[i];
+    h_obj[i]       = h_global_obj[g];
+    h_var_lower[i] = h_global_var_lower[g];
+    h_var_upper[i] = h_global_var_upper[g];
+  }
+  for (i_t i = 0; i < rank_data.owned_cstr_size; ++i) {
+    const auto g    = rank_data.local_to_global_cstr[i];
+    h_cstr_lower[i] = h_global_cstr_lower[g];
+    h_cstr_upper[i] = h_global_cstr_upper[g];
+  }
 
+  // ---- 2. Build optimization_problem_t on this shard's device. ----
+  opt_problem.emplace(&handle);
+  opt_problem->set_csr_constraint_matrix(
+    rank_data.h_A_values     .data(), static_cast<i_t>(rank_data.h_A_values     .size()),
+    rank_data.h_A_col_indices.data(), static_cast<i_t>(rank_data.h_A_col_indices.size()),
+    rank_data.h_A_row_offsets.data(), static_cast<i_t>(rank_data.h_A_row_offsets.size()));
 
+  // Primal axis: TOTAL (owned + halo). Halo slots have neutral defaults.
+  opt_problem->set_objective_coefficients(h_obj      .data(), rank_data.total_var_size);
+  opt_problem->set_variable_lower_bounds (h_var_lower.data(), rank_data.total_var_size);
+  opt_problem->set_variable_upper_bounds (h_var_upper.data(), rank_data.total_var_size);
+
+  // Dual axis: TOTAL (owned + halo). Halo slots have ±inf so trivially satisfied.
+  opt_problem->set_constraint_lower_bounds(h_cstr_lower.data(), rank_data.total_cstr_size);
+  opt_problem->set_constraint_upper_bounds(h_cstr_upper.data(), rank_data.total_cstr_size);
+
+  opt_problem->set_maximize(maximize);
+  opt_problem->set_objective_offset(objective_offset);
+  opt_problem->set_objective_scaling_factor(objective_scaling_factor);
+  opt_problem->set_problem_category(problem_category_t::LP);
+
+  // ---- 3. Build problem_t from opt_problem. ----
+  sub_problem.emplace(*opt_problem);
+
+  // ---- 4. Override reverse_* with the real local A_T from rank_data. ----
+  // problem_t's ctor computes the transpose of the LOCAL A, which is wrong
+  // in multi-GPU: A_local is owned_cstr x total_var, and A_t_local is the
+  // pre-sliced owned_var x total_cstr matrix we built during partitioning.
+  auto stream_view = handle.get_stream();
+  sub_problem->reverse_offsets     .resize(rank_data.h_A_t_row_offsets.size(), stream_view);
+  sub_problem->reverse_constraints .resize(rank_data.h_A_t_col_indices.size(), stream_view);
+  sub_problem->reverse_coefficients.resize(rank_data.h_A_t_values     .size(), stream_view);
+  raft::copy(sub_problem->reverse_offsets.data(),
+             rank_data.h_A_t_row_offsets.data(),
+             rank_data.h_A_t_row_offsets.size(), stream_view);
+  raft::copy(sub_problem->reverse_constraints.data(),
+             rank_data.h_A_t_col_indices.data(),
+             rank_data.h_A_t_col_indices.size(), stream_view);
+  raft::copy(sub_problem->reverse_coefficients.data(),
+             rank_data.h_A_t_values.data(),
+             rank_data.h_A_t_values.size(), stream_view);
+  handle.sync_stream(stream_view);
+
+  // ---- 5. Build sub_pdlp (single-GPU mode; multi_gpu flags cleared by caller). ----
+  sub_pdlp = std::make_unique<pdlp_solver_t<i_t, f_t>>(*sub_problem, settings, /*batch=*/false);
+}
 
 template struct pdlp_shard_t<int, double>;
-//template struct pdlp_shard_t<int, float>;
+// template struct pdlp_shard_t<int, float>;
+
 }  // namespace cuopt::linear_programming::detail
diff --git a/cpp/src/pdlp/distributed_pdlp/shard.hpp b/cpp/src/pdlp/distributed_pdlp/shard.hpp
index 0fe57be974..7528c35dec 100644
--- a/cpp/src/pdlp/distributed_pdlp/shard.hpp
+++ b/cpp/src/pdlp/distributed_pdlp/shard.hpp
@@ -8,6 +8,18 @@ namespace cuopt::linear_programming::detail {
 template <typename i_t, typename f_t>
 class pdlp_solver_t;
 
+struct nccl_comm_deleter_t {
+  int device_id{-1};
+  void operator()(ncclComm* comm) const noexcept
+  {
+    raft::device_setter guard(device_id);
+    if (comm != nullptr) {
+      ncclCommDestroy(comm);
+    }
+  }
+};
+using nccl_comm_unique_ptr_t = std::unique_ptr<ncclComm, nccl_comm_deleter_t>;
+
 template <typename i_t, typename f_t>
 class pdlp_shard_t {
   // Declaration only, will be set as default in shard.cu . Needed to manage cyclic include of pdlp_solver_t.
@@ -19,12 +31,15 @@ class pdlp_shard_t {
     /* ???????? */);
 
   pdlp_shard_t(const pdlp_shard_t&)            = delete;
-  pdlp_shard_t& operator=(const pdlp_shard_t&) = delete;  // Specific multi-GPU data
+  pdlp_shard_t& operator=(const pdlp_shard_t&) = delete;  
+  // Specific multi-GPU data
   int device_id;
+  rmm::cuda_stream stream;
   raft::handle_t                            handle; 
-  ncclComm_t                comm;
+  nccl_comm_unique_ptr_t comm; 
   rank_data_t<i_t, f_t>     rank_data;
-
+  optimization_problem_t opt_problem;
+  problem_t sub_problem;
   std::unique_ptr<pdlp_solver_t<i_t, f_t>> sub_pdlp;
 };
 
diff --git a/cpp/src/pdlp/pdlp.cu b/cpp/src/pdlp/pdlp.cu
index a759887fc5..a58ae4f210 100644
--- a/cpp/src/pdlp/pdlp.cu
+++ b/cpp/src/pdlp/pdlp.cu
@@ -11,12 +11,14 @@
 #include <cuopt/linear_programming/solver_settings.hpp>
 
 #include <pdlp/cusparse_view.hpp>
+#include <pdlp/distributed_pdlp/partition_loader.hpp>
 #include <pdlp/pdlp.cuh>
 #include <pdlp/swap_and_resize_helper.cuh>
 #include <pdlp/utils.cuh>
 
 #include <mip_heuristics/mip_constants.hpp>
 #include "cuopt/linear_programming/pdlp/solver_solution.hpp"
+#include "distributed_pdlp/multi_gpu_engine.hpp"
 
 #include <utilities/copy_helpers.hpp>
 #include <utilities/macros.cuh>
@@ -314,6 +316,95 @@ pdlp_solver_t<i_t, f_t>::pdlp_solver_t(problem_t<i_t, f_t>& op_problem,
   }
 }
 
+template <typename i_t, typename f_t>
+pdlp_solver_t<i_t, f_t>::pdlp_solver_t(problem_t<i_t, f_t>& op_problem,
+                                       pdlp_solver_settings_t<i_t, f_t> const& settings,
+                                       int num_gpus)
+  // 1. Delegate to single-GPU ctor to bring up all the per-master state
+  //    (problem_ptr, op_problem_scaled_, pdhg_solver_, strategies, etc.).
+  : pdlp_solver_t(op_problem, settings, false)
+{
+  cuopt_expects(num_gpus == settings.num_gpus && settings.num_gpus > 1,
+                error_type_t::ValidationError,
+                "This constructor should only be used for distributed PDLP (num_gpus > 1)");
+  // 2. Load partition
+  std::vector<i_t> parts;
+  if (!settings.multi_gpu_partition_file.empty()) {
+    parts = partition_loader_t<i_t, f_t>::parse_distributed_pdlp_partition_file(
+      settings.multi_gpu_partition_file);
+  } else {
+    cuopt_expects(false, error_type_t::NotImplemented,
+                  "Metis partitioning inside cuopt not implemented yet; "
+                  "provide a --parts file via settings.multi_gpu_partition_file");
+  }
+  // 3. Scale now before copying to children
+  initial_scaling_strategy_.scale_problem();
+
+  // 4. Copy the scaled global problem from device -> host.
+  auto const stream = op_problem_scaled_.handle_ptr->get_stream();
+  i_t const n_cstr  = op_problem_scaled_.n_constraints;
+  i_t const n_vars  = op_problem_scaled_.n_variables;
+  i_t const nnz     = op_problem_scaled_.nnz;
+  // CSRs (A and A_t).
+  std::vector<i_t> h_A_row_offsets  (n_cstr + 1);
+  std::vector<i_t> h_A_col_indices  (nnz);
+  std::vector<f_t> h_A_values       (nnz);
+  std::vector<i_t> h_A_t_row_offsets(n_vars + 1);
+  std::vector<i_t> h_A_t_col_indices(nnz);
+  std::vector<f_t> h_A_t_values     (nnz);
+  raft::copy(h_A_row_offsets  .data(), op_problem_scaled_.offsets             .data(), n_cstr + 1, stream);
+  raft::copy(h_A_col_indices  .data(), op_problem_scaled_.variables           .data(), nnz,        stream);
+  raft::copy(h_A_values       .data(), op_problem_scaled_.coefficients        .data(), nnz,        stream);
+  raft::copy(h_A_t_row_offsets.data(), op_problem_scaled_.reverse_offsets     .data(), n_vars + 1, stream);
+  raft::copy(h_A_t_col_indices.data(), op_problem_scaled_.reverse_constraints .data(), nnz,        stream);
+  raft::copy(h_A_t_values     .data(), op_problem_scaled_.reverse_coefficients.data(), nnz,        stream);
+  // Objective coefficients.
+  std::vector<f_t> h_obj(n_vars);
+  raft::copy(h_obj.data(), op_problem_scaled_.objective_coefficients.data(), n_vars, stream);
+  // Variable bounds: stored interleaved as f_t2 {lower, upper}. Unpack into two host vectors.
+  using f_t2 = typename type_2<f_t>::type;
+  std::vector<f_t2> h_var_bounds_packed(n_vars);
+  raft::copy(h_var_bounds_packed.data(),
+             op_problem_scaled_.variable_bounds.data(), n_vars, stream);
+  // Constraint bounds.
+  std::vector<f_t> h_cstr_lower(n_cstr);
+  std::vector<f_t> h_cstr_upper(n_cstr);
+  raft::copy(h_cstr_lower.data(), op_problem_scaled_.constraint_lower_bounds.data(), n_cstr, stream);
+  raft::copy(h_cstr_upper.data(), op_problem_scaled_.constraint_upper_bounds.data(), n_cstr, stream);
+  op_problem_scaled_.handle_ptr->sync_stream(stream);
+  
+  std::vector<f_t> h_var_lower(n_vars), h_var_upper(n_vars);
+  for (i_t i = 0; i < n_vars; ++i) {
+    h_var_lower[i] = h_var_bounds_packed[i].x;
+    h_var_upper[i] = h_var_bounds_packed[i].y;
+  }
+  // 5. Build per-rank data and meta-data
+  std::vector<rank_data_t<i_t, f_t>> sub_pdlp_rank_data =
+    partition_loader_t<i_t, f_t>::create_rank_data_from_parts(
+      parts,
+      h_A_row_offsets,   h_A_col_indices,   h_A_values,
+      h_A_t_row_offsets, h_A_t_col_indices, h_A_t_values,
+      settings.num_gpus, n_cstr, n_vars, nnz);
+  // 6. Build the per-shard PDLP settings:
+  //    - single-GPU mode (num_gpus=1, no partition file) so sub-solvers don't recurse;
+  //    - disable scaling (master already scaled the data we're handing out).
+  pdlp_solver_settings_t<i_t, f_t> sub_pdlp_settings        = settings;
+  sub_pdlp_settings.num_gpus                                = 1;
+  sub_pdlp_settings.multi_gpu_partition_file                = "";
+  sub_pdlp_settings.hyper_params.default_l_inf_ruiz_iterations          = 0;
+  sub_pdlp_settings.hyper_params.default_alpha_pock_chambolle_rescaling = 0.0;
+
+  // 7. Construct the engine — this collectively bootstraps NCCL across all GPUs
+  //    and constructs one shard per partition with the right slice of host data.
+  multi_gpu_engine.emplace(
+    std::move(sub_pdlp_rank_data),
+    h_obj, h_var_lower, h_var_upper, h_cstr_lower, h_cstr_upper,
+    op_problem_scaled_.maximize,
+    op_problem_scaled_.objective_offset,
+    op_problem_scaled_.presolve_data.objective_scaling_factor,
+    sub_pdlp_settings);
+}
+
 template <typename i_t, typename f_t>
 void pdlp_solver_t<i_t, f_t>::set_initial_primal_weight(f_t initial_primal_weight)
 {
@@ -2258,7 +2349,11 @@ optimization_problem_solution_t<i_t, f_t> pdlp_solver_t<i_t, f_t>::run_solver(co
       !settings_.get_initial_primal_weight().has_value())
     compute_initial_primal_weight();
 
-  initial_scaling_strategy_.scale_problem();
+  // In multi-GPU mode the master scaled op_problem_scaled_ in its ctor before
+  // distributing data to the shards, so skip the second scaling pass here.
+  if (!multi_gpu_engine.has_value()) {
+    initial_scaling_strategy_.scale_problem();
+  }
 
   // Update FP32 matrix copies for mixed precision SpMV after scaling
   pdhg_solver_.get_cusparse_view().update_mixed_precision_matrices();
diff --git a/cpp/src/pdlp/pdlp.cuh b/cpp/src/pdlp/pdlp.cuh
index 5cb267730f..ef992d2a9e 100644
--- a/cpp/src/pdlp/pdlp.cuh
+++ b/cpp/src/pdlp/pdlp.cuh
@@ -63,6 +63,11 @@ class pdlp_solver_t {
   pdlp_solver_t(problem_t<i_t, f_t>& op_problem,
                 pdlp_solver_settings_t<i_t, f_t> const& settings,
                 bool is_batch_mode = false);
+  
+  // Distributed Solver Constructor
+  pdlp_solver_t(problem_t<i_t, f_t>& op_problem,
+    pdlp_solver_settings_t<i_t, f_t> const& settings,
+    int num_gpus);
 
   optimization_problem_solution_t<i_t, f_t> run_solver(const timer_t& timer);
 
@@ -240,7 +245,10 @@ class pdlp_solver_t {
   // Flag to indicate if solver is being called from MIP. No logging is done in this case.
   bool inside_mip_{false};
 
-  multi_gpu_engine_t<i_t, f_t> multi_gpu_engine;
+  // std::optional because multi_gpu_engine_t is non-default-constructible
+  // (collectively bootstraps NCCL, owns RMM resources). Stays nullopt in
+  // single-GPU mode; emplaced by the multi-GPU ctor.
+  std::optional<multi_gpu_engine_t<i_t, f_t>> multi_gpu_engine;
 };
 
 }  // namespace cuopt::linear_programming::detail
diff --git a/cpp/src/pdlp/solve.cu b/cpp/src/pdlp/solve.cu
index 59f1a4517f..6057f1cb83 100644
--- a/cpp/src/pdlp/solve.cu
+++ b/cpp/src/pdlp/solve.cu
@@ -709,6 +709,17 @@ static optimization_problem_solution_t<i_t, f_t> run_pdlp_solver(
     }
   }
 #endif
+  if (settings.hyper_params.use_distributed_pdlp) {
+    cuopt_expects(settings.num_gpus > 1,
+                  error_type_t::ValidationError,
+                  "use_distributed_pdlp requires settings.num_gpus > 1");
+    cuopt_expects(!is_batch_mode,
+                  error_type_t::ValidationError,
+                  "Distributed PDLP does not support batch mode");
+    // Multi-GPU ctor; dispatched by 3rd-arg TYPE (int num_gpus, not bool batch).
+    detail::pdlp_solver_t<i_t, f_t> solver(problem, settings, settings.num_gpus);
+    return solver.run_solver(timer);
+  }
   detail::pdlp_solver_t<i_t, f_t> solver(problem, settings, is_batch_mode);
   if (settings.inside_mip) { solver.set_inside_mip(true); }
   return solver.run_solver(timer);

From 5534ff049bca7c32da24fd0dc755f5c17c5a0611 Mon Sep 17 00:00:00 2001
From: Bulle Mostovoi <vmostovoi@nvidia.com>
Date: Tue, 19 May 2026 13:54:58 +0200
Subject: [PATCH 07/67] cursor broke everything grrr

---
 .../pdlp/distributed_pdlp/partition_loader.cu | 371 ++++++++++--------
 .../distributed_pdlp/partition_loader.hpp     |  45 ++-
 cpp/src/pdlp/distributed_pdlp/shard.hpp       | 122 +++---
 3 files changed, 305 insertions(+), 233 deletions(-)

diff --git a/cpp/src/pdlp/distributed_pdlp/partition_loader.cu b/cpp/src/pdlp/distributed_pdlp/partition_loader.cu
index 449e8640ab..a9df158601 100644
--- a/cpp/src/pdlp/distributed_pdlp/partition_loader.cu
+++ b/cpp/src/pdlp/distributed_pdlp/partition_loader.cu
@@ -1,178 +1,201 @@
-static std::vector<int> parse_distributed_pdlp_partition_file(std::string file){
-    //returns a vector with all the values separated by a \n
-}
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ */
 
-std::vector<rank_data_t> create_rank_data_from_parts(const std::vector<i_t>& parts,
-    const std::vector<i_t>& A_row_offsets,
-    const std::vector<i_t>& A_col_indices,
-    const std::vector<f_t>& A_values,
-    const std::vector<i_t>& A_t_row_offsets,
-    const std::vector<i_t>& A_t_col_indices,
-    const std::vector<f_t>& A_t_values,
-    i_t nb_parts,
-    i_t nb_cstr,
-    i_t nb_vars,
-    i_t nnz)
-{
-std::vector<rank_data_t> rank_data(nb_parts, rank_data_t(nb_parts));
-std::vector<i_t> cstr_parts(parts.begin(), parts.begin() + nb_cstr);
-std::vector<i_t> var_parts(parts.begin() + nb_cstr, parts.begin() + nb_cstr + nb_vars);
+#include <pdlp/distributed_pdlp/partition_loader.hpp>
 
-// 1. Compute ownership
-for (i_t i = 0; i < nb_cstr; i++) {
-rank_data[cstr_parts[i]].owned_cstr_indices.push_back(i);
-}
-for (i_t i = 0; i < nb_vars; i++) {
-rank_data[var_parts[i]].owned_var_indices.push_back(i);
-}
+#include <set>
+#include <utility>
 
-// 2. Compute local matrices and rank_data
-for (i_t rank = 0; rank < nb_parts; rank++) {
-auto& rd = rank_data[rank];
-rd.owned_var_size  = rd.owned_var_indices.size();
-rd.owned_cstr_size = rd.owned_cstr_indices.size();
-// ---- A side ----
-std::vector<i_t> local_A_row_offsets;
-std::vector<i_t> local_A_col_indices;
-std::vector<f_t> local_A_values;
-
-i_t local_A_nnz = 0;
-local_A_row_offsets.push_back(local_A_nnz);
-
-// For each owned constraint, build local matrix A
-for (auto owned_cstr : rd.owned_cstr_indices) {
-i_t cstr_len = A_row_offsets[owned_cstr + 1] - A_row_offsets[owned_cstr];
-i_t row_start = A_row_offsets[owned_cstr];
-for (i_t v = 0; v < cstr_len; v++) {
-local_A_col_indices.push_back(A_col_indices[row_start + v]);
-local_A_values.push_back(A_values[row_start + v]);
-}
-local_A_nnz += cstr_len;
-local_A_row_offsets.push_back(local_A_nnz);
-}
-
-std::set<i_t> needed_vars;
-for (auto indice : local_A_col_indices) {
-if (var_parts[indice] != rank)
-needed_vars.insert(indice);
-}
-
-for (i_t peer = 0; peer < nb_parts; peer++) {
-std::vector<i_t> needed_var_from_peer;
-for (auto needed_var : needed_vars) {
-if (var_parts[needed_var] == peer)
-needed_var_from_peer.push_back(needed_var);
-}
-i_t nb_recv_from_peer = needed_var_from_peer.size();
-rd.var_recv_counts[peer] = nb_recv_from_peer;
-rd.var_recv_offsets[peer] =
-peer == 0
-? 0
-: rd.var_recv_offsets[peer - 1] + rd.var_recv_counts[peer - 1];
-rank_data[peer].var_send_per_peer[rank] = std::move(needed_var_from_peer);
-}
+namespace cuopt::linear_programming::detail {
 
-rd.h_A_row_offsets = std::move(local_A_row_offsets);
-rd.h_A_col_indices = std::move(local_A_col_indices);
-rd.h_A_values = std::move(local_A_values);
-
-// ---- A_t side ----
-std::vector<i_t> local_A_t_row_offsets;
-std::vector<i_t> local_A_t_col_indices;
-std::vector<f_t> local_A_t_values;
-i_t local_A_t_nnz = 0;
-local_A_t_row_offsets.push_back(local_A_t_nnz);
-
-for (auto owned_var : rd.owned_var_indices) {
-i_t var_len = A_t_row_offsets[owned_var + 1] - A_t_row_offsets[owned_var];
-i_t row_start = A_t_row_offsets[owned_var];
-for (i_t v = 0; v < var_len; v++) {
-local_A_t_col_indices.push_back(A_t_col_indices[row_start + v]);
-local_A_t_values.push_back(A_t_values[row_start + v]);
-}
-local_A_t_nnz += var_len;
-local_A_t_row_offsets.push_back(local_A_t_nnz);
-}
-
-std::set<i_t> needed_cstrs;
-for (auto indice : local_A_t_col_indices) {
-if (cstr_parts[indice] != rank)
-needed_cstrs.insert(indice);
-}
-
-for (i_t peer = 0; peer < nb_parts; peer++) {
-std::vector<i_t> needed_cstr_from_peer;
-for (auto needed_cstr : needed_cstrs) {
-if (cstr_parts[needed_cstr] == peer)
-needed_cstr_from_peer.push_back(needed_cstr);
-}
-i_t nb_recv_from_peer = needed_cstr_from_peer.size();
-rd.cstr_recv_counts[peer] = nb_recv_from_peer;
-rd.cstr_recv_offsets[peer] =
-peer == 0
-? 0
-: rd.cstr_recv_offsets[peer - 1] + rd.cstr_recv_counts[peer - 1];
-rank_data[peer].cstr_send_per_peer[rank] = std::move(needed_cstr_from_peer);
-}
-
-rd.h_A_t_row_offsets = std::move(local_A_t_row_offsets);
-rd.h_A_t_col_indices = std::move(local_A_t_col_indices);
-rd.h_A_t_values = std::move(local_A_t_values);
-
-rd.total_var_size  = rd.owned_var_size  + needed_vars.size();
-rd.total_cstr_size = rd.owned_cstr_size + needed_cstrs.size();
-}
-
-// 3. Generate local indices for contiguous [[self], [peer1], ..., [peer_k]]
-//    Build scatter_gather_maps
-for (i_t rank = 0; rank < nb_parts; rank++) {
-auto& rd = rank_data[rank];
-
-i_t curr_id = 0;
-for (auto owned_cstr : rd.owned_cstr_indices) {
-rd.global_to_local_cstr[owned_cstr] = curr_id;
-rd.local_to_global_cstr.push_back(owned_cstr);
-curr_id++;
-}
-for (i_t peer = 0; peer < nb_parts; peer++) {
-if (peer == rank) continue;
-for (auto recv_cstr : rank_data[peer].cstr_send_per_peer[rank]) {
-rd.global_to_local_cstr[recv_cstr] = curr_id;
-// rd.local_to_global_cstr.push_back(recv_cstr); // Not needed, we only do local_to_global on owned side
-curr_id++;
-}
-}
-
-curr_id = 0;
-for (auto owned_var : rd.owned_var_indices) {
-rd.global_to_local_var[owned_var] = curr_id;
-rd.local_to_global_var.push_back(owned_var);
-curr_id++;
-}
-for (i_t peer = 0; peer < nb_parts; peer++) {
-if (peer == rank) continue;
-for (auto recv_var : rank_data[peer].var_send_per_peer[rank]) {
-rd.global_to_local_var[recv_var] = curr_id;
-// rd.local_to_global_var.push_back(recv_var); // same as over
-curr_id++;
-}
-}
-}
-
-// 4. Remap global -> local everywhere
-for (i_t rank = 0; rank < nb_parts; rank++) {
-auto& rd = rank_data[rank];
-
-for (auto& send_vec : rd.var_send_per_peer) {
-for (auto& v : send_vec) v = rd.global_to_local_var.at(v);
-}
-for (auto& send_vec : rd.cstr_send_per_peer) {
-for (auto& v : send_vec) v = rd.global_to_local_cstr.at(v);
-}
-
-for (auto& v : rd.h_A_col_indices) v = rd.global_to_local_var.at(v);
-for (auto& v : rd.h_A_t_col_indices) v = rd.global_to_local_cstr.at(v);
-}
-
-return rank_data;
-}
+template <typename i_t, typename f_t>
+std::vector<int> partition_loader_t<i_t, f_t>::parse_distributed_pdlp_partition_file(
+  std::string file)
+{
+  // returns a vector with all the values separated by a \n
+  return {};  // TODO: implement
+}
+
+template <typename i_t, typename f_t>
+std::vector<rank_data_t<i_t, f_t>>
+partition_loader_t<i_t, f_t>::create_rank_data_from_parts(
+  const std::vector<i_t>& parts,
+  const std::vector<i_t>& A_row_offsets,
+  const std::vector<i_t>& A_col_indices,
+  const std::vector<f_t>& A_values,
+  const std::vector<i_t>& A_t_row_offsets,
+  const std::vector<i_t>& A_t_col_indices,
+  const std::vector<f_t>& A_t_values,
+  i_t nb_parts,
+  i_t nb_cstr,
+  i_t nb_vars,
+  i_t nnz)
+{
+  std::vector<rank_data_t<i_t, f_t>> rank_data(nb_parts, rank_data_t<i_t, f_t>(nb_parts));
+  std::vector<i_t> cstr_parts(parts.begin(), parts.begin() + nb_cstr);
+  std::vector<i_t> var_parts(parts.begin() + nb_cstr, parts.begin() + nb_cstr + nb_vars);
+
+  // 1. Compute ownership
+  for (i_t i = 0; i < nb_cstr; i++) {
+    rank_data[cstr_parts[i]].owned_cstr_indices.push_back(i);
+  }
+  for (i_t i = 0; i < nb_vars; i++) {
+    rank_data[var_parts[i]].owned_var_indices.push_back(i);
+  }
+
+  // 2. Compute local matrices and rank_data
+  for (i_t rank = 0; rank < nb_parts; rank++) {
+    auto& rd = rank_data[rank];
+    rd.owned_var_size  = rd.owned_var_indices.size();
+    rd.owned_cstr_size = rd.owned_cstr_indices.size();
+    // ---- A side ----
+    std::vector<i_t> local_A_row_offsets;
+    std::vector<i_t> local_A_col_indices;
+    std::vector<f_t> local_A_values;
+
+    i_t local_A_nnz = 0;
+    local_A_row_offsets.push_back(local_A_nnz);
+
+    // For each owned constraint, build local matrix A
+    for (auto owned_cstr : rd.owned_cstr_indices) {
+      i_t cstr_len = A_row_offsets[owned_cstr + 1] - A_row_offsets[owned_cstr];
+      i_t row_start = A_row_offsets[owned_cstr];
+      for (i_t v = 0; v < cstr_len; v++) {
+        local_A_col_indices.push_back(A_col_indices[row_start + v]);
+        local_A_values.push_back(A_values[row_start + v]);
+      }
+      local_A_nnz += cstr_len;
+      local_A_row_offsets.push_back(local_A_nnz);
+    }
+
+    std::set<i_t> needed_vars;
+    for (auto indice : local_A_col_indices) {
+      if (var_parts[indice] != rank)
+        needed_vars.insert(indice);
+    }
+
+    for (i_t peer = 0; peer < nb_parts; peer++) {
+      std::vector<i_t> needed_var_from_peer;
+      for (auto needed_var : needed_vars) {
+        if (var_parts[needed_var] == peer)
+          needed_var_from_peer.push_back(needed_var);
+      }
+      i_t nb_recv_from_peer = needed_var_from_peer.size();
+      rd.var_recv_counts[peer] = nb_recv_from_peer;
+      rd.var_recv_offsets[peer] =
+        peer == 0
+          ? 0
+          : rd.var_recv_offsets[peer - 1] + rd.var_recv_counts[peer - 1];
+      rank_data[peer].var_send_per_peer[rank] = std::move(needed_var_from_peer);
+    }
+
+    rd.h_A_row_offsets = std::move(local_A_row_offsets);
+    rd.h_A_col_indices = std::move(local_A_col_indices);
+    rd.h_A_values = std::move(local_A_values);
+
+    // ---- A_t side ----
+    std::vector<i_t> local_A_t_row_offsets;
+    std::vector<i_t> local_A_t_col_indices;
+    std::vector<f_t> local_A_t_values;
+    i_t local_A_t_nnz = 0;
+    local_A_t_row_offsets.push_back(local_A_t_nnz);
+
+    for (auto owned_var : rd.owned_var_indices) {
+      i_t var_len = A_t_row_offsets[owned_var + 1] - A_t_row_offsets[owned_var];
+      i_t row_start = A_t_row_offsets[owned_var];
+      for (i_t v = 0; v < var_len; v++) {
+        local_A_t_col_indices.push_back(A_t_col_indices[row_start + v]);
+        local_A_t_values.push_back(A_t_values[row_start + v]);
+      }
+      local_A_t_nnz += var_len;
+      local_A_t_row_offsets.push_back(local_A_t_nnz);
+    }
+
+    std::set<i_t> needed_cstrs;
+    for (auto indice : local_A_t_col_indices) {
+      if (cstr_parts[indice] != rank)
+        needed_cstrs.insert(indice);
+    }
+
+    for (i_t peer = 0; peer < nb_parts; peer++) {
+      std::vector<i_t> needed_cstr_from_peer;
+      for (auto needed_cstr : needed_cstrs) {
+        if (cstr_parts[needed_cstr] == peer)
+          needed_cstr_from_peer.push_back(needed_cstr);
+      }
+      i_t nb_recv_from_peer = needed_cstr_from_peer.size();
+      rd.cstr_recv_counts[peer] = nb_recv_from_peer;
+      rd.cstr_recv_offsets[peer] =
+        peer == 0
+          ? 0
+          : rd.cstr_recv_offsets[peer - 1] + rd.cstr_recv_counts[peer - 1];
+      rank_data[peer].cstr_send_per_peer[rank] = std::move(needed_cstr_from_peer);
+    }
+
+    rd.h_A_t_row_offsets = std::move(local_A_t_row_offsets);
+    rd.h_A_t_col_indices = std::move(local_A_t_col_indices);
+    rd.h_A_t_values = std::move(local_A_t_values);
+
+    rd.total_var_size  = rd.owned_var_size  + needed_vars.size();
+    rd.total_cstr_size = rd.owned_cstr_size + needed_cstrs.size();
+  }
+
+  // 3. Generate local indices for contiguous [[self], [peer1], ..., [peer_k]]
+  //    Build scatter_gather_maps
+  for (i_t rank = 0; rank < nb_parts; rank++) {
+    auto& rd = rank_data[rank];
+
+    i_t curr_id = 0;
+    for (auto owned_cstr : rd.owned_cstr_indices) {
+      rd.global_to_local_cstr[owned_cstr] = curr_id;
+      rd.local_to_global_cstr.push_back(owned_cstr);
+      curr_id++;
+    }
+    for (i_t peer = 0; peer < nb_parts; peer++) {
+      if (peer == rank) continue;
+      for (auto recv_cstr : rank_data[peer].cstr_send_per_peer[rank]) {
+        rd.global_to_local_cstr[recv_cstr] = curr_id;
+        // rd.local_to_global_cstr.push_back(recv_cstr); // Not needed, we only do local_to_global on owned side
+        curr_id++;
+      }
+    }
+
+    curr_id = 0;
+    for (auto owned_var : rd.owned_var_indices) {
+      rd.global_to_local_var[owned_var] = curr_id;
+      rd.local_to_global_var.push_back(owned_var);
+      curr_id++;
+    }
+    for (i_t peer = 0; peer < nb_parts; peer++) {
+      if (peer == rank) continue;
+      for (auto recv_var : rank_data[peer].var_send_per_peer[rank]) {
+        rd.global_to_local_var[recv_var] = curr_id;
+        // rd.local_to_global_var.push_back(recv_var); // same as over
+        curr_id++;
+      }
+    }
+  }
+
+  // 4. Remap global -> local everywhere
+  for (i_t rank = 0; rank < nb_parts; rank++) {
+    auto& rd = rank_data[rank];
+
+    for (auto& send_vec : rd.var_send_per_peer) {
+      for (auto& v : send_vec) v = rd.global_to_local_var.at(v);
+    }
+    for (auto& send_vec : rd.cstr_send_per_peer) {
+      for (auto& v : send_vec) v = rd.global_to_local_cstr.at(v);
+    }
+
+    for (auto& v : rd.h_A_col_indices) v = rd.global_to_local_var.at(v);
+    for (auto& v : rd.h_A_t_col_indices) v = rd.global_to_local_cstr.at(v);
+  }
+
+  return rank_data;
+}
+
+template struct partition_loader_t<int, double>;
+
+}  // namespace cuopt::linear_programming::detail
diff --git a/cpp/src/pdlp/distributed_pdlp/partition_loader.hpp b/cpp/src/pdlp/distributed_pdlp/partition_loader.hpp
index 4d66d4445c..efdfd0ba0e 100644
--- a/cpp/src/pdlp/distributed_pdlp/partition_loader.hpp
+++ b/cpp/src/pdlp/distributed_pdlp/partition_loader.hpp
@@ -1,16 +1,33 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ */
 
+#pragma once
 
-partition_loader_t {
-    static std::vector<int> parse_distributed_pdlp_partition_file(std::string file);
-    std::vector<rank_data_t> create_rank_data_from_parts(const std::vector<i_t>& parts,
-        const std::vector<i_t>& A_row_offsets,
-        const std::vector<i_t>& A_col_indices,
-        const std::vector<f_t>& A_values,
-        const std::vector<i_t>& A_t_row_offsets,
-        const std::vector<i_t>& A_t_col_indices,
-        const std::vector<f_t>& A_t_values,
-        i_t nb_parts,
-        i_t nb_cstr,
-        i_t nb_vars,
-        i_t nnz);
-}
\ No newline at end of file
+#include <pdlp/distributed_pdlp/rank_data.hpp>
+
+#include <string>
+#include <vector>
+
+namespace cuopt::linear_programming::detail {
+
+template <typename i_t, typename f_t>
+struct partition_loader_t {
+  static std::vector<int> parse_distributed_pdlp_partition_file(std::string file);
+
+  static std::vector<rank_data_t<i_t, f_t>> create_rank_data_from_parts(
+    const std::vector<i_t>& parts,
+    const std::vector<i_t>& A_row_offsets,
+    const std::vector<i_t>& A_col_indices,
+    const std::vector<f_t>& A_values,
+    const std::vector<i_t>& A_t_row_offsets,
+    const std::vector<i_t>& A_t_col_indices,
+    const std::vector<f_t>& A_t_values,
+    i_t nb_parts,
+    i_t nb_cstr,
+    i_t nb_vars,
+    i_t nnz);
+};
+
+}  // namespace cuopt::linear_programming::detail
diff --git a/cpp/src/pdlp/distributed_pdlp/shard.hpp b/cpp/src/pdlp/distributed_pdlp/shard.hpp
index 7528c35dec..a33477edf1 100644
--- a/cpp/src/pdlp/distributed_pdlp/shard.hpp
+++ b/cpp/src/pdlp/distributed_pdlp/shard.hpp
@@ -1,46 +1,78 @@
-#pragma once
-#include <pdlp/distributed_pdlp/rank_data.hpp>
-#include <raft/core/handle.hpp>
-#include <nccl.h>
-#include <memory>
-namespace cuopt::linear_programming::detail {
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ */
+ #pragma once
 
-template <typename i_t, typename f_t>
-class pdlp_solver_t;
-
-struct nccl_comm_deleter_t {
-  int device_id{-1};
-  void operator()(ncclComm* comm) const noexcept
-  {
-    raft::device_setter guard(device_id);
-    if (comm != nullptr) {
-      ncclCommDestroy(comm);
-    }
-  }
-};
-using nccl_comm_unique_ptr_t = std::unique_ptr<ncclComm, nccl_comm_deleter_t>;
-
-template <typename i_t, typename f_t>
-class pdlp_shard_t {
-  // Declaration only, will be set as default in shard.cu . Needed to manage cyclic include of pdlp_solver_t.
-  public: 
-    ~pdlp_shard_t();
-  pdlp_shard_t(int device_id,
-    rank_data_t<i_t, f_t>&& rd,
-    ncclComm_t comm
-    /* ???????? */);
-
-  pdlp_shard_t(const pdlp_shard_t&)            = delete;
-  pdlp_shard_t& operator=(const pdlp_shard_t&) = delete;  
-  // Specific multi-GPU data
-  int device_id;
-  rmm::cuda_stream stream;
-  raft::handle_t                            handle; 
-  nccl_comm_unique_ptr_t comm; 
-  rank_data_t<i_t, f_t>     rank_data;
-  optimization_problem_t opt_problem;
-  problem_t sub_problem;
-  std::unique_ptr<pdlp_solver_t<i_t, f_t>> sub_pdlp;
-};
-
-}
+ #include <pdlp/distributed_pdlp/rank_data.hpp>
+ 
+ #include <cuopt/linear_programming/optimization_problem.hpp>
+ #include <cuopt/linear_programming/pdlp/solver_settings.hpp>
+ #include <mip_heuristics/problem/problem.cuh>
+ 
+ #include <raft/core/device_setter.hpp>
+ #include <raft/core/handle.hpp>
+ #include <rmm/cuda_stream.hpp>
+ 
+ #include <nccl.h>
+ 
+ #include <memory>
+ #include <optional>
+ #include <vector>
+ 
+ namespace cuopt::linear_programming::detail {
+ 
+ // Forward-declare to break the cyclic include with pdlp.cuh
+ // (pdlp.cuh -> multi_gpu_engine.hpp -> shard.hpp -> pdlp.cuh).
+ // Definitions of out-of-line members live in shard.cu, which includes pdlp.cuh.
+ template <typename i_t, typename f_t>
+ class pdlp_solver_t;
+ 
+ // RAII deleter for ncclComm_t; sets the right device before destroy.
+ struct nccl_comm_deleter_t {
+   int device_id{-1};
+   void operator()(ncclComm* comm) const noexcept
+   {
+     if (comm == nullptr) return;
+     raft::device_setter guard(device_id);
+     ncclCommDestroy(comm);
+   }
+ };
+ using nccl_comm_unique_ptr_t = std::unique_ptr<ncclComm, nccl_comm_deleter_t>;
+ 
+ template <typename i_t, typename f_t>
+ struct pdlp_shard_t {
+   // Out-of-line (in shard.cu) because pdlp_solver_t is incomplete here.
+   ~pdlp_shard_t();
+ 
+   pdlp_shard_t(int device_id,
+                rank_data_t<i_t, f_t>&& rd,
+                ncclComm_t raw_comm,
+                std::vector<f_t> const& h_global_obj,
+                std::vector<f_t> const& h_global_var_lower,
+                std::vector<f_t> const& h_global_var_upper,
+                std::vector<f_t> const& h_global_cstr_lower,
+                std::vector<f_t> const& h_global_cstr_upper,
+                bool maximize,
+                f_t  objective_offset,
+                f_t  objective_scaling_factor,
+                pdlp_solver_settings_t<i_t, f_t> const& settings);
+ 
+   pdlp_shard_t(const pdlp_shard_t&)            = delete;
+   pdlp_shard_t& operator=(const pdlp_shard_t&) = delete;
+   // Move ops are implicitly deleted (user-declared dtor + deleted copy).
+   // Intentional: shard owns device-affine resources and must never move.
+   // Store as std::unique_ptr in any container.
+ 
+   int                                              device_id;
+   rmm::cuda_stream                                 stream;
+   raft::handle_t                                   handle;
+   nccl_comm_unique_ptr_t                           comm;
+   rank_data_t<i_t, f_t>                            rank_data;
+   std::optional<optimization_problem_t<i_t, f_t>>  opt_problem;
+   std::optional<problem_t<i_t, f_t>>               sub_problem;
+   std::unique_ptr<pdlp_solver_t<i_t, f_t>>         sub_pdlp;
+ };
+ 
+ }  // namespace cuopt::linear_programming::detail
+ 
\ No newline at end of file

From dd935c5307a312918121b53a27674bb4656fd291 Mon Sep 17 00:00:00 2001
From: Bulle Mostovoi <vmostovoi@nvidia.com>
Date: Tue, 19 May 2026 14:26:31 +0200
Subject: [PATCH 08/67] partition loader now partition loads

---
 .../pdlp/distributed_pdlp/partition_loader.cu | 28 ++++++++++++++++---
 .../distributed_pdlp/partition_loader.hpp     |  5 +++-
 2 files changed, 28 insertions(+), 5 deletions(-)

diff --git a/cpp/src/pdlp/distributed_pdlp/partition_loader.cu b/cpp/src/pdlp/distributed_pdlp/partition_loader.cu
index a9df158601..0e122cefc0 100644
--- a/cpp/src/pdlp/distributed_pdlp/partition_loader.cu
+++ b/cpp/src/pdlp/distributed_pdlp/partition_loader.cu
@@ -5,17 +5,37 @@
 
 #include <pdlp/distributed_pdlp/partition_loader.hpp>
 
+#include <cuopt/error.hpp>
+
+#include <fstream>
 #include <set>
 #include <utility>
 
 namespace cuopt::linear_programming::detail {
 
 template <typename i_t, typename f_t>
-std::vector<int> partition_loader_t<i_t, f_t>::parse_distributed_pdlp_partition_file(
-  std::string file)
+std::vector<i_t> partition_loader_t<i_t, f_t>::parse_distributed_pdlp_partition_file(
+  std::string const& file)
 {
-  // returns a vector with all the values separated by a \n
-  return {};  // TODO: implement
+  std::ifstream part_file(file);
+  cuopt_expects(part_file.is_open(),
+                error_type_t::ValidationError,
+                "Failed to open partition file: " + file);
+
+  // One integer per line; operator>> skips whitespace so blank lines and
+  // trailing newlines are tolerated.
+  std::vector<i_t> parts;
+  i_t part = 0;
+  while (part_file >> part) {
+    parts.push_back(part);
+  }
+
+  // We must have hit EOF cleanly; any other state means a malformed token.
+  cuopt_expects(part_file.eof(),
+                error_type_t::ValidationError,
+                "Malformed partition file (expected one integer per line): " + file);
+
+  return parts;
 }
 
 template <typename i_t, typename f_t>
diff --git a/cpp/src/pdlp/distributed_pdlp/partition_loader.hpp b/cpp/src/pdlp/distributed_pdlp/partition_loader.hpp
index efdfd0ba0e..25560cdbfd 100644
--- a/cpp/src/pdlp/distributed_pdlp/partition_loader.hpp
+++ b/cpp/src/pdlp/distributed_pdlp/partition_loader.hpp
@@ -14,7 +14,10 @@ namespace cuopt::linear_programming::detail {
 
 template <typename i_t, typename f_t>
 struct partition_loader_t {
-  static std::vector<int> parse_distributed_pdlp_partition_file(std::string file);
+  // Read a Metis-style partition file: one part-id per line (whitespace-tolerant),
+  // ASCII integers in [0, nb_parts). Returns a flat vector of length
+  // nb_cstr + nb_vars, indexed as in create_rank_data_from_parts (cstrs first, then vars).
+  static std::vector<i_t> parse_distributed_pdlp_partition_file(std::string const& file);
 
   static std::vector<rank_data_t<i_t, f_t>> create_rank_data_from_parts(
     const std::vector<i_t>& parts,

From 09eb20b7701df0079309ab6932a5a03a9fd6595e Mon Sep 17 00:00:00 2001
From: Bulle Mostovoi <vmostovoi@nvidia.com>
Date: Tue, 19 May 2026 19:44:15 +0200
Subject: [PATCH 09/67] big advancements ayo ! We can soon start working on
 imlementing the solver !!!

---
 .../pdlp/solver_settings.hpp                  |   2 +
 .../pdlp/distributed_pdlp/multi_gpu_engine.cu | 116 ++++++++------
 .../distributed_pdlp/multi_gpu_engine.hpp     |  41 ++---
 .../pdlp/distributed_pdlp/partition_loader.cu |  38 +++--
 .../distributed_pdlp/partition_loader.hpp     |   3 +
 cpp/src/pdlp/distributed_pdlp/rank_data.hpp   |   2 +
 cpp/src/pdlp/distributed_pdlp/shard.cu        | 102 ++++++++++--
 cpp/src/pdlp/distributed_pdlp/shard.hpp       |  35 +++--
 .../initial_scaling.cu                        |  36 +++++
 .../initial_scaling.cuh                       |   7 +
 cpp/src/pdlp/pdlp.cu                          | 145 ++++++++++++------
 cpp/src/pdlp/pdlp.cuh                         |   8 +-
 12 files changed, 382 insertions(+), 153 deletions(-)

diff --git a/cpp/include/cuopt/linear_programming/pdlp/solver_settings.hpp b/cpp/include/cuopt/linear_programming/pdlp/solver_settings.hpp
index 4585b9d1cf..2a18b8060f 100644
--- a/cpp/include/cuopt/linear_programming/pdlp/solver_settings.hpp
+++ b/cpp/include/cuopt/linear_programming/pdlp/solver_settings.hpp
@@ -287,6 +287,8 @@ class pdlp_solver_settings_t {
   bool dual_postsolve{true};
   int num_gpus{1};
   std::string multi_gpu_partition_file{""};
+  // Set to true inside the shards
+  bool is_distributed_sub_pdlp{false};
   method_t method{method_t::Concurrent};
   bool inside_mip{false};
   // For concurrent termination
diff --git a/cpp/src/pdlp/distributed_pdlp/multi_gpu_engine.cu b/cpp/src/pdlp/distributed_pdlp/multi_gpu_engine.cu
index c7307c46ee..9b404bbd53 100644
--- a/cpp/src/pdlp/distributed_pdlp/multi_gpu_engine.cu
+++ b/cpp/src/pdlp/distributed_pdlp/multi_gpu_engine.cu
@@ -15,57 +15,71 @@
  
  namespace cuopt::linear_programming::detail {
  
- template <typename i_t, typename f_t>
- multi_gpu_engine_t<i_t, f_t>::multi_gpu_engine_t(
-   std::vector<rank_data_t<i_t, f_t>>&&      rank_data,
-   std::vector<f_t> const&                   h_global_obj,
-   std::vector<f_t> const&                   h_global_var_lower,
-   std::vector<f_t> const&                   h_global_var_upper,
-   std::vector<f_t> const&                   h_global_cstr_lower,
-   std::vector<f_t> const&                   h_global_cstr_upper,
-   bool                                      maximize,
-   f_t                                       objective_offset,
-   f_t                                       objective_scaling_factor,
-   pdlp_solver_settings_t<i_t, f_t> const&   sub_solver_settings)
-   : stream()
- {
-   const int nb_parts = static_cast<int>(rank_data.size());
-   cuopt_expects(nb_parts > 0,
-                 error_type_t::ValidationError,
-                 "multi_gpu_engine_t: rank_data must be non-empty");
- 
-   shards.reserve(nb_parts);
- 
-   // 1:1 rank -> device mapping. (Matches metis_tests; refine later if needed.)
-   std::vector<int> devices(nb_parts);
-   std::iota(devices.begin(), devices.end(), 0);
- 
-   // 2. Collectively bootstrap NCCL communicators across all devices.
-   //    Must be done together; each comm is then handed to one shard,
-   //    which wraps it in a unique_ptr with the device-aware deleter.
-   std::vector<ncclComm_t> raw_comms(nb_parts);
-   cuopt_expects(ncclCommInitAll(raw_comms.data(), nb_parts, devices.data()) == ncclSuccess,
-                 error_type_t::RuntimeError,
-                 "ncclCommInitAll failed");
- 
-   // 3. Construct one shard per rank, pinned to its device.
-   for (int r = 0; r < nb_parts; ++r) {
-     raft::device_setter guard(devices[r]);  // shard ctor asserts current device
-     shards.emplace_back(std::make_unique<pdlp_shard_t<i_t, f_t>>(
-       devices[r],
-       std::move(rank_data[r]),
-       raw_comms[r],
-       h_global_obj,
-       h_global_var_lower,
-       h_global_var_upper,
-       h_global_cstr_lower,
-       h_global_cstr_upper,
-       maximize,
-       objective_offset,
-       objective_scaling_factor,
-       sub_solver_settings));
-   }
- }
+template <typename i_t, typename f_t>
+multi_gpu_engine_t<i_t, f_t>::multi_gpu_engine_t(
+  std::vector<rank_data_t<i_t, f_t>>&&      rank_data,
+  std::vector<f_t> const&                   h_global_obj,
+  std::vector<f_t> const&                   h_global_var_lower,
+  std::vector<f_t> const&                   h_global_var_upper,
+  std::vector<f_t> const&                   h_global_cstr_lower,
+  std::vector<f_t> const&                   h_global_cstr_upper,
+  std::vector<f_t> const&                   h_global_obj_scaled,
+  std::vector<f_t> const&                   h_global_var_lower_scaled,
+  std::vector<f_t> const&                   h_global_var_upper_scaled,
+  std::vector<f_t> const&                   h_global_cstr_lower_scaled,
+  std::vector<f_t> const&                   h_global_cstr_upper_scaled,
+  std::vector<f_t> const&                   h_global_cummulative_cstr_scaling,
+  std::vector<f_t> const&                   h_global_cummulative_var_scaling,
+  f_t                                       h_bound_rescaling,
+  f_t                                       h_objective_rescaling,
+  bool                                      maximize,
+  f_t                                       objective_offset,
+  f_t                                       objective_scaling_factor,
+  pdlp_solver_settings_t<i_t, f_t> const&   sub_solver_settings)
+  : stream()
+{
+  const int nb_parts = static_cast<int>(rank_data.size());
+  cuopt_expects(nb_parts > 0,
+                error_type_t::ValidationError,
+                "multi_gpu_engine_t: rank_data must be non-empty");
+
+  shards.reserve(nb_parts);
+  std::vector<int> devices(nb_parts);
+  std::iota(devices.begin(), devices.end(), 0);
+
+  // Create NCCL Comms then let shards own them
+  std::vector<ncclComm_t> raw_comms(nb_parts);
+  cuopt_expects(ncclCommInitAll(raw_comms.data(), nb_parts, devices.data()) == ncclSuccess,
+                error_type_t::RuntimeError,
+                "ncclCommInitAll failed");
+
+  // 3. Construct one shard per rank, pinned to its device.
+  for (int r = 0; r < nb_parts; ++r) {
+    raft::device_setter guard(devices[r]);  // shard ctor needs device set
+    shards.emplace_back(std::make_unique<pdlp_shard_t<i_t, f_t>>(
+      devices[r],
+      std::move(rank_data[r]),
+      raw_comms[r],
+      h_global_obj,
+      h_global_var_lower,
+      h_global_var_upper,
+      h_global_cstr_lower,
+      h_global_cstr_upper,
+      h_global_obj_scaled,
+      h_global_var_lower_scaled,
+      h_global_var_upper_scaled,
+      h_global_cstr_lower_scaled,
+      h_global_cstr_upper_scaled,
+      h_global_cummulative_cstr_scaling,
+      h_global_cummulative_var_scaling,
+      h_bound_rescaling,
+      h_objective_rescaling,
+      maximize,
+      objective_offset,
+      objective_scaling_factor,
+      sub_solver_settings));
+  }
+}
  
  template struct multi_gpu_engine_t<int, double>;
  // template struct multi_gpu_engine_t<int, float>;
diff --git a/cpp/src/pdlp/distributed_pdlp/multi_gpu_engine.hpp b/cpp/src/pdlp/distributed_pdlp/multi_gpu_engine.hpp
index 6142c938e3..d672e18197 100644
--- a/cpp/src/pdlp/distributed_pdlp/multi_gpu_engine.hpp
+++ b/cpp/src/pdlp/distributed_pdlp/multi_gpu_engine.hpp
@@ -16,24 +16,29 @@
  
  namespace cuopt::linear_programming::detail {
  
- template <typename i_t, typename f_t>
- struct multi_gpu_engine_t {
-   // Constructs one shard per partition. Caller is responsible for:
-   //   - rank_data[i] being correctly populated for rank i
-   //   - the host vectors holding the (already scaled) global problem data
-   //   - sub_solver_settings being the per-shard PDLP config (num_gpus=1,
-   //     multi_gpu_partition_file="", scaling disabled).
-   multi_gpu_engine_t(
-     std::vector<rank_data_t<i_t, f_t>>&&      rank_data,
-     std::vector<f_t> const&                   h_global_obj,
-     std::vector<f_t> const&                   h_global_var_lower,
-     std::vector<f_t> const&                   h_global_var_upper,
-     std::vector<f_t> const&                   h_global_cstr_lower,
-     std::vector<f_t> const&                   h_global_cstr_upper,
-     bool                                      maximize,
-     f_t                                       objective_offset,
-     f_t                                       objective_scaling_factor,
-     pdlp_solver_settings_t<i_t, f_t> const&   sub_solver_settings);
+template <typename i_t, typename f_t>
+struct multi_gpu_engine_t {
+  // Constructs shards from rank_data
+  multi_gpu_engine_t(
+    std::vector<rank_data_t<i_t, f_t>>&&      rank_data,
+    std::vector<f_t> const&                   h_global_obj,
+    std::vector<f_t> const&                   h_global_var_lower,
+    std::vector<f_t> const&                   h_global_var_upper,
+    std::vector<f_t> const&                   h_global_cstr_lower,
+    std::vector<f_t> const&                   h_global_cstr_upper,
+    std::vector<f_t> const&                   h_global_obj_scaled,
+    std::vector<f_t> const&                   h_global_var_lower_scaled,
+    std::vector<f_t> const&                   h_global_var_upper_scaled,
+    std::vector<f_t> const&                   h_global_cstr_lower_scaled,
+    std::vector<f_t> const&                   h_global_cstr_upper_scaled,
+    std::vector<f_t> const&                   h_global_cummulative_cstr_scaling,
+    std::vector<f_t> const&                   h_global_cummulative_var_scaling,
+    f_t                                       h_bound_rescaling,
+    f_t                                       h_objective_rescaling,
+    bool                                      maximize,
+    f_t                                       objective_offset,
+    f_t                                       objective_scaling_factor,
+    pdlp_solver_settings_t<i_t, f_t> const&   sub_solver_settings);
  
    multi_gpu_engine_t(const multi_gpu_engine_t&)            = delete;
    multi_gpu_engine_t& operator=(const multi_gpu_engine_t&) = delete;
diff --git a/cpp/src/pdlp/distributed_pdlp/partition_loader.cu b/cpp/src/pdlp/distributed_pdlp/partition_loader.cu
index 0e122cefc0..047fb536d5 100644
--- a/cpp/src/pdlp/distributed_pdlp/partition_loader.cu
+++ b/cpp/src/pdlp/distributed_pdlp/partition_loader.cu
@@ -45,14 +45,23 @@ partition_loader_t<i_t, f_t>::create_rank_data_from_parts(
   const std::vector<i_t>& A_row_offsets,
   const std::vector<i_t>& A_col_indices,
   const std::vector<f_t>& A_values,
+  const std::vector<f_t>& A_values_scaled,
   const std::vector<i_t>& A_t_row_offsets,
   const std::vector<i_t>& A_t_col_indices,
   const std::vector<f_t>& A_t_values,
+  const std::vector<f_t>& A_t_values_scaled,
   i_t nb_parts,
   i_t nb_cstr,
   i_t nb_vars,
   i_t nnz)
 {
+  cuopt_expects(A_values.size() == A_values_scaled.size(),
+                error_type_t::ValidationError,
+                "A_values and A_values_scaled must have the same length");
+  cuopt_expects(A_t_values.size() == A_t_values_scaled.size(),
+                error_type_t::ValidationError,
+                "A_t_values and A_t_values_scaled must have the same length");
+
   std::vector<rank_data_t<i_t, f_t>> rank_data(nb_parts, rank_data_t<i_t, f_t>(nb_parts));
   std::vector<i_t> cstr_parts(parts.begin(), parts.begin() + nb_cstr);
   std::vector<i_t> var_parts(parts.begin() + nb_cstr, parts.begin() + nb_cstr + nb_vars);
@@ -74,17 +83,22 @@ partition_loader_t<i_t, f_t>::create_rank_data_from_parts(
     std::vector<i_t> local_A_row_offsets;
     std::vector<i_t> local_A_col_indices;
     std::vector<f_t> local_A_values;
+    std::vector<f_t> local_A_values_scaled;
 
     i_t local_A_nnz = 0;
     local_A_row_offsets.push_back(local_A_nnz);
 
-    // For each owned constraint, build local matrix A
+    // For each owned constraint, build local matrix A. We walk both the
+    // unscaled and scaled global value arrays in lockstep so the produced
+    // local arrays share identical (offsets, col_indices) and differ only
+    // in values.
     for (auto owned_cstr : rd.owned_cstr_indices) {
       i_t cstr_len = A_row_offsets[owned_cstr + 1] - A_row_offsets[owned_cstr];
       i_t row_start = A_row_offsets[owned_cstr];
       for (i_t v = 0; v < cstr_len; v++) {
         local_A_col_indices.push_back(A_col_indices[row_start + v]);
-        local_A_values.push_back(A_values[row_start + v]);
+        local_A_values       .push_back(A_values       [row_start + v]);
+        local_A_values_scaled.push_back(A_values_scaled[row_start + v]);
       }
       local_A_nnz += cstr_len;
       local_A_row_offsets.push_back(local_A_nnz);
@@ -111,14 +125,16 @@ partition_loader_t<i_t, f_t>::create_rank_data_from_parts(
       rank_data[peer].var_send_per_peer[rank] = std::move(needed_var_from_peer);
     }
 
-    rd.h_A_row_offsets = std::move(local_A_row_offsets);
-    rd.h_A_col_indices = std::move(local_A_col_indices);
-    rd.h_A_values = std::move(local_A_values);
+    rd.h_A_row_offsets    = std::move(local_A_row_offsets);
+    rd.h_A_col_indices    = std::move(local_A_col_indices);
+    rd.h_A_values         = std::move(local_A_values);
+    rd.h_A_values_scaled  = std::move(local_A_values_scaled);
 
     // ---- A_t side ----
     std::vector<i_t> local_A_t_row_offsets;
     std::vector<i_t> local_A_t_col_indices;
     std::vector<f_t> local_A_t_values;
+    std::vector<f_t> local_A_t_values_scaled;
     i_t local_A_t_nnz = 0;
     local_A_t_row_offsets.push_back(local_A_t_nnz);
 
@@ -126,8 +142,9 @@ partition_loader_t<i_t, f_t>::create_rank_data_from_parts(
       i_t var_len = A_t_row_offsets[owned_var + 1] - A_t_row_offsets[owned_var];
       i_t row_start = A_t_row_offsets[owned_var];
       for (i_t v = 0; v < var_len; v++) {
-        local_A_t_col_indices.push_back(A_t_col_indices[row_start + v]);
-        local_A_t_values.push_back(A_t_values[row_start + v]);
+        local_A_t_col_indices  .push_back(A_t_col_indices [row_start + v]);
+        local_A_t_values       .push_back(A_t_values      [row_start + v]);
+        local_A_t_values_scaled.push_back(A_t_values_scaled[row_start + v]);
       }
       local_A_t_nnz += var_len;
       local_A_t_row_offsets.push_back(local_A_t_nnz);
@@ -154,9 +171,10 @@ partition_loader_t<i_t, f_t>::create_rank_data_from_parts(
       rank_data[peer].cstr_send_per_peer[rank] = std::move(needed_cstr_from_peer);
     }
 
-    rd.h_A_t_row_offsets = std::move(local_A_t_row_offsets);
-    rd.h_A_t_col_indices = std::move(local_A_t_col_indices);
-    rd.h_A_t_values = std::move(local_A_t_values);
+    rd.h_A_t_row_offsets    = std::move(local_A_t_row_offsets);
+    rd.h_A_t_col_indices    = std::move(local_A_t_col_indices);
+    rd.h_A_t_values         = std::move(local_A_t_values);
+    rd.h_A_t_values_scaled  = std::move(local_A_t_values_scaled);
 
     rd.total_var_size  = rd.owned_var_size  + needed_vars.size();
     rd.total_cstr_size = rd.owned_cstr_size + needed_cstrs.size();
diff --git a/cpp/src/pdlp/distributed_pdlp/partition_loader.hpp b/cpp/src/pdlp/distributed_pdlp/partition_loader.hpp
index 25560cdbfd..915c24a828 100644
--- a/cpp/src/pdlp/distributed_pdlp/partition_loader.hpp
+++ b/cpp/src/pdlp/distributed_pdlp/partition_loader.hpp
@@ -19,14 +19,17 @@ struct partition_loader_t {
   // nb_cstr + nb_vars, indexed as in create_rank_data_from_parts (cstrs first, then vars).
   static std::vector<i_t> parse_distributed_pdlp_partition_file(std::string const& file);
 
+  // Slices the data to prepare a split from metis partitionning with halo communication
   static std::vector<rank_data_t<i_t, f_t>> create_rank_data_from_parts(
     const std::vector<i_t>& parts,
     const std::vector<i_t>& A_row_offsets,
     const std::vector<i_t>& A_col_indices,
     const std::vector<f_t>& A_values,
+    const std::vector<f_t>& A_values_scaled,
     const std::vector<i_t>& A_t_row_offsets,
     const std::vector<i_t>& A_t_col_indices,
     const std::vector<f_t>& A_t_values,
+    const std::vector<f_t>& A_t_values_scaled,
     i_t nb_parts,
     i_t nb_cstr,
     i_t nb_vars,
diff --git a/cpp/src/pdlp/distributed_pdlp/rank_data.hpp b/cpp/src/pdlp/distributed_pdlp/rank_data.hpp
index ee107f5cf1..29d76ae110 100644
--- a/cpp/src/pdlp/distributed_pdlp/rank_data.hpp
+++ b/cpp/src/pdlp/distributed_pdlp/rank_data.hpp
@@ -44,9 +44,11 @@ struct rank_data_t {
     std::vector<i_t> h_A_row_offsets;
     std::vector<i_t> h_A_col_indices;
     std::vector<f_t> h_A_values;
+    std::vector<f_t> h_A_values_scaled;
     // A_t
     std::vector<i_t> h_A_t_row_offsets;
     std::vector<i_t> h_A_t_col_indices;
     std::vector<f_t> h_A_t_values;
+    std::vector<f_t> h_A_t_values_scaled;
   };
 } // namespace cuopt::linear_programming::detail
\ No newline at end of file
diff --git a/cpp/src/pdlp/distributed_pdlp/shard.cu b/cpp/src/pdlp/distributed_pdlp/shard.cu
index d5e795bb61..41f74086ab 100644
--- a/cpp/src/pdlp/distributed_pdlp/shard.cu
+++ b/cpp/src/pdlp/distributed_pdlp/shard.cu
@@ -5,6 +5,9 @@
 
 #include <pdlp/distributed_pdlp/shard.hpp>
 #include <pdlp/pdlp.cuh>
+#include <pdlp/utils.cuh>
+
+#include <utilities/copy_helpers.hpp>
 
 #include <raft/core/copy.hpp>
 #include <raft/core/device_setter.hpp>
@@ -28,6 +31,15 @@ pdlp_shard_t<i_t, f_t>::pdlp_shard_t(
   std::vector<f_t> const& h_global_var_upper,
   std::vector<f_t> const& h_global_cstr_lower,
   std::vector<f_t> const& h_global_cstr_upper,
+  std::vector<f_t> const& h_global_obj_scaled,
+  std::vector<f_t> const& h_global_var_lower_scaled,
+  std::vector<f_t> const& h_global_var_upper_scaled,
+  std::vector<f_t> const& h_global_cstr_lower_scaled,
+  std::vector<f_t> const& h_global_cstr_upper_scaled,
+  std::vector<f_t> const& h_global_cummulative_cstr_scaling,
+  std::vector<f_t> const& h_global_cummulative_var_scaling,
+  f_t                                      h_bound_rescaling,
+  f_t                                      h_objective_rescaling,
   bool                                     maximize,
   f_t                                      objective_offset,
   f_t                                      objective_scaling_factor,
@@ -45,27 +57,47 @@ pdlp_shard_t<i_t, f_t>::pdlp_shard_t(
 
   // ---- 1. Gather per-shard host slices using rank_data's index maps. ----
   // All vectors are sized to TOTAL (owned + halo). Owned slots get real
-  // values; halo slots keep neutral defaults so they are no-ops even if
-  // accidentally touched before `owned_*_size_` plumbing is in place.
-  std::vector<f_t> h_obj       (rank_data.total_var_size,   f_t{0});
-  std::vector<f_t> h_var_lower (rank_data.total_var_size,  -std::numeric_limits<f_t>::infinity());
-  std::vector<f_t> h_var_upper (rank_data.total_var_size,   std::numeric_limits<f_t>::infinity());
-  std::vector<f_t> h_cstr_lower(rank_data.total_cstr_size, -std::numeric_limits<f_t>::infinity());
-  std::vector<f_t> h_cstr_upper(rank_data.total_cstr_size,  std::numeric_limits<f_t>::infinity());
+  // values; halo slots keep defaults because they should not be accessed
+  std::vector<f_t> h_obj              (rank_data.total_var_size,   f_t{0});
+  std::vector<f_t> h_var_lower        (rank_data.total_var_size,  -std::numeric_limits<f_t>::infinity());
+  std::vector<f_t> h_var_upper        (rank_data.total_var_size,   std::numeric_limits<f_t>::infinity());
+  std::vector<f_t> h_cstr_lower       (rank_data.total_cstr_size, -std::numeric_limits<f_t>::infinity());
+  std::vector<f_t> h_cstr_upper       (rank_data.total_cstr_size,  std::numeric_limits<f_t>::infinity());
+
+  std::vector<f_t> h_obj_scaled       (rank_data.total_var_size,   f_t{0});
+  std::vector<f_t> h_var_lower_scaled (rank_data.total_var_size,  -std::numeric_limits<f_t>::infinity());
+  std::vector<f_t> h_var_upper_scaled (rank_data.total_var_size,   std::numeric_limits<f_t>::infinity());
+  std::vector<f_t> h_cstr_lower_scaled(rank_data.total_cstr_size, -std::numeric_limits<f_t>::infinity());
+  std::vector<f_t> h_cstr_upper_scaled(rank_data.total_cstr_size,  std::numeric_limits<f_t>::infinity());
 
   for (i_t i = 0; i < rank_data.owned_var_size; ++i) {
-    const auto g  = rank_data.local_to_global_var[i];
-    h_obj[i]       = h_global_obj[g];
-    h_var_lower[i] = h_global_var_lower[g];
-    h_var_upper[i] = h_global_var_upper[g];
+    const auto g          = rank_data.local_to_global_var[i];
+    h_obj[i]              = h_global_obj[g];
+    h_var_lower[i]        = h_global_var_lower[g];
+    h_var_upper[i]        = h_global_var_upper[g];
+    h_obj_scaled[i]       = h_global_obj_scaled[g];
+    h_var_lower_scaled[i] = h_global_var_lower_scaled[g];
+    h_var_upper_scaled[i] = h_global_var_upper_scaled[g];
+  }
+  for (i_t i = 0; i < rank_data.owned_cstr_size; ++i) {
+    const auto g           = rank_data.local_to_global_cstr[i];
+    h_cstr_lower[i]        = h_global_cstr_lower[g];
+    h_cstr_upper[i]        = h_global_cstr_upper[g];
+    h_cstr_lower_scaled[i] = h_global_cstr_lower_scaled[g];
+    h_cstr_upper_scaled[i] = h_global_cstr_upper_scaled[g];
   }
+
+  // Get local scaling factors
+  std::vector<f_t> h_cstr_scaling_local(rank_data.total_cstr_size, f_t{1});
+  std::vector<f_t> h_var_scaling_local (rank_data.total_var_size,  f_t{1});
   for (i_t i = 0; i < rank_data.owned_cstr_size; ++i) {
-    const auto g    = rank_data.local_to_global_cstr[i];
-    h_cstr_lower[i] = h_global_cstr_lower[g];
-    h_cstr_upper[i] = h_global_cstr_upper[g];
+    h_cstr_scaling_local[i] = h_global_cummulative_cstr_scaling[rank_data.local_to_global_cstr[i]];
+  }
+  for (i_t i = 0; i < rank_data.owned_var_size; ++i) {
+    h_var_scaling_local[i] = h_global_cummulative_var_scaling[rank_data.local_to_global_var[i]];
   }
 
-  // ---- 2. Build optimization_problem_t on this shard's device. ----
+  // ---- 2. Build optimization_problem_t on this shard's device (UNSCALED). ----
   opt_problem.emplace(&handle);
   opt_problem->set_csr_constraint_matrix(
     rank_data.h_A_values     .data(), static_cast<i_t>(rank_data.h_A_values     .size()),
@@ -86,7 +118,7 @@ pdlp_shard_t<i_t, f_t>::pdlp_shard_t(
   opt_problem->set_objective_scaling_factor(objective_scaling_factor);
   opt_problem->set_problem_category(problem_category_t::LP);
 
-  // ---- 3. Build problem_t from opt_problem. ----
+  // ---- 3. Build problem_t from opt_problem (still UNSCALED). ----
   sub_problem.emplace(*opt_problem);
 
   // ---- 4. Override reverse_* with the real local A_T from rank_data. ----
@@ -109,7 +141,45 @@ pdlp_shard_t<i_t, f_t>::pdlp_shard_t(
   handle.sync_stream(stream_view);
 
   // ---- 5. Build sub_pdlp (single-GPU mode; multi_gpu flags cleared by caller). ----
+  //         At this point sub_pdlp.op_problem_scaled_ is an unscaled copy
+  //         of sub_problem and sub_pdlp.initial_scaling_strategy_ has
+  //         unit cumulative factors (sub-settings disable Ruiz / PC iters).
   sub_pdlp = std::make_unique<pdlp_solver_t<i_t, f_t>>(*sub_problem, settings, /*batch=*/false);
+
+  // Inject master-scaled buffers inside sub_pdlp
+  auto& scaled = sub_pdlp->get_op_problem_scaled();
+  raft::copy(scaled.coefficients.data(),
+             rank_data.h_A_values_scaled.data(),
+             rank_data.h_A_values_scaled.size(), stream_view);
+  raft::copy(scaled.reverse_coefficients.data(),
+             rank_data.h_A_t_values_scaled.data(),
+             rank_data.h_A_t_values_scaled.size(), stream_view);
+  raft::copy(scaled.objective_coefficients.data(),
+             h_obj_scaled.data(), h_obj_scaled.size(), stream_view);
+  raft::copy(scaled.constraint_lower_bounds.data(),
+             h_cstr_lower_scaled.data(), h_cstr_lower_scaled.size(), stream_view);
+  raft::copy(scaled.constraint_upper_bounds.data(),
+             h_cstr_upper_scaled.data(), h_cstr_upper_scaled.size(), stream_view);
+
+  using f_t2 = typename type_2<f_t>::type;
+  std::vector<f_t2> h_var_bounds_scaled_packed(rank_data.total_var_size);
+  for (i_t i = 0; i < rank_data.total_var_size; ++i) {
+    h_var_bounds_scaled_packed[i].x = h_var_lower_scaled[i];
+    h_var_bounds_scaled_packed[i].y = h_var_upper_scaled[i];
+  }
+  raft::copy(scaled.variable_bounds.data(),
+             h_var_bounds_scaled_packed.data(),
+             h_var_bounds_scaled_packed.size(), stream_view);
+
+  combine_constraint_bounds<i_t, f_t>(scaled, scaled.combined_bounds);
+
+  // Inject master-scaled buffers inside sub_pdlp.initil_strategy
+  auto& scaling = sub_pdlp->get_initial_scaling_strategy();
+  scaling.set_cummulative_scaling(h_cstr_scaling_local, h_var_scaling_local);
+  scaling.set_h_bound_rescaling   (h_bound_rescaling);
+  scaling.set_h_objective_rescaling(h_objective_rescaling);
+
+  handle.sync_stream(stream_view);
 }
 
 template struct pdlp_shard_t<int, double>;
diff --git a/cpp/src/pdlp/distributed_pdlp/shard.hpp b/cpp/src/pdlp/distributed_pdlp/shard.hpp
index a33477edf1..3c10a90f90 100644
--- a/cpp/src/pdlp/distributed_pdlp/shard.hpp
+++ b/cpp/src/pdlp/distributed_pdlp/shard.hpp
@@ -45,18 +45,29 @@
    // Out-of-line (in shard.cu) because pdlp_solver_t is incomplete here.
    ~pdlp_shard_t();
  
-   pdlp_shard_t(int device_id,
-                rank_data_t<i_t, f_t>&& rd,
-                ncclComm_t raw_comm,
-                std::vector<f_t> const& h_global_obj,
-                std::vector<f_t> const& h_global_var_lower,
-                std::vector<f_t> const& h_global_var_upper,
-                std::vector<f_t> const& h_global_cstr_lower,
-                std::vector<f_t> const& h_global_cstr_upper,
-                bool maximize,
-                f_t  objective_offset,
-                f_t  objective_scaling_factor,
-                pdlp_solver_settings_t<i_t, f_t> const& settings);
+  // sub worker for distributed pdlp. Owns its own view on scaled problem and unscaled problem
+  // Owns necessary multi-gpu data (rank_data, device_id, nccl_comm)
+  pdlp_shard_t(int device_id,
+               rank_data_t<i_t, f_t>&& rd,
+               ncclComm_t raw_comm,
+               std::vector<f_t> const& h_global_obj,
+               std::vector<f_t> const& h_global_var_lower,
+               std::vector<f_t> const& h_global_var_upper,
+               std::vector<f_t> const& h_global_cstr_lower,
+               std::vector<f_t> const& h_global_cstr_upper,
+               std::vector<f_t> const& h_global_obj_scaled,
+               std::vector<f_t> const& h_global_var_lower_scaled,
+               std::vector<f_t> const& h_global_var_upper_scaled,
+               std::vector<f_t> const& h_global_cstr_lower_scaled,
+               std::vector<f_t> const& h_global_cstr_upper_scaled,
+               std::vector<f_t> const& h_global_cummulative_cstr_scaling,
+               std::vector<f_t> const& h_global_cummulative_var_scaling,
+               f_t h_bound_rescaling,
+               f_t h_objective_rescaling,
+               bool maximize,
+               f_t  objective_offset,
+               f_t  objective_scaling_factor,
+               pdlp_solver_settings_t<i_t, f_t> const& settings);
  
    pdlp_shard_t(const pdlp_shard_t&)            = delete;
    pdlp_shard_t& operator=(const pdlp_shard_t&) = delete;
diff --git a/cpp/src/pdlp/initial_scaling_strategy/initial_scaling.cu b/cpp/src/pdlp/initial_scaling_strategy/initial_scaling.cu
index a76b1773f9..a94064d0af 100644
--- a/cpp/src/pdlp/initial_scaling_strategy/initial_scaling.cu
+++ b/cpp/src/pdlp/initial_scaling_strategy/initial_scaling.cu
@@ -809,6 +809,42 @@ pdlp_initial_scaling_strategy_t<i_t, f_t>::get_variable_scaling_vector() const
   return cummulative_variable_scaling_;
 }
 
+template <typename i_t, typename f_t>
+void pdlp_initial_scaling_strategy_t<i_t, f_t>::set_cummulative_scaling(
+  const std::vector<f_t>& h_cummulative_constraint_matrix_scaling,
+  const std::vector<f_t>& h_cummulative_variable_scaling)
+{
+  cuopt_expects(static_cast<i_t>(h_cummulative_constraint_matrix_scaling.size()) == dual_size_h_,
+                error_type_t::ValidationError,
+                "set_cummulative_scaling: host constraint scaling vector size mismatch");
+  cuopt_expects(static_cast<i_t>(h_cummulative_variable_scaling.size()) == primal_size_h_,
+                error_type_t::ValidationError,
+                "set_cummulative_scaling: host variable scaling vector size mismatch");
+
+  raft::copy(cummulative_constraint_matrix_scaling_.data(),
+             h_cummulative_constraint_matrix_scaling.data(),
+             h_cummulative_constraint_matrix_scaling.size(),
+             stream_view_);
+  raft::copy(cummulative_variable_scaling_.data(),
+             h_cummulative_variable_scaling.data(),
+             h_cummulative_variable_scaling.size(),
+             stream_view_);
+}
+
+template <typename i_t, typename f_t>
+void pdlp_initial_scaling_strategy_t<i_t, f_t>::set_h_bound_rescaling(f_t value)
+{
+  h_bound_rescaling = value;
+  bound_rescaling_.set_value_async(value, stream_view_);
+}
+
+template <typename i_t, typename f_t>
+void pdlp_initial_scaling_strategy_t<i_t, f_t>::set_h_objective_rescaling(f_t value)
+{
+  h_objective_rescaling = value;
+  objective_rescaling_.set_value_async(value, stream_view_);
+}
+
 template <typename i_t, typename f_t>
 typename pdlp_initial_scaling_strategy_t<i_t, f_t>::view_t
 pdlp_initial_scaling_strategy_t<i_t, f_t>::view()
diff --git a/cpp/src/pdlp/initial_scaling_strategy/initial_scaling.cuh b/cpp/src/pdlp/initial_scaling_strategy/initial_scaling.cuh
index 5a3dcfaca2..ed5f8b1851 100644
--- a/cpp/src/pdlp/initial_scaling_strategy/initial_scaling.cuh
+++ b/cpp/src/pdlp/initial_scaling_strategy/initial_scaling.cuh
@@ -76,6 +76,13 @@ class pdlp_initial_scaling_strategy_t {
   f_t get_h_bound_rescaling() const;
   f_t get_h_objective_rescaling() const;
 
+  // Inject scaling state computed by another pdlp_initial_scaling_strategy_t
+  // Needed by distributed PDLP
+  void set_cummulative_scaling(const std::vector<f_t>& h_cummulative_constraint_matrix_scaling,
+                               const std::vector<f_t>& h_cummulative_variable_scaling);
+  void set_h_bound_rescaling(f_t value);
+  void set_h_objective_rescaling(f_t value);
+
   void bound_objective_rescaling();
 
   /**
diff --git a/cpp/src/pdlp/pdlp.cu b/cpp/src/pdlp/pdlp.cu
index a58ae4f210..612eb676ec 100644
--- a/cpp/src/pdlp/pdlp.cu
+++ b/cpp/src/pdlp/pdlp.cu
@@ -337,68 +337,119 @@ pdlp_solver_t<i_t, f_t>::pdlp_solver_t(problem_t<i_t, f_t>& op_problem,
                   "Metis partitioning inside cuopt not implemented yet; "
                   "provide a --parts file via settings.multi_gpu_partition_file");
   }
-  // 3. Scale now before copying to children
+
+  // always compute initial step size before scaling and primal_weight after scaling to do like cuPDLPx
+  assert(settings_.hyper_params.compute_initial_primal_weight_before_scaling && "compute_initial_primal_weight_before_scaling must be true in distributed mode");
+  assert(!settings_.hyper_params.compute_initial_step_size_before_scaling && "compute_initial_step_size_before_scaling must be false in distributed mode");
+  
+  compute_initial_primal_weight();
+  
+  // scale globally before dispatching to shards
   initial_scaling_strategy_.scale_problem();
+  
+  compute_initial_step_size();
 
-  // 4. Copy the scaled global problem from device -> host.
+  const f_t initial_step_size_global = get_step_size_h(0);
+  const f_t initial_primal_weight_global = get_primal_weight_h(0);
+
+  // 4. Copy both scaled and unscaled pb
   auto const stream = op_problem_scaled_.handle_ptr->get_stream();
   i_t const n_cstr  = op_problem_scaled_.n_constraints;
   i_t const n_vars  = op_problem_scaled_.n_variables;
   i_t const nnz     = op_problem_scaled_.nnz;
-  // CSRs (A and A_t).
+
+  // Shared topology (taken from the scaled problem, but identical on both).
   std::vector<i_t> h_A_row_offsets  (n_cstr + 1);
   std::vector<i_t> h_A_col_indices  (nnz);
-  std::vector<f_t> h_A_values       (nnz);
   std::vector<i_t> h_A_t_row_offsets(n_vars + 1);
   std::vector<i_t> h_A_t_col_indices(nnz);
-  std::vector<f_t> h_A_t_values     (nnz);
-  raft::copy(h_A_row_offsets  .data(), op_problem_scaled_.offsets             .data(), n_cstr + 1, stream);
-  raft::copy(h_A_col_indices  .data(), op_problem_scaled_.variables           .data(), nnz,        stream);
-  raft::copy(h_A_values       .data(), op_problem_scaled_.coefficients        .data(), nnz,        stream);
-  raft::copy(h_A_t_row_offsets.data(), op_problem_scaled_.reverse_offsets     .data(), n_vars + 1, stream);
-  raft::copy(h_A_t_col_indices.data(), op_problem_scaled_.reverse_constraints .data(), nnz,        stream);
-  raft::copy(h_A_t_values     .data(), op_problem_scaled_.reverse_coefficients.data(), nnz,        stream);
-  // Objective coefficients.
-  std::vector<f_t> h_obj(n_vars);
-  raft::copy(h_obj.data(), op_problem_scaled_.objective_coefficients.data(), n_vars, stream);
-  // Variable bounds: stored interleaved as f_t2 {lower, upper}. Unpack into two host vectors.
+  raft::copy(h_A_row_offsets  .data(), op_problem_scaled_.offsets            .data(), n_cstr + 1, stream);
+  raft::copy(h_A_col_indices  .data(), op_problem_scaled_.variables          .data(), nnz,        stream);
+  raft::copy(h_A_t_row_offsets.data(), op_problem_scaled_.reverse_offsets    .data(), n_vars + 1, stream);
+  raft::copy(h_A_t_col_indices.data(), op_problem_scaled_.reverse_constraints.data(), nnz,        stream);
+
+  // Paired value arrays for A and A_T.
+  std::vector<f_t> h_A_values        (nnz);
+  std::vector<f_t> h_A_values_scaled (nnz);
+  std::vector<f_t> h_A_t_values      (nnz);
+  std::vector<f_t> h_A_t_values_scaled(nnz);
+  raft::copy(h_A_values        .data(), problem_ptr->coefficients         .data(), nnz, stream);
+  raft::copy(h_A_t_values      .data(), problem_ptr->reverse_coefficients .data(), nnz, stream);
+  raft::copy(h_A_values_scaled .data(), op_problem_scaled_.coefficients        .data(), nnz, stream);
+  raft::copy(h_A_t_values_scaled.data(), op_problem_scaled_.reverse_coefficients.data(), nnz, stream);
+
   using f_t2 = typename type_2<f_t>::type;
-  std::vector<f_t2> h_var_bounds_packed(n_vars);
-  raft::copy(h_var_bounds_packed.data(),
-             op_problem_scaled_.variable_bounds.data(), n_vars, stream);
-  // Constraint bounds.
-  std::vector<f_t> h_cstr_lower(n_cstr);
-  std::vector<f_t> h_cstr_upper(n_cstr);
-  raft::copy(h_cstr_lower.data(), op_problem_scaled_.constraint_lower_bounds.data(), n_cstr, stream);
-  raft::copy(h_cstr_upper.data(), op_problem_scaled_.constraint_upper_bounds.data(), n_cstr, stream);
+
+  std::vector<f_t>  h_obj             (n_vars);
+  std::vector<f_t>  h_obj_scaled      (n_vars);
+  std::vector<f_t2> h_var_bounds_packed       (n_vars);
+  std::vector<f_t2> h_var_bounds_scaled_packed(n_vars);
+  std::vector<f_t>  h_cstr_lower      (n_cstr);
+  std::vector<f_t>  h_cstr_upper      (n_cstr);
+  std::vector<f_t>  h_cstr_lower_scaled(n_cstr);
+  std::vector<f_t>  h_cstr_upper_scaled(n_cstr);
+
+  raft::copy(h_obj                     .data(), problem_ptr->objective_coefficients.data(), n_vars, stream);
+  raft::copy(h_obj_scaled              .data(), op_problem_scaled_.objective_coefficients.data(), n_vars, stream);
+  raft::copy(h_var_bounds_packed       .data(), problem_ptr->variable_bounds.data(),       n_vars, stream);
+  raft::copy(h_var_bounds_scaled_packed.data(), op_problem_scaled_.variable_bounds.data(), n_vars, stream);
+  raft::copy(h_cstr_lower              .data(), problem_ptr->constraint_lower_bounds.data(),       n_cstr, stream);
+  raft::copy(h_cstr_upper              .data(), problem_ptr->constraint_upper_bounds.data(),       n_cstr, stream);
+  raft::copy(h_cstr_lower_scaled       .data(), op_problem_scaled_.constraint_lower_bounds.data(), n_cstr, stream);
+  raft::copy(h_cstr_upper_scaled       .data(), op_problem_scaled_.constraint_upper_bounds.data(), n_cstr, stream);
+
+  // 5. Get full scaling factors on host
+  std::vector<f_t> h_cummulative_cstr_scaling(n_cstr);
+  std::vector<f_t> h_cummulative_var_scaling (n_vars);
+  raft::copy(h_cummulative_cstr_scaling.data(),
+             initial_scaling_strategy_.get_constraint_matrix_scaling_vector().data(),
+             n_cstr, stream);
+  raft::copy(h_cummulative_var_scaling.data(),
+             initial_scaling_strategy_.get_variable_scaling_vector().data(),
+             n_vars, stream);
+  const f_t h_bound_rescaling     = initial_scaling_strategy_.get_h_bound_rescaling();
+  const f_t h_objective_rescaling = initial_scaling_strategy_.get_h_objective_rescaling();
+
   op_problem_scaled_.handle_ptr->sync_stream(stream);
-  
-  std::vector<f_t> h_var_lower(n_vars), h_var_upper(n_vars);
+
+  // Unpack interleaved {lower, upper} into separate vectors for both
+  // versions, so the shard ctor's slicing loop is uniform.
+  std::vector<f_t> h_var_lower (n_vars), h_var_upper (n_vars);
+  std::vector<f_t> h_var_lower_scaled(n_vars), h_var_upper_scaled(n_vars);
   for (i_t i = 0; i < n_vars; ++i) {
-    h_var_lower[i] = h_var_bounds_packed[i].x;
-    h_var_upper[i] = h_var_bounds_packed[i].y;
+    h_var_lower[i]        = h_var_bounds_packed[i].x;
+    h_var_upper[i]        = h_var_bounds_packed[i].y;
+    h_var_lower_scaled[i] = h_var_bounds_scaled_packed[i].x;
+    h_var_upper_scaled[i] = h_var_bounds_scaled_packed[i].y;
   }
-  // 5. Build per-rank data and meta-data
+
+  // 6. Build per-rank data and meta-data.
   std::vector<rank_data_t<i_t, f_t>> sub_pdlp_rank_data =
     partition_loader_t<i_t, f_t>::create_rank_data_from_parts(
       parts,
-      h_A_row_offsets,   h_A_col_indices,   h_A_values,
-      h_A_t_row_offsets, h_A_t_col_indices, h_A_t_values,
+      h_A_row_offsets,   h_A_col_indices,
+      h_A_values,        h_A_values_scaled,
+      h_A_t_row_offsets, h_A_t_col_indices,
+      h_A_t_values,      h_A_t_values_scaled,
       settings.num_gpus, n_cstr, n_vars, nnz);
-  // 6. Build the per-shard PDLP settings:
-  //    - single-GPU mode (num_gpus=1, no partition file) so sub-solvers don't recurse;
-  //    - disable scaling (master already scaled the data we're handing out).
-  pdlp_solver_settings_t<i_t, f_t> sub_pdlp_settings        = settings;
-  sub_pdlp_settings.num_gpus                                = 1;
-  sub_pdlp_settings.multi_gpu_partition_file                = "";
-  sub_pdlp_settings.hyper_params.default_l_inf_ruiz_iterations          = 0;
-  sub_pdlp_settings.hyper_params.default_alpha_pock_chambolle_rescaling = 0.0;
-
-  // 7. Construct the engine — this collectively bootstraps NCCL across all GPUs
-  //    and constructs one shard per partition with the right slice of host data.
+
+  // 7. Build the per-shard PDLP settings:
+  pdlp_solver_settings_t<i_t, f_t> sub_pdlp_settings = settings;
+  sub_pdlp_settings.num_gpus                                              = 1;
+  sub_pdlp_settings.multi_gpu_partition_file                              = "";
+  sub_pdlp_settings.is_distributed_sub_pdlp                               = true;
+  sub_pdlp_settings.hyper_params.default_l_inf_ruiz_iterations            = 0;
+  sub_pdlp_settings.hyper_params.default_alpha_pock_chambolle_rescaling   = 0.0;
+  sub_pdlp_settings.set_initial_step_size    (initial_step_size_global);
+  sub_pdlp_settings.set_initial_primal_weight(initial_primal_weight_global);
+
+  // 8. Construct the engine, creates NCCL comms and shards
   multi_gpu_engine.emplace(
     std::move(sub_pdlp_rank_data),
-    h_obj, h_var_lower, h_var_upper, h_cstr_lower, h_cstr_upper,
+    h_obj,        h_var_lower,        h_var_upper,        h_cstr_lower,        h_cstr_upper,
+    h_obj_scaled, h_var_lower_scaled, h_var_upper_scaled, h_cstr_lower_scaled, h_cstr_upper_scaled,
+    h_cummulative_cstr_scaling, h_cummulative_var_scaling,
+    h_bound_rescaling, h_objective_rescaling,
     op_problem_scaled_.maximize,
     op_problem_scaled_.objective_offset,
     op_problem_scaled_.presolve_data.objective_scaling_factor,
@@ -2349,9 +2400,13 @@ optimization_problem_solution_t<i_t, f_t> pdlp_solver_t<i_t, f_t>::run_solver(co
       !settings_.get_initial_primal_weight().has_value())
     compute_initial_primal_weight();
 
-  // In multi-GPU mode the master scaled op_problem_scaled_ in its ctor before
-  // distributing data to the shards, so skip the second scaling pass here.
-  if (!multi_gpu_engine.has_value()) {
+  // Skip the in-loop scaling pass in both distributed roles:
+  //   - The master pdlp_solver_t scaled op_problem_scaled_ in its multi-GPU
+  //     ctor before shipping data to the shards (multi_gpu_engine present).
+  //   - Each per-shard pdlp_solver_t received already-scaled
+  //     op_problem_scaled_ + injected scaling state from the master, so it
+  //     must not re-apply scale_problem() (is_distributed_sub_pdlp set).
+  if (!multi_gpu_engine.has_value() && !settings_.is_distributed_sub_pdlp) {
     initial_scaling_strategy_.scale_problem();
   }
 
diff --git a/cpp/src/pdlp/pdlp.cuh b/cpp/src/pdlp/pdlp.cuh
index ef992d2a9e..532f038fbf 100644
--- a/cpp/src/pdlp/pdlp.cuh
+++ b/cpp/src/pdlp/pdlp.cuh
@@ -33,7 +33,6 @@
 
 #include <optional>
 #include <unordered_set>
-#include "distributed_pdlp/multi_gpu_engine.hpp"
 
 namespace cuopt::linear_programming::detail {
 /**
@@ -108,6 +107,13 @@ class pdlp_solver_t {
   void compute_initial_step_size();
   void compute_initial_primal_weight();
 
+  // Needed by multi-GPU to mutate them
+  problem_t<i_t, f_t>& get_op_problem_scaled() { return op_problem_scaled_; }
+  detail::pdlp_initial_scaling_strategy_t<i_t, f_t>& get_initial_scaling_strategy()
+  {
+    return initial_scaling_strategy_;
+  }
+
  private:
   void print_termination_criteria(const timer_t& timer, bool is_average = false);
   void print_final_termination_criteria(

From b5ebfd2a757e1f35bcb70af97559b1d2082c3451 Mon Sep 17 00:00:00 2001
From: Bulle Mostovoi <vmostovoi@nvidia.com>
Date: Wed, 20 May 2026 15:41:59 +0200
Subject: [PATCH 10/67] added pre loop setup need to manage boxing + style too

---
 .../pdlp/distributed_pdlp/multi_gpu_engine.cu | 118 ++--
 .../distributed_pdlp/multi_gpu_engine.hpp     |  93 ++--
 .../pdlp/distributed_pdlp/partition_loader.cu |  77 ++-
 cpp/src/pdlp/distributed_pdlp/rank_data.hpp   | 101 ++--
 cpp/src/pdlp/distributed_pdlp/shard.cu        | 128 +++--
 cpp/src/pdlp/distributed_pdlp/shard.hpp       | 125 +++--
 cpp/src/pdlp/pdlp.cu                          | 521 ++++++++++--------
 cpp/src/pdlp/pdlp.cuh                         |   6 +-
 8 files changed, 607 insertions(+), 562 deletions(-)

diff --git a/cpp/src/pdlp/distributed_pdlp/multi_gpu_engine.cu b/cpp/src/pdlp/distributed_pdlp/multi_gpu_engine.cu
index 9b404bbd53..fe95b1e5ff 100644
--- a/cpp/src/pdlp/distributed_pdlp/multi_gpu_engine.cu
+++ b/cpp/src/pdlp/distributed_pdlp/multi_gpu_engine.cu
@@ -3,45 +3,44 @@
  * SPDX-License-Identifier: Apache-2.0
  */
 
- #include <pdlp/distributed_pdlp/multi_gpu_engine.hpp>
+#include <pdlp/distributed_pdlp/multi_gpu_engine.hpp>
+
+#include <cuopt/error.hpp>
+
+#include <raft/core/device_setter.hpp>
+
+#include <nccl.h>
+
+#include <numeric>
+
+namespace cuopt::linear_programming::detail {
 
- #include <cuopt/error.hpp>
- 
- #include <raft/core/device_setter.hpp>
- 
- #include <nccl.h>
- 
- #include <numeric>
- 
- namespace cuopt::linear_programming::detail {
- 
 template <typename i_t, typename f_t>
 multi_gpu_engine_t<i_t, f_t>::multi_gpu_engine_t(
-  std::vector<rank_data_t<i_t, f_t>>&&      rank_data,
-  std::vector<f_t> const&                   h_global_obj,
-  std::vector<f_t> const&                   h_global_var_lower,
-  std::vector<f_t> const&                   h_global_var_upper,
-  std::vector<f_t> const&                   h_global_cstr_lower,
-  std::vector<f_t> const&                   h_global_cstr_upper,
-  std::vector<f_t> const&                   h_global_obj_scaled,
-  std::vector<f_t> const&                   h_global_var_lower_scaled,
-  std::vector<f_t> const&                   h_global_var_upper_scaled,
-  std::vector<f_t> const&                   h_global_cstr_lower_scaled,
-  std::vector<f_t> const&                   h_global_cstr_upper_scaled,
-  std::vector<f_t> const&                   h_global_cummulative_cstr_scaling,
-  std::vector<f_t> const&                   h_global_cummulative_var_scaling,
-  f_t                                       h_bound_rescaling,
-  f_t                                       h_objective_rescaling,
-  bool                                      maximize,
-  f_t                                       objective_offset,
-  f_t                                       objective_scaling_factor,
-  pdlp_solver_settings_t<i_t, f_t> const&   sub_solver_settings)
+  std::vector<rank_data_t<i_t, f_t>>&& rank_data,
+  std::vector<f_t> const& h_global_obj,
+  std::vector<f_t> const& h_global_var_lower,
+  std::vector<f_t> const& h_global_var_upper,
+  std::vector<f_t> const& h_global_cstr_lower,
+  std::vector<f_t> const& h_global_cstr_upper,
+  std::vector<f_t> const& h_global_obj_scaled,
+  std::vector<f_t> const& h_global_var_lower_scaled,
+  std::vector<f_t> const& h_global_var_upper_scaled,
+  std::vector<f_t> const& h_global_cstr_lower_scaled,
+  std::vector<f_t> const& h_global_cstr_upper_scaled,
+  std::vector<f_t> const& h_global_cummulative_cstr_scaling,
+  std::vector<f_t> const& h_global_cummulative_var_scaling,
+  f_t h_bound_rescaling,
+  f_t h_objective_rescaling,
+  bool maximize,
+  f_t objective_offset,
+  f_t objective_scaling_factor,
+  pdlp_solver_settings_t<i_t, f_t> const& sub_solver_settings)
   : stream()
 {
   const int nb_parts = static_cast<int>(rank_data.size());
-  cuopt_expects(nb_parts > 0,
-                error_type_t::ValidationError,
-                "multi_gpu_engine_t: rank_data must be non-empty");
+  cuopt_expects(
+    nb_parts > 0, error_type_t::ValidationError, "multi_gpu_engine_t: rank_data must be non-empty");
 
   shards.reserve(nb_parts);
   std::vector<int> devices(nb_parts);
@@ -56,32 +55,31 @@ multi_gpu_engine_t<i_t, f_t>::multi_gpu_engine_t(
   // 3. Construct one shard per rank, pinned to its device.
   for (int r = 0; r < nb_parts; ++r) {
     raft::device_setter guard(devices[r]);  // shard ctor needs device set
-    shards.emplace_back(std::make_unique<pdlp_shard_t<i_t, f_t>>(
-      devices[r],
-      std::move(rank_data[r]),
-      raw_comms[r],
-      h_global_obj,
-      h_global_var_lower,
-      h_global_var_upper,
-      h_global_cstr_lower,
-      h_global_cstr_upper,
-      h_global_obj_scaled,
-      h_global_var_lower_scaled,
-      h_global_var_upper_scaled,
-      h_global_cstr_lower_scaled,
-      h_global_cstr_upper_scaled,
-      h_global_cummulative_cstr_scaling,
-      h_global_cummulative_var_scaling,
-      h_bound_rescaling,
-      h_objective_rescaling,
-      maximize,
-      objective_offset,
-      objective_scaling_factor,
-      sub_solver_settings));
+    shards.emplace_back(std::make_unique<pdlp_shard_t<i_t, f_t>>(devices[r],
+                                                                 std::move(rank_data[r]),
+                                                                 raw_comms[r],
+                                                                 h_global_obj,
+                                                                 h_global_var_lower,
+                                                                 h_global_var_upper,
+                                                                 h_global_cstr_lower,
+                                                                 h_global_cstr_upper,
+                                                                 h_global_obj_scaled,
+                                                                 h_global_var_lower_scaled,
+                                                                 h_global_var_upper_scaled,
+                                                                 h_global_cstr_lower_scaled,
+                                                                 h_global_cstr_upper_scaled,
+                                                                 h_global_cummulative_cstr_scaling,
+                                                                 h_global_cummulative_var_scaling,
+                                                                 h_bound_rescaling,
+                                                                 h_objective_rescaling,
+                                                                 maximize,
+                                                                 objective_offset,
+                                                                 objective_scaling_factor,
+                                                                 sub_solver_settings));
   }
 }
- 
- template struct multi_gpu_engine_t<int, double>;
- // template struct multi_gpu_engine_t<int, float>;
- 
- }  // namespace cuopt::linear_programming::detail
\ No newline at end of file
+
+template struct multi_gpu_engine_t<int, double>;
+// template struct multi_gpu_engine_t<int, float>;
+
+}  // namespace cuopt::linear_programming::detail
diff --git a/cpp/src/pdlp/distributed_pdlp/multi_gpu_engine.hpp b/cpp/src/pdlp/distributed_pdlp/multi_gpu_engine.hpp
index d672e18197..e191a89d60 100644
--- a/cpp/src/pdlp/distributed_pdlp/multi_gpu_engine.hpp
+++ b/cpp/src/pdlp/distributed_pdlp/multi_gpu_engine.hpp
@@ -2,53 +2,52 @@
  * SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: Apache-2.0
  */
- #pragma once
-
- #include <pdlp/distributed_pdlp/rank_data.hpp>
- #include <pdlp/distributed_pdlp/shard.hpp>
- 
- #include <cuopt/linear_programming/pdlp/solver_settings.hpp>
- 
- #include <rmm/cuda_stream.hpp>
- 
- #include <memory>
- #include <vector>
- 
- namespace cuopt::linear_programming::detail {
- 
+#pragma once
+
+#include <pdlp/distributed_pdlp/rank_data.hpp>
+#include <pdlp/distributed_pdlp/shard.hpp>
+
+#include <cuopt/linear_programming/pdlp/solver_settings.hpp>
+
+#include <rmm/cuda_stream.hpp>
+
+#include <memory>
+#include <vector>
+
+namespace cuopt::linear_programming::detail {
+
 template <typename i_t, typename f_t>
 struct multi_gpu_engine_t {
   // Constructs shards from rank_data
-  multi_gpu_engine_t(
-    std::vector<rank_data_t<i_t, f_t>>&&      rank_data,
-    std::vector<f_t> const&                   h_global_obj,
-    std::vector<f_t> const&                   h_global_var_lower,
-    std::vector<f_t> const&                   h_global_var_upper,
-    std::vector<f_t> const&                   h_global_cstr_lower,
-    std::vector<f_t> const&                   h_global_cstr_upper,
-    std::vector<f_t> const&                   h_global_obj_scaled,
-    std::vector<f_t> const&                   h_global_var_lower_scaled,
-    std::vector<f_t> const&                   h_global_var_upper_scaled,
-    std::vector<f_t> const&                   h_global_cstr_lower_scaled,
-    std::vector<f_t> const&                   h_global_cstr_upper_scaled,
-    std::vector<f_t> const&                   h_global_cummulative_cstr_scaling,
-    std::vector<f_t> const&                   h_global_cummulative_var_scaling,
-    f_t                                       h_bound_rescaling,
-    f_t                                       h_objective_rescaling,
-    bool                                      maximize,
-    f_t                                       objective_offset,
-    f_t                                       objective_scaling_factor,
-    pdlp_solver_settings_t<i_t, f_t> const&   sub_solver_settings);
- 
-   multi_gpu_engine_t(const multi_gpu_engine_t&)            = delete;
-   multi_gpu_engine_t& operator=(const multi_gpu_engine_t&) = delete;
- 
-   // Engine-level stream for fork/join orchestration (master side).
-   rmm::cuda_stream stream;
- 
-   // Shards stored by unique_ptr because pdlp_shard_t is immovable
-   // (owns device-affine resources: handle, NCCL comm, RMM buffers).
-   std::vector<std::unique_ptr<pdlp_shard_t<i_t, f_t>>> shards;
- };
- 
- }  // namespace cuopt::linear_programming::detail
\ No newline at end of file
+  multi_gpu_engine_t(std::vector<rank_data_t<i_t, f_t>>&& rank_data,
+                     std::vector<f_t> const& h_global_obj,
+                     std::vector<f_t> const& h_global_var_lower,
+                     std::vector<f_t> const& h_global_var_upper,
+                     std::vector<f_t> const& h_global_cstr_lower,
+                     std::vector<f_t> const& h_global_cstr_upper,
+                     std::vector<f_t> const& h_global_obj_scaled,
+                     std::vector<f_t> const& h_global_var_lower_scaled,
+                     std::vector<f_t> const& h_global_var_upper_scaled,
+                     std::vector<f_t> const& h_global_cstr_lower_scaled,
+                     std::vector<f_t> const& h_global_cstr_upper_scaled,
+                     std::vector<f_t> const& h_global_cummulative_cstr_scaling,
+                     std::vector<f_t> const& h_global_cummulative_var_scaling,
+                     f_t h_bound_rescaling,
+                     f_t h_objective_rescaling,
+                     bool maximize,
+                     f_t objective_offset,
+                     f_t objective_scaling_factor,
+                     pdlp_solver_settings_t<i_t, f_t> const& sub_solver_settings);
+
+  multi_gpu_engine_t(const multi_gpu_engine_t&)            = delete;
+  multi_gpu_engine_t& operator=(const multi_gpu_engine_t&) = delete;
+
+  // Engine-level stream for fork/join orchestration (master side).
+  rmm::cuda_stream stream;
+
+  // Shards stored by unique_ptr because pdlp_shard_t is immovable
+  // (owns device-affine resources: handle, NCCL comm, RMM buffers).
+  std::vector<std::unique_ptr<pdlp_shard_t<i_t, f_t>>> shards;
+};
+
+}  // namespace cuopt::linear_programming::detail
diff --git a/cpp/src/pdlp/distributed_pdlp/partition_loader.cu b/cpp/src/pdlp/distributed_pdlp/partition_loader.cu
index 047fb536d5..6c96e0b63d 100644
--- a/cpp/src/pdlp/distributed_pdlp/partition_loader.cu
+++ b/cpp/src/pdlp/distributed_pdlp/partition_loader.cu
@@ -18,9 +18,8 @@ std::vector<i_t> partition_loader_t<i_t, f_t>::parse_distributed_pdlp_partition_
   std::string const& file)
 {
   std::ifstream part_file(file);
-  cuopt_expects(part_file.is_open(),
-                error_type_t::ValidationError,
-                "Failed to open partition file: " + file);
+  cuopt_expects(
+    part_file.is_open(), error_type_t::ValidationError, "Failed to open partition file: " + file);
 
   // One integer per line; operator>> skips whitespace so blank lines and
   // trailing newlines are tolerated.
@@ -39,8 +38,7 @@ std::vector<i_t> partition_loader_t<i_t, f_t>::parse_distributed_pdlp_partition_
 }
 
 template <typename i_t, typename f_t>
-std::vector<rank_data_t<i_t, f_t>>
-partition_loader_t<i_t, f_t>::create_rank_data_from_parts(
+std::vector<rank_data_t<i_t, f_t>> partition_loader_t<i_t, f_t>::create_rank_data_from_parts(
   const std::vector<i_t>& parts,
   const std::vector<i_t>& A_row_offsets,
   const std::vector<i_t>& A_col_indices,
@@ -76,7 +74,7 @@ partition_loader_t<i_t, f_t>::create_rank_data_from_parts(
 
   // 2. Compute local matrices and rank_data
   for (i_t rank = 0; rank < nb_parts; rank++) {
-    auto& rd = rank_data[rank];
+    auto& rd           = rank_data[rank];
     rd.owned_var_size  = rd.owned_var_indices.size();
     rd.owned_cstr_size = rd.owned_cstr_indices.size();
     // ---- A side ----
@@ -93,11 +91,11 @@ partition_loader_t<i_t, f_t>::create_rank_data_from_parts(
     // local arrays share identical (offsets, col_indices) and differ only
     // in values.
     for (auto owned_cstr : rd.owned_cstr_indices) {
-      i_t cstr_len = A_row_offsets[owned_cstr + 1] - A_row_offsets[owned_cstr];
+      i_t cstr_len  = A_row_offsets[owned_cstr + 1] - A_row_offsets[owned_cstr];
       i_t row_start = A_row_offsets[owned_cstr];
       for (i_t v = 0; v < cstr_len; v++) {
         local_A_col_indices.push_back(A_col_indices[row_start + v]);
-        local_A_values       .push_back(A_values       [row_start + v]);
+        local_A_values.push_back(A_values[row_start + v]);
         local_A_values_scaled.push_back(A_values_scaled[row_start + v]);
       }
       local_A_nnz += cstr_len;
@@ -106,29 +104,25 @@ partition_loader_t<i_t, f_t>::create_rank_data_from_parts(
 
     std::set<i_t> needed_vars;
     for (auto indice : local_A_col_indices) {
-      if (var_parts[indice] != rank)
-        needed_vars.insert(indice);
+      if (var_parts[indice] != rank) needed_vars.insert(indice);
     }
 
     for (i_t peer = 0; peer < nb_parts; peer++) {
       std::vector<i_t> needed_var_from_peer;
       for (auto needed_var : needed_vars) {
-        if (var_parts[needed_var] == peer)
-          needed_var_from_peer.push_back(needed_var);
+        if (var_parts[needed_var] == peer) needed_var_from_peer.push_back(needed_var);
       }
-      i_t nb_recv_from_peer = needed_var_from_peer.size();
+      i_t nb_recv_from_peer    = needed_var_from_peer.size();
       rd.var_recv_counts[peer] = nb_recv_from_peer;
       rd.var_recv_offsets[peer] =
-        peer == 0
-          ? 0
-          : rd.var_recv_offsets[peer - 1] + rd.var_recv_counts[peer - 1];
+        peer == 0 ? 0 : rd.var_recv_offsets[peer - 1] + rd.var_recv_counts[peer - 1];
       rank_data[peer].var_send_per_peer[rank] = std::move(needed_var_from_peer);
     }
 
-    rd.h_A_row_offsets    = std::move(local_A_row_offsets);
-    rd.h_A_col_indices    = std::move(local_A_col_indices);
-    rd.h_A_values         = std::move(local_A_values);
-    rd.h_A_values_scaled  = std::move(local_A_values_scaled);
+    rd.h_A_row_offsets   = std::move(local_A_row_offsets);
+    rd.h_A_col_indices   = std::move(local_A_col_indices);
+    rd.h_A_values        = std::move(local_A_values);
+    rd.h_A_values_scaled = std::move(local_A_values_scaled);
 
     // ---- A_t side ----
     std::vector<i_t> local_A_t_row_offsets;
@@ -139,11 +133,11 @@ partition_loader_t<i_t, f_t>::create_rank_data_from_parts(
     local_A_t_row_offsets.push_back(local_A_t_nnz);
 
     for (auto owned_var : rd.owned_var_indices) {
-      i_t var_len = A_t_row_offsets[owned_var + 1] - A_t_row_offsets[owned_var];
+      i_t var_len   = A_t_row_offsets[owned_var + 1] - A_t_row_offsets[owned_var];
       i_t row_start = A_t_row_offsets[owned_var];
       for (i_t v = 0; v < var_len; v++) {
-        local_A_t_col_indices  .push_back(A_t_col_indices [row_start + v]);
-        local_A_t_values       .push_back(A_t_values      [row_start + v]);
+        local_A_t_col_indices.push_back(A_t_col_indices[row_start + v]);
+        local_A_t_values.push_back(A_t_values[row_start + v]);
         local_A_t_values_scaled.push_back(A_t_values_scaled[row_start + v]);
       }
       local_A_t_nnz += var_len;
@@ -152,31 +146,27 @@ partition_loader_t<i_t, f_t>::create_rank_data_from_parts(
 
     std::set<i_t> needed_cstrs;
     for (auto indice : local_A_t_col_indices) {
-      if (cstr_parts[indice] != rank)
-        needed_cstrs.insert(indice);
+      if (cstr_parts[indice] != rank) needed_cstrs.insert(indice);
     }
 
     for (i_t peer = 0; peer < nb_parts; peer++) {
       std::vector<i_t> needed_cstr_from_peer;
       for (auto needed_cstr : needed_cstrs) {
-        if (cstr_parts[needed_cstr] == peer)
-          needed_cstr_from_peer.push_back(needed_cstr);
+        if (cstr_parts[needed_cstr] == peer) needed_cstr_from_peer.push_back(needed_cstr);
       }
-      i_t nb_recv_from_peer = needed_cstr_from_peer.size();
+      i_t nb_recv_from_peer     = needed_cstr_from_peer.size();
       rd.cstr_recv_counts[peer] = nb_recv_from_peer;
       rd.cstr_recv_offsets[peer] =
-        peer == 0
-          ? 0
-          : rd.cstr_recv_offsets[peer - 1] + rd.cstr_recv_counts[peer - 1];
+        peer == 0 ? 0 : rd.cstr_recv_offsets[peer - 1] + rd.cstr_recv_counts[peer - 1];
       rank_data[peer].cstr_send_per_peer[rank] = std::move(needed_cstr_from_peer);
     }
 
-    rd.h_A_t_row_offsets    = std::move(local_A_t_row_offsets);
-    rd.h_A_t_col_indices    = std::move(local_A_t_col_indices);
-    rd.h_A_t_values         = std::move(local_A_t_values);
-    rd.h_A_t_values_scaled  = std::move(local_A_t_values_scaled);
+    rd.h_A_t_row_offsets   = std::move(local_A_t_row_offsets);
+    rd.h_A_t_col_indices   = std::move(local_A_t_col_indices);
+    rd.h_A_t_values        = std::move(local_A_t_values);
+    rd.h_A_t_values_scaled = std::move(local_A_t_values_scaled);
 
-    rd.total_var_size  = rd.owned_var_size  + needed_vars.size();
+    rd.total_var_size  = rd.owned_var_size + needed_vars.size();
     rd.total_cstr_size = rd.owned_cstr_size + needed_cstrs.size();
   }
 
@@ -195,7 +185,8 @@ partition_loader_t<i_t, f_t>::create_rank_data_from_parts(
       if (peer == rank) continue;
       for (auto recv_cstr : rank_data[peer].cstr_send_per_peer[rank]) {
         rd.global_to_local_cstr[recv_cstr] = curr_id;
-        // rd.local_to_global_cstr.push_back(recv_cstr); // Not needed, we only do local_to_global on owned side
+        // rd.local_to_global_cstr.push_back(recv_cstr); // Not needed, we only do local_to_global
+        // on owned side
         curr_id++;
       }
     }
@@ -221,14 +212,18 @@ partition_loader_t<i_t, f_t>::create_rank_data_from_parts(
     auto& rd = rank_data[rank];
 
     for (auto& send_vec : rd.var_send_per_peer) {
-      for (auto& v : send_vec) v = rd.global_to_local_var.at(v);
+      for (auto& v : send_vec)
+        v = rd.global_to_local_var.at(v);
     }
     for (auto& send_vec : rd.cstr_send_per_peer) {
-      for (auto& v : send_vec) v = rd.global_to_local_cstr.at(v);
+      for (auto& v : send_vec)
+        v = rd.global_to_local_cstr.at(v);
     }
 
-    for (auto& v : rd.h_A_col_indices) v = rd.global_to_local_var.at(v);
-    for (auto& v : rd.h_A_t_col_indices) v = rd.global_to_local_cstr.at(v);
+    for (auto& v : rd.h_A_col_indices)
+      v = rd.global_to_local_var.at(v);
+    for (auto& v : rd.h_A_t_col_indices)
+      v = rd.global_to_local_cstr.at(v);
   }
 
   return rank_data;
diff --git a/cpp/src/pdlp/distributed_pdlp/rank_data.hpp b/cpp/src/pdlp/distributed_pdlp/rank_data.hpp
index 29d76ae110..d52d277116 100644
--- a/cpp/src/pdlp/distributed_pdlp/rank_data.hpp
+++ b/cpp/src/pdlp/distributed_pdlp/rank_data.hpp
@@ -1,54 +1,61 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
 #pragma once
 
-#include <vector>
 #include <unordered_map>
+#include <vector>
 
 namespace cuopt::linear_programming::detail {
 template <typename i_t, typename f_t>
 struct rank_data_t {
-    rank_data_t(std::size_t nb_parts)
-      : var_send_per_peer(nb_parts),
-        cstr_send_per_peer(nb_parts),
-        var_recv_counts(nb_parts, 0),
-        var_recv_offsets(nb_parts, 0),
-        cstr_recv_counts(nb_parts, 0),
-        cstr_recv_offsets(nb_parts, 0) {}
-  
-    i_t owned_var_size{0};
-    i_t total_var_size{0};
-    i_t owned_cstr_size{0};
-    i_t total_cstr_size{0};
-  
-    // === Ownership ===
-    std::vector<i_t> owned_var_indices;
-    std::vector<i_t> owned_cstr_indices;
-  
-    // === Send plan: per peer, indices to gather + send ===
-    std::vector<std::vector<i_t>> var_send_per_peer;
-    std::vector<std::vector<i_t>> cstr_send_per_peer;
-  
-    // === Recv plan: per peer, contiguous slot in halo region ===
-    std::vector<i_t> var_recv_counts;
-    std::vector<i_t> var_recv_offsets;
-    std::vector<i_t> cstr_recv_counts;
-    std::vector<i_t> cstr_recv_offsets;
-  
-    // === Mappings ===
-    std::unordered_map<i_t, i_t> global_to_local_var;
-    std::unordered_map<i_t, i_t> global_to_local_cstr;
-    std::vector<i_t> local_to_global_var;
-    std::vector<i_t> local_to_global_cstr;
-  
-    // === Local host CSR matrices ===
-    // A
-    std::vector<i_t> h_A_row_offsets;
-    std::vector<i_t> h_A_col_indices;
-    std::vector<f_t> h_A_values;
-    std::vector<f_t> h_A_values_scaled;
-    // A_t
-    std::vector<i_t> h_A_t_row_offsets;
-    std::vector<i_t> h_A_t_col_indices;
-    std::vector<f_t> h_A_t_values;
-    std::vector<f_t> h_A_t_values_scaled;
-  };
-} // namespace cuopt::linear_programming::detail
\ No newline at end of file
+  rank_data_t(std::size_t nb_parts)
+    : var_send_per_peer(nb_parts),
+      cstr_send_per_peer(nb_parts),
+      var_recv_counts(nb_parts, 0),
+      var_recv_offsets(nb_parts, 0),
+      cstr_recv_counts(nb_parts, 0),
+      cstr_recv_offsets(nb_parts, 0)
+  {
+  }
+
+  i_t owned_var_size{0};
+  i_t total_var_size{0};
+  i_t owned_cstr_size{0};
+  i_t total_cstr_size{0};
+
+  // === Ownership ===
+  std::vector<i_t> owned_var_indices;
+  std::vector<i_t> owned_cstr_indices;
+
+  // === Send plan: per peer, indices to gather + send ===
+  std::vector<std::vector<i_t>> var_send_per_peer;
+  std::vector<std::vector<i_t>> cstr_send_per_peer;
+
+  // === Recv plan: per peer, contiguous slot in halo region ===
+  std::vector<i_t> var_recv_counts;
+  std::vector<i_t> var_recv_offsets;
+  std::vector<i_t> cstr_recv_counts;
+  std::vector<i_t> cstr_recv_offsets;
+
+  // === Mappings ===
+  std::unordered_map<i_t, i_t> global_to_local_var;
+  std::unordered_map<i_t, i_t> global_to_local_cstr;
+  std::vector<i_t> local_to_global_var;
+  std::vector<i_t> local_to_global_cstr;
+
+  // === Local host CSR matrices ===
+  // A
+  std::vector<i_t> h_A_row_offsets;
+  std::vector<i_t> h_A_col_indices;
+  std::vector<f_t> h_A_values;
+  std::vector<f_t> h_A_values_scaled;
+  // A_t
+  std::vector<i_t> h_A_t_row_offsets;
+  std::vector<i_t> h_A_t_col_indices;
+  std::vector<f_t> h_A_t_values;
+  std::vector<f_t> h_A_t_values_scaled;
+};
+}  // namespace cuopt::linear_programming::detail
diff --git a/cpp/src/pdlp/distributed_pdlp/shard.cu b/cpp/src/pdlp/distributed_pdlp/shard.cu
index 41f74086ab..596a08a3dc 100644
--- a/cpp/src/pdlp/distributed_pdlp/shard.cu
+++ b/cpp/src/pdlp/distributed_pdlp/shard.cu
@@ -22,28 +22,27 @@ template <typename i_t, typename f_t>
 pdlp_shard_t<i_t, f_t>::~pdlp_shard_t() = default;
 
 template <typename i_t, typename f_t>
-pdlp_shard_t<i_t, f_t>::pdlp_shard_t(
-  int device_id,
-  rank_data_t<i_t, f_t>&& rd,
-  ncclComm_t raw_comm,
-  std::vector<f_t> const& h_global_obj,
-  std::vector<f_t> const& h_global_var_lower,
-  std::vector<f_t> const& h_global_var_upper,
-  std::vector<f_t> const& h_global_cstr_lower,
-  std::vector<f_t> const& h_global_cstr_upper,
-  std::vector<f_t> const& h_global_obj_scaled,
-  std::vector<f_t> const& h_global_var_lower_scaled,
-  std::vector<f_t> const& h_global_var_upper_scaled,
-  std::vector<f_t> const& h_global_cstr_lower_scaled,
-  std::vector<f_t> const& h_global_cstr_upper_scaled,
-  std::vector<f_t> const& h_global_cummulative_cstr_scaling,
-  std::vector<f_t> const& h_global_cummulative_var_scaling,
-  f_t                                      h_bound_rescaling,
-  f_t                                      h_objective_rescaling,
-  bool                                     maximize,
-  f_t                                      objective_offset,
-  f_t                                      objective_scaling_factor,
-  pdlp_solver_settings_t<i_t, f_t> const&  settings)
+pdlp_shard_t<i_t, f_t>::pdlp_shard_t(int device_id,
+                                     rank_data_t<i_t, f_t>&& rd,
+                                     ncclComm_t raw_comm,
+                                     std::vector<f_t> const& h_global_obj,
+                                     std::vector<f_t> const& h_global_var_lower,
+                                     std::vector<f_t> const& h_global_var_upper,
+                                     std::vector<f_t> const& h_global_cstr_lower,
+                                     std::vector<f_t> const& h_global_cstr_upper,
+                                     std::vector<f_t> const& h_global_obj_scaled,
+                                     std::vector<f_t> const& h_global_var_lower_scaled,
+                                     std::vector<f_t> const& h_global_var_upper_scaled,
+                                     std::vector<f_t> const& h_global_cstr_lower_scaled,
+                                     std::vector<f_t> const& h_global_cstr_upper_scaled,
+                                     std::vector<f_t> const& h_global_cummulative_cstr_scaling,
+                                     std::vector<f_t> const& h_global_cummulative_var_scaling,
+                                     f_t h_bound_rescaling,
+                                     f_t h_objective_rescaling,
+                                     bool maximize,
+                                     f_t objective_offset,
+                                     f_t objective_scaling_factor,
+                                     pdlp_solver_settings_t<i_t, f_t> const& settings)
   : device_id(device_id),
     stream(),
     handle(stream.view()),
@@ -53,22 +52,27 @@ pdlp_shard_t<i_t, f_t>::pdlp_shard_t(
     sub_problem(std::nullopt),
     sub_pdlp(nullptr)
 {
-  assert(raft::device_setter::get_current_device() == device_id && "Right device must be set before building the shard");
+  assert(raft::device_setter::get_current_device() == device_id &&
+         "Right device must be set before building the shard");
 
   // ---- 1. Gather per-shard host slices using rank_data's index maps. ----
   // All vectors are sized to TOTAL (owned + halo). Owned slots get real
   // values; halo slots keep defaults because they should not be accessed
-  std::vector<f_t> h_obj              (rank_data.total_var_size,   f_t{0});
-  std::vector<f_t> h_var_lower        (rank_data.total_var_size,  -std::numeric_limits<f_t>::infinity());
-  std::vector<f_t> h_var_upper        (rank_data.total_var_size,   std::numeric_limits<f_t>::infinity());
-  std::vector<f_t> h_cstr_lower       (rank_data.total_cstr_size, -std::numeric_limits<f_t>::infinity());
-  std::vector<f_t> h_cstr_upper       (rank_data.total_cstr_size,  std::numeric_limits<f_t>::infinity());
-
-  std::vector<f_t> h_obj_scaled       (rank_data.total_var_size,   f_t{0});
-  std::vector<f_t> h_var_lower_scaled (rank_data.total_var_size,  -std::numeric_limits<f_t>::infinity());
-  std::vector<f_t> h_var_upper_scaled (rank_data.total_var_size,   std::numeric_limits<f_t>::infinity());
-  std::vector<f_t> h_cstr_lower_scaled(rank_data.total_cstr_size, -std::numeric_limits<f_t>::infinity());
-  std::vector<f_t> h_cstr_upper_scaled(rank_data.total_cstr_size,  std::numeric_limits<f_t>::infinity());
+  std::vector<f_t> h_obj(rank_data.total_var_size, f_t{0});
+  std::vector<f_t> h_var_lower(rank_data.total_var_size, -std::numeric_limits<f_t>::infinity());
+  std::vector<f_t> h_var_upper(rank_data.total_var_size, std::numeric_limits<f_t>::infinity());
+  std::vector<f_t> h_cstr_lower(rank_data.total_cstr_size, -std::numeric_limits<f_t>::infinity());
+  std::vector<f_t> h_cstr_upper(rank_data.total_cstr_size, std::numeric_limits<f_t>::infinity());
+
+  std::vector<f_t> h_obj_scaled(rank_data.total_var_size, f_t{0});
+  std::vector<f_t> h_var_lower_scaled(rank_data.total_var_size,
+                                      -std::numeric_limits<f_t>::infinity());
+  std::vector<f_t> h_var_upper_scaled(rank_data.total_var_size,
+                                      std::numeric_limits<f_t>::infinity());
+  std::vector<f_t> h_cstr_lower_scaled(rank_data.total_cstr_size,
+                                       -std::numeric_limits<f_t>::infinity());
+  std::vector<f_t> h_cstr_upper_scaled(rank_data.total_cstr_size,
+                                       std::numeric_limits<f_t>::infinity());
 
   for (i_t i = 0; i < rank_data.owned_var_size; ++i) {
     const auto g          = rank_data.local_to_global_var[i];
@@ -89,7 +93,7 @@ pdlp_shard_t<i_t, f_t>::pdlp_shard_t(
 
   // Get local scaling factors
   std::vector<f_t> h_cstr_scaling_local(rank_data.total_cstr_size, f_t{1});
-  std::vector<f_t> h_var_scaling_local (rank_data.total_var_size,  f_t{1});
+  std::vector<f_t> h_var_scaling_local(rank_data.total_var_size, f_t{1});
   for (i_t i = 0; i < rank_data.owned_cstr_size; ++i) {
     h_cstr_scaling_local[i] = h_global_cummulative_cstr_scaling[rank_data.local_to_global_cstr[i]];
   }
@@ -99,15 +103,17 @@ pdlp_shard_t<i_t, f_t>::pdlp_shard_t(
 
   // ---- 2. Build optimization_problem_t on this shard's device (UNSCALED). ----
   opt_problem.emplace(&handle);
-  opt_problem->set_csr_constraint_matrix(
-    rank_data.h_A_values     .data(), static_cast<i_t>(rank_data.h_A_values     .size()),
-    rank_data.h_A_col_indices.data(), static_cast<i_t>(rank_data.h_A_col_indices.size()),
-    rank_data.h_A_row_offsets.data(), static_cast<i_t>(rank_data.h_A_row_offsets.size()));
+  opt_problem->set_csr_constraint_matrix(rank_data.h_A_values.data(),
+                                         static_cast<i_t>(rank_data.h_A_values.size()),
+                                         rank_data.h_A_col_indices.data(),
+                                         static_cast<i_t>(rank_data.h_A_col_indices.size()),
+                                         rank_data.h_A_row_offsets.data(),
+                                         static_cast<i_t>(rank_data.h_A_row_offsets.size()));
 
   // Primal axis: TOTAL (owned + halo). Halo slots have neutral defaults.
-  opt_problem->set_objective_coefficients(h_obj      .data(), rank_data.total_var_size);
-  opt_problem->set_variable_lower_bounds (h_var_lower.data(), rank_data.total_var_size);
-  opt_problem->set_variable_upper_bounds (h_var_upper.data(), rank_data.total_var_size);
+  opt_problem->set_objective_coefficients(h_obj.data(), rank_data.total_var_size);
+  opt_problem->set_variable_lower_bounds(h_var_lower.data(), rank_data.total_var_size);
+  opt_problem->set_variable_upper_bounds(h_var_upper.data(), rank_data.total_var_size);
 
   // Dual axis: TOTAL (owned + halo). Halo slots have ±inf so trivially satisfied.
   opt_problem->set_constraint_lower_bounds(h_cstr_lower.data(), rank_data.total_cstr_size);
@@ -126,18 +132,21 @@ pdlp_shard_t<i_t, f_t>::pdlp_shard_t(
   // in multi-GPU: A_local is owned_cstr x total_var, and A_t_local is the
   // pre-sliced owned_var x total_cstr matrix we built during partitioning.
   auto stream_view = handle.get_stream();
-  sub_problem->reverse_offsets     .resize(rank_data.h_A_t_row_offsets.size(), stream_view);
-  sub_problem->reverse_constraints .resize(rank_data.h_A_t_col_indices.size(), stream_view);
-  sub_problem->reverse_coefficients.resize(rank_data.h_A_t_values     .size(), stream_view);
+  sub_problem->reverse_offsets.resize(rank_data.h_A_t_row_offsets.size(), stream_view);
+  sub_problem->reverse_constraints.resize(rank_data.h_A_t_col_indices.size(), stream_view);
+  sub_problem->reverse_coefficients.resize(rank_data.h_A_t_values.size(), stream_view);
   raft::copy(sub_problem->reverse_offsets.data(),
              rank_data.h_A_t_row_offsets.data(),
-             rank_data.h_A_t_row_offsets.size(), stream_view);
+             rank_data.h_A_t_row_offsets.size(),
+             stream_view);
   raft::copy(sub_problem->reverse_constraints.data(),
              rank_data.h_A_t_col_indices.data(),
-             rank_data.h_A_t_col_indices.size(), stream_view);
+             rank_data.h_A_t_col_indices.size(),
+             stream_view);
   raft::copy(sub_problem->reverse_coefficients.data(),
              rank_data.h_A_t_values.data(),
-             rank_data.h_A_t_values.size(), stream_view);
+             rank_data.h_A_t_values.size(),
+             stream_view);
   handle.sync_stream(stream_view);
 
   // ---- 5. Build sub_pdlp (single-GPU mode; multi_gpu flags cleared by caller). ----
@@ -150,16 +159,22 @@ pdlp_shard_t<i_t, f_t>::pdlp_shard_t(
   auto& scaled = sub_pdlp->get_op_problem_scaled();
   raft::copy(scaled.coefficients.data(),
              rank_data.h_A_values_scaled.data(),
-             rank_data.h_A_values_scaled.size(), stream_view);
+             rank_data.h_A_values_scaled.size(),
+             stream_view);
   raft::copy(scaled.reverse_coefficients.data(),
              rank_data.h_A_t_values_scaled.data(),
-             rank_data.h_A_t_values_scaled.size(), stream_view);
-  raft::copy(scaled.objective_coefficients.data(),
-             h_obj_scaled.data(), h_obj_scaled.size(), stream_view);
+             rank_data.h_A_t_values_scaled.size(),
+             stream_view);
+  raft::copy(
+    scaled.objective_coefficients.data(), h_obj_scaled.data(), h_obj_scaled.size(), stream_view);
   raft::copy(scaled.constraint_lower_bounds.data(),
-             h_cstr_lower_scaled.data(), h_cstr_lower_scaled.size(), stream_view);
+             h_cstr_lower_scaled.data(),
+             h_cstr_lower_scaled.size(),
+             stream_view);
   raft::copy(scaled.constraint_upper_bounds.data(),
-             h_cstr_upper_scaled.data(), h_cstr_upper_scaled.size(), stream_view);
+             h_cstr_upper_scaled.data(),
+             h_cstr_upper_scaled.size(),
+             stream_view);
 
   using f_t2 = typename type_2<f_t>::type;
   std::vector<f_t2> h_var_bounds_scaled_packed(rank_data.total_var_size);
@@ -169,14 +184,15 @@ pdlp_shard_t<i_t, f_t>::pdlp_shard_t(
   }
   raft::copy(scaled.variable_bounds.data(),
              h_var_bounds_scaled_packed.data(),
-             h_var_bounds_scaled_packed.size(), stream_view);
+             h_var_bounds_scaled_packed.size(),
+             stream_view);
 
   combine_constraint_bounds<i_t, f_t>(scaled, scaled.combined_bounds);
 
   // Inject master-scaled buffers inside sub_pdlp.initil_strategy
   auto& scaling = sub_pdlp->get_initial_scaling_strategy();
   scaling.set_cummulative_scaling(h_cstr_scaling_local, h_var_scaling_local);
-  scaling.set_h_bound_rescaling   (h_bound_rescaling);
+  scaling.set_h_bound_rescaling(h_bound_rescaling);
   scaling.set_h_objective_rescaling(h_objective_rescaling);
 
   handle.sync_stream(stream_view);
diff --git a/cpp/src/pdlp/distributed_pdlp/shard.hpp b/cpp/src/pdlp/distributed_pdlp/shard.hpp
index 3c10a90f90..a5ff89c5c4 100644
--- a/cpp/src/pdlp/distributed_pdlp/shard.hpp
+++ b/cpp/src/pdlp/distributed_pdlp/shard.hpp
@@ -2,49 +2,49 @@
  * SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: Apache-2.0
  */
- #pragma once
+#pragma once
+
+#include <pdlp/distributed_pdlp/rank_data.hpp>
+
+#include <cuopt/linear_programming/optimization_problem.hpp>
+#include <cuopt/linear_programming/pdlp/solver_settings.hpp>
+#include <mip_heuristics/problem/problem.cuh>
+
+#include <raft/core/device_setter.hpp>
+#include <raft/core/handle.hpp>
+#include <rmm/cuda_stream.hpp>
+
+#include <nccl.h>
+
+#include <memory>
+#include <optional>
+#include <vector>
+
+namespace cuopt::linear_programming::detail {
+
+// Forward-declare to break the cyclic include with pdlp.cuh
+// (pdlp.cuh -> multi_gpu_engine.hpp -> shard.hpp -> pdlp.cuh).
+// Definitions of out-of-line members live in shard.cu, which includes pdlp.cuh.
+template <typename i_t, typename f_t>
+class pdlp_solver_t;
+
+// RAII deleter for ncclComm_t; sets the right device before destroy.
+struct nccl_comm_deleter_t {
+  int device_id{-1};
+  void operator()(ncclComm* comm) const noexcept
+  {
+    if (comm == nullptr) return;
+    raft::device_setter guard(device_id);
+    ncclCommDestroy(comm);
+  }
+};
+using nccl_comm_unique_ptr_t = std::unique_ptr<ncclComm, nccl_comm_deleter_t>;
+
+template <typename i_t, typename f_t>
+struct pdlp_shard_t {
+  // Out-of-line (in shard.cu) because pdlp_solver_t is incomplete here.
+  ~pdlp_shard_t();
 
- #include <pdlp/distributed_pdlp/rank_data.hpp>
- 
- #include <cuopt/linear_programming/optimization_problem.hpp>
- #include <cuopt/linear_programming/pdlp/solver_settings.hpp>
- #include <mip_heuristics/problem/problem.cuh>
- 
- #include <raft/core/device_setter.hpp>
- #include <raft/core/handle.hpp>
- #include <rmm/cuda_stream.hpp>
- 
- #include <nccl.h>
- 
- #include <memory>
- #include <optional>
- #include <vector>
- 
- namespace cuopt::linear_programming::detail {
- 
- // Forward-declare to break the cyclic include with pdlp.cuh
- // (pdlp.cuh -> multi_gpu_engine.hpp -> shard.hpp -> pdlp.cuh).
- // Definitions of out-of-line members live in shard.cu, which includes pdlp.cuh.
- template <typename i_t, typename f_t>
- class pdlp_solver_t;
- 
- // RAII deleter for ncclComm_t; sets the right device before destroy.
- struct nccl_comm_deleter_t {
-   int device_id{-1};
-   void operator()(ncclComm* comm) const noexcept
-   {
-     if (comm == nullptr) return;
-     raft::device_setter guard(device_id);
-     ncclCommDestroy(comm);
-   }
- };
- using nccl_comm_unique_ptr_t = std::unique_ptr<ncclComm, nccl_comm_deleter_t>;
- 
- template <typename i_t, typename f_t>
- struct pdlp_shard_t {
-   // Out-of-line (in shard.cu) because pdlp_solver_t is incomplete here.
-   ~pdlp_shard_t();
- 
   // sub worker for distributed pdlp. Owns its own view on scaled problem and unscaled problem
   // Owns necessary multi-gpu data (rank_data, device_id, nccl_comm)
   pdlp_shard_t(int device_id,
@@ -65,25 +65,24 @@
                f_t h_bound_rescaling,
                f_t h_objective_rescaling,
                bool maximize,
-               f_t  objective_offset,
-               f_t  objective_scaling_factor,
+               f_t objective_offset,
+               f_t objective_scaling_factor,
                pdlp_solver_settings_t<i_t, f_t> const& settings);
- 
-   pdlp_shard_t(const pdlp_shard_t&)            = delete;
-   pdlp_shard_t& operator=(const pdlp_shard_t&) = delete;
-   // Move ops are implicitly deleted (user-declared dtor + deleted copy).
-   // Intentional: shard owns device-affine resources and must never move.
-   // Store as std::unique_ptr in any container.
- 
-   int                                              device_id;
-   rmm::cuda_stream                                 stream;
-   raft::handle_t                                   handle;
-   nccl_comm_unique_ptr_t                           comm;
-   rank_data_t<i_t, f_t>                            rank_data;
-   std::optional<optimization_problem_t<i_t, f_t>>  opt_problem;
-   std::optional<problem_t<i_t, f_t>>               sub_problem;
-   std::unique_ptr<pdlp_solver_t<i_t, f_t>>         sub_pdlp;
- };
- 
- }  // namespace cuopt::linear_programming::detail
- 
\ No newline at end of file
+
+  pdlp_shard_t(const pdlp_shard_t&)            = delete;
+  pdlp_shard_t& operator=(const pdlp_shard_t&) = delete;
+  // Move ops are implicitly deleted (user-declared dtor + deleted copy).
+  // Intentional: shard owns device-affine resources and must never move.
+  // Store as std::unique_ptr in any container.
+
+  int device_id;
+  rmm::cuda_stream stream;
+  raft::handle_t handle;
+  nccl_comm_unique_ptr_t comm;
+  rank_data_t<i_t, f_t> rank_data;
+  std::optional<optimization_problem_t<i_t, f_t>> opt_problem;
+  std::optional<problem_t<i_t, f_t>> sub_problem;
+  std::unique_ptr<pdlp_solver_t<i_t, f_t>> sub_pdlp;
+};
+
+}  // namespace cuopt::linear_programming::detail
diff --git a/cpp/src/pdlp/pdlp.cu b/cpp/src/pdlp/pdlp.cu
index 612eb676ec..2a36c160fd 100644
--- a/cpp/src/pdlp/pdlp.cu
+++ b/cpp/src/pdlp/pdlp.cu
@@ -333,23 +333,28 @@ pdlp_solver_t<i_t, f_t>::pdlp_solver_t(problem_t<i_t, f_t>& op_problem,
     parts = partition_loader_t<i_t, f_t>::parse_distributed_pdlp_partition_file(
       settings.multi_gpu_partition_file);
   } else {
-    cuopt_expects(false, error_type_t::NotImplemented,
+    cuopt_expects(false,
+                  error_type_t::NotImplemented,
                   "Metis partitioning inside cuopt not implemented yet; "
                   "provide a --parts file via settings.multi_gpu_partition_file");
   }
 
-  // always compute initial step size before scaling and primal_weight after scaling to do like cuPDLPx
-  assert(settings_.hyper_params.compute_initial_primal_weight_before_scaling && "compute_initial_primal_weight_before_scaling must be true in distributed mode");
-  assert(!settings_.hyper_params.compute_initial_step_size_before_scaling && "compute_initial_step_size_before_scaling must be false in distributed mode");
-  
+  // always compute initial step size before scaling and primal_weight after scaling to do like
+  // cuPDLPx
+  assert(settings_.hyper_params.compute_initial_primal_weight_before_scaling &&
+         "compute_initial_primal_weight_before_scaling must be true in distributed mode");
+  assert(!settings_.hyper_params.compute_initial_step_size_before_scaling &&
+         "compute_initial_step_size_before_scaling must be false in distributed mode");
+
   compute_initial_primal_weight();
-  
+
   // scale globally before dispatching to shards
   initial_scaling_strategy_.scale_problem();
-  
+
   compute_initial_step_size();
+  step_size_strategy_.get_primal_and_dual_stepsizes(primal_step_size_, dual_step_size_);
 
-  const f_t initial_step_size_global = get_step_size_h(0);
+  const f_t initial_step_size_global     = get_step_size_h(0);
   const f_t initial_primal_weight_global = get_primal_weight_h(0);
 
   // 4. Copy both scaled and unscaled pb
@@ -359,54 +364,61 @@ pdlp_solver_t<i_t, f_t>::pdlp_solver_t(problem_t<i_t, f_t>& op_problem,
   i_t const nnz     = op_problem_scaled_.nnz;
 
   // Shared topology (taken from the scaled problem, but identical on both).
-  std::vector<i_t> h_A_row_offsets  (n_cstr + 1);
-  std::vector<i_t> h_A_col_indices  (nnz);
+  std::vector<i_t> h_A_row_offsets(n_cstr + 1);
+  std::vector<i_t> h_A_col_indices(nnz);
   std::vector<i_t> h_A_t_row_offsets(n_vars + 1);
   std::vector<i_t> h_A_t_col_indices(nnz);
-  raft::copy(h_A_row_offsets  .data(), op_problem_scaled_.offsets            .data(), n_cstr + 1, stream);
-  raft::copy(h_A_col_indices  .data(), op_problem_scaled_.variables          .data(), nnz,        stream);
-  raft::copy(h_A_t_row_offsets.data(), op_problem_scaled_.reverse_offsets    .data(), n_vars + 1, stream);
-  raft::copy(h_A_t_col_indices.data(), op_problem_scaled_.reverse_constraints.data(), nnz,        stream);
+  raft::copy(h_A_row_offsets.data(), op_problem_scaled_.offsets.data(), n_cstr + 1, stream);
+  raft::copy(h_A_col_indices.data(), op_problem_scaled_.variables.data(), nnz, stream);
+  raft::copy(
+    h_A_t_row_offsets.data(), op_problem_scaled_.reverse_offsets.data(), n_vars + 1, stream);
+  raft::copy(h_A_t_col_indices.data(), op_problem_scaled_.reverse_constraints.data(), nnz, stream);
 
   // Paired value arrays for A and A_T.
-  std::vector<f_t> h_A_values        (nnz);
-  std::vector<f_t> h_A_values_scaled (nnz);
-  std::vector<f_t> h_A_t_values      (nnz);
+  std::vector<f_t> h_A_values(nnz);
+  std::vector<f_t> h_A_values_scaled(nnz);
+  std::vector<f_t> h_A_t_values(nnz);
   std::vector<f_t> h_A_t_values_scaled(nnz);
-  raft::copy(h_A_values        .data(), problem_ptr->coefficients         .data(), nnz, stream);
-  raft::copy(h_A_t_values      .data(), problem_ptr->reverse_coefficients .data(), nnz, stream);
-  raft::copy(h_A_values_scaled .data(), op_problem_scaled_.coefficients        .data(), nnz, stream);
-  raft::copy(h_A_t_values_scaled.data(), op_problem_scaled_.reverse_coefficients.data(), nnz, stream);
+  raft::copy(h_A_values.data(), problem_ptr->coefficients.data(), nnz, stream);
+  raft::copy(h_A_t_values.data(), problem_ptr->reverse_coefficients.data(), nnz, stream);
+  raft::copy(h_A_values_scaled.data(), op_problem_scaled_.coefficients.data(), nnz, stream);
+  raft::copy(
+    h_A_t_values_scaled.data(), op_problem_scaled_.reverse_coefficients.data(), nnz, stream);
 
   using f_t2 = typename type_2<f_t>::type;
 
-  std::vector<f_t>  h_obj             (n_vars);
-  std::vector<f_t>  h_obj_scaled      (n_vars);
-  std::vector<f_t2> h_var_bounds_packed       (n_vars);
+  std::vector<f_t> h_obj(n_vars);
+  std::vector<f_t> h_obj_scaled(n_vars);
+  std::vector<f_t2> h_var_bounds_packed(n_vars);
   std::vector<f_t2> h_var_bounds_scaled_packed(n_vars);
-  std::vector<f_t>  h_cstr_lower      (n_cstr);
-  std::vector<f_t>  h_cstr_upper      (n_cstr);
-  std::vector<f_t>  h_cstr_lower_scaled(n_cstr);
-  std::vector<f_t>  h_cstr_upper_scaled(n_cstr);
-
-  raft::copy(h_obj                     .data(), problem_ptr->objective_coefficients.data(), n_vars, stream);
-  raft::copy(h_obj_scaled              .data(), op_problem_scaled_.objective_coefficients.data(), n_vars, stream);
-  raft::copy(h_var_bounds_packed       .data(), problem_ptr->variable_bounds.data(),       n_vars, stream);
-  raft::copy(h_var_bounds_scaled_packed.data(), op_problem_scaled_.variable_bounds.data(), n_vars, stream);
-  raft::copy(h_cstr_lower              .data(), problem_ptr->constraint_lower_bounds.data(),       n_cstr, stream);
-  raft::copy(h_cstr_upper              .data(), problem_ptr->constraint_upper_bounds.data(),       n_cstr, stream);
-  raft::copy(h_cstr_lower_scaled       .data(), op_problem_scaled_.constraint_lower_bounds.data(), n_cstr, stream);
-  raft::copy(h_cstr_upper_scaled       .data(), op_problem_scaled_.constraint_upper_bounds.data(), n_cstr, stream);
+  std::vector<f_t> h_cstr_lower(n_cstr);
+  std::vector<f_t> h_cstr_upper(n_cstr);
+  std::vector<f_t> h_cstr_lower_scaled(n_cstr);
+  std::vector<f_t> h_cstr_upper_scaled(n_cstr);
+
+  raft::copy(h_obj.data(), problem_ptr->objective_coefficients.data(), n_vars, stream);
+  raft::copy(h_obj_scaled.data(), op_problem_scaled_.objective_coefficients.data(), n_vars, stream);
+  raft::copy(h_var_bounds_packed.data(), problem_ptr->variable_bounds.data(), n_vars, stream);
+  raft::copy(
+    h_var_bounds_scaled_packed.data(), op_problem_scaled_.variable_bounds.data(), n_vars, stream);
+  raft::copy(h_cstr_lower.data(), problem_ptr->constraint_lower_bounds.data(), n_cstr, stream);
+  raft::copy(h_cstr_upper.data(), problem_ptr->constraint_upper_bounds.data(), n_cstr, stream);
+  raft::copy(
+    h_cstr_lower_scaled.data(), op_problem_scaled_.constraint_lower_bounds.data(), n_cstr, stream);
+  raft::copy(
+    h_cstr_upper_scaled.data(), op_problem_scaled_.constraint_upper_bounds.data(), n_cstr, stream);
 
   // 5. Get full scaling factors on host
   std::vector<f_t> h_cummulative_cstr_scaling(n_cstr);
-  std::vector<f_t> h_cummulative_var_scaling (n_vars);
+  std::vector<f_t> h_cummulative_var_scaling(n_vars);
   raft::copy(h_cummulative_cstr_scaling.data(),
              initial_scaling_strategy_.get_constraint_matrix_scaling_vector().data(),
-             n_cstr, stream);
+             n_cstr,
+             stream);
   raft::copy(h_cummulative_var_scaling.data(),
              initial_scaling_strategy_.get_variable_scaling_vector().data(),
-             n_vars, stream);
+             n_vars,
+             stream);
   const f_t h_bound_rescaling     = initial_scaling_strategy_.get_h_bound_rescaling();
   const f_t h_objective_rescaling = initial_scaling_strategy_.get_h_objective_rescaling();
 
@@ -414,7 +426,7 @@ pdlp_solver_t<i_t, f_t>::pdlp_solver_t(problem_t<i_t, f_t>& op_problem,
 
   // Unpack interleaved {lower, upper} into separate vectors for both
   // versions, so the shard ctor's slicing loop is uniform.
-  std::vector<f_t> h_var_lower (n_vars), h_var_upper (n_vars);
+  std::vector<f_t> h_var_lower(n_vars), h_var_upper(n_vars);
   std::vector<f_t> h_var_lower_scaled(n_vars), h_var_upper_scaled(n_vars);
   for (i_t i = 0; i < n_vars; ++i) {
     h_var_lower[i]        = h_var_bounds_packed[i].x;
@@ -425,35 +437,58 @@ pdlp_solver_t<i_t, f_t>::pdlp_solver_t(problem_t<i_t, f_t>& op_problem,
 
   // 6. Build per-rank data and meta-data.
   std::vector<rank_data_t<i_t, f_t>> sub_pdlp_rank_data =
-    partition_loader_t<i_t, f_t>::create_rank_data_from_parts(
-      parts,
-      h_A_row_offsets,   h_A_col_indices,
-      h_A_values,        h_A_values_scaled,
-      h_A_t_row_offsets, h_A_t_col_indices,
-      h_A_t_values,      h_A_t_values_scaled,
-      settings.num_gpus, n_cstr, n_vars, nnz);
+    partition_loader_t<i_t, f_t>::create_rank_data_from_parts(parts,
+                                                              h_A_row_offsets,
+                                                              h_A_col_indices,
+                                                              h_A_values,
+                                                              h_A_values_scaled,
+                                                              h_A_t_row_offsets,
+                                                              h_A_t_col_indices,
+                                                              h_A_t_values,
+                                                              h_A_t_values_scaled,
+                                                              settings.num_gpus,
+                                                              n_cstr,
+                                                              n_vars,
+                                                              nnz);
 
   // 7. Build the per-shard PDLP settings:
-  pdlp_solver_settings_t<i_t, f_t> sub_pdlp_settings = settings;
-  sub_pdlp_settings.num_gpus                                              = 1;
-  sub_pdlp_settings.multi_gpu_partition_file                              = "";
-  sub_pdlp_settings.is_distributed_sub_pdlp                               = true;
-  sub_pdlp_settings.hyper_params.default_l_inf_ruiz_iterations            = 0;
-  sub_pdlp_settings.hyper_params.default_alpha_pock_chambolle_rescaling   = 0.0;
-  sub_pdlp_settings.set_initial_step_size    (initial_step_size_global);
-  sub_pdlp_settings.set_initial_primal_weight(initial_primal_weight_global);
+  pdlp_solver_settings_t<i_t, f_t> sub_pdlp_settings                    = settings;
+  sub_pdlp_settings.num_gpus                                            = 1;
+  sub_pdlp_settings.multi_gpu_partition_file                            = "";
+  sub_pdlp_settings.is_distributed_sub_pdlp                             = true;
+  sub_pdlp_settings.hyper_params.default_l_inf_ruiz_iterations          = 0;
+  sub_pdlp_settings.hyper_params.default_alpha_pock_chambolle_rescaling = 0.0;
 
   // 8. Construct the engine, creates NCCL comms and shards
-  multi_gpu_engine.emplace(
-    std::move(sub_pdlp_rank_data),
-    h_obj,        h_var_lower,        h_var_upper,        h_cstr_lower,        h_cstr_upper,
-    h_obj_scaled, h_var_lower_scaled, h_var_upper_scaled, h_cstr_lower_scaled, h_cstr_upper_scaled,
-    h_cummulative_cstr_scaling, h_cummulative_var_scaling,
-    h_bound_rescaling, h_objective_rescaling,
-    op_problem_scaled_.maximize,
-    op_problem_scaled_.objective_offset,
-    op_problem_scaled_.presolve_data.objective_scaling_factor,
-    sub_pdlp_settings);
+  multi_gpu_engine.emplace(std::move(sub_pdlp_rank_data),
+                           h_obj,
+                           h_var_lower,
+                           h_var_upper,
+                           h_cstr_lower,
+                           h_cstr_upper,
+                           h_obj_scaled,
+                           h_var_lower_scaled,
+                           h_var_upper_scaled,
+                           h_cstr_lower_scaled,
+                           h_cstr_upper_scaled,
+                           h_cummulative_cstr_scaling,
+                           h_cummulative_var_scaling,
+                           h_bound_rescaling,
+                           h_objective_rescaling,
+                           op_problem_scaled_.maximize,
+                           op_problem_scaled_.objective_offset,
+                           op_problem_scaled_.presolve_data.objective_scaling_factor,
+                           sub_pdlp_settings);
+
+  for (auto& shard : multi_gpu_engine.shards) {
+    raft::device_setter guard(shard->device_id);
+    auto& sub = *shard->sub_pdlp;
+    raft::copy(sub.step_size_.data(), step_size_.data(), 1, shard->stream);
+    raft::copy(sub.primal_weight_.data(), primal_weight_.data(), 1, shard->stream);
+    raft::copy(sub.best_primal_weight_.data(), best_primal_weight_.data(), 1, shard->stream);
+    raft::copy(sub.primal_step_size_.data(), primal_step_size_.data(), 1, shard->stream);
+    raft::copy(sub.dual_step_size_.data(), dual_step_size_.data(), 1, shard->stream);
+  }
 }
 
 template <typename i_t, typename f_t>
@@ -2392,219 +2427,215 @@ optimization_problem_solution_t<i_t, f_t> pdlp_solver_t<i_t, f_t>::run_solver(co
   std::cout << "Starting PDLP loop:" << std::endl;
 #endif
 
-  // TODO handle that properly
-  if (settings_.hyper_params.compute_initial_step_size_before_scaling &&
-      !settings_.get_initial_step_size().has_value())
-    compute_initial_step_size();
-  if (settings_.hyper_params.compute_initial_primal_weight_before_scaling &&
-      !settings_.get_initial_primal_weight().has_value())
-    compute_initial_primal_weight();
-
-  // Skip the in-loop scaling pass in both distributed roles:
-  //   - The master pdlp_solver_t scaled op_problem_scaled_ in its multi-GPU
-  //     ctor before shipping data to the shards (multi_gpu_engine present).
-  //   - Each per-shard pdlp_solver_t received already-scaled
-  //     op_problem_scaled_ + injected scaling state from the master, so it
-  //     must not re-apply scale_problem() (is_distributed_sub_pdlp set).
-  if (!multi_gpu_engine.has_value() && !settings_.is_distributed_sub_pdlp) {
+  // In distributed mode, skip all setup, it is done before
+  if (!settings_.hyper_params.use_distributed_pdlp) {
+    // TODO handle that properly
+    if (settings_.hyper_params.compute_initial_step_size_before_scaling &&
+        !settings_.get_initial_step_size().has_value())
+      compute_initial_step_size();
+    if (settings_.hyper_params.compute_initial_primal_weight_before_scaling &&
+        !settings_.get_initial_primal_weight().has_value())
+      compute_initial_primal_weight();
+
     initial_scaling_strategy_.scale_problem();
-  }
 
-  // Update FP32 matrix copies for mixed precision SpMV after scaling
-  pdhg_solver_.get_cusparse_view().update_mixed_precision_matrices();
+    // Update FP32 matrix copies for mixed precision SpMV after scaling
+    pdhg_solver_.get_cusparse_view().update_mixed_precision_matrices();
 
-  if (!settings_.hyper_params.compute_initial_step_size_before_scaling &&
-      !settings_.get_initial_step_size().has_value())
-    compute_initial_step_size();
-  if (!settings_.hyper_params.compute_initial_primal_weight_before_scaling &&
-      !settings_.get_initial_primal_weight().has_value())
-    compute_initial_primal_weight();
+    if (!settings_.hyper_params.compute_initial_step_size_before_scaling &&
+        !settings_.get_initial_step_size().has_value())
+      compute_initial_step_size();
+    if (!settings_.hyper_params.compute_initial_primal_weight_before_scaling &&
+        !settings_.get_initial_primal_weight().has_value())
+      compute_initial_primal_weight();
 
 #ifdef PDLP_DEBUG_MODE
-  std::cout << "Initial Scaling done" << std::endl;
+    std::cout << "Initial Scaling done" << std::endl;
 #endif
-
-  // Needs to be performed here before the below line to make sure the initial primal_weight / step
-  // size are used as previous point when potentially updating them in this next call
-  if (settings_.get_initial_step_size().has_value() || initial_step_size_.has_value()) {
-    if (initial_step_size_.has_value())
-      thrust::uninitialized_fill(handle_ptr_->get_thrust_policy(),
-                                 step_size_.begin(),
-                                 step_size_.end(),
-                                 initial_step_size_.value());
-    else
-      thrust::uninitialized_fill(handle_ptr_->get_thrust_policy(),
-                                 step_size_.begin(),
-                                 step_size_.end(),
-                                 settings_.get_initial_step_size().value());
-  }
-  if (settings_.get_initial_primal_weight().has_value() || initial_primal_weight_.has_value()) {
-    if (initial_primal_weight_.has_value()) {
-      thrust::uninitialized_fill(handle_ptr_->get_thrust_policy(),
-                                 primal_weight_.begin(),
-                                 primal_weight_.end(),
-                                 initial_primal_weight_.value());
-      if (is_cupdlpx_restart<i_t, f_t>(settings_.hyper_params))
+    // Needs to be performed here before the below line to make sure the initial primal_weight /
+    // step size are used as previous point when potentially updating them in this next call
+    if (settings_.get_initial_step_size().has_value() || initial_step_size_.has_value()) {
+      if (initial_step_size_.has_value())
+        thrust::uninitialized_fill(handle_ptr_->get_thrust_policy(),
+                                   step_size_.begin(),
+                                   step_size_.end(),
+                                   initial_step_size_.value());
+      else
+        thrust::uninitialized_fill(handle_ptr_->get_thrust_policy(),
+                                   step_size_.begin(),
+                                   step_size_.end(),
+                                   settings_.get_initial_step_size().value());
+    }
+    if (settings_.get_initial_primal_weight().has_value() || initial_primal_weight_.has_value()) {
+      if (initial_primal_weight_.has_value()) {
         thrust::uninitialized_fill(handle_ptr_->get_thrust_policy(),
-                                   best_primal_weight_.begin(),
-                                   best_primal_weight_.end(),
+                                   primal_weight_.begin(),
+                                   primal_weight_.end(),
                                    initial_primal_weight_.value());
-    } else {
-      thrust::uninitialized_fill(handle_ptr_->get_thrust_policy(),
-                                 primal_weight_.begin(),
-                                 primal_weight_.end(),
-                                 settings_.get_initial_primal_weight().value());
-      if (is_cupdlpx_restart<i_t, f_t>(settings_.hyper_params))
+        if (is_cupdlpx_restart<i_t, f_t>(settings_.hyper_params))
+          thrust::uninitialized_fill(handle_ptr_->get_thrust_policy(),
+                                     best_primal_weight_.begin(),
+                                     best_primal_weight_.end(),
+                                     initial_primal_weight_.value());
+      } else {
         thrust::uninitialized_fill(handle_ptr_->get_thrust_policy(),
-                                   best_primal_weight_.begin(),
-                                   best_primal_weight_.end(),
+                                   primal_weight_.begin(),
+                                   primal_weight_.end(),
                                    settings_.get_initial_primal_weight().value());
+        if (is_cupdlpx_restart<i_t, f_t>(settings_.hyper_params))
+          thrust::uninitialized_fill(handle_ptr_->get_thrust_policy(),
+                                     best_primal_weight_.begin(),
+                                     best_primal_weight_.end(),
+                                     settings_.get_initial_primal_weight().value());
+      }
+    }
+    if (initial_k_.has_value()) {
+      pdhg_solver_.total_pdhg_iterations_ = initial_k_.value();
+      pdhg_solver_.get_d_total_pdhg_iterations().set_value_async(initial_k_.value(), stream_view_);
+    }
+    if (settings_.get_initial_pdlp_iteration().has_value()) {
+      total_pdlp_iterations_ = settings_.get_initial_pdlp_iteration().value();
+      // This is meaningless in batch mode since pdhg step is never used, set it just to avoid
+      // assertions
+      pdhg_solver_.get_d_total_pdhg_iterations().set_value_async(total_pdlp_iterations_,
+                                                                 stream_view_);
+      pdhg_solver_.total_pdhg_iterations_ = total_pdlp_iterations_;
+      // Reset the fixed point error since at this pdlp iteration it is expected to already be
+      // initialized to some value
+      std::fill(restart_strategy_.initial_fixed_point_error_.begin(),
+                restart_strategy_.initial_fixed_point_error_.end(),
+                f_t(0.0));
+      std::fill(restart_strategy_.fixed_point_error_.begin(),
+                restart_strategy_.fixed_point_error_.end(),
+                f_t(0.0));
     }
-  }
-  if (initial_k_.has_value()) {
-    pdhg_solver_.total_pdhg_iterations_ = initial_k_.value();
-    pdhg_solver_.get_d_total_pdhg_iterations().set_value_async(initial_k_.value(), stream_view_);
-  }
-  if (settings_.get_initial_pdlp_iteration().has_value()) {
-    total_pdlp_iterations_ = settings_.get_initial_pdlp_iteration().value();
-    // This is meaningless in batch mode since pdhg step is never used, set it just to avoid
-    // assertions
-    pdhg_solver_.get_d_total_pdhg_iterations().set_value_async(total_pdlp_iterations_,
-                                                               stream_view_);
-    pdhg_solver_.total_pdhg_iterations_ = total_pdlp_iterations_;
-    // Reset the fixed point error since at this pdlp iteration it is expected to already be
-    // initialized to some value
-    std::fill(restart_strategy_.initial_fixed_point_error_.begin(),
-              restart_strategy_.initial_fixed_point_error_.end(),
-              f_t(0.0));
-    std::fill(restart_strategy_.fixed_point_error_.begin(),
-              restart_strategy_.fixed_point_error_.end(),
-              f_t(0.0));
-  }
 
-  // Only the primal_weight_ and step_size_ variables are initialized during the initial phase
-  // The associated primal/dual step_size (computed using the two firstly mentionned) are not
-  // initialized. This calls ensures the latter
-  // In the event of a given primal and dual solutions and if the option is toggled, calling the
-  // update primal_weight and step_size will also update the associated primal_step_size_,
-  // dual_step_size_.
-  // In summary: the below call is only mandatory at the beginning when
-  // computing/setting the initial primal weight and step size and if they are not recomputed later.
-  step_size_strategy_.get_primal_and_dual_stepsizes(primal_step_size_, dual_step_size_);
+    // Only the primal_weight_ and step_size_ variables are initialized during the initial phase
+    // The associated primal/dual step_size (computed using the two firstly mentionned) are not
+    // initialized. This calls ensures the latter
+    // In the event of a given primal and dual solutions and if the option is toggled, calling the
+    // update primal_weight and step_size will also update the associated primal_step_size_,
+    // dual_step_size_.
+    // In summary: the below call is only mandatory at the beginning when
+    // computing/setting the initial primal weight and step size and if they are not recomputed
+    // later.
+    step_size_strategy_.get_primal_and_dual_stepsizes(primal_step_size_, dual_step_size_);
 
 #ifdef CUPDLP_DEBUG_MODE
-  if (initial_primal_.size() != 0 || initial_dual_.size() != 0) {
-    std::cout << "Initial primal and dual solution before scaling" << std::endl;
-    if (initial_primal_.size() != 0) { print("initial_primal_", initial_primal_); }
-    if (initial_dual_.size() != 0) { print("initial_dual_", initial_dual_); }
-  }
+    if (initial_primal_.size() != 0 || initial_dual_.size() != 0) {
+      std::cout << "Initial primal and dual solution before scaling" << std::endl;
+      if (initial_primal_.size() != 0) { print("initial_primal_", initial_primal_); }
+      if (initial_dual_.size() != 0) { print("initial_dual_", initial_dual_); }
+    }
 #endif
 
-  // If there is an initial primal or dual we should update the restart info as if there was a step
-  // that has happend
-  if (initial_primal_.size() != 0 || initial_dual_.size() != 0) {
-    update_primal_dual_solutions(
-      (initial_primal_.size() != 0) ? std::make_optional(&initial_primal_) : std::nullopt,
-      (initial_dual_.size() != 0) ? std::make_optional(&initial_dual_) : std::nullopt);
-  }
+    // If there is an initial primal or dual we should update the restart info as if there was a
+    // step that has happend
+    if (initial_primal_.size() != 0 || initial_dual_.size() != 0) {
+      update_primal_dual_solutions(
+        (initial_primal_.size() != 0) ? std::make_optional(&initial_primal_) : std::nullopt,
+        (initial_dual_.size() != 0) ? std::make_optional(&initial_dual_) : std::nullopt);
+    }
 
 #ifdef CUPDLP_DEBUG_MODE
-  std::cout << "Solution before projection" << std::endl;
-  print("pdhg_solver_.get_primal_solution()", pdhg_solver_.get_primal_solution());
-  print("pdhg_solver_.get_dual_solution()", pdhg_solver_.get_dual_solution());
-  print("pdhg_solver_.get_potential_next_primal_solution()",
-        pdhg_solver_.get_potential_next_primal_solution());
-  print("pdhg_solver_.get_potential_next_dual_solution()",
-        pdhg_solver_.get_potential_next_dual_solution());
-  print("restart_strategy_.last_restart_duality_gap_.primal_solution_",
-        restart_strategy_.last_restart_duality_gap_.primal_solution_);
-  print("restart_strategy_.last_restart_duality_gap_.dual_solution_",
-        restart_strategy_.last_restart_duality_gap_.dual_solution_);
+    std::cout << "Solution before projection" << std::endl;
+    print("pdhg_solver_.get_primal_solution()", pdhg_solver_.get_primal_solution());
+    print("pdhg_solver_.get_dual_solution()", pdhg_solver_.get_dual_solution());
+    print("pdhg_solver_.get_potential_next_primal_solution()",
+          pdhg_solver_.get_potential_next_primal_solution());
+    print("pdhg_solver_.get_potential_next_dual_solution()",
+          pdhg_solver_.get_potential_next_dual_solution());
+    print("restart_strategy_.last_restart_duality_gap_.primal_solution_",
+          restart_strategy_.last_restart_duality_gap_.primal_solution_);
+    print("restart_strategy_.last_restart_duality_gap_.dual_solution_",
+          restart_strategy_.last_restart_duality_gap_.dual_solution_);
 #endif
 
-  // Project initial primal solution
-  if (settings_.hyper_params.project_initial_primal) {
-    using f_t2 = typename type_2<f_t>::type;
-    cub::DeviceTransform::Transform(
-      cuda::std::make_tuple(pdhg_solver_.get_primal_solution().data(),
-                            problem_wrap_container(op_problem_scaled_.variable_bounds)),
-      pdhg_solver_.get_primal_solution().data(),
-      pdhg_solver_.get_primal_solution().size(),
-      clamp<f_t, f_t2>(),
-      stream_view_.value());
-
-    pdhg_solver_.refine_initial_primal_projection();
-
-    if (!settings_.hyper_params.never_restart_to_average) {
-      cuopt_expects(!batch_mode_,
-                    cuopt::error_type_t::ValidationError,
-                    "Restart to average not supported in batch mode");
+    // Project initial primal solution
+    if (settings_.hyper_params.project_initial_primal) {
+      using f_t2 = typename type_2<f_t>::type;
       cub::DeviceTransform::Transform(
-        cuda::std::make_tuple(unscaled_primal_avg_solution_.data(),
-                              op_problem_scaled_.variable_bounds.data()),
-        unscaled_primal_avg_solution_.data(),
-        primal_size_h_,
+        cuda::std::make_tuple(pdhg_solver_.get_primal_solution().data(),
+                              problem_wrap_container(op_problem_scaled_.variable_bounds)),
+        pdhg_solver_.get_primal_solution().data(),
+        pdhg_solver_.get_primal_solution().size(),
         clamp<f_t, f_t2>(),
         stream_view_.value());
+
+      pdhg_solver_.refine_initial_primal_projection();
+
+      if (!settings_.hyper_params.never_restart_to_average) {
+        cuopt_expects(!batch_mode_,
+                      cuopt::error_type_t::ValidationError,
+                      "Restart to average not supported in batch mode");
+        cub::DeviceTransform::Transform(
+          cuda::std::make_tuple(unscaled_primal_avg_solution_.data(),
+                                op_problem_scaled_.variable_bounds.data()),
+          unscaled_primal_avg_solution_.data(),
+          primal_size_h_,
+          clamp<f_t, f_t2>(),
+          stream_view_.value());
+      }
     }
-  }
 
 #ifdef CUPDLP_DEBUG_MODE
-  std::cout << "Solution after projection" << std::endl;
-  print("pdhg_solver_.get_primal_solution()", pdhg_solver_.get_primal_solution());
-  print("pdhg_solver_.get_dual_solution()", pdhg_solver_.get_dual_solution());
-  print("pdhg_solver_.get_potential_next_primal_solution()",
-        pdhg_solver_.get_potential_next_primal_solution());
-  print("pdhg_solver_.get_potential_next_dual_solution()",
-        pdhg_solver_.get_potential_next_dual_solution());
-  print("restart_strategy_.last_restart_duality_gap_.primal_solution_",
-        restart_strategy_.last_restart_duality_gap_.primal_solution_);
-  print("restart_strategy_.last_restart_duality_gap_.dual_solution_",
-        restart_strategy_.last_restart_duality_gap_.dual_solution_);
+    std::cout << "Solution after projection" << std::endl;
+    print("pdhg_solver_.get_primal_solution()", pdhg_solver_.get_primal_solution());
+    print("pdhg_solver_.get_dual_solution()", pdhg_solver_.get_dual_solution());
+    print("pdhg_solver_.get_potential_next_primal_solution()",
+          pdhg_solver_.get_potential_next_primal_solution());
+    print("pdhg_solver_.get_potential_next_dual_solution()",
+          pdhg_solver_.get_potential_next_dual_solution());
+    print("restart_strategy_.last_restart_duality_gap_.primal_solution_",
+          restart_strategy_.last_restart_duality_gap_.primal_solution_);
+    print("restart_strategy_.last_restart_duality_gap_.dual_solution_",
+          restart_strategy_.last_restart_duality_gap_.dual_solution_);
 #endif
 
-  // Need to to tranpose primal solution to row format as there might be initial values or clamping
-  // Value may not be all 0
-  if (batch_mode_) {
-    rmm::device_uvector<f_t> dummy(0, stream_view_);
-    transpose_primal_dual_to_row(
-      pdhg_solver_.get_primal_solution(), pdhg_solver_.get_dual_solution(), dummy);
-    if (settings_.hyper_params.use_reflected_primal_dual) {
-      transpose_primal_dual_to_row(pdhg_solver_.get_potential_next_primal_solution(),
-                                   pdhg_solver_.get_potential_next_dual_solution(),
-                                   dummy);
-      transpose_primal_dual_to_row(restart_strategy_.last_restart_duality_gap_.primal_solution_,
-                                   restart_strategy_.last_restart_duality_gap_.dual_solution_,
-                                   dummy);
+    // Need to to tranpose primal solution to row format as there might be initial values or
+    // clamping Value may not be all 0
+    if (batch_mode_) {
+      rmm::device_uvector<f_t> dummy(0, stream_view_);
+      transpose_primal_dual_to_row(
+        pdhg_solver_.get_primal_solution(), pdhg_solver_.get_dual_solution(), dummy);
+      if (settings_.hyper_params.use_reflected_primal_dual) {
+        transpose_primal_dual_to_row(pdhg_solver_.get_potential_next_primal_solution(),
+                                     pdhg_solver_.get_potential_next_dual_solution(),
+                                     dummy);
+        transpose_primal_dual_to_row(restart_strategy_.last_restart_duality_gap_.primal_solution_,
+                                     restart_strategy_.last_restart_duality_gap_.dual_solution_,
+                                     dummy);
+      }
     }
-  }
 
-  if (verbose) {
-    std::cout << "primal_size_h_ " << primal_size_h_ << " dual_size_h_ " << dual_size_h_ << " nnz "
-              << problem_ptr->nnz << std::endl;
-    std::cout << "Problem before scaling" << std::endl;
-    print_problem_info<f_t>(
-      problem_ptr->coefficients, problem_ptr->objective_coefficients, problem_ptr->combined_bounds);
-    std::cout << "Problem after scaling" << std::endl;
-    print_problem_info<f_t>(op_problem_scaled_.coefficients,
-                            op_problem_scaled_.objective_coefficients,
-                            op_problem_scaled_.combined_bounds);
-    raft::print_device_vector("Initial step_size", step_size_.data(), 1, std::cout);
-    raft::print_device_vector("Initial primal_weight", primal_weight_.data(), 1, std::cout);
-    raft::print_device_vector("Initial primal_step_size", primal_step_size_.data(), 1, std::cout);
-    raft::print_device_vector("Initial dual_step_size", dual_step_size_.data(), 1, std::cout);
-  }
+    if (verbose) {
+      std::cout << "primal_size_h_ " << primal_size_h_ << " dual_size_h_ " << dual_size_h_
+                << " nnz " << problem_ptr->nnz << std::endl;
+      std::cout << "Problem before scaling" << std::endl;
+      print_problem_info<f_t>(problem_ptr->coefficients,
+                              problem_ptr->objective_coefficients,
+                              problem_ptr->combined_bounds);
+      std::cout << "Problem after scaling" << std::endl;
+      print_problem_info<f_t>(op_problem_scaled_.coefficients,
+                              op_problem_scaled_.objective_coefficients,
+                              op_problem_scaled_.combined_bounds);
+      raft::print_device_vector("Initial step_size", step_size_.data(), 1, std::cout);
+      raft::print_device_vector("Initial primal_weight", primal_weight_.data(), 1, std::cout);
+      raft::print_device_vector("Initial primal_step_size", primal_step_size_.data(), 1, std::cout);
+      raft::print_device_vector("Initial dual_step_size", dual_step_size_.data(), 1, std::cout);
+    }
 #ifdef CUPDLP_DEBUG_MODE
-  raft::print_device_vector("Initial step_size", step_size_.data(), step_size_.size(), std::cout);
-  raft::print_device_vector(
-    "Initial primal_weight", primal_weight_.data(), primal_weight_.size(), std::cout);
+    raft::print_device_vector("Initial step_size", step_size_.data(), step_size_.size(), std::cout);
+    raft::print_device_vector(
+      "Initial primal_weight", primal_weight_.data(), primal_weight_.size(), std::cout);
 #endif
 
-  bool warm_start_was_given = settings_.get_pdlp_warm_start_data().is_populated();
+    bool warm_start_was_given = settings_.get_pdlp_warm_start_data().is_populated();
 
-  if (!inside_mip_) {
-    CUOPT_LOG_INFO(
-      "   Iter    Primal Obj.      Dual Obj.    Gap        Primal Res.  Dual Res.   Time");
+    if (!inside_mip_) {
+      CUOPT_LOG_INFO(
+        "   Iter    Primal Obj.      Dual Obj.    Gap        Primal Res.  Dual Res.   Time");
+    }
   }
   while (true) {
 #ifdef CUPDLP_DEBUG_MODE
diff --git a/cpp/src/pdlp/pdlp.cuh b/cpp/src/pdlp/pdlp.cuh
index 532f038fbf..598d93ec33 100644
--- a/cpp/src/pdlp/pdlp.cuh
+++ b/cpp/src/pdlp/pdlp.cuh
@@ -62,11 +62,11 @@ class pdlp_solver_t {
   pdlp_solver_t(problem_t<i_t, f_t>& op_problem,
                 pdlp_solver_settings_t<i_t, f_t> const& settings,
                 bool is_batch_mode = false);
-  
+
   // Distributed Solver Constructor
   pdlp_solver_t(problem_t<i_t, f_t>& op_problem,
-    pdlp_solver_settings_t<i_t, f_t> const& settings,
-    int num_gpus);
+                pdlp_solver_settings_t<i_t, f_t> const& settings,
+                int num_gpus);
 
   optimization_problem_solution_t<i_t, f_t> run_solver(const timer_t& timer);
 

From 0965a60dd6300638174761e686936303aef97030 Mon Sep 17 00:00:00 2001
From: Bulle Mostovoi <vmostovoi@nvidia.com>
Date: Wed, 20 May 2026 16:41:07 +0200
Subject: [PATCH 11/67] added distributed transform

---
 .../distributed_pdlp/multi_gpu_engine.hpp     | 47 +++++++++++++++++++
 cpp/src/pdlp/pdlp.cu                          | 13 +++++
 2 files changed, 60 insertions(+)

diff --git a/cpp/src/pdlp/distributed_pdlp/multi_gpu_engine.hpp b/cpp/src/pdlp/distributed_pdlp/multi_gpu_engine.hpp
index e191a89d60..94f6b8584a 100644
--- a/cpp/src/pdlp/distributed_pdlp/multi_gpu_engine.hpp
+++ b/cpp/src/pdlp/distributed_pdlp/multi_gpu_engine.hpp
@@ -42,6 +42,53 @@ struct multi_gpu_engine_t {
   multi_gpu_engine_t(const multi_gpu_engine_t&)            = delete;
   multi_gpu_engine_t& operator=(const multi_gpu_engine_t&) = delete;
 
+
+
+  template <typename Fn>
+  void for_each_shard(Fn&& fn)
+  {
+    for (auto& s : shards) {
+      raft::device_setter guard(s->device_id);   
+      fn(*s);                                     
+    }
+  }
+
+  template <typename... InAccess,
+          typename OutAccess,
+          typename SizeAccess,
+          typename Op>
+  void distributed_transform(std::tuple<InAccess...> in_accessors,
+                            OutAccess                out,
+                            SizeAccess               sz,
+                            Op                       op)
+  {
+    for_each_shard([&](auto& shard) {
+      auto& sub = *shard.sub_pdlp;
+      // turns the Tuple of lambdas into a tuple of rmm::device_uvector
+      auto cub_inputs = std::apply(
+        [&sub](auto&... acc) { return cuda::std::make_tuple(acc(sub)...); },
+        in_accessors);
+
+      cub::DeviceTransform::Transform(cub_inputs,
+                                      out(sub),
+                                      sz(sub),
+                                      op,
+                                      shard.stream.view());
+    });
+  }
+  // --- 2) convenience: single input accessor (delegates) ---
+  template <typename InAccess,
+  typename OutAccess,
+  typename SizeAccess,
+  typename Op>
+  void distributed_transform(InAccess   in,
+                  OutAccess  out,
+                  SizeAccess sz,
+                  Op         op)
+  {
+  distributed_transform(std::make_tuple(in), out, sz, op);
+  }
+
   // Engine-level stream for fork/join orchestration (master side).
   rmm::cuda_stream stream;
 
diff --git a/cpp/src/pdlp/pdlp.cu b/cpp/src/pdlp/pdlp.cu
index 2a36c160fd..12717ce45b 100644
--- a/cpp/src/pdlp/pdlp.cu
+++ b/cpp/src/pdlp/pdlp.cu
@@ -489,6 +489,19 @@ pdlp_solver_t<i_t, f_t>::pdlp_solver_t(problem_t<i_t, f_t>& op_problem,
     raft::copy(sub.primal_step_size_.data(), primal_step_size_.data(), 1, shard->stream);
     raft::copy(sub.dual_step_size_.data(), dual_step_size_.data(), 1, shard->stream);
   }
+
+  // Project initial primal solution
+  if (settings_.hyper_params.project_initial_primal) {
+    using f_t2 = typename type_2<f_t>::type;
+    
+    multi_gpu_engine->distributed_transform(
+      std::make_tuple(
+        [](auto& s) -> auto& { return s.pdhg_solver_.get_primal_solution().data();},
+        [](auto& s) -> auto& { return s.get_op_problem_scaled().variable_bounds.data();}),
+      [](auto& s) -> auto& { return s.pdhg_solver_.get_primal_solution().data(); },  
+      [](auto& s) -> auto { return s.pdhg_solver_.get_primal_solution().size(); },  
+      clamp<f_t, f_t2>()
+    )
 }
 
 template <typename i_t, typename f_t>

From d4d1cab460a8b06163a749d7496099c185baa2de Mon Sep 17 00:00:00 2001
From: Bulle Mostovoi <vmostovoi@nvidia.com>
Date: Wed, 20 May 2026 16:45:59 +0200
Subject: [PATCH 12/67] added semicolon and existing runtime error enum

---
 cpp/src/pdlp/pdlp.cu | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/cpp/src/pdlp/pdlp.cu b/cpp/src/pdlp/pdlp.cu
index 12717ce45b..6ef586a8b8 100644
--- a/cpp/src/pdlp/pdlp.cu
+++ b/cpp/src/pdlp/pdlp.cu
@@ -334,9 +334,9 @@ pdlp_solver_t<i_t, f_t>::pdlp_solver_t(problem_t<i_t, f_t>& op_problem,
       settings.multi_gpu_partition_file);
   } else {
     cuopt_expects(false,
-                  error_type_t::NotImplemented,
-                  "Metis partitioning inside cuopt not implemented yet; "
-                  "provide a --parts file via settings.multi_gpu_partition_file");
+      error_type_t::RuntimeError,
+      "Metis partitioning inside cuopt not implemented yet; "
+      "provide a --parts file via settings.multi_gpu_partition_file");
   }
 
   // always compute initial step size before scaling and primal_weight after scaling to do like
@@ -501,7 +501,7 @@ pdlp_solver_t<i_t, f_t>::pdlp_solver_t(problem_t<i_t, f_t>& op_problem,
       [](auto& s) -> auto& { return s.pdhg_solver_.get_primal_solution().data(); },  
       [](auto& s) -> auto { return s.pdhg_solver_.get_primal_solution().size(); },  
       clamp<f_t, f_t2>()
-    )
+    );
 }
 
 template <typename i_t, typename f_t>

From 6659dd9768a9db5504c3ef480bacf148f66f8f33 Mon Sep 17 00:00:00 2001
From: Bulle Mostovoi <vmostovoi@nvidia.com>
Date: Wed, 20 May 2026 16:49:05 +0200
Subject: [PATCH 13/67] added } and fixed cuot_expects in partition loader

---
 cpp/src/pdlp/distributed_pdlp/partition_loader.cu | 11 +++++++----
 cpp/src/pdlp/pdlp.cu                              |  1 +
 2 files changed, 8 insertions(+), 4 deletions(-)

diff --git a/cpp/src/pdlp/distributed_pdlp/partition_loader.cu b/cpp/src/pdlp/distributed_pdlp/partition_loader.cu
index 6c96e0b63d..007df4ce1c 100644
--- a/cpp/src/pdlp/distributed_pdlp/partition_loader.cu
+++ b/cpp/src/pdlp/distributed_pdlp/partition_loader.cu
@@ -18,8 +18,10 @@ std::vector<i_t> partition_loader_t<i_t, f_t>::parse_distributed_pdlp_partition_
   std::string const& file)
 {
   std::ifstream part_file(file);
-  cuopt_expects(
-    part_file.is_open(), error_type_t::ValidationError, "Failed to open partition file: " + file);
+  cuopt_expects(part_file.is_open(),
+  error_type_t::ValidationError,
+  "Failed to open partition file: %s",
+  file.c_str());
 
   // One integer per line; operator>> skips whitespace so blank lines and
   // trailing newlines are tolerated.
@@ -31,8 +33,9 @@ std::vector<i_t> partition_loader_t<i_t, f_t>::parse_distributed_pdlp_partition_
 
   // We must have hit EOF cleanly; any other state means a malformed token.
   cuopt_expects(part_file.eof(),
-                error_type_t::ValidationError,
-                "Malformed partition file (expected one integer per line): " + file);
+  error_type_t::ValidationError,
+  "Malformed partition file (expected one integer per line): %s",
+  file.c_str());
 
   return parts;
 }
diff --git a/cpp/src/pdlp/pdlp.cu b/cpp/src/pdlp/pdlp.cu
index 6ef586a8b8..ee91c874a4 100644
--- a/cpp/src/pdlp/pdlp.cu
+++ b/cpp/src/pdlp/pdlp.cu
@@ -502,6 +502,7 @@ pdlp_solver_t<i_t, f_t>::pdlp_solver_t(problem_t<i_t, f_t>& op_problem,
       [](auto& s) -> auto { return s.pdhg_solver_.get_primal_solution().size(); },  
       clamp<f_t, f_t2>()
     );
+  }
 }
 
 template <typename i_t, typename f_t>

From b2ed271234f8ad3ec9483aff4fdb9f3aa5d6b21f Mon Sep 17 00:00:00 2001
From: Bulle Mostovoi <vmostovoi@nvidia.com>
Date: Wed, 20 May 2026 16:58:37 +0200
Subject: [PATCH 14/67] small bug fixes

---
 cpp/src/pdlp/pdlp.cu | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/cpp/src/pdlp/pdlp.cu b/cpp/src/pdlp/pdlp.cu
index ee91c874a4..a2aa14a78a 100644
--- a/cpp/src/pdlp/pdlp.cu
+++ b/cpp/src/pdlp/pdlp.cu
@@ -480,7 +480,7 @@ pdlp_solver_t<i_t, f_t>::pdlp_solver_t(problem_t<i_t, f_t>& op_problem,
                            op_problem_scaled_.presolve_data.objective_scaling_factor,
                            sub_pdlp_settings);
 
-  for (auto& shard : multi_gpu_engine.shards) {
+  for (auto& shard : multi_gpu_engine->shards) {
     raft::device_setter guard(shard->device_id);
     auto& sub = *shard->sub_pdlp;
     raft::copy(sub.step_size_.data(), step_size_.data(), 1, shard->stream);
@@ -2441,6 +2441,8 @@ optimization_problem_solution_t<i_t, f_t> pdlp_solver_t<i_t, f_t>::run_solver(co
   std::cout << "Starting PDLP loop:" << std::endl;
 #endif
 
+  bool warm_start_was_given = settings_.get_pdlp_warm_start_data().is_populated();
+
   // In distributed mode, skip all setup, it is done before
   if (!settings_.hyper_params.use_distributed_pdlp) {
     // TODO handle that properly
@@ -2644,7 +2646,6 @@ optimization_problem_solution_t<i_t, f_t> pdlp_solver_t<i_t, f_t>::run_solver(co
       "Initial primal_weight", primal_weight_.data(), primal_weight_.size(), std::cout);
 #endif
 
-    bool warm_start_was_given = settings_.get_pdlp_warm_start_data().is_populated();
 
     if (!inside_mip_) {
       CUOPT_LOG_INFO(

From 50d16ce7d5abb8a06bf69f4be3e96b4bb19c3f0d Mon Sep 17 00:00:00 2001
From: Bulle Mostovoi <vmostovoi@nvidia.com>
Date: Wed, 20 May 2026 17:04:18 +0200
Subject: [PATCH 15/67] =?UTF-8?q?a=20version=20that=20compiles=20#heheha?=
 =?UTF-8?q?=20=F0=9F=98=8E=F0=9F=98=8E=F0=9F=98=8E=F0=9F=98=8E?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 cpp/src/pdlp/pdlp.cu | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/cpp/src/pdlp/pdlp.cu b/cpp/src/pdlp/pdlp.cu
index a2aa14a78a..1cc987291f 100644
--- a/cpp/src/pdlp/pdlp.cu
+++ b/cpp/src/pdlp/pdlp.cu
@@ -496,10 +496,10 @@ pdlp_solver_t<i_t, f_t>::pdlp_solver_t(problem_t<i_t, f_t>& op_problem,
     
     multi_gpu_engine->distributed_transform(
       std::make_tuple(
-        [](auto& s) -> auto& { return s.pdhg_solver_.get_primal_solution().data();},
-        [](auto& s) -> auto& { return s.get_op_problem_scaled().variable_bounds.data();}),
-      [](auto& s) -> auto& { return s.pdhg_solver_.get_primal_solution().data(); },  
-      [](auto& s) -> auto { return s.pdhg_solver_.get_primal_solution().size(); },  
+        [](auto& s) { return s.pdhg_solver_.get_primal_solution().data();},
+        [](auto& s) { return s.get_op_problem_scaled().variable_bounds.data();}),
+      [](auto& s) { return s.pdhg_solver_.get_primal_solution().data(); },  
+      [](auto& s) { return s.pdhg_solver_.get_primal_solution().size(); },  
       clamp<f_t, f_t2>()
     );
   }

From 359d9f49693afb5a22ea767114dd8e3b20414c9a Mon Sep 17 00:00:00 2001
From: Bulle Mostovoi <vmostovoi@nvidia.com>
Date: Thu, 21 May 2026 10:51:26 +0200
Subject: [PATCH 16/67] removed use of engine:transaform

---
 cpp/src/pdlp/pdlp.cu | 19 ++++++++++---------
 1 file changed, 10 insertions(+), 9 deletions(-)

diff --git a/cpp/src/pdlp/pdlp.cu b/cpp/src/pdlp/pdlp.cu
index 1cc987291f..d5422c9d5f 100644
--- a/cpp/src/pdlp/pdlp.cu
+++ b/cpp/src/pdlp/pdlp.cu
@@ -493,15 +493,16 @@ pdlp_solver_t<i_t, f_t>::pdlp_solver_t(problem_t<i_t, f_t>& op_problem,
   // Project initial primal solution
   if (settings_.hyper_params.project_initial_primal) {
     using f_t2 = typename type_2<f_t>::type;
-    
-    multi_gpu_engine->distributed_transform(
-      std::make_tuple(
-        [](auto& s) { return s.pdhg_solver_.get_primal_solution().data();},
-        [](auto& s) { return s.get_op_problem_scaled().variable_bounds.data();}),
-      [](auto& s) { return s.pdhg_solver_.get_primal_solution().data(); },  
-      [](auto& s) { return s.pdhg_solver_.get_primal_solution().size(); },  
-      clamp<f_t, f_t2>()
-    );
+    for (auto& shard : multi_gpu_engine->shards) {
+      raft::device_setter guard(shard->device_id);
+      auto& sub = *shard->sub_pdlp;
+      cub::DeviceTransform::Transform(
+        std::make_tuple(sub.pdhg_solver_.get_primal_solution().data(),
+                        sub.get_op_problem_scaled().variable_bounds.data()),
+        sub.pdhg_solver_.get_primal_solution().data(),
+        sub.pdhg_solver_.get_primal_solution().size(),
+        clamp<f_t, f_t2>(), shard->stream);
+    }
   }
 }
 

From 910a49ab4346d08f8c70f4fa3ee9523becf3d9a5 Mon Sep 17 00:00:00 2001
From: Bulle Mostovoi <vmostovoi@nvidia.com>
Date: Fri, 22 May 2026 11:08:52 +0200
Subject: [PATCH 17/67] added multi-gpu SpMV #heheha

---
 .../pdlp/distributed_pdlp/multi_gpu_engine.cu |   4 +
 .../distributed_pdlp/multi_gpu_engine.hpp     | 147 ++++++++++++++++++
 cpp/src/pdlp/distributed_pdlp/shard.cu        |  24 +++
 cpp/src/pdlp/distributed_pdlp/shard.hpp       |  11 ++
 cpp/src/pdlp/pdhg.cu                          |  23 +++
 cpp/src/pdlp/pdhg.hpp                         |  27 +++-
 cpp/src/pdlp/pdlp.cu                          |   4 +
 7 files changed, 238 insertions(+), 2 deletions(-)

diff --git a/cpp/src/pdlp/distributed_pdlp/multi_gpu_engine.cu b/cpp/src/pdlp/distributed_pdlp/multi_gpu_engine.cu
index fe95b1e5ff..a0b3f5dcc3 100644
--- a/cpp/src/pdlp/distributed_pdlp/multi_gpu_engine.cu
+++ b/cpp/src/pdlp/distributed_pdlp/multi_gpu_engine.cu
@@ -4,6 +4,10 @@
  */
 
 #include <pdlp/distributed_pdlp/multi_gpu_engine.hpp>
+// compute_A_x() / compute_At_y() (defined inline in the engine header) call
+// shard.sub_pdlp->pdhg_solver_.compute_* — pdlp_solver_t must be complete at
+// the explicit instantiation point below.
+#include <pdlp/pdlp.cuh>
 
 #include <cuopt/error.hpp>
 
diff --git a/cpp/src/pdlp/distributed_pdlp/multi_gpu_engine.hpp b/cpp/src/pdlp/distributed_pdlp/multi_gpu_engine.hpp
index 94f6b8584a..9ea007947e 100644
--- a/cpp/src/pdlp/distributed_pdlp/multi_gpu_engine.hpp
+++ b/cpp/src/pdlp/distributed_pdlp/multi_gpu_engine.hpp
@@ -6,12 +6,23 @@
 
 #include <pdlp/distributed_pdlp/rank_data.hpp>
 #include <pdlp/distributed_pdlp/shard.hpp>
+#include <pdlp/pdhg.hpp>
 
 #include <cuopt/linear_programming/pdlp/solver_settings.hpp>
 
+#include <raft/core/device_setter.hpp>
+
 #include <rmm/cuda_stream.hpp>
+#include <rmm/exec_policy.hpp>
+
+#include <cub/device/device_transform.cuh>
+#include <cuda/std/tuple>
+#include <thrust/gather.h>
+
+#include <nccl.h>
 
 #include <memory>
+#include <tuple>
 #include <vector>
 
 namespace cuopt::linear_programming::detail {
@@ -89,6 +100,142 @@ struct multi_gpu_engine_t {
   distributed_transform(std::make_tuple(in), out, sz, op);
   }
 
+  // -------- Halo exchange (variables / x) ---------------------------------
+  // Fills the halo slice [owned_var_size, total_var_size) of the per-shard
+  // reflected_primal vector (the buffer A @ x reads). Step 1: thrust::gather
+  // per-peer outgoing values into staging buffers. Step 2: a single NCCL
+  // group with matched ncclSend / ncclRecv across all (rank, peer) pairs.
+  void halo_exchange_var()
+  {
+    const int nb = static_cast<int>(shards.size());
+
+    // Step 1: gather owned values that each peer needs into per-peer staging.
+    for (int r = 0; r < nb; ++r) {
+      auto& s = *shards[r];
+      raft::device_setter guard(s.device_id);
+      auto& x = s.sub_pdlp->pdhg_solver_.get_reflected_primal();
+      for (int peer = 0; peer < nb; ++peer) {
+        if (peer == r) continue;
+        if (s.var_send_indices_d[peer].size() == 0) continue;
+        thrust::gather(rmm::exec_policy_nosync(s.stream.view()),
+                       s.var_send_indices_d[peer].begin(),
+                       s.var_send_indices_d[peer].end(),
+                       x.begin(),
+                       s.var_send_buf_d[peer].begin());
+      }
+    }
+
+    // Step 2: matched send / recv across the whole topology in one NCCL group.
+    ncclGroupStart();
+    for (int r = 0; r < nb; ++r) {
+      auto& s = *shards[r];
+      raft::device_setter guard(s.device_id);
+      for (int peer = 0; peer < nb; ++peer) {
+        if (peer == r) continue;
+        ncclSend(s.var_send_buf_d[peer].data(),
+                 s.var_send_buf_d[peer].size(),
+                 ncclFloat64,
+                 peer,
+                 s.comm.get(),
+                 s.stream.view().value());
+      }
+    }
+    for (int r = 0; r < nb; ++r) {
+      auto& s   = *shards[r];
+      auto& rd  = s.rank_data;
+      raft::device_setter guard(s.device_id);
+      auto& x   = s.sub_pdlp->pdhg_solver_.get_reflected_primal();
+      for (int peer = 0; peer < nb; ++peer) {
+        if (peer == r) continue;
+        f_t* recv_ptr = x.data() + rd.owned_var_size + rd.var_recv_offsets[peer];
+        ncclRecv(recv_ptr,
+                 static_cast<size_t>(rd.var_recv_counts[peer]),
+                 ncclFloat64,
+                 peer,
+                 s.comm.get(),
+                 s.stream.view().value());
+      }
+    }
+    ncclGroupEnd();
+  }
+
+  // -------- Halo exchange (constraints / y) -------------------------------
+  // Same as halo_exchange_var but for the per-shard dual solution (the buffer
+  // A_T @ y reads) and constraint halos.
+  void halo_exchange_cstr()
+  {
+    const int nb = static_cast<int>(shards.size());
+
+    for (int r = 0; r < nb; ++r) {
+      auto& s = *shards[r];
+      raft::device_setter guard(s.device_id);
+      auto& y = s.sub_pdlp->pdhg_solver_.get_dual_solution();
+      for (int peer = 0; peer < nb; ++peer) {
+        if (peer == r) continue;
+        if (s.cstr_send_indices_d[peer].size() == 0) continue;
+        thrust::gather(rmm::exec_policy_nosync(s.stream.view()),
+                       s.cstr_send_indices_d[peer].begin(),
+                       s.cstr_send_indices_d[peer].end(),
+                       y.begin(),
+                       s.cstr_send_buf_d[peer].begin());
+      }
+    }
+
+    ncclGroupStart();
+    for (int r = 0; r < nb; ++r) {
+      auto& s = *shards[r];
+      raft::device_setter guard(s.device_id);
+      for (int peer = 0; peer < nb; ++peer) {
+        if (peer == r) continue;
+        ncclSend(s.cstr_send_buf_d[peer].data(),
+                 s.cstr_send_buf_d[peer].size(),
+                 ncclFloat64,
+                 peer,
+                 s.comm.get(),
+                 s.stream.view().value());
+      }
+    }
+    for (int r = 0; r < nb; ++r) {
+      auto& s   = *shards[r];
+      auto& rd  = s.rank_data;
+      raft::device_setter guard(s.device_id);
+      auto& y   = s.sub_pdlp->pdhg_solver_.get_dual_solution();
+      for (int peer = 0; peer < nb; ++peer) {
+        if (peer == r) continue;
+        f_t* recv_ptr = y.data() + rd.owned_cstr_size + rd.cstr_recv_offsets[peer];
+        ncclRecv(recv_ptr,
+                 static_cast<size_t>(rd.cstr_recv_counts[peer]),
+                 ncclFloat64,
+                 peer,
+                 s.comm.get(),
+                 s.stream.view().value());
+      }
+    }
+    ncclGroupEnd();
+  }
+
+  // -------- High-level: A @ x and A_T @ y ---------------------------------
+  // A @ x: halo-update the reflected_primal vector, then per-shard SpMV.
+  // Named distributed_* (rather than compute_*) to make call sites in pdhg.cu
+  // self-documenting and to avoid name collision with pdhg_solver_t's own
+  // compute_A_x / compute_At_y, which the engine dispatches into per shard.
+  void distributed_compute_A_x()
+  {
+    halo_exchange_var();
+    for_each_shard([&](auto& shard) {
+      shard.sub_pdlp->pdhg_solver_.compute_A_x();
+    });
+  }
+
+  // A_T @ y: halo-update the dual solution vector, then per-shard SpMV.
+  void distributed_compute_At_y()
+  {
+    halo_exchange_cstr();
+    for_each_shard([&](auto& shard) {
+      shard.sub_pdlp->pdhg_solver_.compute_At_y();
+    });
+  }
+
   // Engine-level stream for fork/join orchestration (master side).
   rmm::cuda_stream stream;
 
diff --git a/cpp/src/pdlp/distributed_pdlp/shard.cu b/cpp/src/pdlp/distributed_pdlp/shard.cu
index 596a08a3dc..bbc02559cf 100644
--- a/cpp/src/pdlp/distributed_pdlp/shard.cu
+++ b/cpp/src/pdlp/distributed_pdlp/shard.cu
@@ -195,6 +195,30 @@ pdlp_shard_t<i_t, f_t>::pdlp_shard_t(int device_id,
   scaling.set_h_bound_rescaling(h_bound_rescaling);
   scaling.set_h_objective_rescaling(h_objective_rescaling);
 
+  // ---- 6. Build per-peer halo-exchange plans (ported from metis_tests). ----
+  // For each peer p, we precompute:
+  //   send_indices_d[p] : local indices to gather (uploaded from host send plan)
+  //   send_buf_d[p]     : f_t staging buffer sized to match
+  // Self-peer slot is present but empty (size 0). Used in engine halo exchange.
+  auto build_send_plan = [&](auto const& send_per_peer,
+                             auto& indices_d,
+                             auto& buf_d) {
+    const std::size_t n_peers = send_per_peer.size();
+    indices_d.reserve(n_peers);
+    buf_d.reserve(n_peers);
+    for (auto const& send_to_peer : send_per_peer) {
+      rmm::device_uvector<i_t> idx(send_to_peer.size(), stream_view);
+      rmm::device_uvector<f_t> buf(send_to_peer.size(), stream_view);
+      if (!send_to_peer.empty()) {
+        raft::copy(idx.data(), send_to_peer.data(), send_to_peer.size(), stream_view);
+      }
+      indices_d.emplace_back(std::move(idx));
+      buf_d.emplace_back(std::move(buf));
+    }
+  };
+  build_send_plan(rank_data.var_send_per_peer,  var_send_indices_d,  var_send_buf_d);
+  build_send_plan(rank_data.cstr_send_per_peer, cstr_send_indices_d, cstr_send_buf_d);
+
   handle.sync_stream(stream_view);
 }
 
diff --git a/cpp/src/pdlp/distributed_pdlp/shard.hpp b/cpp/src/pdlp/distributed_pdlp/shard.hpp
index a5ff89c5c4..35babc12db 100644
--- a/cpp/src/pdlp/distributed_pdlp/shard.hpp
+++ b/cpp/src/pdlp/distributed_pdlp/shard.hpp
@@ -13,6 +13,7 @@
 #include <raft/core/device_setter.hpp>
 #include <raft/core/handle.hpp>
 #include <rmm/cuda_stream.hpp>
+#include <rmm/device_uvector.hpp>
 
 #include <nccl.h>
 
@@ -83,6 +84,16 @@ struct pdlp_shard_t {
   std::optional<optimization_problem_t<i_t, f_t>> opt_problem;
   std::optional<problem_t<i_t, f_t>> sub_problem;
   std::unique_ptr<pdlp_solver_t<i_t, f_t>> sub_pdlp;
+
+  // Per-peer halo-exchange state. Inner index = peer rank.
+  // Slot for self (peer == this rank) is present but unused (size 0).
+  // var_send_indices_d[peer] : local indices into primal vector to gather and ncclSend
+  // var_send_buf_d    [peer] : staging buffer for outgoing variable values
+  // cstr_send_indices_d/cstr_send_buf_d : same, for dual vector
+  std::vector<rmm::device_uvector<i_t>> var_send_indices_d;
+  std::vector<rmm::device_uvector<f_t>> var_send_buf_d;
+  std::vector<rmm::device_uvector<i_t>> cstr_send_indices_d;
+  std::vector<rmm::device_uvector<f_t>> cstr_send_buf_d;
 };
 
 }  // namespace cuopt::linear_programming::detail
diff --git a/cpp/src/pdlp/pdhg.cu b/cpp/src/pdlp/pdhg.cu
index cb16c9d662..9cf2087c8b 100644
--- a/cpp/src/pdlp/pdhg.cu
+++ b/cpp/src/pdlp/pdhg.cu
@@ -5,6 +5,11 @@
  */
 /* clang-format on */
 #include <pdlp/pdhg.hpp>
+// pdlp.cuh defines pdlp_solver_t which the engine's compute_A_x/compute_At_y
+// template bodies dereference via shard.sub_pdlp->pdhg_solver_. Must be a
+// complete type at the point of template instantiation below.
+#include <pdlp/pdlp.cuh>
+#include <pdlp/distributed_pdlp/multi_gpu_engine.hpp>
 #include <pdlp/pdlp_climber_strategy.hpp>
 #include <pdlp/pdlp_constants.hpp>
 #include <pdlp/swap_and_resize_helper.cuh>
@@ -306,6 +311,15 @@ void pdhg_solver_t<i_t, f_t>::compute_At_y()
 {
   // A_t @ y
 
+  // Multi-GPU dispatch: when the master pdhg has an engine, drive halo
+  // exchange + per-shard SpMV via the engine. Shards' pdhg_solver_ have no
+  // engine pointer set, so their compute_At_y falls through to the cusparse
+  // path below on each shard's local A_t.
+  if (mgpu_engine_ != nullptr) {
+    mgpu_engine_->distributed_compute_At_y();
+    return;
+  }
+
   if (!batch_mode_) {
     if constexpr (std::is_same_v<f_t, double>) {
       if (cusparse_view_.mixed_precision_enabled_) {
@@ -354,6 +368,15 @@ template <typename i_t, typename f_t>
 void pdhg_solver_t<i_t, f_t>::compute_A_x()
 {
   // A @ x
+
+  // Multi-GPU dispatch: see compute_At_y. The engine halo-updates the
+  // reflected_primal vector (the buffer this SpMV reads) and then drives
+  // per-shard local cusparse SpMV.
+  if (mgpu_engine_ != nullptr) {
+    mgpu_engine_->distributed_compute_A_x();
+    return;
+  }
+
   if (!batch_mode_) {
     if constexpr (std::is_same_v<f_t, double>) {
       if (cusparse_view_.mixed_precision_enabled_) {
diff --git a/cpp/src/pdlp/pdhg.hpp b/cpp/src/pdlp/pdhg.hpp
index 0a64e49efb..d258afb091 100644
--- a/cpp/src/pdlp/pdhg.hpp
+++ b/cpp/src/pdlp/pdhg.hpp
@@ -21,6 +21,12 @@
 #include <rmm/device_uvector.hpp>
 
 namespace cuopt::linear_programming::detail {
+
+// Forward-declared to avoid include cycle: multi_gpu_engine.hpp itself includes pdhg.hpp
+// (engine calls per-shard pdhg compute_*). pdhg.cu does the full include.
+template <typename i_t, typename f_t>
+struct multi_gpu_engine_t;
+
 template <typename i_t, typename f_t>
 class pdhg_solver_t {
  public:
@@ -69,6 +75,21 @@ class pdhg_solver_t {
   void update_solution(cusparse_view_t<i_t, f_t>& current_op_problem_evaluation_cusparse_view_);
   void refine_initial_primal_projection();
 
+  // SpMV primitives. Public so the multi-GPU engine can drive them per-shard
+  // after halo-exchanging the relevant vector. Single-GPU PDLP still calls
+  // them internally via take_step / compute_next_*.
+  //
+  // If set_multi_gpu_engine() has been called, these dispatch to the engine
+  // (halo exchange + per-shard SpMV). Otherwise they run the single-GPU
+  // cusparse path on the local matrix.
+  void compute_At_y();
+  void compute_A_x();
+
+  // Master PDLP wires up the engine pointer here after the engine is built.
+  // Shards' pdhg_solver_ leaves this null so each shard runs single-GPU SpMV
+  // on its local matrix.
+  void set_multi_gpu_engine(multi_gpu_engine_t<i_t, f_t>* engine) { mgpu_engine_ = engine; }
+
   i_t total_pdhg_iterations_;
 
  private:
@@ -84,8 +105,6 @@ class pdhg_solver_t {
 
   void compute_primal_projection_with_gradient(rmm::device_uvector<f_t>& primal_step_size);
   void compute_primal_projection(rmm::device_uvector<f_t>& primal_step_size);
-  void compute_At_y();
-  void compute_A_x();
 
   bool batch_mode_{false};
   raft::handle_t const* handle_ptr_{nullptr};
@@ -132,6 +151,10 @@ class pdhg_solver_t {
   rmm::device_uvector<f_t> new_bounds_lower_;
   rmm::device_uvector<f_t> new_bounds_upper_;
   cuda::fast_mod_div<size_t> batch_size_divisor_;
+
+  // Non-owning. Set on the master pdhg_solver_ in distributed mode; null
+  // (default) means single-GPU path. See compute_At_y / compute_A_x.
+  multi_gpu_engine_t<i_t, f_t>* mgpu_engine_{nullptr};
 };
 
 }  // namespace cuopt::linear_programming::detail
diff --git a/cpp/src/pdlp/pdlp.cu b/cpp/src/pdlp/pdlp.cu
index d5422c9d5f..348d41a512 100644
--- a/cpp/src/pdlp/pdlp.cu
+++ b/cpp/src/pdlp/pdlp.cu
@@ -490,6 +490,10 @@ pdlp_solver_t<i_t, f_t>::pdlp_solver_t(problem_t<i_t, f_t>& op_problem,
     raft::copy(sub.dual_step_size_.data(), dual_step_size_.data(), 1, shard->stream);
   }
 
+  // Wire the engine into the master pdhg_solver_. Shards' pdhg_solver_ keep
+  // mgpu_engine_ == nullptr so they run plain single-GPU SpMV on local A.
+  pdhg_solver_.set_multi_gpu_engine(&*multi_gpu_engine);
+
   // Project initial primal solution
   if (settings_.hyper_params.project_initial_primal) {
     using f_t2 = typename type_2<f_t>::type;

From 76c0b3f50b96647d23534729a68c6b2f5702848d Mon Sep 17 00:00:00 2001
From: Bulle Mostovoi <vmostovoi@nvidia.com>
Date: Fri, 22 May 2026 11:48:51 +0200
Subject: [PATCH 18/67] transformed a transform. it compiles hehe

---
 cpp/src/pdlp/pdhg.cu  | 40 +++++++++++++++++++++++++++++-----------
 cpp/src/pdlp/pdhg.hpp |  7 +++++++
 cpp/src/pdlp/pdlp.cu  |  7 ++++---
 cpp/src/pdlp/pdlp.cuh |  6 ++++++
 4 files changed, 46 insertions(+), 14 deletions(-)

diff --git a/cpp/src/pdlp/pdhg.cu b/cpp/src/pdlp/pdhg.cu
index 9cf2087c8b..09d439cc0e 100644
--- a/cpp/src/pdlp/pdhg.cu
+++ b/cpp/src/pdlp/pdhg.cu
@@ -521,6 +521,26 @@ struct primal_reflected_major_projection {
   const f_t* scalar_;
 };
 
+// Pure cub-transform extract — body byte-identical to the non-batch inline
+// path in compute_next_primal_dual_solution_reflected. The platform dispatch
+// (single-GPU vs per-shard fan-out) lives at the call site, not here.
+// Placed after primal_reflected_major_projection so the functor is visible.
+template <typename i_t, typename f_t>
+void pdhg_solver_t<i_t, f_t>::primal_reflected_major_projection_transform(
+  rmm::device_uvector<f_t>& primal_step_size)
+{
+  cub::DeviceTransform::Transform(
+    cuda::std::make_tuple(current_saddle_point_state_.get_primal_solution().data(),
+                          problem_ptr->objective_coefficients.data(),
+                          current_saddle_point_state_.get_current_AtY().data(),
+                          problem_ptr->variable_bounds.data()),
+    thrust::make_zip_iterator(
+      potential_next_primal_solution_.data(), dual_slack_.data(), reflected_primal_.data()),
+    primal_size_h_,
+    primal_reflected_major_projection<f_t>(primal_step_size.data()),
+    stream_view_.value());
+}
+
 template <typename f_t>
 struct primal_reflected_major_projection_batch {
   using f_t2 = typename type_2<f_t>::type;
@@ -910,17 +930,15 @@ void pdhg_solver_t<i_t, f_t>::compute_next_primal_dual_solution_reflected(
       graph_all.start_capture(should_major);
 
       compute_At_y();
-      if (!batch_mode_) {
-        cub::DeviceTransform::Transform(
-          cuda::std::make_tuple(current_saddle_point_state_.get_primal_solution().data(),
-                                problem_ptr->objective_coefficients.data(),
-                                current_saddle_point_state_.get_current_AtY().data(),
-                                problem_ptr->variable_bounds.data()),
-          thrust::make_zip_iterator(
-            potential_next_primal_solution_.data(), dual_slack_.data(), reflected_primal_.data()),
-          primal_size_h_,
-          primal_reflected_major_projection<f_t>(primal_step_size.data()),
-          stream_view_.value());
+      if (mgpu_engine_ != nullptr) {
+        for (auto& shard : mgpu_engine_->shards) {
+          raft::device_setter guard(shard->device_id);
+          auto& sub_pdlp = *shard->sub_pdlp;
+          sub_pdlp.pdhg_solver_.primal_reflected_major_projection_transform(
+            sub_pdlp.get_primal_step_size());
+        }
+      } else if (!batch_mode_) {
+        primal_reflected_major_projection_transform(primal_step_size);
       } else {
         cub::DeviceFor::Bulk(potential_next_primal_solution_.size(),
                              primal_reflected_major_projection_bulk_op<f_t>{
diff --git a/cpp/src/pdlp/pdhg.hpp b/cpp/src/pdlp/pdhg.hpp
index d258afb091..3a1795ce6f 100644
--- a/cpp/src/pdlp/pdhg.hpp
+++ b/cpp/src/pdlp/pdhg.hpp
@@ -85,6 +85,13 @@ class pdhg_solver_t {
   void compute_At_y();
   void compute_A_x();
 
+  // Pure cub-transform extractions. Each one is byte-identical to the inline
+  // cub call it replaces — no platform dispatch inside. Callers handle the
+  // single-GPU vs per-shard branching at the call site (see the
+  // "if (mgpu_engine_) for shard..." blocks in compute_next_*).
+  void primal_reflected_major_projection_transform(
+    rmm::device_uvector<f_t>& primal_step_size);
+
   // Master PDLP wires up the engine pointer here after the engine is built.
   // Shards' pdhg_solver_ leaves this null so each shard runs single-GPU SpMV
   // on its local matrix.
diff --git a/cpp/src/pdlp/pdlp.cu b/cpp/src/pdlp/pdlp.cu
index 348d41a512..168f997724 100644
--- a/cpp/src/pdlp/pdlp.cu
+++ b/cpp/src/pdlp/pdlp.cu
@@ -501,11 +501,12 @@ pdlp_solver_t<i_t, f_t>::pdlp_solver_t(problem_t<i_t, f_t>& op_problem,
       raft::device_setter guard(shard->device_id);
       auto& sub = *shard->sub_pdlp;
       cub::DeviceTransform::Transform(
-        std::make_tuple(sub.pdhg_solver_.get_primal_solution().data(),
-                        sub.get_op_problem_scaled().variable_bounds.data()),
+        cuda::std::make_tuple(sub.pdhg_solver_.get_primal_solution().data(),
+                              sub.get_op_problem_scaled().variable_bounds.data()),
         sub.pdhg_solver_.get_primal_solution().data(),
         sub.pdhg_solver_.get_primal_solution().size(),
-        clamp<f_t, f_t2>(), shard->stream);
+        clamp<f_t, f_t2>(),
+        shard->stream.view());
     }
   }
 }
diff --git a/cpp/src/pdlp/pdlp.cuh b/cpp/src/pdlp/pdlp.cuh
index 598d93ec33..6b2bc35a24 100644
--- a/cpp/src/pdlp/pdlp.cuh
+++ b/cpp/src/pdlp/pdlp.cuh
@@ -114,6 +114,12 @@ class pdlp_solver_t {
     return initial_scaling_strategy_;
   }
 
+  // Per-shard primal/dual step sizes are private state on pdlp_solver_t but
+  // are needed inside the multi-GPU dispatch paths that fan out a master cub
+  // call across all shards' pdhg_solver_t::*_transform methods.
+  rmm::device_uvector<f_t>& get_primal_step_size() { return primal_step_size_; }
+  rmm::device_uvector<f_t>& get_dual_step_size() { return dual_step_size_; }
+
  private:
   void print_termination_criteria(const timer_t& timer, bool is_average = false);
   void print_final_termination_criteria(

From 5ec713842159df18170a2c6798f8c92344c789e6 Mon Sep 17 00:00:00 2001
From: Bulle Mostovoi <vmostovoi@nvidia.com>
Date: Fri, 22 May 2026 12:55:29 +0200
Subject: [PATCH 19/67] updated take step for distributed. compiles but doesnt
 run. will check on main

---
 cpp/CMakeLists.txt    |  29 ++++++++++++
 cpp/src/pdlp/pdhg.cu  | 102 +++++++++++++++++++++++++++++-------------
 cpp/src/pdlp/pdhg.hpp |   3 ++
 cpp/src/pdlp/pdlp.cu  |  15 +++++++
 4 files changed, 119 insertions(+), 30 deletions(-)

diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index e7b4693547..627e086343 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -288,6 +288,34 @@ create_logger_macros(CUOPT "cuopt::default_logger()" include/cuopt)
 
 find_package(CUDSS REQUIRED)
 
+# ##################################################################################################
+# - NCCL (multi-GPU distributed PDLP) -------------------------------------------------------------
+# NCCL is shipped via the conda env; no canonical CMake config target, so look it
+# up by name in the standard lib paths (plus CONDA_PREFIX as a hint).
+set(NCCL_HINT_PREFIXES "")
+if (DEFINED ENV{CONDA_PREFIX} AND NOT "$ENV{CONDA_PREFIX}" STREQUAL "")
+    list(APPEND NCCL_HINT_PREFIXES "$ENV{CONDA_PREFIX}")
+endif ()
+find_path(NCCL_INCLUDE_DIR
+    NAMES nccl.h
+    HINTS ${NCCL_HINT_PREFIXES}
+    PATH_SUFFIXES include
+)
+find_library(NCCL_LIBRARY
+    NAMES nccl
+    HINTS ${NCCL_HINT_PREFIXES}
+    PATH_SUFFIXES lib lib64
+)
+if (NOT NCCL_INCLUDE_DIR OR NOT NCCL_LIBRARY)
+    message(FATAL_ERROR "NCCL not found. Looked in ${NCCL_HINT_PREFIXES}. Install nccl-dev / libnccl-dev in the active env.")
+endif ()
+add_library(nccl_external UNKNOWN IMPORTED GLOBAL)
+set_target_properties(nccl_external PROPERTIES
+    IMPORTED_LOCATION "${NCCL_LIBRARY}"
+    INTERFACE_INCLUDE_DIRECTORIES "${NCCL_INCLUDE_DIR}"
+)
+message(STATUS "Using NCCL: ${NCCL_LIBRARY}")
+
 # ##################################################################################################
 # - gRPC and Protobuf setup -----------------------------------------------------------------------
 
@@ -549,6 +577,7 @@ target_link_libraries(cuopt
         ${CUDSS_LIB_FILE}
         PRIVATE
         ${CUOPT_PRIVATE_CUDA_LIBS}
+        nccl_external
         $<$<BOOL:${CUOPT_ENABLE_GRPC}>:protobuf::libprotobuf>
         $<$<BOOL:${CUOPT_ENABLE_GRPC}>:gRPC::grpc++>
 )
diff --git a/cpp/src/pdlp/pdhg.cu b/cpp/src/pdlp/pdhg.cu
index 09d439cc0e..eb60a43603 100644
--- a/cpp/src/pdlp/pdhg.cu
+++ b/cpp/src/pdlp/pdhg.cu
@@ -569,6 +569,21 @@ struct primal_reflected_projection {
   const f_t* scalar_;
 };
 
+template <typename i_t, typename f_t>
+void pdhg_solver_t<i_t, f_t>::primal_reflected_projection_transform(
+  rmm::device_uvector<f_t>& primal_step_size)
+{
+  cub::DeviceTransform::Transform(
+    cuda::std::make_tuple(current_saddle_point_state_.get_primal_solution().data(),
+                          problem_ptr->objective_coefficients.data(),
+                          current_saddle_point_state_.get_current_AtY().data(),
+                          problem_ptr->variable_bounds.data()),
+    reflected_primal_.data(),
+    primal_size_h_,
+    primal_reflected_projection<f_t>(primal_step_size.data()),
+    stream_view_.value());
+}
+
 template <typename f_t>
 struct primal_reflected_projection_batch {
   using f_t2 = typename type_2<f_t>::type;
@@ -598,6 +613,21 @@ struct dual_reflected_major_projection {
   const f_t* scalar_;
 };
 
+template <typename i_t, typename f_t>
+void pdhg_solver_t<i_t, f_t>::dual_reflected_major_projection_transform(
+  rmm::device_uvector<f_t>& dual_step_size)
+{
+  cub::DeviceTransform::Transform(
+    cuda::std::make_tuple(current_saddle_point_state_.get_dual_solution().data(),
+                          current_saddle_point_state_.get_dual_gradient().data(),
+                          problem_ptr->constraint_lower_bounds.data(),
+                          problem_ptr->constraint_upper_bounds.data()),
+    thrust::make_zip_iterator(potential_next_dual_solution_.data(), reflected_dual_.data()),
+    dual_size_h_,
+    dual_reflected_major_projection<f_t>(dual_step_size.data()),
+    stream_view_.value());
+}
+
 template <typename f_t>
 struct dual_reflected_major_projection_batch {
   HDI thrust::tuple<f_t, f_t> operator()(
@@ -626,6 +656,21 @@ struct dual_reflected_projection {
   const f_t* scalar_;
 };
 
+template <typename i_t, typename f_t>
+void pdhg_solver_t<i_t, f_t>::dual_reflected_projection_transform(
+  rmm::device_uvector<f_t>& dual_step_size)
+{
+  cub::DeviceTransform::Transform(
+    cuda::std::make_tuple(current_saddle_point_state_.get_dual_solution().data(),
+                          current_saddle_point_state_.get_dual_gradient().data(),
+                          problem_ptr->constraint_lower_bounds.data(),
+                          problem_ptr->constraint_upper_bounds.data()),
+    reflected_dual_.data(),
+    dual_size_h_,
+    dual_reflected_projection<f_t>(dual_step_size.data()),
+    stream_view_.value());
+}
+
 template <typename f_t>
 struct dual_reflected_projection_batch {
   HDI f_t
@@ -989,16 +1034,15 @@ void pdhg_solver_t<i_t, f_t>::compute_next_primal_dual_solution_reflected(
       // Compute next dual
       compute_A_x();
 
-      if (!batch_mode_) {
-        cub::DeviceTransform::Transform(
-          cuda::std::make_tuple(current_saddle_point_state_.get_dual_solution().data(),
-                                current_saddle_point_state_.get_dual_gradient().data(),
-                                problem_ptr->constraint_lower_bounds.data(),
-                                problem_ptr->constraint_upper_bounds.data()),
-          thrust::make_zip_iterator(potential_next_dual_solution_.data(), reflected_dual_.data()),
-          dual_size_h_,
-          dual_reflected_major_projection<f_t>(dual_step_size.data()),
-          stream_view_.value());
+      if (mgpu_engine_ != nullptr) {
+        for (auto& shard : mgpu_engine_->shards) {
+          raft::device_setter guard(shard->device_id);
+          auto& sub_pdlp = *shard->sub_pdlp;
+          sub_pdlp.pdhg_solver_.dual_reflected_major_projection_transform(
+            sub_pdlp.get_dual_step_size());
+        }
+      } else if (!batch_mode_) {
+        dual_reflected_major_projection_transform(dual_step_size);
       } else {
         cub::DeviceFor::Bulk(potential_next_dual_solution_.size(),
                              dual_reflected_major_projection_bulk_op<f_t>{
@@ -1036,16 +1080,15 @@ void pdhg_solver_t<i_t, f_t>::compute_next_primal_dual_solution_reflected(
             current_saddle_point_state_.get_current_AtY());
 #endif
 
-      if (!batch_mode_) {
-        cub::DeviceTransform::Transform(
-          cuda::std::make_tuple(current_saddle_point_state_.get_primal_solution().data(),
-                                problem_ptr->objective_coefficients.data(),
-                                current_saddle_point_state_.get_current_AtY().data(),
-                                problem_ptr->variable_bounds.data()),
-          reflected_primal_.data(),
-          primal_size_h_,
-          primal_reflected_projection<f_t>(primal_step_size.data()),
-          stream_view_.value());
+      if (mgpu_engine_ != nullptr) {
+        for (auto& shard : mgpu_engine_->shards) {
+          raft::device_setter guard(shard->device_id);
+          auto& sub_pdlp = *shard->sub_pdlp;
+          sub_pdlp.pdhg_solver_.primal_reflected_projection_transform(
+            sub_pdlp.get_primal_step_size());
+        }
+      } else if (!batch_mode_) {
+        primal_reflected_projection_transform(primal_step_size);
       } else {
         cub::DeviceFor::Bulk(reflected_primal_.size(),
                              primal_reflected_projection_bulk_op<f_t>{
@@ -1097,16 +1140,15 @@ void pdhg_solver_t<i_t, f_t>::compute_next_primal_dual_solution_reflected(
       // Compute next dual
       compute_A_x();
 
-      if (!batch_mode_) {
-        cub::DeviceTransform::Transform(
-          cuda::std::make_tuple(current_saddle_point_state_.get_dual_solution().data(),
-                                current_saddle_point_state_.get_dual_gradient().data(),
-                                problem_ptr->constraint_lower_bounds.data(),
-                                problem_ptr->constraint_upper_bounds.data()),
-          reflected_dual_.data(),
-          dual_size_h_,
-          dual_reflected_projection<f_t>(dual_step_size.data()),
-          stream_view_.value());
+      if (mgpu_engine_ != nullptr) {
+        for (auto& shard : mgpu_engine_->shards) {
+          raft::device_setter guard(shard->device_id);
+          auto& sub_pdlp = *shard->sub_pdlp;
+          sub_pdlp.pdhg_solver_.dual_reflected_projection_transform(
+            sub_pdlp.get_dual_step_size());
+        }
+      } else if (!batch_mode_) {
+        dual_reflected_projection_transform(dual_step_size);
       } else {
         cub::DeviceFor::Bulk(reflected_dual_.size(),
                              dual_reflected_projection_bulk_op<f_t>{
diff --git a/cpp/src/pdlp/pdhg.hpp b/cpp/src/pdlp/pdhg.hpp
index 3a1795ce6f..628c3897e2 100644
--- a/cpp/src/pdlp/pdhg.hpp
+++ b/cpp/src/pdlp/pdhg.hpp
@@ -91,6 +91,9 @@ class pdhg_solver_t {
   // "if (mgpu_engine_) for shard..." blocks in compute_next_*).
   void primal_reflected_major_projection_transform(
     rmm::device_uvector<f_t>& primal_step_size);
+  void dual_reflected_major_projection_transform(rmm::device_uvector<f_t>& dual_step_size);
+  void primal_reflected_projection_transform(rmm::device_uvector<f_t>& primal_step_size);
+  void dual_reflected_projection_transform(rmm::device_uvector<f_t>& dual_step_size);
 
   // Master PDLP wires up the engine pointer here after the engine is built.
   // Shards' pdhg_solver_ leaves this null so each shard runs single-GPU SpMV
diff --git a/cpp/src/pdlp/pdlp.cu b/cpp/src/pdlp/pdlp.cu
index 168f997724..37de2d8537 100644
--- a/cpp/src/pdlp/pdlp.cu
+++ b/cpp/src/pdlp/pdlp.cu
@@ -42,6 +42,7 @@
 
 #include <cmath>
 #include <optional>
+#include <type_traits>
 #include <unordered_set>
 
 namespace cuopt::linear_programming::detail {
@@ -327,6 +328,19 @@ pdlp_solver_t<i_t, f_t>::pdlp_solver_t(problem_t<i_t, f_t>& op_problem,
   cuopt_expects(num_gpus == settings.num_gpus && settings.num_gpus > 1,
                 error_type_t::ValidationError,
                 "This constructor should only be used for distributed PDLP (num_gpus > 1)");
+
+  // Distributed PDLP is currently double-only. The body is guarded with
+  // `if constexpr` so the float instantiation never references the
+  // multi_gpu_engine_t<i_t, float> / partition_loader_t<i_t, float> symbols
+  // (those are intentionally not instantiated in their .cu files), keeping
+  // the link clean. Trying to use distributed PDLP with f_t = float will
+  // throw at runtime instead.
+  if constexpr (!std::is_same_v<f_t, double>) {
+    cuopt_expects(false,
+                  error_type_t::ValidationError,
+                  "Distributed PDLP (num_gpus > 1) currently requires double precision");
+    return;
+  } else {
   // 2. Load partition
   std::vector<i_t> parts;
   if (!settings.multi_gpu_partition_file.empty()) {
@@ -509,6 +523,7 @@ pdlp_solver_t<i_t, f_t>::pdlp_solver_t(problem_t<i_t, f_t>& op_problem,
         shard->stream.view());
     }
   }
+  }  // end if constexpr (std::is_same_v<f_t, double>)
 }
 
 template <typename i_t, typename f_t>

From de19f38f5e771e8470d0ae8e711676fb47912c4f Mon Sep 17 00:00:00 2001
From: Bulle Mostovoi <vmostovoi@nvidia.com>
Date: Fri, 22 May 2026 13:40:31 +0200
Subject: [PATCH 20/67] support spmvop on multi-gpu

---
 cpp/src/pdlp/distributed_pdlp/multi_gpu_engine.hpp | 4 ++--
 cpp/src/pdlp/distributed_pdlp/shard.cu             | 2 ++
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/cpp/src/pdlp/distributed_pdlp/multi_gpu_engine.hpp b/cpp/src/pdlp/distributed_pdlp/multi_gpu_engine.hpp
index 9ea007947e..e9f48b9666 100644
--- a/cpp/src/pdlp/distributed_pdlp/multi_gpu_engine.hpp
+++ b/cpp/src/pdlp/distributed_pdlp/multi_gpu_engine.hpp
@@ -223,7 +223,7 @@ struct multi_gpu_engine_t {
   {
     halo_exchange_var();
     for_each_shard([&](auto& shard) {
-      shard.sub_pdlp->pdhg_solver_.compute_A_x();
+      shard.sub_pdlp->pdhg_solver_.spmvop_A_x();
     });
   }
 
@@ -232,7 +232,7 @@ struct multi_gpu_engine_t {
   {
     halo_exchange_cstr();
     for_each_shard([&](auto& shard) {
-      shard.sub_pdlp->pdhg_solver_.compute_At_y();
+      shard.sub_pdlp->pdhg_solver_.spmvop_At_y();
     });
   }
 
diff --git a/cpp/src/pdlp/distributed_pdlp/shard.cu b/cpp/src/pdlp/distributed_pdlp/shard.cu
index bbc02559cf..06c6f8c8de 100644
--- a/cpp/src/pdlp/distributed_pdlp/shard.cu
+++ b/cpp/src/pdlp/distributed_pdlp/shard.cu
@@ -195,6 +195,8 @@ pdlp_shard_t<i_t, f_t>::pdlp_shard_t(int device_id,
   scaling.set_h_bound_rescaling(h_bound_rescaling);
   scaling.set_h_objective_rescaling(h_objective_rescaling);
 
+  sub_pdlp->pdhg_solver_.get_cusparse_view().create_spmv_op_plans(
+    /* is_reflected */ true);
   // ---- 6. Build per-peer halo-exchange plans (ported from metis_tests). ----
   // For each peer p, we precompute:
   //   send_indices_d[p] : local indices to gather (uploaded from host send plan)

From 0030a6c5d7b3f9e22c3da791c25a09869679f3e0 Mon Sep 17 00:00:00 2001
From: Bulle Mostovoi <vmostovoi@nvidia.com>
Date: Fri, 22 May 2026 14:06:47 +0200
Subject: [PATCH 21/67] compile ready

---
 .../initial_scaling_strategy/initial_scaling.cu    | 14 ++++++++++----
 cpp/src/pdlp/pdhg.hpp                              |  6 +++---
 cpp/src/pdlp/pdlp.cu                               |  4 +++-
 3 files changed, 16 insertions(+), 8 deletions(-)

diff --git a/cpp/src/pdlp/initial_scaling_strategy/initial_scaling.cu b/cpp/src/pdlp/initial_scaling_strategy/initial_scaling.cu
index eb1bae2e95..fd6e02079e 100644
--- a/cpp/src/pdlp/initial_scaling_strategy/initial_scaling.cu
+++ b/cpp/src/pdlp/initial_scaling_strategy/initial_scaling.cu
@@ -938,15 +938,21 @@ void pdlp_initial_scaling_strategy_t<i_t, f_t>::set_cummulative_scaling(
 template <typename i_t, typename f_t>
 void pdlp_initial_scaling_strategy_t<i_t, f_t>::set_h_bound_rescaling(f_t value)
 {
-  h_bound_rescaling = value;
-  bound_rescaling_.set_value_async(value, stream_view_);
+  std::fill(h_bound_rescaling_.begin(), h_bound_rescaling_.end(), value);
+  thrust::fill(handle_ptr_->get_thrust_policy(),
+               bound_rescaling_.begin(),
+               bound_rescaling_.end(),
+               value);
 }
 
 template <typename i_t, typename f_t>
 void pdlp_initial_scaling_strategy_t<i_t, f_t>::set_h_objective_rescaling(f_t value)
 {
-  h_objective_rescaling = value;
-  objective_rescaling_.set_value_async(value, stream_view_);
+  std::fill(h_objective_rescaling_.begin(), h_objective_rescaling_.end(), value);
+  thrust::fill(handle_ptr_->get_thrust_policy(),
+               objective_rescaling_.begin(),
+               objective_rescaling_.end(),
+               value);
 }
 
 template <typename i_t, typename f_t>
diff --git a/cpp/src/pdlp/pdhg.hpp b/cpp/src/pdlp/pdhg.hpp
index 7b2e606864..8226d2cecc 100644
--- a/cpp/src/pdlp/pdhg.hpp
+++ b/cpp/src/pdlp/pdhg.hpp
@@ -91,7 +91,9 @@ class pdhg_solver_t {
   // cusparse path on the local matrix.
   void compute_At_y();
   void compute_A_x();
-
+  void spmvop_At_y();
+  void spmvop_A_x();
+  
   // Pure cub-transform extractions. Each one is byte-identical to the inline
   // cub call it replaces — no platform dispatch inside. Callers handle the
   // single-GPU vs per-shard branching at the call site (see the
@@ -124,8 +126,6 @@ class pdhg_solver_t {
 
   void compute_primal_projection_with_gradient(rmm::device_uvector<f_t>& primal_step_size);
   void compute_primal_projection(rmm::device_uvector<f_t>& primal_step_size);
-  void spmvop_At_y();
-  void spmvop_A_x();
 
   bool batch_mode_{false};
   raft::handle_t const* handle_ptr_{nullptr};
diff --git a/cpp/src/pdlp/pdlp.cu b/cpp/src/pdlp/pdlp.cu
index 8ceb712aff..ec7ed16c30 100644
--- a/cpp/src/pdlp/pdlp.cu
+++ b/cpp/src/pdlp/pdlp.cu
@@ -567,6 +567,7 @@ pdlp_solver_t<i_t, f_t>::pdlp_solver_t(problem_t<i_t, f_t>& op_problem,
 
   // Project initial primal solution
   if (settings_.hyper_params.project_initial_primal) {
+    // Use refine_initial_primal_projection ???
     using f_t2 = typename type_2<f_t>::type;
     for (auto& shard : multi_gpu_engine->shards) {
       raft::device_setter guard(shard->device_id);
@@ -2672,7 +2673,8 @@ optimization_problem_solution_t<i_t, f_t> pdlp_solver_t<i_t, f_t>::run_solver(co
         clamp<f_t, f_t2>(),
         stream_view_.value());
 
-      pdhg_solver_.refine_initial_primal_projection();
+      pdhg_solver_.refine_initial_primal_projection(
+        initial_scaling_strategy_.get_bound_rescaling_vector());
 
       if (!settings_.hyper_params.never_restart_to_average) {
         cuopt_expects(!batch_mode_,

From 172ebc29da1eb892da5a2fe22d2df1f57d93f773 Mon Sep 17 00:00:00 2001
From: Bulle Mostovoi <vmostovoi@nvidia.com>
Date: Fri, 22 May 2026 14:14:10 +0200
Subject: [PATCH 22/67] can run now

---
 cpp/src/pdlp/distributed_pdlp/shard.cu | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cpp/src/pdlp/distributed_pdlp/shard.cu b/cpp/src/pdlp/distributed_pdlp/shard.cu
index 06c6f8c8de..c66b03755e 100644
--- a/cpp/src/pdlp/distributed_pdlp/shard.cu
+++ b/cpp/src/pdlp/distributed_pdlp/shard.cu
@@ -225,6 +225,6 @@ pdlp_shard_t<i_t, f_t>::pdlp_shard_t(int device_id,
 }
 
 template struct pdlp_shard_t<int, double>;
-// template struct pdlp_shard_t<int, float>;
+template struct pdlp_shard_t<int, float>;
 
 }  // namespace cuopt::linear_programming::detail

From 23d07981d0dd7b84d953437aafd921292f4db4d8 Mon Sep 17 00:00:00 2001
From: Bulle Mostovoi <vmostovoi@nvidia.com>
Date: Fri, 22 May 2026 14:57:51 +0200
Subject: [PATCH 23/67] passing all tests, good merge

---
 cpp/src/pdlp/pdlp.cu | 33 ++++++++++++++++++++++++++-------
 1 file changed, 26 insertions(+), 7 deletions(-)

diff --git a/cpp/src/pdlp/pdlp.cu b/cpp/src/pdlp/pdlp.cu
index ec7ed16c30..b5fe5ad6ca 100644
--- a/cpp/src/pdlp/pdlp.cu
+++ b/cpp/src/pdlp/pdlp.cu
@@ -2665,13 +2665,31 @@ optimization_problem_solution_t<i_t, f_t> pdlp_solver_t<i_t, f_t>::run_solver(co
     // Project initial primal solution
     if (settings_.hyper_params.project_initial_primal) {
       using f_t2 = typename type_2<f_t>::type;
-      cub::DeviceTransform::Transform(
-        cuda::std::make_tuple(pdhg_solver_.get_primal_solution().data(),
-                              problem_wrap_container(op_problem_scaled_.variable_bounds)),
-        pdhg_solver_.get_primal_solution().data(),
-        pdhg_solver_.get_primal_solution().size(),
-        clamp<f_t, f_t2>(),
-        stream_view_.value());
+      if (batch_mode_) {
+        // In batch mode variable_bounds are shared and only the bound rescaling is per climber.
+        // Apply it here too so the initial point is projected into the correct scaled space.
+        cub::DeviceTransform::Transform(
+          cuda::std::make_tuple(
+            pdhg_solver_.get_primal_solution().data(),
+            thrust::make_transform_iterator(
+              thrust::make_zip_iterator(
+                problem_wrap_container(op_problem_scaled_.variable_bounds),
+                batch_wrapped_container(initial_scaling_strategy_.get_bound_rescaling_vector(),
+                                        primal_size_h_)),
+              scale_bounds_by_scalar_op<f_t>{})),
+          pdhg_solver_.get_primal_solution().data(),
+          pdhg_solver_.get_primal_solution().size(),
+          clamp<f_t, f_t2>(),
+          stream_view_.value());
+      } else {
+        cub::DeviceTransform::Transform(
+          cuda::std::make_tuple(pdhg_solver_.get_primal_solution().data(),
+                                problem_wrap_container(op_problem_scaled_.variable_bounds)),
+          pdhg_solver_.get_primal_solution().data(),
+          pdhg_solver_.get_primal_solution().size(),
+          clamp<f_t, f_t2>(),
+          stream_view_.value());
+      }
 
       pdhg_solver_.refine_initial_primal_projection(
         initial_scaling_strategy_.get_bound_rescaling_vector());
@@ -2718,6 +2736,7 @@ optimization_problem_solution_t<i_t, f_t> pdlp_solver_t<i_t, f_t>::run_solver(co
                                      restart_strategy_.last_restart_duality_gap_.dual_solution_,
                                      dummy);
       }
+      transpose_problem_fields(/*to_row=*/true);
     }
 
     if (verbose) {

From 30881ce2393292d2d4b7422f682857df074798c7 Mon Sep 17 00:00:00 2001
From: Bulle Mostovoi <vmostovoi@nvidia.com>
Date: Fri, 22 May 2026 16:24:24 +0200
Subject: [PATCH 24/67] fixed the errors hihi, finished distributed part for
 compte_fixed_error

---
 .../distributed_pdlp/multi_gpu_engine.hpp     |  99 ++++++++++---
 cpp/src/pdlp/pdhg.cu                          |  45 ++++++
 cpp/src/pdlp/pdhg.hpp                         |   9 ++
 cpp/src/pdlp/pdlp.cu                          | 133 ++++++++++++++----
 .../adaptive_step_size_strategy.cu            |  20 +++
 .../adaptive_step_size_strategy.hpp           |   7 +
 6 files changed, 266 insertions(+), 47 deletions(-)

diff --git a/cpp/src/pdlp/distributed_pdlp/multi_gpu_engine.hpp b/cpp/src/pdlp/distributed_pdlp/multi_gpu_engine.hpp
index e9f48b9666..6d9cf9d3a3 100644
--- a/cpp/src/pdlp/distributed_pdlp/multi_gpu_engine.hpp
+++ b/cpp/src/pdlp/distributed_pdlp/multi_gpu_engine.hpp
@@ -102,10 +102,12 @@ struct multi_gpu_engine_t {
 
   // -------- Halo exchange (variables / x) ---------------------------------
   // Fills the halo slice [owned_var_size, total_var_size) of the per-shard
-  // reflected_primal vector (the buffer A @ x reads). Step 1: thrust::gather
-  // per-peer outgoing values into staging buffers. Step 2: a single NCCL
-  // group with matched ncclSend / ncclRecv across all (rank, peer) pairs.
-  void halo_exchange_var()
+  // input buffer returned by `buf_access(pdhg)` (the buffer A @ x will read).
+  // Step 1: thrust::gather per-peer outgoing values into staging buffers.
+  // Step 2: a single NCCL group with matched ncclSend / ncclRecv across all
+  // (rank, peer) pairs.
+  template <typename BufAccess>
+  void halo_exchange_var(BufAccess&& buf_access)
   {
     const int nb = static_cast<int>(shards.size());
 
@@ -113,7 +115,7 @@ struct multi_gpu_engine_t {
     for (int r = 0; r < nb; ++r) {
       auto& s = *shards[r];
       raft::device_setter guard(s.device_id);
-      auto& x = s.sub_pdlp->pdhg_solver_.get_reflected_primal();
+      auto& x = buf_access(s.sub_pdlp->pdhg_solver_);
       for (int peer = 0; peer < nb; ++peer) {
         if (peer == r) continue;
         if (s.var_send_indices_d[peer].size() == 0) continue;
@@ -144,7 +146,7 @@ struct multi_gpu_engine_t {
       auto& s   = *shards[r];
       auto& rd  = s.rank_data;
       raft::device_setter guard(s.device_id);
-      auto& x   = s.sub_pdlp->pdhg_solver_.get_reflected_primal();
+      auto& x   = buf_access(s.sub_pdlp->pdhg_solver_);
       for (int peer = 0; peer < nb; ++peer) {
         if (peer == r) continue;
         f_t* recv_ptr = x.data() + rd.owned_var_size + rd.var_recv_offsets[peer];
@@ -160,16 +162,17 @@ struct multi_gpu_engine_t {
   }
 
   // -------- Halo exchange (constraints / y) -------------------------------
-  // Same as halo_exchange_var but for the per-shard dual solution (the buffer
-  // A_T @ y reads) and constraint halos.
-  void halo_exchange_cstr()
+  // Same as halo_exchange_var but for a constraint-shaped buffer (the input
+  // A_T @ y will read) and constraint halos.
+  template <typename BufAccess>
+  void halo_exchange_cstr(BufAccess&& buf_access)
   {
     const int nb = static_cast<int>(shards.size());
 
     for (int r = 0; r < nb; ++r) {
       auto& s = *shards[r];
       raft::device_setter guard(s.device_id);
-      auto& y = s.sub_pdlp->pdhg_solver_.get_dual_solution();
+      auto& y = buf_access(s.sub_pdlp->pdhg_solver_);
       for (int peer = 0; peer < nb; ++peer) {
         if (peer == r) continue;
         if (s.cstr_send_indices_d[peer].size() == 0) continue;
@@ -199,7 +202,7 @@ struct multi_gpu_engine_t {
       auto& s   = *shards[r];
       auto& rd  = s.rank_data;
       raft::device_setter guard(s.device_id);
-      auto& y   = s.sub_pdlp->pdhg_solver_.get_dual_solution();
+      auto& y   = buf_access(s.sub_pdlp->pdhg_solver_);
       for (int peer = 0; peer < nb; ++peer) {
         if (peer == r) continue;
         f_t* recv_ptr = y.data() + rd.owned_cstr_size + rd.cstr_recv_offsets[peer];
@@ -214,28 +217,78 @@ struct multi_gpu_engine_t {
     ncclGroupEnd();
   }
 
-  // -------- High-level: A @ x and A_T @ y ---------------------------------
-  // A @ x: halo-update the reflected_primal vector, then per-shard SpMV.
-  // Named distributed_* (rather than compute_*) to make call sites in pdhg.cu
-  // self-documenting and to avoid name collision with pdhg_solver_t's own
-  // compute_A_x / compute_At_y, which the engine dispatches into per shard.
-  void distributed_compute_A_x()
+  // -------- NCCL allreduce (sum, in place) --------------------------------
+  // Per-shard in-place sum-allreduce. Each shard's stream issues an
+  // ncclAllReduce(buf, buf, count, ncclFloat64, ncclSum, ...) inside a single
+  // group. After this returns, every shard's buffer holds the global sum.
+  //
+  // PtrAccess: pdlp_solver_t<i_t,f_t>& -> f_t*  (e.g. into step_size_strategy_).
+  template <typename PtrAccess>
+  void allreduce_sum_inplace(PtrAccess&& ptr_access, size_t count = 1)
+  {
+    ncclGroupStart();
+    for (auto& s : shards) {
+      raft::device_setter guard(s->device_id);
+      f_t* buf = ptr_access(*s->sub_pdlp);
+      ncclAllReduce(buf,
+                    buf,
+                    count,
+                    ncclFloat64,
+                    ncclSum,
+                    s->comm.get(),
+                    s->stream.view().value());
+    }
+    ncclGroupEnd();
+  }
+
+  // -------- Generic distributed SpMVs -------------------------------------
+  // distributed_spmv_A : halo-update the var-shaped input buffer returned by
+  // `in_buf(pdhg)`, then per-shard A @ in_buf -> out_desc.
+  // distributed_spmv_At: halo-update the cstr-shaped input buffer returned by
+  // `in_buf(pdhg)`, then per-shard A_T @ in_buf -> out_desc.
+  //
+  // Accessor signatures:
+  //   in_buf  (pdhg_solver_t<i_t,f_t>&) -> rmm::device_uvector<f_t>&
+  //   out_desc(pdhg_solver_t<i_t,f_t>&) -> cusparseDnVecDescr_t
+  template <typename InBufAccess, typename OutDescAccess>
+  void distributed_spmv_A(InBufAccess&& in_buf, OutDescAccess&& out_desc)
   {
-    halo_exchange_var();
+    halo_exchange_var(in_buf);
     for_each_shard([&](auto& shard) {
-      shard.sub_pdlp->pdhg_solver_.spmvop_A_x();
+      auto& sub_pdhg = shard.sub_pdlp->pdhg_solver_;
+      sub_pdhg.spmv_A_into(in_buf(sub_pdhg), out_desc(sub_pdhg));
     });
   }
 
-  // A_T @ y: halo-update the dual solution vector, then per-shard SpMV.
-  void distributed_compute_At_y()
+  template <typename InBufAccess, typename OutDescAccess>
+  void distributed_spmv_At(InBufAccess&& in_buf, OutDescAccess&& out_desc)
   {
-    halo_exchange_cstr();
+    halo_exchange_cstr(in_buf);
     for_each_shard([&](auto& shard) {
-      shard.sub_pdlp->pdhg_solver_.spmvop_At_y();
+      auto& sub_pdhg = shard.sub_pdlp->pdhg_solver_;
+      sub_pdhg.spmv_At_into(in_buf(sub_pdhg), out_desc(sub_pdhg));
     });
   }
 
+  // -------- High-level: A @ x and A_T @ y ---------------------------------
+  // Thin wrappers used from pdhg_solver_t::compute_A_x / compute_At_y when an
+  // engine is wired in. They use the canonical PDHG buffers/descriptors so the
+  // result lands where single-GPU PDHG would have put it (dual_gradient for A,
+  // current_AtY for A_T).
+  void distributed_compute_A_x()
+  {
+    distributed_spmv_A(
+      [](auto& pdhg) -> rmm::device_uvector<f_t>& { return pdhg.get_reflected_primal(); },
+      [](auto& pdhg) -> cusparseDnVecDescr_t { return pdhg.get_cusparse_view().dual_gradient; });
+  }
+
+  void distributed_compute_At_y()
+  {
+    distributed_spmv_At(
+      [](auto& pdhg) -> rmm::device_uvector<f_t>& { return pdhg.get_dual_solution(); },
+      [](auto& pdhg) -> cusparseDnVecDescr_t { return pdhg.get_cusparse_view().current_AtY; });
+  }
+
   // Engine-level stream for fork/join orchestration (master side).
   rmm::cuda_stream stream;
 
diff --git a/cpp/src/pdlp/pdhg.cu b/cpp/src/pdlp/pdhg.cu
index fb0fc9b611..56c61aedda 100644
--- a/cpp/src/pdlp/pdhg.cu
+++ b/cpp/src/pdlp/pdhg.cu
@@ -623,6 +623,51 @@ void pdhg_solver_t<i_t, f_t>::compute_A_x()
   }
 }
 
+template <typename i_t, typename f_t>
+void pdhg_solver_t<i_t, f_t>::spmv_At_into(rmm::device_uvector<f_t>& in_buf,
+                                           cusparseDnVecDescr_t out_desc)
+{
+  RAFT_CUSPARSE_TRY(cusparseDnVecSetValues(cusparse_view_.dual_solution, in_buf.data()));
+  RAFT_CUSPARSE_TRY(
+    raft::sparse::detail::cusparsespmv(handle_ptr_->get_cusparse_handle(),
+                                       CUSPARSE_OPERATION_NON_TRANSPOSE,
+                                       reusable_device_scalar_value_1_.data(),
+                                       cusparse_view_.A_T,
+                                       cusparse_view_.dual_solution,
+                                       reusable_device_scalar_value_0_.data(),
+                                       out_desc,
+                                       CUSPARSE_SPMV_CSR_ALG2,
+                                       (f_t*)cusparse_view_.buffer_transpose.data(),
+                                       stream_view_));
+  // Restore the canonical binding so subsequent code on this shard that reads
+  // cv.dual_solution sees the dual_solution_ buffer it was constructed with.
+  RAFT_CUSPARSE_TRY(cusparseDnVecSetValues(
+    cusparse_view_.dual_solution, current_saddle_point_state_.get_dual_solution().data()));
+}
+
+template <typename i_t, typename f_t>
+void pdhg_solver_t<i_t, f_t>::spmv_A_into(rmm::device_uvector<f_t>& in_buf,
+                                          cusparseDnVecDescr_t out_desc)
+{
+  RAFT_CUSPARSE_TRY(
+    cusparseDnVecSetValues(cusparse_view_.reflected_primal_solution, in_buf.data()));
+  RAFT_CUSPARSE_TRY(
+    raft::sparse::detail::cusparsespmv(handle_ptr_->get_cusparse_handle(),
+                                       CUSPARSE_OPERATION_NON_TRANSPOSE,
+                                       reusable_device_scalar_value_1_.data(),
+                                       cusparse_view_.A,
+                                       cusparse_view_.reflected_primal_solution,
+                                       reusable_device_scalar_value_0_.data(),
+                                       out_desc,
+                                       CUSPARSE_SPMV_CSR_ALG2,
+                                       (f_t*)cusparse_view_.buffer_non_transpose.data(),
+                                       stream_view_));
+  // Restore the canonical binding so subsequent code on this shard that reads
+  // cv.reflected_primal_solution sees the reflected_primal_ buffer.
+  RAFT_CUSPARSE_TRY(
+    cusparseDnVecSetValues(cusparse_view_.reflected_primal_solution, reflected_primal_.data()));
+}
+
 template <typename i_t, typename f_t>
 void pdhg_solver_t<i_t, f_t>::compute_primal_projection_with_gradient(
   rmm::device_uvector<f_t>& primal_step_size)
diff --git a/cpp/src/pdlp/pdhg.hpp b/cpp/src/pdlp/pdhg.hpp
index 8226d2cecc..8fbee24e71 100644
--- a/cpp/src/pdlp/pdhg.hpp
+++ b/cpp/src/pdlp/pdhg.hpp
@@ -93,6 +93,15 @@ class pdhg_solver_t {
   void compute_A_x();
   void spmvop_At_y();
   void spmvop_A_x();
+
+  // Parameterized SpMVs used by the multi-GPU engine.
+  // Both temporarily hijack a canonical input descriptor in cusparse_view_
+  // (cv.dual_solution for At, cv.reflected_primal_solution for A) to point at
+  // `in_buf.data()`, run the local SpMV into `out_desc`, then restore the
+  // descriptor to its original buffer so other code on this shard is unaffected.
+  // No multi-GPU dispatch inside — the engine is the orchestrator.
+  void spmv_At_into(rmm::device_uvector<f_t>& in_buf, cusparseDnVecDescr_t out_desc);
+  void spmv_A_into(rmm::device_uvector<f_t>& in_buf, cusparseDnVecDescr_t out_desc);
   
   // Pure cub-transform extractions. Each one is byte-identical to the inline
   // cub call it replaces — no platform dispatch inside. Callers handle the
diff --git a/cpp/src/pdlp/pdlp.cu b/cpp/src/pdlp/pdlp.cu
index b5fe5ad6ca..7203c11a42 100644
--- a/cpp/src/pdlp/pdlp.cu
+++ b/cpp/src/pdlp/pdlp.cu
@@ -2221,34 +2221,118 @@ void pdlp_solver_t<i_t, f_t>::compute_fixed_error(std::vector<int>& has_restarte
 
   // Computing the deltas
   // TODO batch mdoe: this only works if everyone restarts
-  cub::DeviceTransform::Transform(cuda::std::make_tuple(pdhg_solver_.get_reflected_primal().data(),
-                                                        pdhg_solver_.get_primal_solution().data()),
-                                  pdhg_solver_.get_saddle_point_state().get_delta_primal().data(),
-                                  pdhg_solver_.get_primal_solution().size(),
-                                  cuda::std::minus<f_t>{},
-                                  stream_view_.value());
-  cub::DeviceTransform::Transform(cuda::std::make_tuple(pdhg_solver_.get_reflected_dual().data(),
-                                                        pdhg_solver_.get_dual_solution().data()),
-                                  pdhg_solver_.get_saddle_point_state().get_delta_dual().data(),
-                                  pdhg_solver_.get_dual_solution().size(),
-                                  cuda::std::minus<f_t>{},
-                                  stream_view_.value());
+  if (multi_gpu_engine) {
+    // Go faire une fonction compute_delta_primal, compute_delta primal ? 
+    for (auto& shard : multi_gpu_engine->shards) {
+      raft::device_setter guard(shard->device_id);
+      auto& sub_pdhg = shard->sub_pdlp->pdhg_solver_;
+      cub::DeviceTransform::Transform(
+        cuda::std::make_tuple(sub_pdhg.get_reflected_primal().data(),
+                              sub_pdhg.get_primal_solution().data()),
+        sub_pdhg.get_saddle_point_state().get_delta_primal().data(),
+        sub_pdhg.get_primal_solution().size(),
+        cuda::std::minus<f_t>{},
+        shard->stream.view());
+      cub::DeviceTransform::Transform(
+        cuda::std::make_tuple(sub_pdhg.get_reflected_dual().data(),
+                              sub_pdhg.get_dual_solution().data()),
+        sub_pdhg.get_saddle_point_state().get_delta_dual().data(),
+        sub_pdhg.get_dual_solution().size(),
+        cuda::std::minus<f_t>{},
+        shard->stream.view());
+    }
+  } else {
+    cub::DeviceTransform::Transform(
+      cuda::std::make_tuple(pdhg_solver_.get_reflected_primal().data(),
+                            pdhg_solver_.get_primal_solution().data()),
+      pdhg_solver_.get_saddle_point_state().get_delta_primal().data(),
+      pdhg_solver_.get_primal_solution().size(),
+      cuda::std::minus<f_t>{},
+      stream_view_.value());
+    cub::DeviceTransform::Transform(
+      cuda::std::make_tuple(pdhg_solver_.get_reflected_dual().data(),
+                            pdhg_solver_.get_dual_solution().data()),
+      pdhg_solver_.get_saddle_point_state().get_delta_dual().data(),
+      pdhg_solver_.get_dual_solution().size(),
+      cuda::std::minus<f_t>{},
+      stream_view_.value());
+  }
 
   auto& cusparse_view = pdhg_solver_.get_cusparse_view();
-  // Sync to make sure all previous cuSparse operations are finished before setting the
-  // potential_next_dual_solution
-  RAFT_CUDA_TRY(cudaStreamSynchronize(stream_view_));
 
-  // Make potential_next_dual_solution point towards reflected dual solution to reuse the code
-  RAFT_CUSPARSE_TRY(cusparseDnVecSetValues(cusparse_view.potential_next_dual_solution,
-                                           (void*)pdhg_solver_.get_reflected_dual().data()));
+  if (multi_gpu_engine) {
 
-  if (batch_mode_)
-    RAFT_CUSPARSE_TRY(cusparseDnMatSetValues(cusparse_view.batch_potential_next_dual_solution,
+    // SpMV is the first operation in compute_interaction_and_movement so we can do halo before and call it naturally
+    // we then reduce the local dot products
+    multi_gpu_engine->halo_exchange_cstr(
+      [](auto& pdhg) -> rmm::device_uvector<f_t>& { return pdhg.get_reflected_dual(); });
+
+    for (auto& shard : multi_gpu_engine->shards) {
+      raft::device_setter guard(shard->device_id);
+      auto& sub_pdlp = *shard->sub_pdlp;
+      auto& sub_cv   = sub_pdlp.pdhg_solver_.get_cusparse_view();
+
+      RAFT_CUSPARSE_TRY(
+        cusparseDnVecSetValues(sub_cv.potential_next_dual_solution,
+                               (void*)sub_pdlp.pdhg_solver_.get_reflected_dual().data()));
+
+      sub_pdlp.step_size_strategy_.compute_interaction_and_movement(
+        sub_pdlp.pdhg_solver_.get_primal_tmp_resource(),
+        sub_cv,
+        sub_pdlp.pdhg_solver_.get_saddle_point_state());
+
+      RAFT_CUSPARSE_TRY(cusparseDnVecSetValues(
+        sub_cv.potential_next_dual_solution,
+        (void*)sub_pdlp.pdhg_solver_.get_potential_next_dual_solution().data()));
+    }
+
+    multi_gpu_engine->allreduce_sum_inplace(
+      [](auto& sp) -> f_t* { return sp.step_size_strategy_.get_interaction().data(); }, 1);
+    multi_gpu_engine->allreduce_sum_inplace(
+      [](auto& sp) -> f_t* {
+        return sp.step_size_strategy_.get_norm_squared_delta_primal().data();
+      },
+      1);
+    multi_gpu_engine->allreduce_sum_inplace(
+      [](auto& sp) -> f_t* {
+        return sp.step_size_strategy_.get_norm_squared_delta_dual().data();
+      },
+      1);
+
+    auto& s0 = *multi_gpu_engine->shards[0];
+    {
+      raft::device_setter guard(s0.device_id);
+      RAFT_CUDA_TRY(cudaStreamSynchronize(s0.stream.view().value()));
+    }
+    auto& src_sp = s0.sub_pdlp->step_size_strategy_;
+    raft::copy(step_size_strategy_.get_interaction().data(),
+               src_sp.get_interaction().data(),
+               1,
+               stream_view_);
+    raft::copy(step_size_strategy_.get_norm_squared_delta_primal().data(),
+               src_sp.get_norm_squared_delta_primal().data(),
+               1,
+               stream_view_);
+    raft::copy(step_size_strategy_.get_norm_squared_delta_dual().data(),
+               src_sp.get_norm_squared_delta_dual().data(),
+               1,
+               stream_view_);
+  } else {
+    // Sync to make sure all previous cuSparse operations are finished before setting the
+    // potential_next_dual_solution
+    RAFT_CUDA_TRY(cudaStreamSynchronize(stream_view_));
+
+    // Make potential_next_dual_solution point towards reflected dual solution to reuse the code
+    RAFT_CUSPARSE_TRY(cusparseDnVecSetValues(cusparse_view.potential_next_dual_solution,
                                              (void*)pdhg_solver_.get_reflected_dual().data()));
 
-  step_size_strategy_.compute_interaction_and_movement(
-    pdhg_solver_.get_primal_tmp_resource(), cusparse_view, pdhg_solver_.get_saddle_point_state());
+    if (batch_mode_)
+      RAFT_CUSPARSE_TRY(cusparseDnMatSetValues(cusparse_view.batch_potential_next_dual_solution,
+                                               (void*)pdhg_solver_.get_reflected_dual().data()));
+
+    step_size_strategy_.compute_interaction_and_movement(
+      pdhg_solver_.get_primal_tmp_resource(), cusparse_view, pdhg_solver_.get_saddle_point_state());
+  }
 
   if (batch_mode_) {
     const auto [grid_size, block_size] = kernel_config_from_batch_size(climber_strategies_.size());
@@ -2279,11 +2363,12 @@ void pdlp_solver_t<i_t, f_t>::compute_fixed_error(std::vector<int>& has_restarte
   // potential_next_dual_solution
   RAFT_CUDA_TRY(cudaStreamSynchronize(stream_view_));
 
-  // Put back
+  // Put back, already done in multi-gpu side
+  if (!multi_gpu_engine) {
   RAFT_CUSPARSE_TRY(
     cusparseDnVecSetValues(cusparse_view.potential_next_dual_solution,
                            (void*)pdhg_solver_.get_potential_next_dual_solution().data()));
-
+    }
   if (batch_mode_) {
     RAFT_CUSPARSE_TRY(
       cusparseDnMatSetValues(cusparse_view.batch_potential_next_dual_solution,
diff --git a/cpp/src/pdlp/step_size_strategy/adaptive_step_size_strategy.cu b/cpp/src/pdlp/step_size_strategy/adaptive_step_size_strategy.cu
index 1f137dc9ea..fb85be4280 100644
--- a/cpp/src/pdlp/step_size_strategy/adaptive_step_size_strategy.cu
+++ b/cpp/src/pdlp/step_size_strategy/adaptive_step_size_strategy.cu
@@ -309,6 +309,26 @@ adaptive_step_size_strategy_t<i_t, f_t>::get_norm_squared_delta_dual() const
   return norm_squared_delta_dual_;
 }
 
+template <typename i_t, typename f_t>
+rmm::device_uvector<f_t>& adaptive_step_size_strategy_t<i_t, f_t>::get_interaction()
+{
+  return interaction_;
+}
+
+template <typename i_t, typename f_t>
+rmm::device_uvector<f_t>&
+adaptive_step_size_strategy_t<i_t, f_t>::get_norm_squared_delta_primal()
+{
+  return norm_squared_delta_primal_;
+}
+
+template <typename i_t, typename f_t>
+rmm::device_uvector<f_t>&
+adaptive_step_size_strategy_t<i_t, f_t>::get_norm_squared_delta_dual()
+{
+  return norm_squared_delta_dual_;
+}
+
 template <typename i_t, typename f_t>
 void adaptive_step_size_strategy_t<i_t, f_t>::set_valid_step_size(i_t valid)
 {
diff --git a/cpp/src/pdlp/step_size_strategy/adaptive_step_size_strategy.hpp b/cpp/src/pdlp/step_size_strategy/adaptive_step_size_strategy.hpp
index 1e969150e7..896c6fa24e 100644
--- a/cpp/src/pdlp/step_size_strategy/adaptive_step_size_strategy.hpp
+++ b/cpp/src/pdlp/step_size_strategy/adaptive_step_size_strategy.hpp
@@ -81,6 +81,13 @@ class adaptive_step_size_strategy_t {
   const rmm::device_uvector<f_t>& get_norm_squared_delta_primal() const;
   const rmm::device_uvector<f_t>& get_norm_squared_delta_dual() const;
 
+  // Mutable overloads — used by the multi-GPU path to NCCL-allreduce the
+  // per-shard scalar contributions in place and to mirror them back to the
+  // master step_size_strategy_.
+  rmm::device_uvector<f_t>& get_interaction();
+  rmm::device_uvector<f_t>& get_norm_squared_delta_primal();
+  rmm::device_uvector<f_t>& get_norm_squared_delta_dual();
+
   void compute_interaction_and_movement(rmm::device_uvector<f_t>& tmp_primal,
                                         cusparse_view_t<i_t, f_t>& cusparse_view,
                                         saddle_point_state_t<i_t, f_t>& current_saddle_point_state);

From c33faf2d4d0ce0b00390553bae0c9c6e70b0c03d Mon Sep 17 00:00:00 2001
From: Bulle Mostovoi <vmostovoi@nvidia.com>
Date: Fri, 22 May 2026 16:27:00 +0200
Subject: [PATCH 25/67] style

---
 .../distributed_pdlp/multi_gpu_engine.hpp     |  60 +--
 .../pdlp/distributed_pdlp/partition_loader.cu |  12 +-
 cpp/src/pdlp/distributed_pdlp/shard.cu        |   6 +-
 .../initial_scaling.cu                        |   6 +-
 cpp/src/pdlp/pdhg.cu                          |  30 +-
 cpp/src/pdlp/pdhg.hpp                         |   5 +-
 cpp/src/pdlp/pdlp.cu                          | 424 +++++++++---------
 .../adaptive_step_size_strategy.cu            |   6 +-
 8 files changed, 259 insertions(+), 290 deletions(-)

diff --git a/cpp/src/pdlp/distributed_pdlp/multi_gpu_engine.hpp b/cpp/src/pdlp/distributed_pdlp/multi_gpu_engine.hpp
index 6d9cf9d3a3..001f9b760e 100644
--- a/cpp/src/pdlp/distributed_pdlp/multi_gpu_engine.hpp
+++ b/cpp/src/pdlp/distributed_pdlp/multi_gpu_engine.hpp
@@ -15,9 +15,9 @@
 #include <rmm/cuda_stream.hpp>
 #include <rmm/exec_policy.hpp>
 
+#include <thrust/gather.h>
 #include <cub/device/device_transform.cuh>
 #include <cuda/std/tuple>
-#include <thrust/gather.h>
 
 #include <nccl.h>
 
@@ -53,51 +53,35 @@ struct multi_gpu_engine_t {
   multi_gpu_engine_t(const multi_gpu_engine_t&)            = delete;
   multi_gpu_engine_t& operator=(const multi_gpu_engine_t&) = delete;
 
-
-
   template <typename Fn>
   void for_each_shard(Fn&& fn)
   {
     for (auto& s : shards) {
-      raft::device_setter guard(s->device_id);   
-      fn(*s);                                     
+      raft::device_setter guard(s->device_id);
+      fn(*s);
     }
   }
 
-  template <typename... InAccess,
-          typename OutAccess,
-          typename SizeAccess,
-          typename Op>
+  template <typename... InAccess, typename OutAccess, typename SizeAccess, typename Op>
   void distributed_transform(std::tuple<InAccess...> in_accessors,
-                            OutAccess                out,
-                            SizeAccess               sz,
-                            Op                       op)
+                             OutAccess out,
+                             SizeAccess sz,
+                             Op op)
   {
     for_each_shard([&](auto& shard) {
       auto& sub = *shard.sub_pdlp;
       // turns the Tuple of lambdas into a tuple of rmm::device_uvector
       auto cub_inputs = std::apply(
-        [&sub](auto&... acc) { return cuda::std::make_tuple(acc(sub)...); },
-        in_accessors);
+        [&sub](auto&... acc) { return cuda::std::make_tuple(acc(sub)...); }, in_accessors);
 
-      cub::DeviceTransform::Transform(cub_inputs,
-                                      out(sub),
-                                      sz(sub),
-                                      op,
-                                      shard.stream.view());
+      cub::DeviceTransform::Transform(cub_inputs, out(sub), sz(sub), op, shard.stream.view());
     });
   }
   // --- 2) convenience: single input accessor (delegates) ---
-  template <typename InAccess,
-  typename OutAccess,
-  typename SizeAccess,
-  typename Op>
-  void distributed_transform(InAccess   in,
-                  OutAccess  out,
-                  SizeAccess sz,
-                  Op         op)
+  template <typename InAccess, typename OutAccess, typename SizeAccess, typename Op>
+  void distributed_transform(InAccess in, OutAccess out, SizeAccess sz, Op op)
   {
-  distributed_transform(std::make_tuple(in), out, sz, op);
+    distributed_transform(std::make_tuple(in), out, sz, op);
   }
 
   // -------- Halo exchange (variables / x) ---------------------------------
@@ -143,10 +127,10 @@ struct multi_gpu_engine_t {
       }
     }
     for (int r = 0; r < nb; ++r) {
-      auto& s   = *shards[r];
-      auto& rd  = s.rank_data;
+      auto& s  = *shards[r];
+      auto& rd = s.rank_data;
       raft::device_setter guard(s.device_id);
-      auto& x   = buf_access(s.sub_pdlp->pdhg_solver_);
+      auto& x = buf_access(s.sub_pdlp->pdhg_solver_);
       for (int peer = 0; peer < nb; ++peer) {
         if (peer == r) continue;
         f_t* recv_ptr = x.data() + rd.owned_var_size + rd.var_recv_offsets[peer];
@@ -199,10 +183,10 @@ struct multi_gpu_engine_t {
       }
     }
     for (int r = 0; r < nb; ++r) {
-      auto& s   = *shards[r];
-      auto& rd  = s.rank_data;
+      auto& s  = *shards[r];
+      auto& rd = s.rank_data;
       raft::device_setter guard(s.device_id);
-      auto& y   = buf_access(s.sub_pdlp->pdhg_solver_);
+      auto& y = buf_access(s.sub_pdlp->pdhg_solver_);
       for (int peer = 0; peer < nb; ++peer) {
         if (peer == r) continue;
         f_t* recv_ptr = y.data() + rd.owned_cstr_size + rd.cstr_recv_offsets[peer];
@@ -230,13 +214,7 @@ struct multi_gpu_engine_t {
     for (auto& s : shards) {
       raft::device_setter guard(s->device_id);
       f_t* buf = ptr_access(*s->sub_pdlp);
-      ncclAllReduce(buf,
-                    buf,
-                    count,
-                    ncclFloat64,
-                    ncclSum,
-                    s->comm.get(),
-                    s->stream.view().value());
+      ncclAllReduce(buf, buf, count, ncclFloat64, ncclSum, s->comm.get(), s->stream.view().value());
     }
     ncclGroupEnd();
   }
diff --git a/cpp/src/pdlp/distributed_pdlp/partition_loader.cu b/cpp/src/pdlp/distributed_pdlp/partition_loader.cu
index 007df4ce1c..b9bc71ae9e 100644
--- a/cpp/src/pdlp/distributed_pdlp/partition_loader.cu
+++ b/cpp/src/pdlp/distributed_pdlp/partition_loader.cu
@@ -19,9 +19,9 @@ std::vector<i_t> partition_loader_t<i_t, f_t>::parse_distributed_pdlp_partition_
 {
   std::ifstream part_file(file);
   cuopt_expects(part_file.is_open(),
-  error_type_t::ValidationError,
-  "Failed to open partition file: %s",
-  file.c_str());
+                error_type_t::ValidationError,
+                "Failed to open partition file: %s",
+                file.c_str());
 
   // One integer per line; operator>> skips whitespace so blank lines and
   // trailing newlines are tolerated.
@@ -33,9 +33,9 @@ std::vector<i_t> partition_loader_t<i_t, f_t>::parse_distributed_pdlp_partition_
 
   // We must have hit EOF cleanly; any other state means a malformed token.
   cuopt_expects(part_file.eof(),
-  error_type_t::ValidationError,
-  "Malformed partition file (expected one integer per line): %s",
-  file.c_str());
+                error_type_t::ValidationError,
+                "Malformed partition file (expected one integer per line): %s",
+                file.c_str());
 
   return parts;
 }
diff --git a/cpp/src/pdlp/distributed_pdlp/shard.cu b/cpp/src/pdlp/distributed_pdlp/shard.cu
index c66b03755e..33aac38103 100644
--- a/cpp/src/pdlp/distributed_pdlp/shard.cu
+++ b/cpp/src/pdlp/distributed_pdlp/shard.cu
@@ -202,9 +202,7 @@ pdlp_shard_t<i_t, f_t>::pdlp_shard_t(int device_id,
   //   send_indices_d[p] : local indices to gather (uploaded from host send plan)
   //   send_buf_d[p]     : f_t staging buffer sized to match
   // Self-peer slot is present but empty (size 0). Used in engine halo exchange.
-  auto build_send_plan = [&](auto const& send_per_peer,
-                             auto& indices_d,
-                             auto& buf_d) {
+  auto build_send_plan = [&](auto const& send_per_peer, auto& indices_d, auto& buf_d) {
     const std::size_t n_peers = send_per_peer.size();
     indices_d.reserve(n_peers);
     buf_d.reserve(n_peers);
@@ -218,7 +216,7 @@ pdlp_shard_t<i_t, f_t>::pdlp_shard_t(int device_id,
       buf_d.emplace_back(std::move(buf));
     }
   };
-  build_send_plan(rank_data.var_send_per_peer,  var_send_indices_d,  var_send_buf_d);
+  build_send_plan(rank_data.var_send_per_peer, var_send_indices_d, var_send_buf_d);
   build_send_plan(rank_data.cstr_send_per_peer, cstr_send_indices_d, cstr_send_buf_d);
 
   handle.sync_stream(stream_view);
diff --git a/cpp/src/pdlp/initial_scaling_strategy/initial_scaling.cu b/cpp/src/pdlp/initial_scaling_strategy/initial_scaling.cu
index fd6e02079e..478753e9d9 100644
--- a/cpp/src/pdlp/initial_scaling_strategy/initial_scaling.cu
+++ b/cpp/src/pdlp/initial_scaling_strategy/initial_scaling.cu
@@ -939,10 +939,8 @@ template <typename i_t, typename f_t>
 void pdlp_initial_scaling_strategy_t<i_t, f_t>::set_h_bound_rescaling(f_t value)
 {
   std::fill(h_bound_rescaling_.begin(), h_bound_rescaling_.end(), value);
-  thrust::fill(handle_ptr_->get_thrust_policy(),
-               bound_rescaling_.begin(),
-               bound_rescaling_.end(),
-               value);
+  thrust::fill(
+    handle_ptr_->get_thrust_policy(), bound_rescaling_.begin(), bound_rescaling_.end(), value);
 }
 
 template <typename i_t, typename f_t>
diff --git a/cpp/src/pdlp/pdhg.cu b/cpp/src/pdlp/pdhg.cu
index 56c61aedda..969f5d0d30 100644
--- a/cpp/src/pdlp/pdhg.cu
+++ b/cpp/src/pdlp/pdhg.cu
@@ -8,8 +8,8 @@
 // pdlp.cuh defines pdlp_solver_t which the engine's compute_A_x/compute_At_y
 // template bodies dereference via shard.sub_pdlp->pdhg_solver_. Must be a
 // complete type at the point of template instantiation below.
-#include <pdlp/pdlp.cuh>
 #include <pdlp/distributed_pdlp/multi_gpu_engine.hpp>
+#include <pdlp/pdlp.cuh>
 #include <pdlp/pdlp_climber_strategy.hpp>
 #include <pdlp/pdlp_constants.hpp>
 #include <pdlp/swap_and_resize_helper.cuh>
@@ -628,21 +628,20 @@ void pdhg_solver_t<i_t, f_t>::spmv_At_into(rmm::device_uvector<f_t>& in_buf,
                                            cusparseDnVecDescr_t out_desc)
 {
   RAFT_CUSPARSE_TRY(cusparseDnVecSetValues(cusparse_view_.dual_solution, in_buf.data()));
-  RAFT_CUSPARSE_TRY(
-    raft::sparse::detail::cusparsespmv(handle_ptr_->get_cusparse_handle(),
-                                       CUSPARSE_OPERATION_NON_TRANSPOSE,
-                                       reusable_device_scalar_value_1_.data(),
-                                       cusparse_view_.A_T,
-                                       cusparse_view_.dual_solution,
-                                       reusable_device_scalar_value_0_.data(),
-                                       out_desc,
-                                       CUSPARSE_SPMV_CSR_ALG2,
-                                       (f_t*)cusparse_view_.buffer_transpose.data(),
-                                       stream_view_));
+  RAFT_CUSPARSE_TRY(raft::sparse::detail::cusparsespmv(handle_ptr_->get_cusparse_handle(),
+                                                       CUSPARSE_OPERATION_NON_TRANSPOSE,
+                                                       reusable_device_scalar_value_1_.data(),
+                                                       cusparse_view_.A_T,
+                                                       cusparse_view_.dual_solution,
+                                                       reusable_device_scalar_value_0_.data(),
+                                                       out_desc,
+                                                       CUSPARSE_SPMV_CSR_ALG2,
+                                                       (f_t*)cusparse_view_.buffer_transpose.data(),
+                                                       stream_view_));
   // Restore the canonical binding so subsequent code on this shard that reads
   // cv.dual_solution sees the dual_solution_ buffer it was constructed with.
-  RAFT_CUSPARSE_TRY(cusparseDnVecSetValues(
-    cusparse_view_.dual_solution, current_saddle_point_state_.get_dual_solution().data()));
+  RAFT_CUSPARSE_TRY(cusparseDnVecSetValues(cusparse_view_.dual_solution,
+                                           current_saddle_point_state_.get_dual_solution().data()));
 }
 
 template <typename i_t, typename f_t>
@@ -1434,8 +1433,7 @@ void pdhg_solver_t<i_t, f_t>::compute_next_primal_dual_solution_reflected(
         for (auto& shard : mgpu_engine_->shards) {
           raft::device_setter guard(shard->device_id);
           auto& sub_pdlp = *shard->sub_pdlp;
-          sub_pdlp.pdhg_solver_.dual_reflected_projection_transform(
-            sub_pdlp.get_dual_step_size());
+          sub_pdlp.pdhg_solver_.dual_reflected_projection_transform(sub_pdlp.get_dual_step_size());
         }
       } else if (!batch_mode_) {
         dual_reflected_projection_transform(dual_step_size);
diff --git a/cpp/src/pdlp/pdhg.hpp b/cpp/src/pdlp/pdhg.hpp
index 8fbee24e71..e38ea9389c 100644
--- a/cpp/src/pdlp/pdhg.hpp
+++ b/cpp/src/pdlp/pdhg.hpp
@@ -102,13 +102,12 @@ class pdhg_solver_t {
   // No multi-GPU dispatch inside — the engine is the orchestrator.
   void spmv_At_into(rmm::device_uvector<f_t>& in_buf, cusparseDnVecDescr_t out_desc);
   void spmv_A_into(rmm::device_uvector<f_t>& in_buf, cusparseDnVecDescr_t out_desc);
-  
+
   // Pure cub-transform extractions. Each one is byte-identical to the inline
   // cub call it replaces — no platform dispatch inside. Callers handle the
   // single-GPU vs per-shard branching at the call site (see the
   // "if (mgpu_engine_) for shard..." blocks in compute_next_*).
-  void primal_reflected_major_projection_transform(
-    rmm::device_uvector<f_t>& primal_step_size);
+  void primal_reflected_major_projection_transform(rmm::device_uvector<f_t>& primal_step_size);
   void dual_reflected_major_projection_transform(rmm::device_uvector<f_t>& dual_step_size);
   void primal_reflected_projection_transform(rmm::device_uvector<f_t>& primal_step_size);
   void dual_reflected_projection_transform(rmm::device_uvector<f_t>& dual_step_size);
diff --git a/cpp/src/pdlp/pdlp.cu b/cpp/src/pdlp/pdlp.cu
index 7203c11a42..302f62e56a 100644
--- a/cpp/src/pdlp/pdlp.cu
+++ b/cpp/src/pdlp/pdlp.cu
@@ -44,8 +44,8 @@
 #include <algorithm>
 #include <cmath>
 #include <optional>
-#include <type_traits>
 #include <tuple>
+#include <type_traits>
 #include <unordered_set>
 
 namespace cuopt::linear_programming::detail {
@@ -398,189 +398,195 @@ pdlp_solver_t<i_t, f_t>::pdlp_solver_t(problem_t<i_t, f_t>& op_problem,
                   "Distributed PDLP (num_gpus > 1) currently requires double precision");
     return;
   } else {
-  // 2. Load partition
-  std::vector<i_t> parts;
-  if (!settings.multi_gpu_partition_file.empty()) {
-    parts = partition_loader_t<i_t, f_t>::parse_distributed_pdlp_partition_file(
-      settings.multi_gpu_partition_file);
-  } else {
-    cuopt_expects(false,
-      error_type_t::RuntimeError,
-      "Metis partitioning inside cuopt not implemented yet; "
-      "provide a --parts file via settings.multi_gpu_partition_file");
-  }
-
-  // always compute initial step size before scaling and primal_weight after scaling to do like
-  // cuPDLPx
-  assert(settings_.hyper_params.compute_initial_primal_weight_before_scaling &&
-         "compute_initial_primal_weight_before_scaling must be true in distributed mode");
-  assert(!settings_.hyper_params.compute_initial_step_size_before_scaling &&
-         "compute_initial_step_size_before_scaling must be false in distributed mode");
-
-  compute_initial_primal_weight();
-
-  // scale globally before dispatching to shards
-  initial_scaling_strategy_.scale_problem();
-
-  compute_initial_step_size();
-  step_size_strategy_.get_primal_and_dual_stepsizes(primal_step_size_, dual_step_size_);
-
-  const f_t initial_step_size_global     = get_step_size_h(0);
-  const f_t initial_primal_weight_global = get_primal_weight_h(0);
-
-  // 4. Copy both scaled and unscaled pb
-  auto const stream = op_problem_scaled_.handle_ptr->get_stream();
-  i_t const n_cstr  = op_problem_scaled_.n_constraints;
-  i_t const n_vars  = op_problem_scaled_.n_variables;
-  i_t const nnz     = op_problem_scaled_.nnz;
-
-  // Shared topology (taken from the scaled problem, but identical on both).
-  std::vector<i_t> h_A_row_offsets(n_cstr + 1);
-  std::vector<i_t> h_A_col_indices(nnz);
-  std::vector<i_t> h_A_t_row_offsets(n_vars + 1);
-  std::vector<i_t> h_A_t_col_indices(nnz);
-  raft::copy(h_A_row_offsets.data(), op_problem_scaled_.offsets.data(), n_cstr + 1, stream);
-  raft::copy(h_A_col_indices.data(), op_problem_scaled_.variables.data(), nnz, stream);
-  raft::copy(
-    h_A_t_row_offsets.data(), op_problem_scaled_.reverse_offsets.data(), n_vars + 1, stream);
-  raft::copy(h_A_t_col_indices.data(), op_problem_scaled_.reverse_constraints.data(), nnz, stream);
-
-  // Paired value arrays for A and A_T.
-  std::vector<f_t> h_A_values(nnz);
-  std::vector<f_t> h_A_values_scaled(nnz);
-  std::vector<f_t> h_A_t_values(nnz);
-  std::vector<f_t> h_A_t_values_scaled(nnz);
-  raft::copy(h_A_values.data(), problem_ptr->coefficients.data(), nnz, stream);
-  raft::copy(h_A_t_values.data(), problem_ptr->reverse_coefficients.data(), nnz, stream);
-  raft::copy(h_A_values_scaled.data(), op_problem_scaled_.coefficients.data(), nnz, stream);
-  raft::copy(
-    h_A_t_values_scaled.data(), op_problem_scaled_.reverse_coefficients.data(), nnz, stream);
+    // 2. Load partition
+    std::vector<i_t> parts;
+    if (!settings.multi_gpu_partition_file.empty()) {
+      parts = partition_loader_t<i_t, f_t>::parse_distributed_pdlp_partition_file(
+        settings.multi_gpu_partition_file);
+    } else {
+      cuopt_expects(false,
+                    error_type_t::RuntimeError,
+                    "Metis partitioning inside cuopt not implemented yet; "
+                    "provide a --parts file via settings.multi_gpu_partition_file");
+    }
 
-  using f_t2 = typename type_2<f_t>::type;
+    // always compute initial step size before scaling and primal_weight after scaling to do like
+    // cuPDLPx
+    assert(settings_.hyper_params.compute_initial_primal_weight_before_scaling &&
+           "compute_initial_primal_weight_before_scaling must be true in distributed mode");
+    assert(!settings_.hyper_params.compute_initial_step_size_before_scaling &&
+           "compute_initial_step_size_before_scaling must be false in distributed mode");
+
+    compute_initial_primal_weight();
+
+    // scale globally before dispatching to shards
+    initial_scaling_strategy_.scale_problem();
+
+    compute_initial_step_size();
+    step_size_strategy_.get_primal_and_dual_stepsizes(primal_step_size_, dual_step_size_);
+
+    const f_t initial_step_size_global     = get_step_size_h(0);
+    const f_t initial_primal_weight_global = get_primal_weight_h(0);
+
+    // 4. Copy both scaled and unscaled pb
+    auto const stream = op_problem_scaled_.handle_ptr->get_stream();
+    i_t const n_cstr  = op_problem_scaled_.n_constraints;
+    i_t const n_vars  = op_problem_scaled_.n_variables;
+    i_t const nnz     = op_problem_scaled_.nnz;
+
+    // Shared topology (taken from the scaled problem, but identical on both).
+    std::vector<i_t> h_A_row_offsets(n_cstr + 1);
+    std::vector<i_t> h_A_col_indices(nnz);
+    std::vector<i_t> h_A_t_row_offsets(n_vars + 1);
+    std::vector<i_t> h_A_t_col_indices(nnz);
+    raft::copy(h_A_row_offsets.data(), op_problem_scaled_.offsets.data(), n_cstr + 1, stream);
+    raft::copy(h_A_col_indices.data(), op_problem_scaled_.variables.data(), nnz, stream);
+    raft::copy(
+      h_A_t_row_offsets.data(), op_problem_scaled_.reverse_offsets.data(), n_vars + 1, stream);
+    raft::copy(
+      h_A_t_col_indices.data(), op_problem_scaled_.reverse_constraints.data(), nnz, stream);
+
+    // Paired value arrays for A and A_T.
+    std::vector<f_t> h_A_values(nnz);
+    std::vector<f_t> h_A_values_scaled(nnz);
+    std::vector<f_t> h_A_t_values(nnz);
+    std::vector<f_t> h_A_t_values_scaled(nnz);
+    raft::copy(h_A_values.data(), problem_ptr->coefficients.data(), nnz, stream);
+    raft::copy(h_A_t_values.data(), problem_ptr->reverse_coefficients.data(), nnz, stream);
+    raft::copy(h_A_values_scaled.data(), op_problem_scaled_.coefficients.data(), nnz, stream);
+    raft::copy(
+      h_A_t_values_scaled.data(), op_problem_scaled_.reverse_coefficients.data(), nnz, stream);
 
-  std::vector<f_t> h_obj(n_vars);
-  std::vector<f_t> h_obj_scaled(n_vars);
-  std::vector<f_t2> h_var_bounds_packed(n_vars);
-  std::vector<f_t2> h_var_bounds_scaled_packed(n_vars);
-  std::vector<f_t> h_cstr_lower(n_cstr);
-  std::vector<f_t> h_cstr_upper(n_cstr);
-  std::vector<f_t> h_cstr_lower_scaled(n_cstr);
-  std::vector<f_t> h_cstr_upper_scaled(n_cstr);
-
-  raft::copy(h_obj.data(), problem_ptr->objective_coefficients.data(), n_vars, stream);
-  raft::copy(h_obj_scaled.data(), op_problem_scaled_.objective_coefficients.data(), n_vars, stream);
-  raft::copy(h_var_bounds_packed.data(), problem_ptr->variable_bounds.data(), n_vars, stream);
-  raft::copy(
-    h_var_bounds_scaled_packed.data(), op_problem_scaled_.variable_bounds.data(), n_vars, stream);
-  raft::copy(h_cstr_lower.data(), problem_ptr->constraint_lower_bounds.data(), n_cstr, stream);
-  raft::copy(h_cstr_upper.data(), problem_ptr->constraint_upper_bounds.data(), n_cstr, stream);
-  raft::copy(
-    h_cstr_lower_scaled.data(), op_problem_scaled_.constraint_lower_bounds.data(), n_cstr, stream);
-  raft::copy(
-    h_cstr_upper_scaled.data(), op_problem_scaled_.constraint_upper_bounds.data(), n_cstr, stream);
-
-  // 5. Get full scaling factors on host
-  std::vector<f_t> h_cummulative_cstr_scaling(n_cstr);
-  std::vector<f_t> h_cummulative_var_scaling(n_vars);
-  raft::copy(h_cummulative_cstr_scaling.data(),
-             initial_scaling_strategy_.get_constraint_matrix_scaling_vector().data(),
-             n_cstr,
-             stream);
-  raft::copy(h_cummulative_var_scaling.data(),
-             initial_scaling_strategy_.get_variable_scaling_vector().data(),
-             n_vars,
-             stream);
-  const f_t h_bound_rescaling     = initial_scaling_strategy_.get_h_bound_rescaling();
-  const f_t h_objective_rescaling = initial_scaling_strategy_.get_h_objective_rescaling();
-
-  op_problem_scaled_.handle_ptr->sync_stream(stream);
-
-  // Unpack interleaved {lower, upper} into separate vectors for both
-  // versions, so the shard ctor's slicing loop is uniform.
-  std::vector<f_t> h_var_lower(n_vars), h_var_upper(n_vars);
-  std::vector<f_t> h_var_lower_scaled(n_vars), h_var_upper_scaled(n_vars);
-  for (i_t i = 0; i < n_vars; ++i) {
-    h_var_lower[i]        = h_var_bounds_packed[i].x;
-    h_var_upper[i]        = h_var_bounds_packed[i].y;
-    h_var_lower_scaled[i] = h_var_bounds_scaled_packed[i].x;
-    h_var_upper_scaled[i] = h_var_bounds_scaled_packed[i].y;
-  }
-
-  // 6. Build per-rank data and meta-data.
-  std::vector<rank_data_t<i_t, f_t>> sub_pdlp_rank_data =
-    partition_loader_t<i_t, f_t>::create_rank_data_from_parts(parts,
-                                                              h_A_row_offsets,
-                                                              h_A_col_indices,
-                                                              h_A_values,
-                                                              h_A_values_scaled,
-                                                              h_A_t_row_offsets,
-                                                              h_A_t_col_indices,
-                                                              h_A_t_values,
-                                                              h_A_t_values_scaled,
-                                                              settings.num_gpus,
-                                                              n_cstr,
-                                                              n_vars,
-                                                              nnz);
-
-  // 7. Build the per-shard PDLP settings:
-  pdlp_solver_settings_t<i_t, f_t> sub_pdlp_settings                    = settings;
-  sub_pdlp_settings.num_gpus                                            = 1;
-  sub_pdlp_settings.multi_gpu_partition_file                            = "";
-  sub_pdlp_settings.is_distributed_sub_pdlp                             = true;
-  sub_pdlp_settings.hyper_params.default_l_inf_ruiz_iterations          = 0;
-  sub_pdlp_settings.hyper_params.default_alpha_pock_chambolle_rescaling = 0.0;
-
-  // 8. Construct the engine, creates NCCL comms and shards
-  multi_gpu_engine.emplace(std::move(sub_pdlp_rank_data),
-                           h_obj,
-                           h_var_lower,
-                           h_var_upper,
-                           h_cstr_lower,
-                           h_cstr_upper,
-                           h_obj_scaled,
-                           h_var_lower_scaled,
-                           h_var_upper_scaled,
-                           h_cstr_lower_scaled,
-                           h_cstr_upper_scaled,
-                           h_cummulative_cstr_scaling,
-                           h_cummulative_var_scaling,
-                           h_bound_rescaling,
-                           h_objective_rescaling,
-                           op_problem_scaled_.maximize,
-                           op_problem_scaled_.objective_offset,
-                           op_problem_scaled_.presolve_data.objective_scaling_factor,
-                           sub_pdlp_settings);
-
-  for (auto& shard : multi_gpu_engine->shards) {
-    raft::device_setter guard(shard->device_id);
-    auto& sub = *shard->sub_pdlp;
-    raft::copy(sub.step_size_.data(), step_size_.data(), 1, shard->stream);
-    raft::copy(sub.primal_weight_.data(), primal_weight_.data(), 1, shard->stream);
-    raft::copy(sub.best_primal_weight_.data(), best_primal_weight_.data(), 1, shard->stream);
-    raft::copy(sub.primal_step_size_.data(), primal_step_size_.data(), 1, shard->stream);
-    raft::copy(sub.dual_step_size_.data(), dual_step_size_.data(), 1, shard->stream);
-  }
-
-  // Wire the engine into the master pdhg_solver_. Shards' pdhg_solver_ keep
-  // mgpu_engine_ == nullptr so they run plain single-GPU SpMV on local A.
-  pdhg_solver_.set_multi_gpu_engine(&*multi_gpu_engine);
-
-  // Project initial primal solution
-  if (settings_.hyper_params.project_initial_primal) {
-    // Use refine_initial_primal_projection ???
     using f_t2 = typename type_2<f_t>::type;
+
+    std::vector<f_t> h_obj(n_vars);
+    std::vector<f_t> h_obj_scaled(n_vars);
+    std::vector<f_t2> h_var_bounds_packed(n_vars);
+    std::vector<f_t2> h_var_bounds_scaled_packed(n_vars);
+    std::vector<f_t> h_cstr_lower(n_cstr);
+    std::vector<f_t> h_cstr_upper(n_cstr);
+    std::vector<f_t> h_cstr_lower_scaled(n_cstr);
+    std::vector<f_t> h_cstr_upper_scaled(n_cstr);
+
+    raft::copy(h_obj.data(), problem_ptr->objective_coefficients.data(), n_vars, stream);
+    raft::copy(
+      h_obj_scaled.data(), op_problem_scaled_.objective_coefficients.data(), n_vars, stream);
+    raft::copy(h_var_bounds_packed.data(), problem_ptr->variable_bounds.data(), n_vars, stream);
+    raft::copy(
+      h_var_bounds_scaled_packed.data(), op_problem_scaled_.variable_bounds.data(), n_vars, stream);
+    raft::copy(h_cstr_lower.data(), problem_ptr->constraint_lower_bounds.data(), n_cstr, stream);
+    raft::copy(h_cstr_upper.data(), problem_ptr->constraint_upper_bounds.data(), n_cstr, stream);
+    raft::copy(h_cstr_lower_scaled.data(),
+               op_problem_scaled_.constraint_lower_bounds.data(),
+               n_cstr,
+               stream);
+    raft::copy(h_cstr_upper_scaled.data(),
+               op_problem_scaled_.constraint_upper_bounds.data(),
+               n_cstr,
+               stream);
+
+    // 5. Get full scaling factors on host
+    std::vector<f_t> h_cummulative_cstr_scaling(n_cstr);
+    std::vector<f_t> h_cummulative_var_scaling(n_vars);
+    raft::copy(h_cummulative_cstr_scaling.data(),
+               initial_scaling_strategy_.get_constraint_matrix_scaling_vector().data(),
+               n_cstr,
+               stream);
+    raft::copy(h_cummulative_var_scaling.data(),
+               initial_scaling_strategy_.get_variable_scaling_vector().data(),
+               n_vars,
+               stream);
+    const f_t h_bound_rescaling     = initial_scaling_strategy_.get_h_bound_rescaling();
+    const f_t h_objective_rescaling = initial_scaling_strategy_.get_h_objective_rescaling();
+
+    op_problem_scaled_.handle_ptr->sync_stream(stream);
+
+    // Unpack interleaved {lower, upper} into separate vectors for both
+    // versions, so the shard ctor's slicing loop is uniform.
+    std::vector<f_t> h_var_lower(n_vars), h_var_upper(n_vars);
+    std::vector<f_t> h_var_lower_scaled(n_vars), h_var_upper_scaled(n_vars);
+    for (i_t i = 0; i < n_vars; ++i) {
+      h_var_lower[i]        = h_var_bounds_packed[i].x;
+      h_var_upper[i]        = h_var_bounds_packed[i].y;
+      h_var_lower_scaled[i] = h_var_bounds_scaled_packed[i].x;
+      h_var_upper_scaled[i] = h_var_bounds_scaled_packed[i].y;
+    }
+
+    // 6. Build per-rank data and meta-data.
+    std::vector<rank_data_t<i_t, f_t>> sub_pdlp_rank_data =
+      partition_loader_t<i_t, f_t>::create_rank_data_from_parts(parts,
+                                                                h_A_row_offsets,
+                                                                h_A_col_indices,
+                                                                h_A_values,
+                                                                h_A_values_scaled,
+                                                                h_A_t_row_offsets,
+                                                                h_A_t_col_indices,
+                                                                h_A_t_values,
+                                                                h_A_t_values_scaled,
+                                                                settings.num_gpus,
+                                                                n_cstr,
+                                                                n_vars,
+                                                                nnz);
+
+    // 7. Build the per-shard PDLP settings:
+    pdlp_solver_settings_t<i_t, f_t> sub_pdlp_settings                    = settings;
+    sub_pdlp_settings.num_gpus                                            = 1;
+    sub_pdlp_settings.multi_gpu_partition_file                            = "";
+    sub_pdlp_settings.is_distributed_sub_pdlp                             = true;
+    sub_pdlp_settings.hyper_params.default_l_inf_ruiz_iterations          = 0;
+    sub_pdlp_settings.hyper_params.default_alpha_pock_chambolle_rescaling = 0.0;
+
+    // 8. Construct the engine, creates NCCL comms and shards
+    multi_gpu_engine.emplace(std::move(sub_pdlp_rank_data),
+                             h_obj,
+                             h_var_lower,
+                             h_var_upper,
+                             h_cstr_lower,
+                             h_cstr_upper,
+                             h_obj_scaled,
+                             h_var_lower_scaled,
+                             h_var_upper_scaled,
+                             h_cstr_lower_scaled,
+                             h_cstr_upper_scaled,
+                             h_cummulative_cstr_scaling,
+                             h_cummulative_var_scaling,
+                             h_bound_rescaling,
+                             h_objective_rescaling,
+                             op_problem_scaled_.maximize,
+                             op_problem_scaled_.objective_offset,
+                             op_problem_scaled_.presolve_data.objective_scaling_factor,
+                             sub_pdlp_settings);
+
     for (auto& shard : multi_gpu_engine->shards) {
       raft::device_setter guard(shard->device_id);
       auto& sub = *shard->sub_pdlp;
-      cub::DeviceTransform::Transform(
-        cuda::std::make_tuple(sub.pdhg_solver_.get_primal_solution().data(),
-                              sub.get_op_problem_scaled().variable_bounds.data()),
-        sub.pdhg_solver_.get_primal_solution().data(),
-        sub.pdhg_solver_.get_primal_solution().size(),
-        clamp<f_t, f_t2>(),
-        shard->stream.view());
+      raft::copy(sub.step_size_.data(), step_size_.data(), 1, shard->stream);
+      raft::copy(sub.primal_weight_.data(), primal_weight_.data(), 1, shard->stream);
+      raft::copy(sub.best_primal_weight_.data(), best_primal_weight_.data(), 1, shard->stream);
+      raft::copy(sub.primal_step_size_.data(), primal_step_size_.data(), 1, shard->stream);
+      raft::copy(sub.dual_step_size_.data(), dual_step_size_.data(), 1, shard->stream);
+    }
+
+    // Wire the engine into the master pdhg_solver_. Shards' pdhg_solver_ keep
+    // mgpu_engine_ == nullptr so they run plain single-GPU SpMV on local A.
+    pdhg_solver_.set_multi_gpu_engine(&*multi_gpu_engine);
+
+    // Project initial primal solution
+    if (settings_.hyper_params.project_initial_primal) {
+      // Use refine_initial_primal_projection ???
+      using f_t2 = typename type_2<f_t>::type;
+      for (auto& shard : multi_gpu_engine->shards) {
+        raft::device_setter guard(shard->device_id);
+        auto& sub = *shard->sub_pdlp;
+        cub::DeviceTransform::Transform(
+          cuda::std::make_tuple(sub.pdhg_solver_.get_primal_solution().data(),
+                                sub.get_op_problem_scaled().variable_bounds.data()),
+          sub.pdhg_solver_.get_primal_solution().data(),
+          sub.pdhg_solver_.get_primal_solution().size(),
+          clamp<f_t, f_t2>(),
+          shard->stream.view());
+      }
     }
-  }
   }  // end if constexpr (std::is_same_v<f_t, double>)
 }
 
@@ -2222,24 +2228,22 @@ void pdlp_solver_t<i_t, f_t>::compute_fixed_error(std::vector<int>& has_restarte
   // Computing the deltas
   // TODO batch mdoe: this only works if everyone restarts
   if (multi_gpu_engine) {
-    // Go faire une fonction compute_delta_primal, compute_delta primal ? 
+    // Go faire une fonction compute_delta_primal, compute_delta primal ?
     for (auto& shard : multi_gpu_engine->shards) {
       raft::device_setter guard(shard->device_id);
       auto& sub_pdhg = shard->sub_pdlp->pdhg_solver_;
-      cub::DeviceTransform::Transform(
-        cuda::std::make_tuple(sub_pdhg.get_reflected_primal().data(),
-                              sub_pdhg.get_primal_solution().data()),
-        sub_pdhg.get_saddle_point_state().get_delta_primal().data(),
-        sub_pdhg.get_primal_solution().size(),
-        cuda::std::minus<f_t>{},
-        shard->stream.view());
-      cub::DeviceTransform::Transform(
-        cuda::std::make_tuple(sub_pdhg.get_reflected_dual().data(),
-                              sub_pdhg.get_dual_solution().data()),
-        sub_pdhg.get_saddle_point_state().get_delta_dual().data(),
-        sub_pdhg.get_dual_solution().size(),
-        cuda::std::minus<f_t>{},
-        shard->stream.view());
+      cub::DeviceTransform::Transform(cuda::std::make_tuple(sub_pdhg.get_reflected_primal().data(),
+                                                            sub_pdhg.get_primal_solution().data()),
+                                      sub_pdhg.get_saddle_point_state().get_delta_primal().data(),
+                                      sub_pdhg.get_primal_solution().size(),
+                                      cuda::std::minus<f_t>{},
+                                      shard->stream.view());
+      cub::DeviceTransform::Transform(cuda::std::make_tuple(sub_pdhg.get_reflected_dual().data(),
+                                                            sub_pdhg.get_dual_solution().data()),
+                                      sub_pdhg.get_saddle_point_state().get_delta_dual().data(),
+                                      sub_pdhg.get_dual_solution().size(),
+                                      cuda::std::minus<f_t>{},
+                                      shard->stream.view());
     }
   } else {
     cub::DeviceTransform::Transform(
@@ -2249,21 +2253,19 @@ void pdlp_solver_t<i_t, f_t>::compute_fixed_error(std::vector<int>& has_restarte
       pdhg_solver_.get_primal_solution().size(),
       cuda::std::minus<f_t>{},
       stream_view_.value());
-    cub::DeviceTransform::Transform(
-      cuda::std::make_tuple(pdhg_solver_.get_reflected_dual().data(),
-                            pdhg_solver_.get_dual_solution().data()),
-      pdhg_solver_.get_saddle_point_state().get_delta_dual().data(),
-      pdhg_solver_.get_dual_solution().size(),
-      cuda::std::minus<f_t>{},
-      stream_view_.value());
+    cub::DeviceTransform::Transform(cuda::std::make_tuple(pdhg_solver_.get_reflected_dual().data(),
+                                                          pdhg_solver_.get_dual_solution().data()),
+                                    pdhg_solver_.get_saddle_point_state().get_delta_dual().data(),
+                                    pdhg_solver_.get_dual_solution().size(),
+                                    cuda::std::minus<f_t>{},
+                                    stream_view_.value());
   }
 
   auto& cusparse_view = pdhg_solver_.get_cusparse_view();
 
   if (multi_gpu_engine) {
-
-    // SpMV is the first operation in compute_interaction_and_movement so we can do halo before and call it naturally
-    // we then reduce the local dot products
+    // SpMV is the first operation in compute_interaction_and_movement so we can do halo before and
+    // call it naturally we then reduce the local dot products
     multi_gpu_engine->halo_exchange_cstr(
       [](auto& pdhg) -> rmm::device_uvector<f_t>& { return pdhg.get_reflected_dual(); });
 
@@ -2294,9 +2296,7 @@ void pdlp_solver_t<i_t, f_t>::compute_fixed_error(std::vector<int>& has_restarte
       },
       1);
     multi_gpu_engine->allreduce_sum_inplace(
-      [](auto& sp) -> f_t* {
-        return sp.step_size_strategy_.get_norm_squared_delta_dual().data();
-      },
+      [](auto& sp) -> f_t* { return sp.step_size_strategy_.get_norm_squared_delta_dual().data(); },
       1);
 
     auto& s0 = *multi_gpu_engine->shards[0];
@@ -2365,10 +2365,10 @@ void pdlp_solver_t<i_t, f_t>::compute_fixed_error(std::vector<int>& has_restarte
 
   // Put back, already done in multi-gpu side
   if (!multi_gpu_engine) {
-  RAFT_CUSPARSE_TRY(
-    cusparseDnVecSetValues(cusparse_view.potential_next_dual_solution,
-                           (void*)pdhg_solver_.get_potential_next_dual_solution().data()));
-    }
+    RAFT_CUSPARSE_TRY(
+      cusparseDnVecSetValues(cusparse_view.potential_next_dual_solution,
+                             (void*)pdhg_solver_.get_potential_next_dual_solution().data()));
+  }
   if (batch_mode_) {
     RAFT_CUSPARSE_TRY(
       cusparseDnMatSetValues(cusparse_view.batch_potential_next_dual_solution,
@@ -2630,8 +2630,9 @@ optimization_problem_solution_t<i_t, f_t> pdlp_solver_t<i_t, f_t>::run_solver(co
     // Update FP32 matrix copies for mixed precision SpMV after scaling
     pdhg_solver_.get_cusparse_view().update_mixed_precision_matrices();
 
-    // Redirect cuSPARSE descriptors to use the original problem's structural data (offsets, indices),
-    // then free the duplicated structural vectors from the scaled copy to save device memory.
+    // Redirect cuSPARSE descriptors to use the original problem's structural data (offsets,
+    // indices), then free the duplicated structural vectors from the scaled copy to save device
+    // memory.
     pdhg_solver_.get_cusparse_view().redirect_cusparse_csr_structure_pointers(*problem_ptr);
     op_problem_scaled_.variables.resize(0, stream_view_);
     op_problem_scaled_.offsets.resize(0, stream_view_);
@@ -2846,7 +2847,6 @@ optimization_problem_solution_t<i_t, f_t> pdlp_solver_t<i_t, f_t>::run_solver(co
       "Initial primal_weight", primal_weight_.data(), primal_weight_.size(), std::cout);
 #endif
 
-
     if (!inside_mip_) {
       CUOPT_LOG_INFO(
         "   Iter    Primal Obj.      Dual Obj.    Gap        Primal Res.  Dual Res.   Time");
diff --git a/cpp/src/pdlp/step_size_strategy/adaptive_step_size_strategy.cu b/cpp/src/pdlp/step_size_strategy/adaptive_step_size_strategy.cu
index fb85be4280..2cb843ae86 100644
--- a/cpp/src/pdlp/step_size_strategy/adaptive_step_size_strategy.cu
+++ b/cpp/src/pdlp/step_size_strategy/adaptive_step_size_strategy.cu
@@ -316,15 +316,13 @@ rmm::device_uvector<f_t>& adaptive_step_size_strategy_t<i_t, f_t>::get_interacti
 }
 
 template <typename i_t, typename f_t>
-rmm::device_uvector<f_t>&
-adaptive_step_size_strategy_t<i_t, f_t>::get_norm_squared_delta_primal()
+rmm::device_uvector<f_t>& adaptive_step_size_strategy_t<i_t, f_t>::get_norm_squared_delta_primal()
 {
   return norm_squared_delta_primal_;
 }
 
 template <typename i_t, typename f_t>
-rmm::device_uvector<f_t>&
-adaptive_step_size_strategy_t<i_t, f_t>::get_norm_squared_delta_dual()
+rmm::device_uvector<f_t>& adaptive_step_size_strategy_t<i_t, f_t>::get_norm_squared_delta_dual()
 {
   return norm_squared_delta_dual_;
 }

From 98e0ce68d67f3b9701c7b196d490754401c18a31 Mon Sep 17 00:00:00 2001
From: Bulle Mostovoi <vmostovoi@nvidia.com>
Date: Tue, 26 May 2026 11:06:24 +0200
Subject: [PATCH 26/67] now manage halpern update in multi-gpu pdlp

---
 cpp/src/pdlp/pdlp.cu | 15 ++++++++++++---
 1 file changed, 12 insertions(+), 3 deletions(-)

diff --git a/cpp/src/pdlp/pdlp.cu b/cpp/src/pdlp/pdlp.cu
index 302f62e56a..b69ceccae5 100644
--- a/cpp/src/pdlp/pdlp.cu
+++ b/cpp/src/pdlp/pdlp.cu
@@ -3085,13 +3085,22 @@ optimization_problem_solution_t<i_t, f_t> pdlp_solver_t<i_t, f_t>::run_solver(co
           transpose_problem_fields(/*to_row=*/true);
         }
       }
-      halpern_update();
+      if (multi_gpu_engine_) {
+        multi_gpu_engine_->for_each_shard([&](auto& shard) { shard.sub_pdlp->halpern_update(); });
+      } else {
+        halpern_update();
+      }
     }
 
     ++total_pdlp_iterations_;
     ++internal_solver_iterations_;
-    if (settings_.hyper_params.never_restart_to_average)
-      restart_strategy_.increment_iteration_since_last_restart();
+    if (settings_.hyper_params.never_restart_to_average) {
+      if (multi_gpu_engine_) {
+        multi_gpu_engine_->for_each_shard([&](auto& shard) { shard.sub_pdlp->restart_strategy_.increment_iteration_since_last_restart(); });
+      } else {
+        restart_strategy_.increment_iteration_since_last_restart();
+      }
+    }
   }
   return optimization_problem_solution_t<i_t, f_t>{pdlp_termination_status_t::NumericalError,
                                                    stream_view_};

From 84128bf809348932fb6b540ae93d893feb7c4756 Mon Sep 17 00:00:00 2001
From: Bulle Mostovoi <vmostovoi@nvidia.com>
Date: Tue, 26 May 2026 11:46:46 +0200
Subject: [PATCH 27/67] small fix to calls of multi_gpu_engine_ and
 scale/unscale solutions. compiles and runs

---
 cpp/src/pdlp/pdlp.cu | 47 ++++++++++++++++++++++++++++++++++++--------
 1 file changed, 39 insertions(+), 8 deletions(-)

diff --git a/cpp/src/pdlp/pdlp.cu b/cpp/src/pdlp/pdlp.cu
index b69ceccae5..36ba854439 100644
--- a/cpp/src/pdlp/pdlp.cu
+++ b/cpp/src/pdlp/pdlp.cu
@@ -2896,6 +2896,9 @@ optimization_problem_solution_t<i_t, f_t> pdlp_solver_t<i_t, f_t>::run_solver(co
         // 1. At the very beginning of the solver, when no steps have been taken yet
         // 2. After a single step, since average of one step is the same step
         if (internal_solver_iterations_ <= 1) {
+          if (multi_gpu_engine) {
+            assert(false && "Not implemented");
+          }
           raft::copy(unscaled_primal_avg_solution_.data(),
                      pdhg_solver_.get_primal_solution().data(),
                      primal_size_h_,
@@ -2946,8 +2949,22 @@ optimization_problem_solution_t<i_t, f_t> pdlp_solver_t<i_t, f_t>::run_solver(co
                                                     unscaled_dual_avg_solution_);
       }
       if (settings_.hyper_params.use_adaptive_step_size_strategy) {
-        initial_scaling_strategy_.unscale_solutions(pdhg_solver_.get_primal_solution(),
-                                                    pdhg_solver_.get_dual_solution());
+        if (multi_gpu_engine) {
+          // Master's pdhg_solver_.{primal,dual}_solution_ is stale in mGPU mode
+          // (live state lives on shards). Unscale in place on each shard with
+          // the shard's own initial_scaling_strategy_, which already holds the
+          // global cumulative scaling factors for its owned slice (set up in
+          // shard.cu via set_cummulative_scaling). Halo slots have unit scaling
+          // so unscaling is a no-op there (their values are junk anyway).
+          multi_gpu_engine->for_each_shard([&](auto& shard) {
+            auto& sub = *shard.sub_pdlp;
+            sub.get_initial_scaling_strategy().unscale_solutions(
+              sub.pdhg_solver_.get_primal_solution(), sub.pdhg_solver_.get_dual_solution());
+          });
+        } else {
+          initial_scaling_strategy_.unscale_solutions(pdhg_solver_.get_primal_solution(),
+                                                      pdhg_solver_.get_dual_solution());
+        }
       } else {
         initial_scaling_strategy_.unscale_solutions(
           pdhg_solver_.get_potential_next_primal_solution(),
@@ -2981,8 +2998,20 @@ optimization_problem_solution_t<i_t, f_t> pdlp_solver_t<i_t, f_t>::run_solver(co
                                                     unscaled_dual_avg_solution_);
         }
         if (settings_.hyper_params.use_adaptive_step_size_strategy) {
-          initial_scaling_strategy_.scale_solutions(pdhg_solver_.get_primal_solution(),
-                                                    pdhg_solver_.get_dual_solution());
+          if (multi_gpu_engine) {
+            // Symmetric to the unscale dispatch above. Live state lives on
+            // shards; each shard's initial_scaling_strategy_ holds the global
+            // cumulative scaling factors for its owned slice (halo slots have
+            // unit scaling, so they're no-ops). Scale in place per shard.
+            multi_gpu_engine->for_each_shard([&](auto& shard) {
+              auto& sub = *shard.sub_pdlp;
+              sub.get_initial_scaling_strategy().scale_solutions(
+                sub.pdhg_solver_.get_primal_solution(), sub.pdhg_solver_.get_dual_solution());
+            });
+          } else {
+            initial_scaling_strategy_.scale_solutions(pdhg_solver_.get_primal_solution(),
+                                                      pdhg_solver_.get_dual_solution());
+          }
         } else {
           initial_scaling_strategy_.scale_solutions(
             pdhg_solver_.get_potential_next_primal_solution(),
@@ -3085,8 +3114,8 @@ optimization_problem_solution_t<i_t, f_t> pdlp_solver_t<i_t, f_t>::run_solver(co
           transpose_problem_fields(/*to_row=*/true);
         }
       }
-      if (multi_gpu_engine_) {
-        multi_gpu_engine_->for_each_shard([&](auto& shard) { shard.sub_pdlp->halpern_update(); });
+      if (multi_gpu_engine) {
+        multi_gpu_engine->for_each_shard([&](auto& shard) { shard.sub_pdlp->halpern_update(); });
       } else {
         halpern_update();
       }
@@ -3095,8 +3124,10 @@ optimization_problem_solution_t<i_t, f_t> pdlp_solver_t<i_t, f_t>::run_solver(co
     ++total_pdlp_iterations_;
     ++internal_solver_iterations_;
     if (settings_.hyper_params.never_restart_to_average) {
-      if (multi_gpu_engine_) {
-        multi_gpu_engine_->for_each_shard([&](auto& shard) { shard.sub_pdlp->restart_strategy_.increment_iteration_since_last_restart(); });
+      if (multi_gpu_engine) {
+        multi_gpu_engine->for_each_shard([&](auto& shard) {
+          shard.sub_pdlp->restart_strategy_.increment_iteration_since_last_restart();
+        });
       } else {
         restart_strategy_.increment_iteration_since_last_restart();
       }

From abe4dd23e41ee7cb7cdba4ca3ca7979874b39856 Mon Sep 17 00:00:00 2001
From: Bulle Mostovoi <vmostovoi@nvidia.com>
Date: Tue, 26 May 2026 11:54:45 +0200
Subject: [PATCH 28/67] comments

---
 cpp/src/pdlp/pdlp.cu | 12 ++----------
 1 file changed, 2 insertions(+), 10 deletions(-)

diff --git a/cpp/src/pdlp/pdlp.cu b/cpp/src/pdlp/pdlp.cu
index 36ba854439..e2aeb3f08c 100644
--- a/cpp/src/pdlp/pdlp.cu
+++ b/cpp/src/pdlp/pdlp.cu
@@ -2950,12 +2950,7 @@ optimization_problem_solution_t<i_t, f_t> pdlp_solver_t<i_t, f_t>::run_solver(co
       }
       if (settings_.hyper_params.use_adaptive_step_size_strategy) {
         if (multi_gpu_engine) {
-          // Master's pdhg_solver_.{primal,dual}_solution_ is stale in mGPU mode
-          // (live state lives on shards). Unscale in place on each shard with
-          // the shard's own initial_scaling_strategy_, which already holds the
-          // global cumulative scaling factors for its owned slice (set up in
-          // shard.cu via set_cummulative_scaling). Halo slots have unit scaling
-          // so unscaling is a no-op there (their values are junk anyway).
+          // The only branch in cuPDLPx
           multi_gpu_engine->for_each_shard([&](auto& shard) {
             auto& sub = *shard.sub_pdlp;
             sub.get_initial_scaling_strategy().unscale_solutions(
@@ -2999,10 +2994,7 @@ optimization_problem_solution_t<i_t, f_t> pdlp_solver_t<i_t, f_t>::run_solver(co
         }
         if (settings_.hyper_params.use_adaptive_step_size_strategy) {
           if (multi_gpu_engine) {
-            // Symmetric to the unscale dispatch above. Live state lives on
-            // shards; each shard's initial_scaling_strategy_ holds the global
-            // cumulative scaling factors for its owned slice (halo slots have
-            // unit scaling, so they're no-ops). Scale in place per shard.
+            // The only branch in cuPDLPx
             multi_gpu_engine->for_each_shard([&](auto& shard) {
               auto& sub = *shard.sub_pdlp;
               sub.get_initial_scaling_strategy().scale_solutions(

From 5c41497080dd3950c378d485a5ada75c1658f31f Mon Sep 17 00:00:00 2001
From: Bulle Mostovoi <vmostovoi@nvidia.com>
Date: Tue, 26 May 2026 12:06:51 +0200
Subject: [PATCH 29/67] added is multi gpu to pdhg

---
 cpp/src/pdlp/distributed_pdlp/shard.cu |  2 ++
 cpp/src/pdlp/pdhg.hpp                  | 16 ++++++++++++++--
 2 files changed, 16 insertions(+), 2 deletions(-)

diff --git a/cpp/src/pdlp/distributed_pdlp/shard.cu b/cpp/src/pdlp/distributed_pdlp/shard.cu
index 33aac38103..405e6fa05c 100644
--- a/cpp/src/pdlp/distributed_pdlp/shard.cu
+++ b/cpp/src/pdlp/distributed_pdlp/shard.cu
@@ -155,6 +155,8 @@ pdlp_shard_t<i_t, f_t>::pdlp_shard_t(int device_id,
   //         unit cumulative factors (sub-settings disable Ruiz / PC iters).
   sub_pdlp = std::make_unique<pdlp_solver_t<i_t, f_t>>(*sub_problem, settings, /*batch=*/false);
 
+  sub_pdlp->pdhg_solver_.set_is_multi_gpu(true);
+
   // Inject master-scaled buffers inside sub_pdlp
   auto& scaled = sub_pdlp->get_op_problem_scaled();
   raft::copy(scaled.coefficients.data(),
diff --git a/cpp/src/pdlp/pdhg.hpp b/cpp/src/pdlp/pdhg.hpp
index e38ea9389c..2e230eaf86 100644
--- a/cpp/src/pdlp/pdhg.hpp
+++ b/cpp/src/pdlp/pdhg.hpp
@@ -114,8 +114,19 @@ class pdhg_solver_t {
 
   // Master PDLP wires up the engine pointer here after the engine is built.
   // Shards' pdhg_solver_ leaves this null so each shard runs single-GPU SpMV
-  // on its local matrix.
-  void set_multi_gpu_engine(multi_gpu_engine_t<i_t, f_t>* engine) { mgpu_engine_ = engine; }
+  // on its local matrix. Also flips is_multi_gpu_ — convenience flag that any
+  // pdhg participating in a distributed run (master OR shard) carries true.
+  void set_multi_gpu_engine(multi_gpu_engine_t<i_t, f_t>* engine)
+  {
+    mgpu_engine_  = engine;
+    is_multi_gpu_ = (engine != nullptr);
+  }
+
+  // Mark a shard's pdhg_solver_ as part of a distributed run without giving it
+  // an engine (shards don't orchestrate; they only run local SpMV on owned
+  // rows). Called from shard.cu right after sub_pdlp is constructed.
+  void set_is_multi_gpu(bool v) { is_multi_gpu_ = v; }
+  bool is_multi_gpu() const { return is_multi_gpu_; }
 
   i_t total_pdhg_iterations_;
 
@@ -136,6 +147,7 @@ class pdhg_solver_t {
   void compute_primal_projection(rmm::device_uvector<f_t>& primal_step_size);
 
   bool batch_mode_{false};
+  bool is_multi_gpu_{false};
   raft::handle_t const* handle_ptr_{nullptr};
   rmm::cuda_stream_view stream_view_;
 

From 37b1fdafab439c4b0be39b7d467b31d0f23110b5 Mon Sep 17 00:00:00 2001
From: Bulle Mostovoi <vmostovoi@nvidia.com>
Date: Tue, 26 May 2026 12:24:45 +0200
Subject: [PATCH 30/67] added pdhg get mgpu engine

---
 cpp/src/pdlp/pdhg.hpp | 1 +
 1 file changed, 1 insertion(+)

diff --git a/cpp/src/pdlp/pdhg.hpp b/cpp/src/pdlp/pdhg.hpp
index 2e230eaf86..e4d16360a7 100644
--- a/cpp/src/pdlp/pdhg.hpp
+++ b/cpp/src/pdlp/pdhg.hpp
@@ -127,6 +127,7 @@ class pdhg_solver_t {
   // rows). Called from shard.cu right after sub_pdlp is constructed.
   void set_is_multi_gpu(bool v) { is_multi_gpu_ = v; }
   bool is_multi_gpu() const { return is_multi_gpu_; }
+  multi_gpu_engine_t<i_t, f_t>* get_mgpu_engine() const { return mgpu_engine_; }
 
   i_t total_pdhg_iterations_;
 

From 57c70615337bd12fe803938d5f1bc44c4d9fa7f1 Mon Sep 17 00:00:00 2001
From: Bulle Mostovoi <vmostovoi@nvidia.com>
Date: Tue, 26 May 2026 12:25:25 +0200
Subject: [PATCH 31/67] added non const convergence information getter

---
 cpp/src/pdlp/termination_strategy/termination_strategy.cu  | 7 +++++++
 cpp/src/pdlp/termination_strategy/termination_strategy.hpp | 1 +
 2 files changed, 8 insertions(+)

diff --git a/cpp/src/pdlp/termination_strategy/termination_strategy.cu b/cpp/src/pdlp/termination_strategy/termination_strategy.cu
index d1a88799d6..0320b420a8 100644
--- a/cpp/src/pdlp/termination_strategy/termination_strategy.cu
+++ b/cpp/src/pdlp/termination_strategy/termination_strategy.cu
@@ -195,6 +195,13 @@ pdlp_termination_strategy_t<i_t, f_t>::get_convergence_information() const
   return convergence_information_;
 }
 
+template <typename i_t, typename f_t>
+convergence_information_t<i_t, f_t>&
+pdlp_termination_strategy_t<i_t, f_t>::get_convergence_information()
+{
+  return convergence_information_;
+}
+
 template <typename i_t, typename f_t>
 const infeasibility_information_t<i_t, f_t>&
 pdlp_termination_strategy_t<i_t, f_t>::get_infeasibility_information() const
diff --git a/cpp/src/pdlp/termination_strategy/termination_strategy.hpp b/cpp/src/pdlp/termination_strategy/termination_strategy.hpp
index 5cd43d7be7..63b2e81ff4 100644
--- a/cpp/src/pdlp/termination_strategy/termination_strategy.hpp
+++ b/cpp/src/pdlp/termination_strategy/termination_strategy.hpp
@@ -187,6 +187,7 @@ class pdlp_termination_strategy_t {
   i_t get_optimal_solution_id() const;
 
   const convergence_information_t<i_t, f_t>& get_convergence_information() const;
+  convergence_information_t<i_t, f_t>& get_convergence_information();
   const infeasibility_information_t<i_t, f_t>& get_infeasibility_information() const;
 
   // Deep copy is used when save best primal so far is toggled

From 9f78d0534c232055b7da4e425379e9f86a436e08 Mon Sep 17 00:00:00 2001
From: Bulle Mostovoi <vmostovoi@nvidia.com>
Date: Tue, 26 May 2026 14:29:36 +0200
Subject: [PATCH 32/67] compute_convergence_information is now on multi-gpu

---
 .../distributed_pdlp/multi_gpu_engine.hpp     |  58 ++++
 cpp/src/pdlp/pdlp.cu                          |   8 +-
 .../convergence_information.cu                | 296 ++++++++++++++++--
 .../convergence_information.hpp               |  31 ++
 4 files changed, 360 insertions(+), 33 deletions(-)

diff --git a/cpp/src/pdlp/distributed_pdlp/multi_gpu_engine.hpp b/cpp/src/pdlp/distributed_pdlp/multi_gpu_engine.hpp
index 001f9b760e..438a878834 100644
--- a/cpp/src/pdlp/distributed_pdlp/multi_gpu_engine.hpp
+++ b/cpp/src/pdlp/distributed_pdlp/multi_gpu_engine.hpp
@@ -7,10 +7,12 @@
 #include <pdlp/distributed_pdlp/rank_data.hpp>
 #include <pdlp/distributed_pdlp/shard.hpp>
 #include <pdlp/pdhg.hpp>
+#include <utilities/cuda_helpers.cuh>
 
 #include <cuopt/linear_programming/pdlp/solver_settings.hpp>
 
 #include <raft/core/device_setter.hpp>
+#include <raft/linalg/detail/cublas_wrappers.hpp>
 
 #include <rmm/cuda_stream.hpp>
 #include <rmm/exec_policy.hpp>
@@ -27,6 +29,16 @@
 
 namespace cuopt::linear_programming::detail {
 
+// Element-wise sqrt functor. Defined at namespace scope (not as a local
+// extended HD lambda) because nvcc disallows extended __host__ __device__
+// lambdas appearing inside templates whose template arguments are
+// themselves local lambda types (which happens when distributed_l2_norm is
+// invoked with closure accessors).
+template <typename f_t>
+struct sqrt_inplace_op_t {
+  __host__ __device__ f_t operator()(f_t x) const { return raft::sqrt(x); }
+};
+
 template <typename i_t, typename f_t>
 struct multi_gpu_engine_t {
   // Constructs shards from rank_data
@@ -219,6 +231,52 @@ struct multi_gpu_engine_t {
     ncclGroupEnd();
   }
 
+  // -------- Distributed L2 norm ------------------------------------------
+  // Computes sqrt(Σ_k Σ_{i ∈ owned_k} buf_k[i]²) and writes the scalar into
+  // the buffer returned by `out_access` on EVERY shard.
+  //
+  // Algorithm:
+  //   1) per shard: out = cublasdot(buf[0:n_owned], buf[0:n_owned])  (partial Σ²)
+  //   2) NCCL allreduce SUM on out (count = 1)                       (global Σ²)
+  //   3) per shard: out = sqrt(out)
+  //
+  // The caller is responsible for clipping correctness via `size_access`
+  // (which picks `rank_data.owned_var_size` or `rank_data.owned_cstr_size`
+  // depending on the shape of the input buffer), and for mirroring the
+  // result back to master if downstream code needs it there.
+  //
+  // BufAccess  : pdlp_solver_t<i_t,f_t>& -> rmm::device_uvector<f_t>&
+  // OutAccess  : pdlp_solver_t<i_t,f_t>& -> f_t*   (single scalar in shard memory)
+  // SizeAccess : pdlp_shard_t<i_t,f_t>&  -> i_t    (owned slice length)
+  template <typename BufAccess, typename OutAccess, typename SizeAccess>
+  void distributed_l2_norm(BufAccess&& buf_access,
+                           OutAccess&& out_access,
+                           SizeAccess&& size_access)
+  {
+    for_each_shard([&](auto& shard) {
+      auto& sub   = *shard.sub_pdlp;
+      auto& buf   = buf_access(sub);
+      const i_t n = size_access(shard);
+      f_t* out    = out_access(sub);
+      RAFT_CUBLAS_TRY(raft::linalg::detail::cublasdot(shard.handle.get_cublas_handle(),
+                                                      static_cast<int>(n),
+                                                      buf.data(),
+                                                      1,
+                                                      buf.data(),
+                                                      1,
+                                                      out,
+                                                      shard.stream.view().value()));
+    });
+
+    allreduce_sum_inplace(out_access, /*count=*/1);
+
+    for_each_shard([&](auto& shard) {
+      f_t* out = out_access(*shard.sub_pdlp);
+      cub::DeviceTransform::Transform(
+        out, out, 1, sqrt_inplace_op_t<f_t>{}, shard.stream.view().value());
+    });
+  }
+
   // -------- Generic distributed SpMVs -------------------------------------
   // distributed_spmv_A : halo-update the var-shaped input buffer returned by
   // `in_buf(pdhg)`, then per-shard A @ in_buf -> out_desc.
diff --git a/cpp/src/pdlp/pdlp.cu b/cpp/src/pdlp/pdlp.cu
index e2aeb3f08c..9522ae4065 100644
--- a/cpp/src/pdlp/pdlp.cu
+++ b/cpp/src/pdlp/pdlp.cu
@@ -2289,15 +2289,13 @@ void pdlp_solver_t<i_t, f_t>::compute_fixed_error(std::vector<int>& has_restarte
     }
 
     multi_gpu_engine->allreduce_sum_inplace(
-      [](auto& sp) -> f_t* { return sp.step_size_strategy_.get_interaction().data(); }, 1);
+      [](auto& sp) -> f_t* { return sp.step_size_strategy_.get_interaction().data(); });
     multi_gpu_engine->allreduce_sum_inplace(
       [](auto& sp) -> f_t* {
         return sp.step_size_strategy_.get_norm_squared_delta_primal().data();
-      },
-      1);
+      });
     multi_gpu_engine->allreduce_sum_inplace(
-      [](auto& sp) -> f_t* { return sp.step_size_strategy_.get_norm_squared_delta_dual().data(); },
-      1);
+      [](auto& sp) -> f_t* { return sp.step_size_strategy_.get_norm_squared_delta_dual().data(); });
 
     auto& s0 = *multi_gpu_engine->shards[0];
     {
diff --git a/cpp/src/pdlp/termination_strategy/convergence_information.cu b/cpp/src/pdlp/termination_strategy/convergence_information.cu
index a6d6d14d96..28b33582ab 100644
--- a/cpp/src/pdlp/termination_strategy/convergence_information.cu
+++ b/cpp/src/pdlp/termination_strategy/convergence_information.cu
@@ -5,12 +5,16 @@
  */
 /* clang-format on */
 
+#include <pdlp/distributed_pdlp/multi_gpu_engine.hpp>
+#include <pdlp/pdlp.cuh>
 #include <pdlp/pdlp_climber_strategy.hpp>
 #include <pdlp/pdlp_constants.hpp>
 #include <pdlp/swap_and_resize_helper.cuh>
 #include <pdlp/termination_strategy/convergence_information.hpp>
 #include <pdlp/utils.cuh>
 
+#include <raft/core/device_setter.hpp>
+
 #include <mip_heuristics/mip_constants.hpp>
 
 #include <cuopt/error.hpp>
@@ -416,17 +420,89 @@ void convergence_information_t<i_t, f_t>::compute_convergence_information(
   print("dual_slack", dual_slack);
 #endif
 
+  if (current_pdhg_solver.is_multi_gpu())
+  {
+    auto* engine = current_pdhg_solver.get_mgpu_engine();
+    cuopt_assert(engine != nullptr,
+                 "mGPU branch reached but current_pdhg_solver has no engine (shard pdhg?)");
+    cuopt_expects(!settings.per_constraint_residual,
+                  error_type_t::ValidationError,
+                  "per_constraint_residual is not yet supported in multi-GPU mode");
+
+    // Prepares halo values in primal_solution
+    engine->halo_exchange_var(
+      [](pdhg_solver_t<i_t, f_t>& pdhg) -> rmm::device_uvector<f_t>& {
+        return pdhg.get_primal_solution();
+      });
+
+    // Compute the primal residual and objective on each shard
+    for (auto& shard : engine->shards) {
+      raft::device_setter guard(shard->device_id);
+      auto& sub_pdlp = *shard->sub_pdlp;
+      auto& sub_conv = sub_pdlp.get_current_termination_strategy().get_convergence_information();
+      sub_conv.compute_primal_residual(sub_conv.op_problem_cusparse_view_,
+                                       sub_pdlp.pdhg_solver_.get_dual_tmp_resource(),
+                                       sub_pdlp.pdhg_solver_.get_dual_solution());
+      sub_conv.compute_primal_objective_owned_partial(sub_pdlp.pdhg_solver_.get_primal_solution(),
+                                                      shard->rank_data.owned_var_size);
+    }
+
+    // Reduce all primal objectives across shards
+    cuopt_assert(!batch_mode_, "multi-GPU PDLP is not supported in batch mode");
+    engine->allreduce_sum_inplace(
+      [](pdlp_solver_t<i_t, f_t>& sp) -> f_t* {
+        return sp.get_current_termination_strategy()
+          .get_convergence_information()
+          .get_primal_objective()
+          .data();
+      });
+
+    // Get the reduced primal objective from the shard[0] (arbitrary) 
+    {
+      auto& s0 = *engine->shards[0];
+      raft::device_setter guard(s0.device_id);
+      auto& s0_conv =
+        s0.sub_pdlp->get_current_termination_strategy().get_convergence_information();
+      raft::copy(primal_objective_.data(), s0_conv.get_primal_objective().data(), 1, stream_view_);
+    }
+    apply_primal_objective_scaling_and_offset();
+  }
+  else {
   compute_primal_residual(
     op_problem_cusparse_view_, current_pdhg_solver.get_dual_tmp_resource(), dual_iterate);
-  compute_primal_objective(primal_iterate);
+  compute_primal_objective(primal_iterate);}
 
 #ifdef CUPDLP_DEBUG_MODE
   print("Primal Residual", primal_residual_);
 #endif
 
-  if (!batch_mode_)
+  // L2 Norm
+  if (current_pdhg_solver.is_multi_gpu()) {
+    auto* engine = current_pdhg_solver.get_mgpu_engine();
+    engine->distributed_l2_norm(
+      [](pdlp_solver_t<i_t, f_t>& sp) -> rmm::device_uvector<f_t>& {
+        return sp.get_current_termination_strategy()
+          .get_convergence_information()
+          .primal_residual_;
+      },
+      [](pdlp_solver_t<i_t, f_t>& sp) -> f_t* {
+        return sp.get_current_termination_strategy()
+          .get_convergence_information()
+          .l2_primal_residual_.data();
+      },
+      [](pdlp_shard_t<i_t, f_t>& shard) -> i_t { return shard.rank_data.owned_cstr_size; });
+
+    auto& s0 = *engine->shards[0];
+    raft::device_setter guard(s0.device_id);
+    raft::copy(l2_primal_residual_.data(),
+               s0.sub_pdlp->get_current_termination_strategy()
+                 .get_convergence_information()
+                 .l2_primal_residual_.data(),
+               1,
+               stream_view_);
+  } else if (!batch_mode_) {
     my_l2_norm<i_t, f_t>(primal_residual_, l2_primal_residual_, handle_ptr_);
-  else {
+  } else {
     segmented_sum_handler_.segmented_sum_helper(
       thrust::make_transform_iterator(primal_residual_.data(), power_two_func_t<f_t>{}),
       l2_primal_residual_.data(),
@@ -444,6 +520,7 @@ void convergence_information_t<i_t, f_t>::compute_convergence_information(
   print("Absolute Primal Residual", l2_primal_residual_);
 #endif
   // If per_constraint_residual is false we still need to perform the l2 since it's used in kkt
+  // Not suported in mGPU
   if (settings.per_constraint_residual) {
     // Compute the linf of (residual_i - rel * b_i)
     if (settings.save_best_primal_so_far) {
@@ -466,19 +543,98 @@ void convergence_information_t<i_t, f_t>::compute_convergence_information(
                                                    std::numeric_limits<f_t>::lowest());
   }
 
-  compute_dual_residual(op_problem_cusparse_view_,
-                        current_pdhg_solver.get_primal_tmp_resource(),
-                        primal_iterate,
-                        dual_slack);
-  compute_dual_objective(dual_iterate, primal_iterate, dual_slack);
+  if (current_pdhg_solver.is_multi_gpu()) {
+    auto* engine = current_pdhg_solver.get_mgpu_engine();
+
+    // 1) Halo-exchange the dual solution on every shard so the upcoming
+    //    A_T_shard @ dual SpMV inside compute_dual_residual reads correct
+    //    values in the cstr halo region.
+    engine->halo_exchange_cstr(
+      [](pdhg_solver_t<i_t, f_t>& pdhg) -> rmm::device_uvector<f_t>& {
+        return pdhg.get_dual_solution();
+      });
+
+    // 2-3) Per-shard:
+    //      - compute_dual_residual: shard.dual_residual_ has owned-var entries
+    //        correct, halo var entries garbage (their A_T row isn't on this
+    //        shard).
+    //      - compute_dual_objective_owned_partial: writes a *partial*
+    //        dot(slack[0:nv], x[0:nv]) + Σ primal_slack[0:nc] into
+    //        shard.dual_objective_, with NO scaling/offset. Relies on
+    //        primal_slack_ already populated by the per-shard
+    //        compute_primal_residual above.
+    for (auto& shard : engine->shards) {
+      raft::device_setter guard(shard->device_id);
+      auto& sub_pdlp = *shard->sub_pdlp;
+      auto& sub_conv = sub_pdlp.get_current_termination_strategy().get_convergence_information();
+      sub_conv.compute_dual_residual(sub_conv.op_problem_cusparse_view_,
+                                     sub_pdlp.pdhg_solver_.get_primal_tmp_resource(),
+                                     sub_pdlp.pdhg_solver_.get_primal_solution(),
+                                     sub_pdlp.pdhg_solver_.get_dual_slack());
+      sub_conv.compute_dual_objective_owned_partial(sub_pdlp.pdhg_solver_.get_primal_solution(),
+                                                    sub_pdlp.pdhg_solver_.get_dual_slack(),
+                                                    shard->rank_data.owned_var_size,
+                                                    shard->rank_data.owned_cstr_size);
+    }
+
+    // 4) Allreduce dual_objective_ across shards (sum, in place). Same
+    //    offset/scaling-after-allreduce reasoning as primal: applying offset
+    //    per-shard would over-count it Nshards times.
+    engine->allreduce_sum_inplace(
+      [](pdlp_solver_t<i_t, f_t>& sp) -> f_t* {
+        return sp.get_current_termination_strategy()
+          .get_convergence_information()
+          .get_dual_objective()
+          .data();
+      });
+
+    {
+      auto& s0 = *engine->shards[0];
+      raft::device_setter guard(s0.device_id);
+      auto& s0_conv =
+        s0.sub_pdlp->get_current_termination_strategy().get_convergence_information();
+      raft::copy(dual_objective_.data(), s0_conv.get_dual_objective().data(), 1, stream_view_);
+    }
+    apply_dual_objective_scaling_and_offset();
+  } else {
+    compute_dual_residual(op_problem_cusparse_view_,
+                          current_pdhg_solver.get_primal_tmp_resource(),
+                          primal_iterate,
+                          dual_slack);
+    compute_dual_objective(dual_iterate, primal_iterate, dual_slack);
+  }
 
 #ifdef CUPDLP_DEBUG_MODE
   print("Dual Residual", dual_residual_);
 #endif
 
-  if (!batch_mode_)
+  if (current_pdhg_solver.is_multi_gpu()) {
+    // Multi-GPU dual residual L2 norm: same pattern as the primal L2 above,
+    // but the dual residual is var-shaped so we clip to owned_var_size.
+    auto* engine = current_pdhg_solver.get_mgpu_engine();
+    engine->distributed_l2_norm(
+      [](pdlp_solver_t<i_t, f_t>& sp) -> rmm::device_uvector<f_t>& {
+        return sp.get_current_termination_strategy()
+          .get_convergence_information()
+          .dual_residual_;
+      },
+      [](pdlp_solver_t<i_t, f_t>& sp) -> f_t* {
+        return sp.get_current_termination_strategy()
+          .get_convergence_information()
+          .l2_dual_residual_.data();
+      },
+      [](pdlp_shard_t<i_t, f_t>& shard) -> i_t { return shard.rank_data.owned_var_size; });
+    auto& s0 = *engine->shards[0];
+    raft::device_setter guard(s0.device_id);
+    raft::copy(l2_dual_residual_.data(),
+               s0.sub_pdlp->get_current_termination_strategy()
+                 .get_convergence_information()
+                 .l2_dual_residual_.data(),
+               1,
+               stream_view_);
+  } else if (!batch_mode_) {
     my_l2_norm<i_t, f_t>(dual_residual_, l2_dual_residual_, handle_ptr_);
-  else {
+  } else {
     segmented_sum_handler_.segmented_sum_helper(
       thrust::make_transform_iterator(dual_residual_.data(), power_two_func_t<f_t>{}),
       l2_dual_residual_.data(),
@@ -509,6 +665,7 @@ void convergence_information_t<i_t, f_t>::compute_convergence_information(
                                                    std::numeric_limits<f_t>::lowest());
   }
 
+  // In mGPU, full primal_objective and dual_objective already mirrored to master so no special behaviour 
   const auto [grid_size, block_size] = kernel_config_from_batch_size(climber_strategies_.size());
   compute_remaining_stats_kernel<i_t, f_t>
     <<<grid_size, block_size, 0, stream_view_>>>(this->view(), climber_strategies_.size());
@@ -615,6 +772,24 @@ __global__ void apply_objective_scaling_and_offset(raft::device_span<f_t> object
   objective[idx] = objective_scaling_factor * (objective[idx] + objective_offsets[idx]);
 }
 
+template <typename i_t, typename f_t>
+void convergence_information_t<i_t, f_t>::compute_primal_objective_owned_partial(
+  rmm::device_uvector<f_t>& primal_solution, i_t n_owned)
+{
+  raft::common::nvtx::range fun_scope("compute_primal_objective_owned_partial");
+  cuopt_assert(!batch_mode_, "owned-partial primal objective is only used in non-batch mGPU mode");
+  cuopt_assert(n_owned <= primal_size_h_,
+               "n_owned must be <= primal_size_h_ (owned slice is a prefix)");
+  RAFT_CUBLAS_TRY(raft::linalg::detail::cublasdot(handle_ptr_->get_cublas_handle(),
+                                                  static_cast<int>(n_owned),
+                                                  primal_solution.data(),
+                                                  primal_stride,
+                                                  problem_ptr->objective_coefficients.data(),
+                                                  primal_stride,
+                                                  primal_objective_.data(),
+                                                  stream_view_));
+}
+
 template <typename i_t, typename f_t>
 void convergence_information_t<i_t, f_t>::compute_primal_objective(
   rmm::device_uvector<f_t>& primal_solution)
@@ -643,21 +818,25 @@ void convergence_information_t<i_t, f_t>::compute_primal_objective(
 
   // Apply per-climber objective scaling and offset. objective_offsets_ is always populated
   // (defaults to the scalar problem offset replicated, or user-specified per-climber offsets).
-  {
-    const auto [grid_size, block_size] = kernel_config_from_batch_size(climber_strategies_.size());
-    apply_objective_scaling_and_offset<i_t, f_t><<<grid_size, block_size, 0, stream_view_>>>(
-      make_span(primal_objective_),
-      problem_ptr->presolve_data.objective_scaling_factor,
-      make_span(objective_offsets_),
-      climber_strategies_.size());
-    RAFT_CUDA_TRY(cudaPeekAtLastError());
-  }
+  apply_primal_objective_scaling_and_offset();
 
 #ifdef CUPDLP_DEBUG_MODE
   print("Primal objective", primal_objective_);
 #endif
 }
 
+template <typename i_t, typename f_t>
+void convergence_information_t<i_t, f_t>::apply_primal_objective_scaling_and_offset()
+{
+  const auto [grid_size, block_size] = kernel_config_from_batch_size(climber_strategies_.size());
+  apply_objective_scaling_and_offset<i_t, f_t><<<grid_size, block_size, 0, stream_view_>>>(
+    make_span(primal_objective_),
+    problem_ptr->presolve_data.objective_scaling_factor,
+    make_span(objective_offsets_),
+    climber_strategies_.size());
+  RAFT_CUDA_TRY(cudaPeekAtLastError());
+}
+
 template <typename i_t, typename f_t>
 void convergence_information_t<i_t, f_t>::compute_dual_residual(
   cusparse_view_t<i_t, f_t>& cusparse_view,
@@ -740,6 +919,51 @@ void convergence_information_t<i_t, f_t>::compute_dual_residual(
   }
 }
 
+template <typename i_t, typename f_t>
+void convergence_information_t<i_t, f_t>::compute_dual_objective_owned_partial(
+  rmm::device_uvector<f_t>& primal_solution,
+  rmm::device_uvector<f_t>& dual_slack,
+  i_t n_owned_var,
+  i_t n_owned_cstr)
+{
+  raft::common::nvtx::range fun_scope("compute_dual_objective_owned_partial");
+  cuopt_assert(!batch_mode_, "owned-partial dual objective is only used in non-batch mGPU mode");
+  cuopt_assert(hyper_params_.use_reflected_primal_dual,
+               "owned-partial dual objective requires use_reflected_primal_dual");
+  cuopt_assert(n_owned_var <= primal_size_h_,
+               "n_owned_var must be <= primal_size_h_ (owned slice is a prefix)");
+  cuopt_assert(n_owned_cstr <= dual_size_h_,
+               "n_owned_cstr must be <= dual_size_h_ (owned slice is a prefix)");
+
+  // dual_dot_ = dot(dual_slack[0:n_owned_var], primal_solution[0:n_owned_var])
+  RAFT_CUBLAS_TRY(raft::linalg::detail::cublasdot(handle_ptr_->get_cublas_handle(),
+                                                  static_cast<int>(n_owned_var),
+                                                  dual_slack.data(),
+                                                  primal_stride,
+                                                  primal_solution.data(),
+                                                  primal_stride,
+                                                  dual_dot_.data(),
+                                                  stream_view_));
+
+  // sum_primal_slack_ = Σ primal_slack_[0:n_owned_cstr]
+  // primal_slack_ is assumed populated for owned cstrs by a prior
+  // compute_primal_residual call on this same shard.
+  cub::DeviceReduce::Sum(rmm_tmp_buffer_.data(),
+                         size_of_buffer_,
+                         primal_slack_.data(),
+                         sum_primal_slack_.data(),
+                         static_cast<int>(n_owned_cstr),
+                         stream_view_);
+
+  // dual_objective_ = dual_dot_ + sum_primal_slack_ (still a partial sum).
+  cub::DeviceTransform::Transform(
+    cuda::std::make_tuple(dual_dot_.data(), sum_primal_slack_.data()),
+    dual_objective_.data(),
+    1,
+    cuda::std::plus<>{},
+    stream_view_);
+}
+
 template <typename i_t, typename f_t>
 void convergence_information_t<i_t, f_t>::compute_dual_objective(
   rmm::device_uvector<f_t>& dual_solution,
@@ -821,21 +1045,25 @@ void convergence_information_t<i_t, f_t>::compute_dual_objective(
   }
 
   // Apply per-climber objective scaling and offset.
-  {
-    const auto [grid_size, block_size] = kernel_config_from_batch_size(climber_strategies_.size());
-    apply_objective_scaling_and_offset<i_t, f_t><<<grid_size, block_size, 0, stream_view_>>>(
-      make_span(dual_objective_),
-      problem_ptr->presolve_data.objective_scaling_factor,
-      make_span(objective_offsets_),
-      climber_strategies_.size());
-    RAFT_CUDA_TRY(cudaPeekAtLastError());
-  }
+  apply_dual_objective_scaling_and_offset();
 
 #ifdef CUPDLP_DEBUG_MODE
   print("Dual objective", dual_objective_);
 #endif
 }
 
+template <typename i_t, typename f_t>
+void convergence_information_t<i_t, f_t>::apply_dual_objective_scaling_and_offset()
+{
+  const auto [grid_size, block_size] = kernel_config_from_batch_size(climber_strategies_.size());
+  apply_objective_scaling_and_offset<i_t, f_t><<<grid_size, block_size, 0, stream_view_>>>(
+    make_span(dual_objective_),
+    problem_ptr->presolve_data.objective_scaling_factor,
+    make_span(objective_offsets_),
+    climber_strategies_.size());
+  RAFT_CUDA_TRY(cudaPeekAtLastError());
+}
+
 template <typename i_t, typename f_t>
 void convergence_information_t<i_t, f_t>::compute_reduced_cost_from_primal_gradient(
   const rmm::device_uvector<f_t>& primal_gradient, const rmm::device_uvector<f_t>& primal_solution)
@@ -916,12 +1144,24 @@ const rmm::device_uvector<f_t>& convergence_information_t<i_t, f_t>::get_primal_
   return primal_objective_;
 }
 
+template <typename i_t, typename f_t>
+rmm::device_uvector<f_t>& convergence_information_t<i_t, f_t>::get_primal_objective()
+{
+  return primal_objective_;
+}
+
 template <typename i_t, typename f_t>
 const rmm::device_uvector<f_t>& convergence_information_t<i_t, f_t>::get_dual_objective() const
 {
   return dual_objective_;
 }
 
+template <typename i_t, typename f_t>
+rmm::device_uvector<f_t>& convergence_information_t<i_t, f_t>::get_dual_objective()
+{
+  return dual_objective_;
+}
+
 template <typename i_t, typename f_t>
 const rmm::device_uvector<f_t>& convergence_information_t<i_t, f_t>::get_l2_dual_residual() const
 {
diff --git a/cpp/src/pdlp/termination_strategy/convergence_information.hpp b/cpp/src/pdlp/termination_strategy/convergence_information.hpp
index 2389a60fae..6325622a2b 100644
--- a/cpp/src/pdlp/termination_strategy/convergence_information.hpp
+++ b/cpp/src/pdlp/termination_strategy/convergence_information.hpp
@@ -52,7 +52,10 @@ class convergence_information_t {
 
   // Needed for kkt restart & debug prints
   const rmm::device_uvector<f_t>& get_primal_objective() const;
+  // Non-const overload used by the multi-GPU branch to mirror / allreduce.
+  rmm::device_uvector<f_t>& get_primal_objective();
   const rmm::device_uvector<f_t>& get_dual_objective() const;
+  rmm::device_uvector<f_t>& get_dual_objective();
   const rmm::device_uvector<f_t>& get_l2_primal_residual() const;
   const rmm::device_uvector<f_t>& get_l2_dual_residual() const;
   const rmm::device_uvector<f_t>& get_relative_linf_primal_residual() const;
@@ -123,12 +126,40 @@ class convergence_information_t {
                                rmm::device_uvector<f_t>& tmp_dual,
                                [[maybe_unused]] const rmm::device_uvector<f_t>& dual_iterate);
 
+  // Multi-GPU shard helper: writes a partial dot(c[0:n_owned], x[0:n_owned])
+  // into primal_objective_ (no scaling, no offset). Master is responsible for
+  // allreduce SUM across shards and then applying scaling + offset once on the
+  // reduced value. n_owned must be <= primal_size_h_; pass owned_var_size on
+  // each shard.
+  void compute_primal_objective_owned_partial(rmm::device_uvector<f_t>& primal_solution,
+                                              i_t n_owned);
+
+  // Multi-GPU shard helper: writes a partial dual objective into
+  // dual_objective_ (no scaling, no offset). Computes
+  //   dual_dot_           = dot(dual_slack[0:n_owned_var], primal_solution[0:n_owned_var])
+  //   sum_primal_slack_   = Σ primal_slack_[0:n_owned_cstr]
+  //   dual_objective_     = dual_dot_ + sum_primal_slack_
+  // primal_slack_ is assumed already populated by a prior per-shard
+  // compute_primal_residual call. Use only in the use_reflected_primal_dual
+  // path (the multi-GPU mode).
+  void compute_dual_objective_owned_partial(rmm::device_uvector<f_t>& primal_solution,
+                                            rmm::device_uvector<f_t>& dual_slack,
+                                            i_t n_owned_var,
+                                            i_t n_owned_cstr);
+
   void swap_context(const thrust::universal_host_pinned_vector<swap_pair_t<i_t>>& swap_pairs);
   void resize_context(i_t new_size);
 
  private:
   void compute_primal_objective(rmm::device_uvector<f_t>& primal_solution);
 
+  // Applies per-climber objective scaling + offset to primal_objective_.
+  // Single-GPU path: called from compute_primal_objective right after the dot.
+  // Multi-GPU path: called on master once after allreduce of partial sums.
+  void apply_primal_objective_scaling_and_offset();
+  // Same as above but for dual_objective_.
+  void apply_dual_objective_scaling_and_offset();
+
   void compute_dual_residual(cusparse_view_t<i_t, f_t>& cusparse_view,
                              rmm::device_uvector<f_t>& tmp_primal,
                              rmm::device_uvector<f_t>& primal_solution,

From c484485d9debf9d5a1d7246dcf34f7f919d5f344 Mon Sep 17 00:00:00 2001
From: Bulle Mostovoi <vmostovoi@nvidia.com>
Date: Tue, 26 May 2026 16:14:44 +0200
Subject: [PATCH 33/67] fill_return_problem_solutionis now ready !!

---
 .../distributed_pdlp/multi_gpu_engine.hpp     | 86 +++++++++++++++++++
 cpp/src/pdlp/pdlp.cu                          |  9 ++
 2 files changed, 95 insertions(+)

diff --git a/cpp/src/pdlp/distributed_pdlp/multi_gpu_engine.hpp b/cpp/src/pdlp/distributed_pdlp/multi_gpu_engine.hpp
index 438a878834..e04f2e26eb 100644
--- a/cpp/src/pdlp/distributed_pdlp/multi_gpu_engine.hpp
+++ b/cpp/src/pdlp/distributed_pdlp/multi_gpu_engine.hpp
@@ -17,7 +17,9 @@
 #include <rmm/cuda_stream.hpp>
 #include <rmm/exec_policy.hpp>
 
+#include <thrust/execution_policy.h>
 #include <thrust/gather.h>
+#include <thrust/scatter.h>
 #include <cub/device/device_transform.cuh>
 #include <cuda/std/tuple>
 
@@ -325,6 +327,90 @@ struct multi_gpu_engine_t {
       [](auto& pdhg) -> cusparseDnVecDescr_t { return pdhg.get_cusparse_view().current_AtY; });
   }
 
+  // -------- Solution gather (shards -> master) ----------------------------
+  // Assembles the global potential_next primal/dual solutions on the master
+  // pdhg_solver_ from the owned slices distributed across shards. Each shard's
+  // first owned_var_size (resp. owned_cstr_size) entries of its
+  // potential_next_primal_solution_ (resp. _dual_) are the live, up-to-date
+  // owned values; the master pdhg_solver_'s buffers are not updated during
+  // iterations and would otherwise return stale data.
+  //
+  // Used right before fill_return_problem_solution() at the return sites in
+  // pdlp_solver_t::check_termination() and pdlp_solver_t::check_limits(): the
+  // user-visible solution must contain gathered global values.
+  //
+  // Mirrors the metis_tests engine::get_x_output / get_y_output pattern:
+  // per shard: alloc small host tmp, copy owned slice device->host, sync,
+  // host-scatter via rank_data.local_to_global_{var,cstr} into a contiguous
+  // host buffer. Then one host->device copy into the master pdhg buffer.
+  void gather_potential_next_solutions_to_master(pdhg_solver_t<i_t, f_t>& master_pdhg)
+  {
+    const std::size_t total_vars =
+      master_pdhg.get_potential_next_primal_solution().size();
+    const std::size_t total_cstrs =
+      master_pdhg.get_potential_next_dual_solution().size();
+
+    std::vector<f_t> h_primal(total_vars);
+    std::vector<f_t> h_dual(total_cstrs);
+
+    for (auto& s_uptr : shards) {
+      auto& s = *s_uptr;
+      raft::device_setter guard(s.device_id);
+      const i_t nv = s.rank_data.owned_var_size;
+      const i_t nc = s.rank_data.owned_cstr_size;
+
+      std::vector<f_t> tmp_primal(nv);
+      std::vector<f_t> tmp_dual(nc);
+
+      if (nv > 0) {
+        RAFT_CUDA_TRY(
+          cudaMemcpyAsync(tmp_primal.data(),
+                          s.sub_pdlp->pdhg_solver_.get_potential_next_primal_solution().data(),
+                          static_cast<std::size_t>(nv) * sizeof(f_t),
+                          cudaMemcpyDeviceToHost,
+                          s.stream.view().value()));
+      }
+      if (nc > 0) {
+        RAFT_CUDA_TRY(
+          cudaMemcpyAsync(tmp_dual.data(),
+                          s.sub_pdlp->pdhg_solver_.get_potential_next_dual_solution().data(),
+                          static_cast<std::size_t>(nc) * sizeof(f_t),
+                          cudaMemcpyDeviceToHost,
+                          s.stream.view().value()));
+      }
+      RAFT_CUDA_TRY(cudaStreamSynchronize(s.stream.view().value()));
+
+      if (nv > 0) {
+        thrust::scatter(thrust::host,
+                        tmp_primal.begin(),
+                        tmp_primal.end(),
+                        s.rank_data.local_to_global_var.begin(),
+                        h_primal.begin());
+      }
+      if (nc > 0) {
+        thrust::scatter(thrust::host,
+                        tmp_dual.begin(),
+                        tmp_dual.end(),
+                        s.rank_data.local_to_global_cstr.begin(),
+                        h_dual.begin());
+      }
+    }
+
+    // Host -> master device. engine.stream lives on the master device
+    // (created at engine construction when master device was current).
+    RAFT_CUDA_TRY(cudaMemcpyAsync(master_pdhg.get_potential_next_primal_solution().data(),
+                                  h_primal.data(),
+                                  total_vars * sizeof(f_t),
+                                  cudaMemcpyHostToDevice,
+                                  stream.view().value()));
+    RAFT_CUDA_TRY(cudaMemcpyAsync(master_pdhg.get_potential_next_dual_solution().data(),
+                                  h_dual.data(),
+                                  total_cstrs * sizeof(f_t),
+                                  cudaMemcpyHostToDevice,
+                                  stream.view().value()));
+    RAFT_CUDA_TRY(cudaStreamSynchronize(stream.view().value()));
+  }
+
   // Engine-level stream for fork/join orchestration (master side).
   rmm::cuda_stream stream;
 
diff --git a/cpp/src/pdlp/pdlp.cu b/cpp/src/pdlp/pdlp.cu
index 9522ae4065..e9cf194d98 100644
--- a/cpp/src/pdlp/pdlp.cu
+++ b/cpp/src/pdlp/pdlp.cu
@@ -661,6 +661,9 @@ std::optional<optimization_problem_solution_t<i_t, f_t>> pdlp_solver_t<i_t, f_t>
     RAFT_CUDA_TRY(cudaDeviceSynchronize());
     std::cout << "Time Limit reached, returning current solution" << std::endl;
 #endif
+    if (auto* engine = pdhg_solver_.get_mgpu_engine()) {
+      engine->gather_potential_next_solutions_to_master(pdhg_solver_);
+    }
     return current_termination_strategy_.fill_return_problem_solution(
       internal_solver_iterations_,
       pdhg_solver_,
@@ -694,6 +697,9 @@ std::optional<optimization_problem_solution_t<i_t, f_t>> pdlp_solver_t<i_t, f_t>
       return finalize_batch_return_with_limit_reached(pdlp_termination_status_t::IterationLimit);
     }
 
+    if (auto* engine = pdhg_solver_.get_mgpu_engine()) {
+      engine->gather_potential_next_solutions_to_master(pdhg_solver_);
+    }
     return current_termination_strategy_.fill_return_problem_solution(
       internal_solver_iterations_,
       pdhg_solver_,
@@ -1371,6 +1377,9 @@ std::optional<optimization_problem_solution_t<i_t, f_t>> pdlp_solver_t<i_t, f_t>
 #endif
     print_final_termination_criteria(
       timer, current_termination_strategy_.get_convergence_information(), termination_current);
+    if (auto* engine = pdhg_solver_.get_mgpu_engine()) {
+      engine->gather_potential_next_solutions_to_master(pdhg_solver_);
+    }
     return current_termination_strategy_.fill_return_problem_solution(
       internal_solver_iterations_,
       pdhg_solver_,

From fc46080d24d7566729a1a505837cfc41e023997d Mon Sep 17 00:00:00 2001
From: Bulle Mostovoi <vmostovoi@nvidia.com>
Date: Tue, 26 May 2026 16:39:26 +0200
Subject: [PATCH 34/67] added reduced cost in gathering of solution, builds and
 runs

---
 .../distributed_pdlp/multi_gpu_engine.hpp     | 47 +++++++++++++++----
 cpp/src/pdlp/pdlp.cu                          | 12 +++--
 2 files changed, 47 insertions(+), 12 deletions(-)

diff --git a/cpp/src/pdlp/distributed_pdlp/multi_gpu_engine.hpp b/cpp/src/pdlp/distributed_pdlp/multi_gpu_engine.hpp
index e04f2e26eb..d156e889af 100644
--- a/cpp/src/pdlp/distributed_pdlp/multi_gpu_engine.hpp
+++ b/cpp/src/pdlp/distributed_pdlp/multi_gpu_engine.hpp
@@ -328,22 +328,30 @@ struct multi_gpu_engine_t {
   }
 
   // -------- Solution gather (shards -> master) ----------------------------
-  // Assembles the global potential_next primal/dual solutions on the master
-  // pdhg_solver_ from the owned slices distributed across shards. Each shard's
-  // first owned_var_size (resp. owned_cstr_size) entries of its
-  // potential_next_primal_solution_ (resp. _dual_) are the live, up-to-date
-  // owned values; the master pdhg_solver_'s buffers are not updated during
-  // iterations and would otherwise return stale data.
+  // Assembles the global potential_next primal/dual solutions and the
+  // reduced_cost on the master from the owned slices distributed across
+  // shards. Each shard's first owned_var_size (resp. owned_cstr_size) entries
+  // of its potential_next_primal_solution_ / reduced_cost_ (resp.
+  // potential_next_dual_solution_) are the live, up-to-date owned values; the
+  // master buffers are not updated during iterations and would otherwise
+  // return stale data.
   //
   // Used right before fill_return_problem_solution() at the return sites in
   // pdlp_solver_t::check_termination() and pdlp_solver_t::check_limits(): the
-  // user-visible solution must contain gathered global values.
+  // user-visible solution must contain gathered global values for primal,
+  // dual, and reduced_cost.
   //
   // Mirrors the metis_tests engine::get_x_output / get_y_output pattern:
   // per shard: alloc small host tmp, copy owned slice device->host, sync,
   // host-scatter via rank_data.local_to_global_{var,cstr} into a contiguous
-  // host buffer. Then one host->device copy into the master pdhg buffer.
-  void gather_potential_next_solutions_to_master(pdhg_solver_t<i_t, f_t>& master_pdhg)
+  // host buffer. Then one host->device copy into the master buffer per field.
+  //
+  // master_pdhg          : provides destinations for primal / dual.
+  // master_reduced_cost  : destination for the reduced_cost (var-shaped, lives
+  //                        in the master pdlp_solver_t's termination strategy
+  //                        convergence_information_).
+  void gather_potential_next_solutions_to_master(
+    pdhg_solver_t<i_t, f_t>& master_pdhg, rmm::device_uvector<f_t>& master_reduced_cost)
   {
     const std::size_t total_vars =
       master_pdhg.get_potential_next_primal_solution().size();
@@ -352,6 +360,7 @@ struct multi_gpu_engine_t {
 
     std::vector<f_t> h_primal(total_vars);
     std::vector<f_t> h_dual(total_cstrs);
+    std::vector<f_t> h_reduced_cost(total_vars);
 
     for (auto& s_uptr : shards) {
       auto& s = *s_uptr;
@@ -361,6 +370,11 @@ struct multi_gpu_engine_t {
 
       std::vector<f_t> tmp_primal(nv);
       std::vector<f_t> tmp_dual(nc);
+      std::vector<f_t> tmp_reduced_cost(nv);
+
+      auto& sub_reduced_cost = s.sub_pdlp->get_current_termination_strategy()
+                                 .get_convergence_information()
+                                 .get_reduced_cost();
 
       if (nv > 0) {
         RAFT_CUDA_TRY(
@@ -369,6 +383,11 @@ struct multi_gpu_engine_t {
                           static_cast<std::size_t>(nv) * sizeof(f_t),
                           cudaMemcpyDeviceToHost,
                           s.stream.view().value()));
+        RAFT_CUDA_TRY(cudaMemcpyAsync(tmp_reduced_cost.data(),
+                                      sub_reduced_cost.data(),
+                                      static_cast<std::size_t>(nv) * sizeof(f_t),
+                                      cudaMemcpyDeviceToHost,
+                                      s.stream.view().value()));
       }
       if (nc > 0) {
         RAFT_CUDA_TRY(
@@ -386,6 +405,11 @@ struct multi_gpu_engine_t {
                         tmp_primal.end(),
                         s.rank_data.local_to_global_var.begin(),
                         h_primal.begin());
+        thrust::scatter(thrust::host,
+                        tmp_reduced_cost.begin(),
+                        tmp_reduced_cost.end(),
+                        s.rank_data.local_to_global_var.begin(),
+                        h_reduced_cost.begin());
       }
       if (nc > 0) {
         thrust::scatter(thrust::host,
@@ -408,6 +432,11 @@ struct multi_gpu_engine_t {
                                   total_cstrs * sizeof(f_t),
                                   cudaMemcpyHostToDevice,
                                   stream.view().value()));
+    RAFT_CUDA_TRY(cudaMemcpyAsync(master_reduced_cost.data(),
+                                  h_reduced_cost.data(),
+                                  total_vars * sizeof(f_t),
+                                  cudaMemcpyHostToDevice,
+                                  stream.view().value()));
     RAFT_CUDA_TRY(cudaStreamSynchronize(stream.view().value()));
   }
 
diff --git a/cpp/src/pdlp/pdlp.cu b/cpp/src/pdlp/pdlp.cu
index e9cf194d98..7bd6d34473 100644
--- a/cpp/src/pdlp/pdlp.cu
+++ b/cpp/src/pdlp/pdlp.cu
@@ -662,7 +662,9 @@ std::optional<optimization_problem_solution_t<i_t, f_t>> pdlp_solver_t<i_t, f_t>
     std::cout << "Time Limit reached, returning current solution" << std::endl;
 #endif
     if (auto* engine = pdhg_solver_.get_mgpu_engine()) {
-      engine->gather_potential_next_solutions_to_master(pdhg_solver_);
+      engine->gather_potential_next_solutions_to_master(
+        pdhg_solver_,
+        current_termination_strategy_.get_convergence_information().get_reduced_cost());
     }
     return current_termination_strategy_.fill_return_problem_solution(
       internal_solver_iterations_,
@@ -698,7 +700,9 @@ std::optional<optimization_problem_solution_t<i_t, f_t>> pdlp_solver_t<i_t, f_t>
     }
 
     if (auto* engine = pdhg_solver_.get_mgpu_engine()) {
-      engine->gather_potential_next_solutions_to_master(pdhg_solver_);
+      engine->gather_potential_next_solutions_to_master(
+        pdhg_solver_,
+        current_termination_strategy_.get_convergence_information().get_reduced_cost());
     }
     return current_termination_strategy_.fill_return_problem_solution(
       internal_solver_iterations_,
@@ -1378,7 +1382,9 @@ std::optional<optimization_problem_solution_t<i_t, f_t>> pdlp_solver_t<i_t, f_t>
     print_final_termination_criteria(
       timer, current_termination_strategy_.get_convergence_information(), termination_current);
     if (auto* engine = pdhg_solver_.get_mgpu_engine()) {
-      engine->gather_potential_next_solutions_to_master(pdhg_solver_);
+      engine->gather_potential_next_solutions_to_master(
+        pdhg_solver_,
+        current_termination_strategy_.get_convergence_information().get_reduced_cost());
     }
     return current_termination_strategy_.fill_return_problem_solution(
       internal_solver_iterations_,

From 6538382755ed20571b649e0366afbebd95493053 Mon Sep 17 00:00:00 2001
From: Bulle Mostovoi <vmostovoi@nvidia.com>
Date: Wed, 27 May 2026 13:41:27 +0200
Subject: [PATCH 35/67] updated mgpu scale/unscale logic

---
 cpp/src/pdlp/pdlp.cu | 40 ++++++++++++++++++++++------------------
 1 file changed, 22 insertions(+), 18 deletions(-)

diff --git a/cpp/src/pdlp/pdlp.cu b/cpp/src/pdlp/pdlp.cu
index 7bd6d34473..c31c528d8d 100644
--- a/cpp/src/pdlp/pdlp.cu
+++ b/cpp/src/pdlp/pdlp.cu
@@ -2962,22 +2962,24 @@ optimization_problem_solution_t<i_t, f_t> pdlp_solver_t<i_t, f_t>::run_solver(co
                                                     unscaled_dual_avg_solution_);
       }
       if (settings_.hyper_params.use_adaptive_step_size_strategy) {
+        initial_scaling_strategy_.unscale_solutions(pdhg_solver_.get_primal_solution(),
+                                                    pdhg_solver_.get_dual_solution());
+      } else {
         if (multi_gpu_engine) {
-          // The only branch in cuPDLPx
+          // The only branch in cuPDLPx (Stable3)
           multi_gpu_engine->for_each_shard([&](auto& shard) {
             auto& sub = *shard.sub_pdlp;
             sub.get_initial_scaling_strategy().unscale_solutions(
-              sub.pdhg_solver_.get_primal_solution(), sub.pdhg_solver_.get_dual_solution());
+              sub.pdhg_solver_.get_potential_next_primal_solution(),
+              sub.pdhg_solver_.get_potential_next_dual_solution(),
+              sub.pdhg_solver_.get_dual_slack());
           });
         } else {
-          initial_scaling_strategy_.unscale_solutions(pdhg_solver_.get_primal_solution(),
-                                                      pdhg_solver_.get_dual_solution());
+          initial_scaling_strategy_.unscale_solutions(
+            pdhg_solver_.get_potential_next_primal_solution(),
+            pdhg_solver_.get_potential_next_dual_solution(),
+            pdhg_solver_.get_dual_slack());
         }
-      } else {
-        initial_scaling_strategy_.unscale_solutions(
-          pdhg_solver_.get_potential_next_primal_solution(),
-          pdhg_solver_.get_potential_next_dual_solution(),
-          pdhg_solver_.get_dual_slack());
       }
 
 #ifdef CUPDLP_DEBUG_MODE
@@ -3006,22 +3008,24 @@ optimization_problem_solution_t<i_t, f_t> pdlp_solver_t<i_t, f_t>::run_solver(co
                                                     unscaled_dual_avg_solution_);
         }
         if (settings_.hyper_params.use_adaptive_step_size_strategy) {
+          initial_scaling_strategy_.scale_solutions(pdhg_solver_.get_primal_solution(),
+                                                    pdhg_solver_.get_dual_solution());
+        } else {
           if (multi_gpu_engine) {
-            // The only branch in cuPDLPx
+            // The only branch in cuPDLPx (Stable3)
             multi_gpu_engine->for_each_shard([&](auto& shard) {
               auto& sub = *shard.sub_pdlp;
               sub.get_initial_scaling_strategy().scale_solutions(
-                sub.pdhg_solver_.get_primal_solution(), sub.pdhg_solver_.get_dual_solution());
+                sub.pdhg_solver_.get_potential_next_primal_solution(),
+                sub.pdhg_solver_.get_potential_next_dual_solution(),
+                sub.pdhg_solver_.get_dual_slack());
             });
           } else {
-            initial_scaling_strategy_.scale_solutions(pdhg_solver_.get_primal_solution(),
-                                                      pdhg_solver_.get_dual_solution());
+            initial_scaling_strategy_.scale_solutions(
+              pdhg_solver_.get_potential_next_primal_solution(),
+              pdhg_solver_.get_potential_next_dual_solution(),
+              pdhg_solver_.get_dual_slack());
           }
-        } else {
-          initial_scaling_strategy_.scale_solutions(
-            pdhg_solver_.get_potential_next_primal_solution(),
-            pdhg_solver_.get_potential_next_dual_solution(),
-            pdhg_solver_.get_dual_slack());
         }
       }
 

From a88285a9f8d4ed8ec568307d0fac1866ed404831 Mon Sep 17 00:00:00 2001
From: Bulle Mostovoi <vmostovoi@nvidia.com>
Date: Wed, 27 May 2026 14:09:56 +0200
Subject: [PATCH 36/67] wired mgpu restart

---
 cpp/src/pdlp/pdlp.cuh                         |   1 +
 .../restart_strategy/pdlp_restart_strategy.cu | 147 ++++++++++++++----
 2 files changed, 118 insertions(+), 30 deletions(-)

diff --git a/cpp/src/pdlp/pdlp.cuh b/cpp/src/pdlp/pdlp.cuh
index 63aef7b43a..17fb05080f 100644
--- a/cpp/src/pdlp/pdlp.cuh
+++ b/cpp/src/pdlp/pdlp.cuh
@@ -110,6 +110,7 @@ class pdlp_solver_t {
   {
     return initial_scaling_strategy_;
   }
+  detail::pdlp_restart_strategy_t<i_t, f_t>& get_restart_strategy() { return restart_strategy_; }
 
   // Per-shard primal/dual step sizes are private state on pdlp_solver_t but
   // are needed inside the multi-GPU dispatch paths that fan out a master cub
diff --git a/cpp/src/pdlp/restart_strategy/pdlp_restart_strategy.cu b/cpp/src/pdlp/restart_strategy/pdlp_restart_strategy.cu
index 17c7abcac5..00c5b16c8b 100644
--- a/cpp/src/pdlp/restart_strategy/pdlp_restart_strategy.cu
+++ b/cpp/src/pdlp/restart_strategy/pdlp_restart_strategy.cu
@@ -8,6 +8,8 @@
 #include <cuopt/error.hpp>
 
 #include <cuopt/linear_programming/pdlp/pdlp_hyper_params.cuh>
+#include <pdlp/distributed_pdlp/multi_gpu_engine.hpp>
+#include <pdlp/pdlp.cuh>
 #include <pdlp/pdlp_constants.hpp>
 #include <pdlp/restart_strategy/pdlp_restart_strategy.cuh>
 #include <pdlp/swap_and_resize_helper.cuh>
@@ -892,20 +894,64 @@ void pdlp_restart_strategy_t<i_t, f_t>::cupdlpx_restart(
     "If any, all should be true");
 
   // Computing the deltas
-  distance_squared_moved_from_last_restart_period(
-    pdhg_solver.get_potential_next_primal_solution(),
-    last_restart_duality_gap_.primal_solution_,
-    pdhg_solver.get_primal_tmp_resource(),
-    primal_size_h_,
-    1,
-    last_restart_duality_gap_.primal_distance_traveled_);
-  distance_squared_moved_from_last_restart_period(
-    pdhg_solver.get_potential_next_dual_solution(),
-    last_restart_duality_gap_.dual_solution_,
-    pdhg_solver.get_dual_tmp_resource(),
-    dual_size_h_,
-    1,
-    last_restart_duality_gap_.dual_distance_traveled_);
+  if (auto* engine = pdhg_solver.get_mgpu_engine()) {
+    engine->for_each_shard([&](auto& shard) {
+      auto& sub      = *shard.sub_pdlp;
+      auto& sub_rest = sub.get_restart_strategy();
+      sub_rest.distance_squared_moved_from_last_restart_period(
+        sub.pdhg_solver_.get_potential_next_primal_solution(),
+        sub_rest.last_restart_duality_gap_.primal_solution_,
+        sub.pdhg_solver_.get_primal_tmp_resource(),
+        shard.rank_data.owned_var_size,
+        1,
+        sub_rest.last_restart_duality_gap_.primal_distance_traveled_);
+      sub_rest.distance_squared_moved_from_last_restart_period(
+        sub.pdhg_solver_.get_potential_next_dual_solution(),
+        sub_rest.last_restart_duality_gap_.dual_solution_,
+        sub.pdhg_solver_.get_dual_tmp_resource(),
+        shard.rank_data.owned_cstr_size,
+        1,
+        sub_rest.last_restart_duality_gap_.dual_distance_traveled_);
+    });
+
+    engine->allreduce_sum_inplace([](pdlp_solver_t<i_t, f_t>& sp) -> f_t* {
+      return sp.get_restart_strategy().last_restart_duality_gap_.primal_distance_traveled_.data();
+    });
+    engine->allreduce_sum_inplace([](pdlp_solver_t<i_t, f_t>& sp) -> f_t* {
+      return sp.get_restart_strategy().last_restart_duality_gap_.dual_distance_traveled_.data();
+    });
+
+    auto& s0 = *engine->shards[0];
+    {
+      raft::device_setter guard(s0.device_id);
+      RAFT_CUDA_TRY(cudaStreamSynchronize(s0.stream.view().value()));
+    }
+    raft::copy(last_restart_duality_gap_.primal_distance_traveled_.data(),
+               s0.sub_pdlp->get_restart_strategy()
+                 .last_restart_duality_gap_.primal_distance_traveled_.data(),
+               1,
+               stream_view_);
+    raft::copy(last_restart_duality_gap_.dual_distance_traveled_.data(),
+               s0.sub_pdlp->get_restart_strategy()
+                 .last_restart_duality_gap_.dual_distance_traveled_.data(),
+               1,
+               stream_view_);
+  } else {
+    distance_squared_moved_from_last_restart_period(
+      pdhg_solver.get_potential_next_primal_solution(),
+      last_restart_duality_gap_.primal_solution_,
+      pdhg_solver.get_primal_tmp_resource(),
+      primal_size_h_,
+      1,
+      last_restart_duality_gap_.primal_distance_traveled_);
+    distance_squared_moved_from_last_restart_period(
+      pdhg_solver.get_potential_next_dual_solution(),
+      last_restart_duality_gap_.dual_solution_,
+      pdhg_solver.get_dual_tmp_resource(),
+      dual_size_h_,
+      1,
+      last_restart_duality_gap_.dual_distance_traveled_);
+  }
 
   auto view = make_cupdlpx_restart_view(last_restart_duality_gap_.primal_distance_traveled_,
                                         last_restart_duality_gap_.dual_distance_traveled_,
@@ -958,24 +1004,58 @@ void pdlp_restart_strategy_t<i_t, f_t>::cupdlpx_restart(
     best_primal_weight.set_element_async(0, best_primal_weight_value, stream_view_);
   }
 
+  // Broadcast the primal and dual step sizes to all shards
+  if (auto* engine = pdhg_solver.get_mgpu_engine()) {
+    RAFT_CUDA_TRY(cudaStreamSynchronize(stream_view_));
+    engine->for_each_shard([&](auto& shard) {
+      auto& sub = *shard.sub_pdlp;
+      raft::copy(sub.get_primal_step_size().data(),
+                 primal_step_size.data(), 1, shard.stream.view());
+      raft::copy(sub.get_dual_step_size().data(),
+                 dual_step_size.data(), 1, shard.stream.view());
+    });
+  }
   // TODO later batch mode: remove if you have per climber restart
 
-  raft::copy(last_restart_duality_gap_.primal_solution_.data(),
-             pdhg_solver.get_potential_next_primal_solution().data(),
-             last_restart_duality_gap_.primal_solution_.size(),
-             stream_view_);
-  raft::copy(pdhg_solver.get_primal_solution().data(),
-             pdhg_solver.get_potential_next_primal_solution().data(),
-             last_restart_duality_gap_.primal_solution_.size(),
-             stream_view_);
-  raft::copy(last_restart_duality_gap_.dual_solution_.data(),
-             pdhg_solver.get_potential_next_dual_solution().data(),
-             last_restart_duality_gap_.dual_solution_.size(),
-             stream_view_);
-  raft::copy(pdhg_solver.get_dual_solution().data(),
-             pdhg_solver.get_potential_next_dual_solution().data(),
-             last_restart_duality_gap_.dual_solution_.size(),
-             stream_view_);
+  if (auto* engine = pdhg_solver.get_mgpu_engine()) {
+    engine->for_each_shard([&](auto& shard) {
+      auto& sub      = *shard.sub_pdlp;
+      auto& sub_rest = sub.get_restart_strategy();
+      raft::copy(sub_rest.last_restart_duality_gap_.primal_solution_.data(),
+                 sub.pdhg_solver_.get_potential_next_primal_solution().data(),
+                 sub_rest.last_restart_duality_gap_.primal_solution_.size(),
+                 shard.stream.view());
+      raft::copy(sub.pdhg_solver_.get_primal_solution().data(),
+                 sub.pdhg_solver_.get_potential_next_primal_solution().data(),
+                 sub.pdhg_solver_.get_primal_solution().size(),
+                 shard.stream.view());
+      raft::copy(sub_rest.last_restart_duality_gap_.dual_solution_.data(),
+                 sub.pdhg_solver_.get_potential_next_dual_solution().data(),
+                 sub_rest.last_restart_duality_gap_.dual_solution_.size(),
+                 shard.stream.view());
+      raft::copy(sub.pdhg_solver_.get_dual_solution().data(),
+                 sub.pdhg_solver_.get_potential_next_dual_solution().data(),
+                 sub.pdhg_solver_.get_dual_solution().size(),
+                 shard.stream.view());
+    });
+  } else {
+    raft::copy(last_restart_duality_gap_.primal_solution_.data(),
+               pdhg_solver.get_potential_next_primal_solution().data(),
+               last_restart_duality_gap_.primal_solution_.size(),
+               stream_view_);
+    raft::copy(pdhg_solver.get_primal_solution().data(),
+               pdhg_solver.get_potential_next_primal_solution().data(),
+               last_restart_duality_gap_.primal_solution_.size(),
+               stream_view_);
+    raft::copy(last_restart_duality_gap_.dual_solution_.data(),
+               pdhg_solver.get_potential_next_dual_solution().data(),
+               last_restart_duality_gap_.dual_solution_.size(),
+               stream_view_);
+    raft::copy(pdhg_solver.get_dual_solution().data(),
+               pdhg_solver.get_potential_next_dual_solution().data(),
+               last_restart_duality_gap_.dual_solution_.size(),
+               stream_view_);
+  }
 
 #ifdef CUPDLP_DEBUG_MODE
   print("New last_restart_duality_gap_.primal_solution_",
@@ -990,6 +1070,13 @@ void pdlp_restart_strategy_t<i_t, f_t>::cupdlpx_restart(
     weighted_average_solution_.iterations_since_last_restart_ = 0;
     last_trial_fixed_point_error_[i] = std::numeric_limits<f_t>::infinity();
   }
+
+  if (auto* engine = pdhg_solver.get_mgpu_engine()) {
+    engine->for_each_shard([&](auto& shard) {
+      shard.sub_pdlp->get_restart_strategy().weighted_average_solution_.iterations_since_last_restart_ =
+        0;
+    });
+  }
 }
 
 template <typename i_t, typename f_t>

From b34c5f6b286add3911c85fe8f747f2b8f6ccc9c2 Mon Sep 17 00:00:00 2001
From: Bulle Mostovoi <vmostovoi@nvidia.com>
Date: Wed, 27 May 2026 14:38:56 +0200
Subject: [PATCH 37/67] dummy version locally seems to work ?????

---
 .../cuopt/linear_programming/constants.h      |  1 +
 cpp/src/math_optimization/solver_settings.cu  |  1 +
 cpp/src/pdlp/pdlp.cu                          | 24 +++++++++++++------
 cpp/src/pdlp/solve.cu                         |  4 +++-
 4 files changed, 22 insertions(+), 8 deletions(-)

diff --git a/cpp/include/cuopt/linear_programming/constants.h b/cpp/include/cuopt/linear_programming/constants.h
index 39685251b6..26ef3653e0 100644
--- a/cpp/include/cuopt/linear_programming/constants.h
+++ b/cpp/include/cuopt/linear_programming/constants.h
@@ -84,6 +84,7 @@
 #define CUOPT_NUM_CPU_THREADS          "num_cpu_threads"
 #define CUOPT_NUM_GPUS                 "num_gpus"
 #define CUOPT_MULTI_GPU_PARTITION_FILE "multi_gpu_partition_file"
+#define CUOPT_USE_DISTRIBUTED_PDLP     "use_distributed_pdlp"
 #define CUOPT_USER_PROBLEM_FILE        "user_problem_file"
 #define CUOPT_PRESOLVE_FILE            "presolve_file"
 #define CUOPT_RANDOM_SEED              "random_seed"
diff --git a/cpp/src/math_optimization/solver_settings.cu b/cpp/src/math_optimization/solver_settings.cu
index 6d7e7504e4..991b0d62c1 100644
--- a/cpp/src/math_optimization/solver_settings.cu
+++ b/cpp/src/math_optimization/solver_settings.cu
@@ -177,6 +177,7 @@ solver_settings_t<i_t, f_t>::solver_settings_t() : pdlp_settings(), mip_settings
     {CUOPT_DUAL_POSTSOLVE, &pdlp_settings.dual_postsolve, true},
     {CUOPT_BARRIER_ITERATIVE_REFINEMENT, &pdlp_settings.barrier_iterative_refinement, true},
     {CUOPT_MIP_PROBING, &mip_settings.probing, true},
+    {CUOPT_USE_DISTRIBUTED_PDLP, &pdlp_settings.hyper_params.use_distributed_pdlp, false},
   };
   // String parameters
   string_parameters = {
diff --git a/cpp/src/pdlp/pdlp.cu b/cpp/src/pdlp/pdlp.cu
index c31c528d8d..1e76fa4251 100644
--- a/cpp/src/pdlp/pdlp.cu
+++ b/cpp/src/pdlp/pdlp.cu
@@ -382,16 +382,14 @@ pdlp_solver_t<i_t, f_t>::pdlp_solver_t(problem_t<i_t, f_t>& op_problem,
   //    (problem_ptr, op_problem_scaled_, pdhg_solver_, strategies, etc.).
   : pdlp_solver_t(op_problem, settings, false)
 {
-  cuopt_expects(num_gpus == settings.num_gpus && settings.num_gpus > 1,
+  if (num_gpus == 1) {
+    std::cout << "CAREFUL: num_gpus == 1, running dummy version" << std::endl;
+  }
+  cuopt_expects(num_gpus == settings.num_gpus /*&& settings.num_gpus > 1*/,
                 error_type_t::ValidationError,
                 "This constructor should only be used for distributed PDLP (num_gpus > 1)");
 
-  // Distributed PDLP is currently double-only. The body is guarded with
-  // `if constexpr` so the float instantiation never references the
-  // multi_gpu_engine_t<i_t, float> / partition_loader_t<i_t, float> symbols
-  // (those are intentionally not instantiated in their .cu files), keeping
-  // the link clean. Trying to use distributed PDLP with f_t = float will
-  // throw at runtime instead.
+  // Distributed PDLP is currently double-only
   if constexpr (!std::is_same_v<f_t, double>) {
     cuopt_expects(false,
                   error_type_t::ValidationError,
@@ -403,6 +401,18 @@ pdlp_solver_t<i_t, f_t>::pdlp_solver_t(problem_t<i_t, f_t>& op_problem,
     if (!settings.multi_gpu_partition_file.empty()) {
       parts = partition_loader_t<i_t, f_t>::parse_distributed_pdlp_partition_file(
         settings.multi_gpu_partition_file);
+    } else if (num_gpus == 1) {
+      // Single-part dummy run: useful for exercising the mGPU code paths on a
+      // single physical GPU without a real Metis partition file. The downstream
+      // create_rank_data_from_parts expects a flat vector of length
+      // (n_constraints + n_variables) where each entry is the owning part-id
+      // (cstrs first, then vars). With nb_parts == 1, every entry is 0.
+      std::cout << "CAREFUL: num_gpus == 1, running dummy version (single part covering "
+                << op_problem_scaled_.n_constraints << " cstrs + "
+                << op_problem_scaled_.n_variables << " vars)" << std::endl;
+      parts = std::vector<i_t>(
+        static_cast<std::size_t>(op_problem_scaled_.n_constraints + op_problem_scaled_.n_variables),
+        0);
     } else {
       cuopt_expects(false,
                     error_type_t::RuntimeError,
diff --git a/cpp/src/pdlp/solve.cu b/cpp/src/pdlp/solve.cu
index 479340810c..e401ab35b6 100644
--- a/cpp/src/pdlp/solve.cu
+++ b/cpp/src/pdlp/solve.cu
@@ -771,9 +771,11 @@ static optimization_problem_solution_t<i_t, f_t> run_pdlp_solver(
   }
 #endif
   if (settings.hyper_params.use_distributed_pdlp) {
+    /*
     cuopt_expects(settings.num_gpus > 1,
                   error_type_t::ValidationError,
-                  "use_distributed_pdlp requires settings.num_gpus > 1");
+                  "use_distributed_pdlp requires settings.num_gpus > 1"); */
+    if (settings.num_gpus == 1) {std::cout << "CAREFUL: use_distributed_pdlp requires settings.num_gpus > 1" << std::endl;}
     cuopt_expects(!is_batch_mode,
                   error_type_t::ValidationError,
                   "Distributed PDLP does not support batch mode");

From b784a441395c092f13a94b276b8caad03a7cac7e Mon Sep 17 00:00:00 2001
From: Bulle Mostovoi <vmostovoi@nvidia.com>
Date: Wed, 27 May 2026 06:12:59 -0700
Subject: [PATCH 38/67] added dummy partitionner

---
 cpp/src/pdlp/CMakeLists.txt                   |  1 +
 .../distributed_pdlp/metis_partitioner.hpp    | 24 +++++
 cpp/src/pdlp/distributed_pdlp/partitioner.cu  | 87 +++++++++++++++++++
 cpp/src/pdlp/distributed_pdlp/partitioner.hpp | 63 ++++++++++++++
 cpp/src/pdlp/pdlp.cu                          | 39 +++++----
 5 files changed, 197 insertions(+), 17 deletions(-)
 create mode 100644 cpp/src/pdlp/distributed_pdlp/metis_partitioner.hpp
 create mode 100644 cpp/src/pdlp/distributed_pdlp/partitioner.cu
 create mode 100644 cpp/src/pdlp/distributed_pdlp/partitioner.hpp

diff --git a/cpp/src/pdlp/CMakeLists.txt b/cpp/src/pdlp/CMakeLists.txt
index 2bc2771c91..a6ef14e3ff 100644
--- a/cpp/src/pdlp/CMakeLists.txt
+++ b/cpp/src/pdlp/CMakeLists.txt
@@ -32,6 +32,7 @@ set(LP_CORE_FILES
   ${CMAKE_CURRENT_SOURCE_DIR}/distributed_pdlp/shard.cu
   ${CMAKE_CURRENT_SOURCE_DIR}/distributed_pdlp/multi_gpu_engine.cu
   ${CMAKE_CURRENT_SOURCE_DIR}/distributed_pdlp/partition_loader.cu
+  ${CMAKE_CURRENT_SOURCE_DIR}/distributed_pdlp/partitioner.cu
 )
 
 # C and Python adapter files
diff --git a/cpp/src/pdlp/distributed_pdlp/metis_partitioner.hpp b/cpp/src/pdlp/distributed_pdlp/metis_partitioner.hpp
new file mode 100644
index 0000000000..c4e37f57a9
--- /dev/null
+++ b/cpp/src/pdlp/distributed_pdlp/metis_partitioner.hpp
@@ -0,0 +1,24 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#pragma once
+
+#include <pdlp/distributed_pdlp/partitioner.hpp>
+
+namespace cuopt::linear_programming::detail {
+
+// METIS k-way partitioner on the constraint/variable bipartite graph induced by A.
+// Requires partitioner_input_t::A and A_t (or A row_offsets/col_indices only — the
+// implementation builds the bipartite adjacency the same way as metis_tests:
+// cstr nodes [0, nb_cstr), var nodes [nb_cstr, nb_cstr+nb_vars), edges from A and A_t).
+//
+// Wire into make_partitioner() once METIS is an optional cuOpt dependency.
+template <typename i_t, typename f_t>
+class metis_partitioner_t : public partitioner_i<i_t, f_t> {
+ public:
+  std::vector<i_t> partition(partitioner_input_t<i_t, f_t> const& input) const override;
+};
+
+}  // namespace cuopt::linear_programming::detail
diff --git a/cpp/src/pdlp/distributed_pdlp/partitioner.cu b/cpp/src/pdlp/distributed_pdlp/partitioner.cu
new file mode 100644
index 0000000000..bdbfcacf06
--- /dev/null
+++ b/cpp/src/pdlp/distributed_pdlp/partitioner.cu
@@ -0,0 +1,87 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#include <pdlp/distributed_pdlp/partitioner.hpp>
+
+#include <cuopt/error.hpp>
+
+#include <algorithm>
+#include <cstddef>
+
+namespace cuopt::linear_programming::detail {
+
+template <typename i_t, typename f_t>
+std::vector<i_t> dummy_partitioner_t<i_t, f_t>::partition(
+  partitioner_input_t<i_t, f_t> const& input) const
+{
+  cuopt_expects(input.nb_parts > 0,
+                error_type_t::ValidationError,
+                "dummy_partitioner: nb_parts must be positive");
+  cuopt_expects(input.nb_cstr >= 0 && input.nb_vars >= 0,
+                error_type_t::ValidationError,
+                "dummy_partitioner: invalid problem dimensions");
+
+  const std::size_t nvtx =
+    static_cast<std::size_t>(input.nb_cstr) + static_cast<std::size_t>(input.nb_vars);
+  std::vector<i_t> parts(nvtx);
+  for (std::size_t i = 0; i < nvtx; ++i) {
+    parts[i] = static_cast<i_t>(i % static_cast<std::size_t>(input.nb_parts));
+  }
+  validate_partition(parts,
+                     static_cast<int>(input.nb_cstr),
+                     static_cast<int>(input.nb_vars),
+                     static_cast<int>(input.nb_parts),
+                     "dummy_partitioner");
+  return parts;
+}
+
+void validate_partition(std::vector<int> const& parts,
+                        int nb_cstr,
+                        int nb_vars,
+                        int nb_parts,
+                        char const* context)
+{
+  const std::size_t expected =
+    static_cast<std::size_t>(nb_cstr) + static_cast<std::size_t>(nb_vars);
+  cuopt_expects(parts.size() == expected,
+                error_type_t::ValidationError,
+                "%s: expected %zu part entries (cstrs + vars), got %zu",
+                context,
+                expected,
+                parts.size());
+  cuopt_expects(nb_parts > 0,
+                error_type_t::ValidationError,
+                "%s: nb_parts must be positive",
+                context);
+  if (parts.empty()) { return; }
+  const auto [min_it, max_it] = std::minmax_element(parts.begin(), parts.end());
+  cuopt_expects(*min_it >= 0,
+                error_type_t::ValidationError,
+                "%s: partition ids must be non-negative (min=%d)",
+                context,
+                static_cast<int>(*min_it));
+  cuopt_expects(*max_it < nb_parts,
+                error_type_t::ValidationError,
+                "%s: partition ids must be in [0, %d) (max=%d)",
+                context,
+                static_cast<int>(nb_parts),
+                static_cast<int>(*max_it));
+}
+
+template <typename i_t, typename f_t>
+std::unique_ptr<partitioner_i<i_t, f_t>> make_partitioner(partitioner_kind_t kind)
+{
+  switch (kind) {
+    case partitioner_kind_t::Dummy:
+      return std::make_unique<dummy_partitioner_t<i_t, f_t>>();
+  }
+  cuopt_expects(false, error_type_t::RuntimeError, "make_partitioner: unsupported partitioner kind");
+  return nullptr;
+}
+
+template class dummy_partitioner_t<int, double>;
+template std::unique_ptr<partitioner_i<int, double>> make_partitioner<int, double>(partitioner_kind_t);
+
+}  // namespace cuopt::linear_programming::detail
diff --git a/cpp/src/pdlp/distributed_pdlp/partitioner.hpp b/cpp/src/pdlp/distributed_pdlp/partitioner.hpp
new file mode 100644
index 0000000000..ee5798fd0b
--- /dev/null
+++ b/cpp/src/pdlp/distributed_pdlp/partitioner.hpp
@@ -0,0 +1,63 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#pragma once
+
+#include <memory>
+#include <string>
+#include <vector>
+
+namespace cuopt::linear_programming::detail {
+
+// Non-owning view of a host CSR matrix (A or A_t).
+template <typename i_t, typename f_t>
+struct csr_host_view_t {
+  std::vector<i_t> const* row_offsets{nullptr};
+  std::vector<i_t> const* col_indices{nullptr};
+  std::vector<f_t> const* values{nullptr};  // optional; unused by topology-only partitioners
+  i_t num_rows{0};
+  i_t num_cols{0};
+};
+
+// Inputs shared by all distributed-PDLP partitioners.
+// Returns a flat vector of length (nb_cstr + nb_vars): constraint part-ids first,
+// then variable part-ids, each in [0, nb_parts).
+template <typename i_t, typename f_t>
+struct partitioner_input_t {
+  i_t nb_cstr{0};
+  i_t nb_vars{0};
+  i_t nb_parts{0};
+  // Constraint matrix A (rows = constraints, cols = variables).
+  csr_host_view_t<i_t, f_t> A{};
+  // Transpose A_t (rows = variables, cols = constraints). Optional for partitioners
+  // that build a bipartite graph (e.g. METIS); dummy partitioner ignores both matrices.
+  csr_host_view_t<i_t, f_t> A_t{};
+};
+
+enum class partitioner_kind_t { Dummy /*, Metis */ };
+
+template <typename i_t, typename f_t>
+class partitioner_i {
+ public:
+  virtual ~partitioner_i() = default;
+  virtual std::vector<i_t> partition(partitioner_input_t<i_t, f_t> const& input) const = 0;
+};
+
+template <typename i_t, typename f_t>
+class dummy_partitioner_t : public partitioner_i<i_t, f_t> {
+ public:
+  std::vector<i_t> partition(partitioner_input_t<i_t, f_t> const& input) const override;
+};
+
+void validate_partition(std::vector<int> const& parts,
+                        int nb_cstr,
+                        int nb_vars,
+                        int nb_parts,
+                        char const* context = "partition");
+
+template <typename i_t, typename f_t>
+std::unique_ptr<partitioner_i<i_t, f_t>> make_partitioner(partitioner_kind_t kind);
+
+}  // namespace cuopt::linear_programming::detail
diff --git a/cpp/src/pdlp/pdlp.cu b/cpp/src/pdlp/pdlp.cu
index 1e76fa4251..203547367b 100644
--- a/cpp/src/pdlp/pdlp.cu
+++ b/cpp/src/pdlp/pdlp.cu
@@ -12,6 +12,7 @@
 
 #include <pdlp/cusparse_view.hpp>
 #include <pdlp/distributed_pdlp/partition_loader.hpp>
+#include <pdlp/distributed_pdlp/partitioner.hpp>
 #include <pdlp/pdlp.cuh>
 #include <pdlp/swap_and_resize_helper.cuh>
 #include <pdlp/utils.cuh>
@@ -396,28 +397,32 @@ pdlp_solver_t<i_t, f_t>::pdlp_solver_t(problem_t<i_t, f_t>& op_problem,
                   "Distributed PDLP (num_gpus > 1) currently requires double precision");
     return;
   } else {
-    // 2. Load partition
+    // 2. Load or compute partition
     std::vector<i_t> parts;
     if (!settings.multi_gpu_partition_file.empty()) {
       parts = partition_loader_t<i_t, f_t>::parse_distributed_pdlp_partition_file(
         settings.multi_gpu_partition_file);
-    } else if (num_gpus == 1) {
-      // Single-part dummy run: useful for exercising the mGPU code paths on a
-      // single physical GPU without a real Metis partition file. The downstream
-      // create_rank_data_from_parts expects a flat vector of length
-      // (n_constraints + n_variables) where each entry is the owning part-id
-      // (cstrs first, then vars). With nb_parts == 1, every entry is 0.
-      std::cout << "CAREFUL: num_gpus == 1, running dummy version (single part covering "
-                << op_problem_scaled_.n_constraints << " cstrs + "
-                << op_problem_scaled_.n_variables << " vars)" << std::endl;
-      parts = std::vector<i_t>(
-        static_cast<std::size_t>(op_problem_scaled_.n_constraints + op_problem_scaled_.n_variables),
-        0);
+      validate_partition(parts,
+                         op_problem_scaled_.n_constraints,
+                         op_problem_scaled_.n_variables,
+                         num_gpus,
+                         "partition file");
     } else {
-      cuopt_expects(false,
-                    error_type_t::RuntimeError,
-                    "Metis partitioning inside cuopt not implemented yet; "
-                    "provide a --parts file via settings.multi_gpu_partition_file");
+      if (num_gpus == 1) {
+        // Single-part dummy run: useful for exercising the mGPU code paths on a
+        // single physical GPU without a real partition file.
+        std::cout << "CAREFUL: num_gpus == 1, running dummy version (single part covering "
+                  << op_problem_scaled_.n_constraints << " cstrs + "
+                  << op_problem_scaled_.n_variables << " vars)" << std::endl;
+      }
+      partitioner_input_t<i_t, f_t> partition_input;
+      partition_input.nb_cstr  = op_problem_scaled_.n_constraints;
+      partition_input.nb_vars  = op_problem_scaled_.n_variables;
+      partition_input.nb_parts = num_gpus;
+      // Dummy partitioner ignores A / A_t for now; future METIS partitioners will
+      // fill these CSR views before calling partition().
+      auto partitioner = make_partitioner<i_t, f_t>(partitioner_kind_t::Dummy);
+      parts            = partitioner->partition(partition_input);
     }
 
     // always compute initial step size before scaling and primal_weight after scaling to do like

From ca7d7a91b33b72c60c885a7314619097d82e19ad Mon Sep 17 00:00:00 2001
From: Bulle Mostovoi <vmostovoi@nvidia.com>
Date: Wed, 27 May 2026 15:57:37 +0200
Subject: [PATCH 39/67] added stream forking for cuda graph

---
 .../pdlp/distributed_pdlp/multi_gpu_engine.cu | 13 +++++
 .../distributed_pdlp/multi_gpu_engine.hpp     | 55 +++++++++++++++++++
 cpp/src/pdlp/pdhg.cu                          | 17 ++++++
 3 files changed, 85 insertions(+)

diff --git a/cpp/src/pdlp/distributed_pdlp/multi_gpu_engine.cu b/cpp/src/pdlp/distributed_pdlp/multi_gpu_engine.cu
index a0b3f5dcc3..796153fd79 100644
--- a/cpp/src/pdlp/distributed_pdlp/multi_gpu_engine.cu
+++ b/cpp/src/pdlp/distributed_pdlp/multi_gpu_engine.cu
@@ -81,6 +81,19 @@ multi_gpu_engine_t<i_t, f_t>::multi_gpu_engine_t(
                                                                  objective_scaling_factor,
                                                                  sub_solver_settings));
   }
+
+  // 4. Allocate fork/join events for cross-stream graph capture splicing.
+  //    fork_event_ on the master device (whatever device is current when the
+  //    engine is constructed -- pdlp_solver_t's mGPU ctor runs on master).
+  //    join_events_[r] on shard r's device. event_handler_t uses the default
+  //    cudaEventCreate (no flags), matching the rest of the codebase.
+  //    Cleanup is automatic via event_handler_t's RAII destructor.
+  fork_event_ = std::make_unique<cuopt::event_handler_t>();
+  join_events_.reserve(nb_parts);
+  for (int r = 0; r < nb_parts; ++r) {
+    raft::device_setter guard(devices[r]);
+    join_events_.emplace_back(std::make_unique<cuopt::event_handler_t>());
+  }
 }
 
 template struct multi_gpu_engine_t<int, double>;
diff --git a/cpp/src/pdlp/distributed_pdlp/multi_gpu_engine.hpp b/cpp/src/pdlp/distributed_pdlp/multi_gpu_engine.hpp
index d156e889af..ade0da1c66 100644
--- a/cpp/src/pdlp/distributed_pdlp/multi_gpu_engine.hpp
+++ b/cpp/src/pdlp/distributed_pdlp/multi_gpu_engine.hpp
@@ -8,6 +8,7 @@
 #include <pdlp/distributed_pdlp/shard.hpp>
 #include <pdlp/pdhg.hpp>
 #include <utilities/cuda_helpers.cuh>
+#include <utilities/event_handler.cuh>
 
 #include <cuopt/linear_programming/pdlp/solver_settings.hpp>
 
@@ -446,6 +447,60 @@ struct multi_gpu_engine_t {
   // Shards stored by unique_ptr because pdlp_shard_t is immovable
   // (owns device-affine resources: handle, NCCL comm, RMM buffers).
   std::vector<std::unique_ptr<pdlp_shard_t<i_t, f_t>>> shards;
+
+  // ===== Fork/join events for CUDA graph capture spanning shard streams =====
+  //
+  // CUDA graph capture starts on the master pdhg stream (in pdhg_solver_t).
+  // The per-iteration work then dispatches kernels and NCCL collectives onto
+  // each shard's own stream. For these cross-stream operations to be
+  // recorded into the same captured graph (instead of escaping the capture
+  // and either invalidating it or being silently dropped), every shard
+  // stream must be "spliced" into the active capture via fork/join events.
+  //
+  //   master_stream ──record(fork_event_)──┐
+  //                                        ├─> shard_0.stream (waits) ──┐
+  //                                        ├─> shard_1.stream (waits) ──┤
+  //                                        └─> shard_{n-1}.stream     ──┘
+  //                                                                  (record join_events_[r])
+  //                                                                  master waits on each
+  //
+  // Pattern mirrors metis_tests/src/bench.cu. Events are reused across
+  // iterations (created once at engine construction) and cleaned up
+  // automatically by event_handler_t's RAII destructor.
+  //
+  // unique_ptr because event_handler_t is non-copyable and we need
+  // per-device construction (each join event must be created with its
+  // shard's device current).
+  std::unique_ptr<cuopt::event_handler_t> fork_event_;
+  std::vector<std::unique_ptr<cuopt::event_handler_t>> join_events_;
+
+  // fork_to_shards: record fork_event_ on `master_stream`, then make every
+  // shard stream wait on it. Inside a graph capture, this splices every
+  // shard stream into the same captured graph.
+  void fork_to_shards(rmm::cuda_stream_view master_stream)
+  {
+    fork_event_->record(master_stream);
+    for (auto& s : shards) {
+      raft::device_setter guard(s->device_id);
+      fork_event_->stream_wait(s->stream.view());
+    }
+  }
+
+  // join_from_shards: each shard records its join event on its own stream,
+  // then `master_stream` waits on every join event. Closes the captured
+  // sub-graph back into the master stream so cudaStreamEndCapture can
+  // produce a single graph spanning all streams.
+  void join_from_shards(rmm::cuda_stream_view master_stream)
+  {
+    const int nb = static_cast<int>(shards.size());
+    for (int r = 0; r < nb; ++r) {
+      raft::device_setter guard(shards[r]->device_id);
+      join_events_[r]->record(shards[r]->stream.view());
+    }
+    for (auto& e : join_events_) {
+      e->stream_wait(master_stream);
+    }
+  }
 };
 
 }  // namespace cuopt::linear_programming::detail
diff --git a/cpp/src/pdlp/pdhg.cu b/cpp/src/pdlp/pdhg.cu
index 969f5d0d30..df183dc7e6 100644
--- a/cpp/src/pdlp/pdhg.cu
+++ b/cpp/src/pdlp/pdhg.cu
@@ -1249,6 +1249,14 @@ void pdhg_solver_t<i_t, f_t>::compute_next_primal_dual_solution_reflected(
 
   if (should_major) {
     graph_all.run(should_major, [&]() {
+      // Multi-GPU: splice shard streams into the capture so their kernels and
+      // NCCL collectives are recorded into the same graph. Without this, work
+      // issued on shard.stream from inside this lambda would either invalidate
+      // the capture or run outside the graph, leaving the captured graph
+      // empty (or broken) -- which produces the cycling/stall behavior we
+      // observed on larger problems. Mirrors metis_tests bench.cu fork/join.
+      if (mgpu_engine_ != nullptr) { mgpu_engine_->fork_to_shards(stream_view_); }
+
       compute_At_y();
       if (mgpu_engine_ != nullptr) {
         for (auto& shard : mgpu_engine_->shards) {
@@ -1346,10 +1354,17 @@ void pdhg_solver_t<i_t, f_t>::compute_next_primal_dual_solution_reflected(
       print("potential_next_dual_solution_", potential_next_dual_solution_);
       print("reflected_dual_", reflected_dual_);
 #endif
+
+      // Multi-GPU: close the fork by joining every shard stream back into
+      // the master stream so cudaStreamEndCapture sees a single graph
+      // spanning all streams.
+      if (mgpu_engine_ != nullptr) { mgpu_engine_->join_from_shards(stream_view_); }
     });
 
   } else {
     graph_all.run(should_major, [&]() {
+      if (mgpu_engine_ != nullptr) { mgpu_engine_->fork_to_shards(stream_view_); }
+
       // Compute next primal
       compute_At_y();
 
@@ -1454,6 +1469,8 @@ void pdhg_solver_t<i_t, f_t>::compute_next_primal_dual_solution_reflected(
 #ifdef CUPDLP_DEBUG_MODE
       print("reflected_dual_", reflected_dual_);
 #endif
+
+      if (mgpu_engine_ != nullptr) { mgpu_engine_->join_from_shards(stream_view_); }
     });
   }
 }

From 0310d50a57dbb6b7f5752a8630f19cf663658795 Mon Sep 17 00:00:00 2001
From: Bulle Mostovoi <vmostovoi@nvidia.com>
Date: Wed, 27 May 2026 18:49:12 +0200
Subject: [PATCH 40/67] updated convergence information to use potential_next 
 rather than current in compute_primal/dual_residual, as the dual_iterate
 parameter

---
 .../convergence_information.cu                | 53 ++++++++++++-------
 1 file changed, 33 insertions(+), 20 deletions(-)

diff --git a/cpp/src/pdlp/termination_strategy/convergence_information.cu b/cpp/src/pdlp/termination_strategy/convergence_information.cu
index 28b33582ab..608590ffa0 100644
--- a/cpp/src/pdlp/termination_strategy/convergence_information.cu
+++ b/cpp/src/pdlp/termination_strategy/convergence_information.cu
@@ -429,22 +429,24 @@ void convergence_information_t<i_t, f_t>::compute_convergence_information(
                   error_type_t::ValidationError,
                   "per_constraint_residual is not yet supported in multi-GPU mode");
 
-    // Prepares halo values in primal_solution
+    // Prepares halo values in potential_next_primal_solution
+
     engine->halo_exchange_var(
       [](pdhg_solver_t<i_t, f_t>& pdhg) -> rmm::device_uvector<f_t>& {
-        return pdhg.get_primal_solution();
+        return pdhg.get_potential_next_primal_solution();
       });
 
-    // Compute the primal residual and objective on each shard
     for (auto& shard : engine->shards) {
       raft::device_setter guard(shard->device_id);
       auto& sub_pdlp = *shard->sub_pdlp;
       auto& sub_conv = sub_pdlp.get_current_termination_strategy().get_convergence_information();
-      sub_conv.compute_primal_residual(sub_conv.op_problem_cusparse_view_,
-                                       sub_pdlp.pdhg_solver_.get_dual_tmp_resource(),
-                                       sub_pdlp.pdhg_solver_.get_dual_solution());
-      sub_conv.compute_primal_objective_owned_partial(sub_pdlp.pdhg_solver_.get_primal_solution(),
-                                                      shard->rank_data.owned_var_size);
+      sub_conv.compute_primal_residual(
+        sub_conv.op_problem_cusparse_view_,
+        sub_pdlp.pdhg_solver_.get_dual_tmp_resource(),
+        sub_pdlp.pdhg_solver_.get_potential_next_dual_solution());
+      sub_conv.compute_primal_objective_owned_partial(
+        sub_pdlp.pdhg_solver_.get_potential_next_primal_solution(),
+        shard->rank_data.owned_var_size);
     }
 
     // Reduce all primal objectives across shards
@@ -546,12 +548,15 @@ void convergence_information_t<i_t, f_t>::compute_convergence_information(
   if (current_pdhg_solver.is_multi_gpu()) {
     auto* engine = current_pdhg_solver.get_mgpu_engine();
 
-    // 1) Halo-exchange the dual solution on every shard so the upcoming
-    //    A_T_shard @ dual SpMV inside compute_dual_residual reads correct
-    //    values in the cstr halo region.
+    // 1) Halo-exchange potential_next_dual_solution on every shard so the
+    //    A_T_shard @ y SpMV inside compute_dual_residual reads correct values
+    //    in the cstr halo region. The SpMV is driven through the eval view's
+    //    cv.dual_solution descriptor, which (cuPDLPx, see
+    //    cusparse_view.cu:931-937) is bound to _potential_next_dual -- not to
+    //    current.dual_solution. So we must halo-exchange the same buffer.
     engine->halo_exchange_cstr(
       [](pdhg_solver_t<i_t, f_t>& pdhg) -> rmm::device_uvector<f_t>& {
-        return pdhg.get_dual_solution();
+        return pdhg.get_potential_next_dual_solution();
       });
 
     // 2-3) Per-shard:
@@ -563,18 +568,26 @@ void convergence_information_t<i_t, f_t>::compute_convergence_information(
     //        shard.dual_objective_, with NO scaling/offset. Relies on
     //        primal_slack_ already populated by the per-shard
     //        compute_primal_residual above.
+    //
+    // Same primal_iterate fix as the primal block above: use the shard's
+    // (fresh, unscaled) potential_next_primal_solution, matching single-GPU
+    // cuPDLPx (pdlp.cu:1190-1203). The previous code's get_primal_solution()
+    // would mix scaled x with unscaled dual_slack in the dual_objective
+    // cublasdot.
     for (auto& shard : engine->shards) {
       raft::device_setter guard(shard->device_id);
       auto& sub_pdlp = *shard->sub_pdlp;
       auto& sub_conv = sub_pdlp.get_current_termination_strategy().get_convergence_information();
-      sub_conv.compute_dual_residual(sub_conv.op_problem_cusparse_view_,
-                                     sub_pdlp.pdhg_solver_.get_primal_tmp_resource(),
-                                     sub_pdlp.pdhg_solver_.get_primal_solution(),
-                                     sub_pdlp.pdhg_solver_.get_dual_slack());
-      sub_conv.compute_dual_objective_owned_partial(sub_pdlp.pdhg_solver_.get_primal_solution(),
-                                                    sub_pdlp.pdhg_solver_.get_dual_slack(),
-                                                    shard->rank_data.owned_var_size,
-                                                    shard->rank_data.owned_cstr_size);
+      sub_conv.compute_dual_residual(
+        sub_conv.op_problem_cusparse_view_,
+        sub_pdlp.pdhg_solver_.get_primal_tmp_resource(),
+        sub_pdlp.pdhg_solver_.get_potential_next_primal_solution(),
+        sub_pdlp.pdhg_solver_.get_dual_slack());
+      sub_conv.compute_dual_objective_owned_partial(
+        sub_pdlp.pdhg_solver_.get_potential_next_primal_solution(),
+        sub_pdlp.pdhg_solver_.get_dual_slack(),
+        shard->rank_data.owned_var_size,
+        shard->rank_data.owned_cstr_size);
     }
 
     // 4) Allreduce dual_objective_ across shards (sum, in place). Same

From f811bc8459f71f6690efce337daacf6db4892141 Mon Sep 17 00:00:00 2001
From: Bulle Mostovoi <vmostovoi@nvidia.com>
Date: Wed, 27 May 2026 19:49:54 +0200
Subject: [PATCH 41/67] disabled graph, can sole afiro hehe

---
 cpp/src/pdlp/distributed_pdlp/shard.cu | 8 +++++++-
 cpp/src/pdlp/pdlp.cu                   | 8 +++++++-
 2 files changed, 14 insertions(+), 2 deletions(-)

diff --git a/cpp/src/pdlp/distributed_pdlp/shard.cu b/cpp/src/pdlp/distributed_pdlp/shard.cu
index 405e6fa05c..45f9f7a880 100644
--- a/cpp/src/pdlp/distributed_pdlp/shard.cu
+++ b/cpp/src/pdlp/distributed_pdlp/shard.cu
@@ -153,7 +153,13 @@ pdlp_shard_t<i_t, f_t>::pdlp_shard_t(int device_id,
   //         At this point sub_pdlp.op_problem_scaled_ is an unscaled copy
   //         of sub_problem and sub_pdlp.initial_scaling_strategy_ has
   //         unit cumulative factors (sub-settings disable Ruiz / PC iters).
-  sub_pdlp = std::make_unique<pdlp_solver_t<i_t, f_t>>(*sub_problem, settings, /*batch=*/false);
+  // NOTE: pass is_legacy_batch_mode=true to disable CUDA-graph capture inside
+  // sub_pdlp while debugging fake-mGPU divergence. The flag is a pure
+  // graph-capture toggle (ping_pong_graph_t / manual_cuda_graph_t) and does
+  // not change any algorithm semantics. Restore to false once the path is
+  // confirmed correct.
+  sub_pdlp = std::make_unique<pdlp_solver_t<i_t, f_t>>(
+    *sub_problem, settings, /*is_legacy_batch_mode=*/true);
 
   sub_pdlp->pdhg_solver_.set_is_multi_gpu(true);
 
diff --git a/cpp/src/pdlp/pdlp.cu b/cpp/src/pdlp/pdlp.cu
index 203547367b..ecc2e35c20 100644
--- a/cpp/src/pdlp/pdlp.cu
+++ b/cpp/src/pdlp/pdlp.cu
@@ -381,7 +381,13 @@ pdlp_solver_t<i_t, f_t>::pdlp_solver_t(problem_t<i_t, f_t>& op_problem,
                                        int num_gpus)
   // 1. Delegate to single-GPU ctor to bring up all the per-master state
   //    (problem_ptr, op_problem_scaled_, pdhg_solver_, strategies, etc.).
-  : pdlp_solver_t(op_problem, settings, false)
+  //
+  // NOTE: pass is_legacy_batch_mode=true to disable CUDA-graph capture on the
+  // master while we are debugging fake-mGPU divergence. The flag is a pure
+  // graph-capture toggle (see ping_pong_graph_t / manual_cuda_graph_t); it does
+  // not change any algorithm semantics. Restore to false once the path is
+  // confirmed correct.
+  : pdlp_solver_t(op_problem, settings, /*is_legacy_batch_mode=*/true)
 {
   if (num_gpus == 1) {
     std::cout << "CAREFUL: num_gpus == 1, running dummy version" << std::endl;

From 4d7e2fced7f3600ca45e6a171972483af876576a Mon Sep 17 00:00:00 2001
From: Bulle Mostovoi <vmostovoi@nvidia.com>
Date: Thu, 28 May 2026 12:49:58 +0200
Subject: [PATCH 42/67] added join_from_shards in convergence_info, now afiro
 is erfect 510 but a28 is 2100 vs 1500 hmmmm

---
 .../convergence_information.cu                     | 14 +++++++++++++-
 1 file changed, 13 insertions(+), 1 deletion(-)

diff --git a/cpp/src/pdlp/termination_strategy/convergence_information.cu b/cpp/src/pdlp/termination_strategy/convergence_information.cu
index 608590ffa0..7877a64c88 100644
--- a/cpp/src/pdlp/termination_strategy/convergence_information.cu
+++ b/cpp/src/pdlp/termination_strategy/convergence_information.cu
@@ -459,7 +459,10 @@ void convergence_information_t<i_t, f_t>::compute_convergence_information(
           .data();
       });
 
-    // Get the reduced primal objective from the shard[0] (arbitrary) 
+    // Get the reduced primal objective from the shard[0] (arbitrary)
+    // Race fix: master stream must wait for shard streams to finish the
+    // allreduce above before copying scalar data out of shard 0's buffer.
+    engine->join_from_shards(stream_view_);
     {
       auto& s0 = *engine->shards[0];
       raft::device_setter guard(s0.device_id);
@@ -494,6 +497,9 @@ void convergence_information_t<i_t, f_t>::compute_convergence_information(
       },
       [](pdlp_shard_t<i_t, f_t>& shard) -> i_t { return shard.rank_data.owned_cstr_size; });
 
+    // Race fix: master stream must wait for shard streams to finish the
+    // distributed L2 norm before copying scalar data out of shard 0.
+    engine->join_from_shards(stream_view_);
     auto& s0 = *engine->shards[0];
     raft::device_setter guard(s0.device_id);
     raft::copy(l2_primal_residual_.data(),
@@ -601,6 +607,9 @@ void convergence_information_t<i_t, f_t>::compute_convergence_information(
           .data();
       });
 
+    // Race fix: master stream must wait for shard streams to finish the
+    // allreduce above before copying scalar data out of shard 0's buffer.
+    engine->join_from_shards(stream_view_);
     {
       auto& s0 = *engine->shards[0];
       raft::device_setter guard(s0.device_id);
@@ -637,6 +646,9 @@ void convergence_information_t<i_t, f_t>::compute_convergence_information(
           .l2_dual_residual_.data();
       },
       [](pdlp_shard_t<i_t, f_t>& shard) -> i_t { return shard.rank_data.owned_var_size; });
+    // Race fix: master stream must wait for shard streams to finish the
+    // distributed L2 norm before copying scalar data out of shard 0.
+    engine->join_from_shards(stream_view_);
     auto& s0 = *engine->shards[0];
     raft::device_setter guard(s0.device_id);
     raft::copy(l2_dual_residual_.data(),

From 7ad460664e5c09328fa3ef3c32dcc2b7598cbc77 Mon Sep 17 00:00:00 2001
From: Bulle Mostovoi <vmostovoi@nvidia.com>
Date: Thu, 28 May 2026 13:52:22 +0200
Subject: [PATCH 43/67] use spmvop in mgpu and fixed small bug of
 increment_iteration_since_last_restart. now we have exact same iter for A28

---
 .../distributed_pdlp/multi_gpu_engine.hpp     | 26 ++++++++++++-------
 cpp/src/pdlp/pdlp.cu                          |  3 +--
 2 files changed, 18 insertions(+), 11 deletions(-)

diff --git a/cpp/src/pdlp/distributed_pdlp/multi_gpu_engine.hpp b/cpp/src/pdlp/distributed_pdlp/multi_gpu_engine.hpp
index ade0da1c66..637c342975 100644
--- a/cpp/src/pdlp/distributed_pdlp/multi_gpu_engine.hpp
+++ b/cpp/src/pdlp/distributed_pdlp/multi_gpu_engine.hpp
@@ -311,21 +311,29 @@ struct multi_gpu_engine_t {
 
   // -------- High-level: A @ x and A_T @ y ---------------------------------
   // Thin wrappers used from pdhg_solver_t::compute_A_x / compute_At_y when an
-  // engine is wired in. They use the canonical PDHG buffers/descriptors so the
-  // result lands where single-GPU PDHG would have put it (dual_gradient for A,
-  // current_AtY for A_T).
+  // engine is wired in. They drive the per-shard plan-based SpMV via the
+  // canonical cusparse_view bindings (no rebinding) so the descriptor binding
+  // is never disturbed by mGPU machinery.
+  //
+  // The halo-exchange MUST target the exact buffer the canonical descriptor
+  // is bound to in the PDHG cusparse_view (see cusparse_view.cu lines 516-519
+  // and 595-599):
+  //   - cv.reflected_primal_solution -> reflected_primal_ (var-shaped)
+  //   - cv.dual_solution             -> current.dual_solution_ (cstr-shaped)
+  // For 1 shard the halo-exchange is a no-op, but the buffer choice is what
+  // makes multi-shard correctness work, so we keep it accurate either way.
   void distributed_compute_A_x()
   {
-    distributed_spmv_A(
-      [](auto& pdhg) -> rmm::device_uvector<f_t>& { return pdhg.get_reflected_primal(); },
-      [](auto& pdhg) -> cusparseDnVecDescr_t { return pdhg.get_cusparse_view().dual_gradient; });
+    halo_exchange_var(
+      [](auto& pdhg) -> rmm::device_uvector<f_t>& { return pdhg.get_reflected_primal(); });
+    for_each_shard([](auto& shard) { shard.sub_pdlp->pdhg_solver_.spmvop_A_x(); });
   }
 
   void distributed_compute_At_y()
   {
-    distributed_spmv_At(
-      [](auto& pdhg) -> rmm::device_uvector<f_t>& { return pdhg.get_dual_solution(); },
-      [](auto& pdhg) -> cusparseDnVecDescr_t { return pdhg.get_cusparse_view().current_AtY; });
+    halo_exchange_cstr(
+      [](auto& pdhg) -> rmm::device_uvector<f_t>& { return pdhg.get_dual_solution(); });
+    for_each_shard([](auto& shard) { shard.sub_pdlp->pdhg_solver_.spmvop_At_y(); });
   }
 
   // -------- Solution gather (shards -> master) ----------------------------
diff --git a/cpp/src/pdlp/pdlp.cu b/cpp/src/pdlp/pdlp.cu
index ecc2e35c20..91263828c1 100644
--- a/cpp/src/pdlp/pdlp.cu
+++ b/cpp/src/pdlp/pdlp.cu
@@ -3154,12 +3154,11 @@ optimization_problem_solution_t<i_t, f_t> pdlp_solver_t<i_t, f_t>::run_solver(co
     ++total_pdlp_iterations_;
     ++internal_solver_iterations_;
     if (settings_.hyper_params.never_restart_to_average) {
+      restart_strategy_.increment_iteration_since_last_restart();
       if (multi_gpu_engine) {
         multi_gpu_engine->for_each_shard([&](auto& shard) {
           shard.sub_pdlp->restart_strategy_.increment_iteration_since_last_restart();
         });
-      } else {
-        restart_strategy_.increment_iteration_since_last_restart();
       }
     }
   }

From 03d1259e668b2103c0547e7b3a0826eb9c18f311 Mon Sep 17 00:00:00 2001
From: Bulle Mostovoi <vmostovoi@nvidia.com>
Date: Thu, 28 May 2026 13:57:10 +0200
Subject: [PATCH 44/67] re-enabled graph. not working

---
 cpp/src/pdlp/distributed_pdlp/shard.cu | 15 +++++++++------
 cpp/src/pdlp/pdlp.cu                   | 12 ++++++------
 2 files changed, 15 insertions(+), 12 deletions(-)

diff --git a/cpp/src/pdlp/distributed_pdlp/shard.cu b/cpp/src/pdlp/distributed_pdlp/shard.cu
index 45f9f7a880..93dc1403fc 100644
--- a/cpp/src/pdlp/distributed_pdlp/shard.cu
+++ b/cpp/src/pdlp/distributed_pdlp/shard.cu
@@ -153,13 +153,16 @@ pdlp_shard_t<i_t, f_t>::pdlp_shard_t(int device_id,
   //         At this point sub_pdlp.op_problem_scaled_ is an unscaled copy
   //         of sub_problem and sub_pdlp.initial_scaling_strategy_ has
   //         unit cumulative factors (sub-settings disable Ruiz / PC iters).
-  // NOTE: pass is_legacy_batch_mode=true to disable CUDA-graph capture inside
-  // sub_pdlp while debugging fake-mGPU divergence. The flag is a pure
-  // graph-capture toggle (ping_pong_graph_t / manual_cuda_graph_t) and does
-  // not change any algorithm semantics. Restore to false once the path is
-  // confirmed correct.
+  // Graph capture is enabled. The per-shard kernels invoked by the master's
+  // captured graph (compute_next_primal_dual_solution_reflected → for_each_shard
+  // → primal/dual_reflected_*_projection_transform on sub_pdlp's pdhg) are
+  // recorded into the same graph via the fork_to_shards / join_from_shards
+  // splicing on the master stream. Shards never own their own graph; their
+  // pdhg ping_pong_graph_t is only constructed because pdlp_solver_t requires
+  // it, but no graph.run() on a shard's pdhg is ever invoked from the mGPU
+  // path (compute_next_primal_dual_solution_reflected runs on master).
   sub_pdlp = std::make_unique<pdlp_solver_t<i_t, f_t>>(
-    *sub_problem, settings, /*is_legacy_batch_mode=*/true);
+    *sub_problem, settings, /*is_legacy_batch_mode=*/false);
 
   sub_pdlp->pdhg_solver_.set_is_multi_gpu(true);
 
diff --git a/cpp/src/pdlp/pdlp.cu b/cpp/src/pdlp/pdlp.cu
index 91263828c1..fd78a0ac9d 100644
--- a/cpp/src/pdlp/pdlp.cu
+++ b/cpp/src/pdlp/pdlp.cu
@@ -382,12 +382,12 @@ pdlp_solver_t<i_t, f_t>::pdlp_solver_t(problem_t<i_t, f_t>& op_problem,
   // 1. Delegate to single-GPU ctor to bring up all the per-master state
   //    (problem_ptr, op_problem_scaled_, pdhg_solver_, strategies, etc.).
   //
-  // NOTE: pass is_legacy_batch_mode=true to disable CUDA-graph capture on the
-  // master while we are debugging fake-mGPU divergence. The flag is a pure
-  // graph-capture toggle (see ping_pong_graph_t / manual_cuda_graph_t); it does
-  // not change any algorithm semantics. Restore to false once the path is
-  // confirmed correct.
-  : pdlp_solver_t(op_problem, settings, /*is_legacy_batch_mode=*/true)
+  // Graph capture is enabled here. The master's captured graph splices the
+  // shard streams via fork_to_shards/join_from_shards inside
+  // compute_next_primal_dual_solution_reflected (see pdhg.cu) so every
+  // per-shard kernel and NCCL collective is recorded into the same parent
+  // graph.
+  : pdlp_solver_t(op_problem, settings, /*is_legacy_batch_mode=*/false)
 {
   if (num_gpus == 1) {
     std::cout << "CAREFUL: num_gpus == 1, running dummy version" << std::endl;

From cdc912b50d0d2549c6f9a43576efcc7b4a2edb3c Mon Sep 17 00:00:00 2001
From: Bulle Mostovoi <vmostovoi@nvidia.com>
Date: Thu, 28 May 2026 14:40:47 +0200
Subject: [PATCH 45/67] Cleaner sync semantics, ez ez ez, single mGPU gives
 exact same results as base PDLP on afiro and a28, with graphs !!!! EZ

---
 .../pdlp/distributed_pdlp/multi_gpu_engine.cu | 18 ++---
 .../distributed_pdlp/multi_gpu_engine.hpp     | 81 ++++++++++---------
 cpp/src/pdlp/distributed_pdlp/shard.cu        | 11 ---
 cpp/src/pdlp/pdhg.cu                          | 21 ++++-
 cpp/src/pdlp/pdlp.cu                          |  6 --
 .../convergence_information.cu                | 17 ++--
 6 files changed, 75 insertions(+), 79 deletions(-)

diff --git a/cpp/src/pdlp/distributed_pdlp/multi_gpu_engine.cu b/cpp/src/pdlp/distributed_pdlp/multi_gpu_engine.cu
index 796153fd79..98f33b6c88 100644
--- a/cpp/src/pdlp/distributed_pdlp/multi_gpu_engine.cu
+++ b/cpp/src/pdlp/distributed_pdlp/multi_gpu_engine.cu
@@ -82,17 +82,17 @@ multi_gpu_engine_t<i_t, f_t>::multi_gpu_engine_t(
                                                                  sub_solver_settings));
   }
 
-  // 4. Allocate fork/join events for cross-stream graph capture splicing.
-  //    fork_event_ on the master device (whatever device is current when the
-  //    engine is constructed -- pdlp_solver_t's mGPU ctor runs on master).
-  //    join_events_[r] on shard r's device. event_handler_t uses the default
-  //    cudaEventCreate (no flags), matching the rest of the codebase.
-  //    Cleanup is automatic via event_handler_t's RAII destructor.
-  fork_event_ = std::make_unique<cuopt::event_handler_t>();
-  join_events_.reserve(nb_parts);
+  // Two different events
+  // capture_*_event_ are used inside graph capture
+  // ext_*_event_ are used when sync is needed outside of graph
+  graph_master_ready_event_ = std::make_unique<cuopt::event_handler_t>();
+  sync_master_ready_event_  = std::make_unique<cuopt::event_handler_t>();
+  graph_shard_ready_events_.reserve(nb_parts);
+  sync_shard_ready_events_.reserve(nb_parts);
   for (int r = 0; r < nb_parts; ++r) {
     raft::device_setter guard(devices[r]);
-    join_events_.emplace_back(std::make_unique<cuopt::event_handler_t>());
+    graph_shard_ready_events_.emplace_back(std::make_unique<cuopt::event_handler_t>());
+    sync_shard_ready_events_.emplace_back(std::make_unique<cuopt::event_handler_t>());
   }
 }
 
diff --git a/cpp/src/pdlp/distributed_pdlp/multi_gpu_engine.hpp b/cpp/src/pdlp/distributed_pdlp/multi_gpu_engine.hpp
index 637c342975..674c4c0ef2 100644
--- a/cpp/src/pdlp/distributed_pdlp/multi_gpu_engine.hpp
+++ b/cpp/src/pdlp/distributed_pdlp/multi_gpu_engine.hpp
@@ -456,56 +456,59 @@ struct multi_gpu_engine_t {
   // (owns device-affine resources: handle, NCCL comm, RMM buffers).
   std::vector<std::unique_ptr<pdlp_shard_t<i_t, f_t>>> shards;
 
-  // ===== Fork/join events for CUDA graph capture spanning shard streams =====
-  //
-  // CUDA graph capture starts on the master pdhg stream (in pdhg_solver_t).
-  // The per-iteration work then dispatches kernels and NCCL collectives onto
-  // each shard's own stream. For these cross-stream operations to be
-  // recorded into the same captured graph (instead of escaping the capture
-  // and either invalidating it or being silently dropped), every shard
-  // stream must be "spliced" into the active capture via fork/join events.
-  //
-  //   master_stream ──record(fork_event_)──┐
-  //                                        ├─> shard_0.stream (waits) ──┐
-  //                                        ├─> shard_1.stream (waits) ──┤
-  //                                        └─> shard_{n-1}.stream     ──┘
-  //                                                                  (record join_events_[r])
-  //                                                                  master waits on each
-  //
-  // Pattern mirrors metis_tests/src/bench.cu. Events are reused across
-  // iterations (created once at engine construction) and cleaned up
-  // automatically by event_handler_t's RAII destructor.
-  //
-  // unique_ptr because event_handler_t is non-copyable and we need
-  // per-device construction (each join event must be created with its
-  // shard's device current).
-  std::unique_ptr<cuopt::event_handler_t> fork_event_;
-  std::vector<std::unique_ptr<cuopt::event_handler_t>> join_events_;
-
-  // fork_to_shards: record fork_event_ on `master_stream`, then make every
-  // shard stream wait on it. Inside a graph capture, this splices every
-  // shard stream into the same captured graph.
-  void fork_to_shards(rmm::cuda_stream_view master_stream)
+  // ===== Cross-stream synchronization events =====
+  // two different events
+  // capture_*_event_ are used inside graph capture
+  // ext_*_event_ are used when sync is needed outside of graph
+  std::unique_ptr<cuopt::event_handler_t> graph_master_ready_event_;
+  std::vector<std::unique_ptr<cuopt::event_handler_t>> graph_shard_ready_events_;
+  std::unique_ptr<cuopt::event_handler_t> sync_master_ready_event_;
+  std::vector<std::unique_ptr<cuopt::event_handler_t>> sync_shard_ready_events_;
+
+  // Forks master stream to shards, so that the captured graph can see the work on the shards
+  void graph_capture_fork_to_shards(rmm::cuda_stream_view master_stream)
+  {
+    graph_master_ready_event_->record(master_stream);
+    for (auto& s : shards) {
+      raft::device_setter guard(s->device_id);
+      graph_master_ready_event_->stream_wait(s->stream.view());
+    }
+  }
+
+  // Joins shards back to master stream for correct graph capture
+  void graph_capture_join_from_shards(rmm::cuda_stream_view master_stream)
+  {
+    const int nb = static_cast<int>(shards.size());
+    for (int r = 0; r < nb; ++r) {
+      raft::device_setter guard(shards[r]->device_id);
+      graph_shard_ready_events_[r]->record(shards[r]->stream.view());
+    }
+    for (auto& e : graph_shard_ready_events_) {
+      e->stream_wait(master_stream);
+    }
+  }
+
+  // Functionnaly same as graph_capture_fork_to_shards but on a different event to avoid race conditions
+  // Can be used as a way to sync shards with master stream
+  void sync_await_master(rmm::cuda_stream_view master_stream)
   {
-    fork_event_->record(master_stream);
+    sync_master_ready_event_->record(master_stream);
     for (auto& s : shards) {
       raft::device_setter guard(s->device_id);
-      fork_event_->stream_wait(s->stream.view());
+      sync_master_ready_event_->stream_wait(s->stream.view());
     }
   }
 
-  // join_from_shards: each shard records its join event on its own stream,
-  // then `master_stream` waits on every join event. Closes the captured
-  // sub-graph back into the master stream so cudaStreamEndCapture can
-  // produce a single graph spanning all streams.
-  void join_from_shards(rmm::cuda_stream_view master_stream)
+  // Same as sync_await_master
+  // Can be used as a way to sync master stream with shards
+  void sync_await_shards(rmm::cuda_stream_view master_stream)
   {
     const int nb = static_cast<int>(shards.size());
     for (int r = 0; r < nb; ++r) {
       raft::device_setter guard(shards[r]->device_id);
-      join_events_[r]->record(shards[r]->stream.view());
+      sync_shard_ready_events_[r]->record(shards[r]->stream.view());
     }
-    for (auto& e : join_events_) {
+    for (auto& e : sync_shard_ready_events_) {
       e->stream_wait(master_stream);
     }
   }
diff --git a/cpp/src/pdlp/distributed_pdlp/shard.cu b/cpp/src/pdlp/distributed_pdlp/shard.cu
index 93dc1403fc..3a49287362 100644
--- a/cpp/src/pdlp/distributed_pdlp/shard.cu
+++ b/cpp/src/pdlp/distributed_pdlp/shard.cu
@@ -150,17 +150,6 @@ pdlp_shard_t<i_t, f_t>::pdlp_shard_t(int device_id,
   handle.sync_stream(stream_view);
 
   // ---- 5. Build sub_pdlp (single-GPU mode; multi_gpu flags cleared by caller). ----
-  //         At this point sub_pdlp.op_problem_scaled_ is an unscaled copy
-  //         of sub_problem and sub_pdlp.initial_scaling_strategy_ has
-  //         unit cumulative factors (sub-settings disable Ruiz / PC iters).
-  // Graph capture is enabled. The per-shard kernels invoked by the master's
-  // captured graph (compute_next_primal_dual_solution_reflected → for_each_shard
-  // → primal/dual_reflected_*_projection_transform on sub_pdlp's pdhg) are
-  // recorded into the same graph via the fork_to_shards / join_from_shards
-  // splicing on the master stream. Shards never own their own graph; their
-  // pdhg ping_pong_graph_t is only constructed because pdlp_solver_t requires
-  // it, but no graph.run() on a shard's pdhg is ever invoked from the mGPU
-  // path (compute_next_primal_dual_solution_reflected runs on master).
   sub_pdlp = std::make_unique<pdlp_solver_t<i_t, f_t>>(
     *sub_problem, settings, /*is_legacy_batch_mode=*/false);
 
diff --git a/cpp/src/pdlp/pdhg.cu b/cpp/src/pdlp/pdhg.cu
index df183dc7e6..ec983fd01b 100644
--- a/cpp/src/pdlp/pdhg.cu
+++ b/cpp/src/pdlp/pdhg.cu
@@ -1245,6 +1245,8 @@ void pdhg_solver_t<i_t, f_t>::compute_next_primal_dual_solution_reflected(
 
   using f_t2 = typename type_2<f_t>::type;
 
+  if (mgpu_engine_ != nullptr) { mgpu_engine_->sync_await_shards(stream_view_); }
+
   // Compute next primal solution reflected.
 
   if (should_major) {
@@ -1255,7 +1257,9 @@ void pdhg_solver_t<i_t, f_t>::compute_next_primal_dual_solution_reflected(
       // the capture or run outside the graph, leaving the captured graph
       // empty (or broken) -- which produces the cycling/stall behavior we
       // observed on larger problems. Mirrors metis_tests bench.cu fork/join.
-      if (mgpu_engine_ != nullptr) { mgpu_engine_->fork_to_shards(stream_view_); }
+      if (mgpu_engine_ != nullptr) {
+        mgpu_engine_->graph_capture_fork_to_shards(stream_view_);
+      }
 
       compute_At_y();
       if (mgpu_engine_ != nullptr) {
@@ -1358,12 +1362,16 @@ void pdhg_solver_t<i_t, f_t>::compute_next_primal_dual_solution_reflected(
       // Multi-GPU: close the fork by joining every shard stream back into
       // the master stream so cudaStreamEndCapture sees a single graph
       // spanning all streams.
-      if (mgpu_engine_ != nullptr) { mgpu_engine_->join_from_shards(stream_view_); }
+      if (mgpu_engine_ != nullptr) {
+        mgpu_engine_->graph_capture_join_from_shards(stream_view_);
+      }
     });
 
   } else {
     graph_all.run(should_major, [&]() {
-      if (mgpu_engine_ != nullptr) { mgpu_engine_->fork_to_shards(stream_view_); }
+      if (mgpu_engine_ != nullptr) {
+        mgpu_engine_->graph_capture_fork_to_shards(stream_view_);
+      }
 
       // Compute next primal
       compute_At_y();
@@ -1470,9 +1478,14 @@ void pdhg_solver_t<i_t, f_t>::compute_next_primal_dual_solution_reflected(
       print("reflected_dual_", reflected_dual_);
 #endif
 
-      if (mgpu_engine_ != nullptr) { mgpu_engine_->join_from_shards(stream_view_); }
+      if (mgpu_engine_ != nullptr) {
+        mgpu_engine_->graph_capture_join_from_shards(stream_view_);
+      }
     });
   }
+
+  // sync to master stream after the graph is captured
+  if (mgpu_engine_ != nullptr) { mgpu_engine_->sync_await_master(stream_view_); }
 }
 
 template <typename i_t, typename f_t>
diff --git a/cpp/src/pdlp/pdlp.cu b/cpp/src/pdlp/pdlp.cu
index fd78a0ac9d..8cf37cd8a1 100644
--- a/cpp/src/pdlp/pdlp.cu
+++ b/cpp/src/pdlp/pdlp.cu
@@ -381,12 +381,6 @@ pdlp_solver_t<i_t, f_t>::pdlp_solver_t(problem_t<i_t, f_t>& op_problem,
                                        int num_gpus)
   // 1. Delegate to single-GPU ctor to bring up all the per-master state
   //    (problem_ptr, op_problem_scaled_, pdhg_solver_, strategies, etc.).
-  //
-  // Graph capture is enabled here. The master's captured graph splices the
-  // shard streams via fork_to_shards/join_from_shards inside
-  // compute_next_primal_dual_solution_reflected (see pdhg.cu) so every
-  // per-shard kernel and NCCL collective is recorded into the same parent
-  // graph.
   : pdlp_solver_t(op_problem, settings, /*is_legacy_batch_mode=*/false)
 {
   if (num_gpus == 1) {
diff --git a/cpp/src/pdlp/termination_strategy/convergence_information.cu b/cpp/src/pdlp/termination_strategy/convergence_information.cu
index 7877a64c88..da2340146a 100644
--- a/cpp/src/pdlp/termination_strategy/convergence_information.cu
+++ b/cpp/src/pdlp/termination_strategy/convergence_information.cu
@@ -460,9 +460,8 @@ void convergence_information_t<i_t, f_t>::compute_convergence_information(
       });
 
     // Get the reduced primal objective from the shard[0] (arbitrary)
-    // Race fix: master stream must wait for shard streams to finish the
-    // allreduce above before copying scalar data out of shard 0's buffer.
-    engine->join_from_shards(stream_view_);
+    // Sync shards with master stream to avoid race conditions
+    engine->sync_await_shards(stream_view_);
     {
       auto& s0 = *engine->shards[0];
       raft::device_setter guard(s0.device_id);
@@ -497,9 +496,8 @@ void convergence_information_t<i_t, f_t>::compute_convergence_information(
       },
       [](pdlp_shard_t<i_t, f_t>& shard) -> i_t { return shard.rank_data.owned_cstr_size; });
 
-    // Race fix: master stream must wait for shard streams to finish the
     // distributed L2 norm before copying scalar data out of shard 0.
-    engine->join_from_shards(stream_view_);
+    engine->sync_await_shards(stream_view_);
     auto& s0 = *engine->shards[0];
     raft::device_setter guard(s0.device_id);
     raft::copy(l2_primal_residual_.data(),
@@ -607,9 +605,8 @@ void convergence_information_t<i_t, f_t>::compute_convergence_information(
           .data();
       });
 
-    // Race fix: master stream must wait for shard streams to finish the
-    // allreduce above before copying scalar data out of shard 0's buffer.
-    engine->join_from_shards(stream_view_);
+    // Sync shards with master stream to avoid race conditions
+    engine->sync_await_shards(stream_view_);
     {
       auto& s0 = *engine->shards[0];
       raft::device_setter guard(s0.device_id);
@@ -646,9 +643,9 @@ void convergence_information_t<i_t, f_t>::compute_convergence_information(
           .l2_dual_residual_.data();
       },
       [](pdlp_shard_t<i_t, f_t>& shard) -> i_t { return shard.rank_data.owned_var_size; });
-    // Race fix: master stream must wait for shard streams to finish the
+
     // distributed L2 norm before copying scalar data out of shard 0.
-    engine->join_from_shards(stream_view_);
+    engine->sync_await_shards(stream_view_);
     auto& s0 = *engine->shards[0];
     raft::device_setter guard(s0.device_id);
     raft::copy(l2_dual_residual_.data(),

From 04d22cf161e55aef6f37add6ff3d988dcd2c34de Mon Sep 17 00:00:00 2001
From: Bulle Mostovoi <vmostovoi@nvidia.com>
Date: Fri, 29 May 2026 05:48:01 -0700
Subject: [PATCH 46/67] pad local matrices for easier integration and allow
 mismatch of nnz between A and A_t for shards

---
 cpp/src/pdlp/cusparse_view.cu                   | 17 +++++++++++------
 .../pdlp/distributed_pdlp/partition_loader.cu   | 10 ++++++++++
 2 files changed, 21 insertions(+), 6 deletions(-)

diff --git a/cpp/src/pdlp/cusparse_view.cu b/cpp/src/pdlp/cusparse_view.cu
index 396fd27499..1e3638cdbd 100644
--- a/cpp/src/pdlp/cusparse_view.cu
+++ b/cpp/src/pdlp/cusparse_view.cu
@@ -498,14 +498,17 @@ cusparse_view_t<i_t, f_t>::cusparse_view_t(
   // setup cusparse view
   A.create(op_problem_scaled.n_constraints,
            op_problem_scaled.n_variables,
-           op_problem_scaled.nnz,
+           static_cast<int64_t>(A_.size()),
            const_cast<i_t*>(op_problem_scaled.offsets.data()),
            const_cast<i_t*>(op_problem_scaled.variables.data()),
            const_cast<f_t*>(op_problem_scaled.coefficients.data()));
 
+  // A_T can have a different nnz than A in multi-GPU shards
+  // A is just what is needed to compute A_x for owned constraints
+  // A_T is just what is needed to compute A_T_y for owned variables
   A_T.create(op_problem_scaled.n_variables,
              op_problem_scaled.n_constraints,
-             op_problem_scaled.nnz,
+             static_cast<int64_t>(A_T_.size()),
              const_cast<i_t*>(A_T_offsets_.data()),
              const_cast<i_t*>(A_T_indices_.data()),
              const_cast<f_t*>(A_T_.data()));
@@ -914,14 +917,14 @@ cusparse_view_t<i_t, f_t>::cusparse_view_t(
   // setup cusparse view
   A.create(op_problem.n_constraints,
            op_problem.n_variables,
-           op_problem.nnz,
+           static_cast<int64_t>(A_.size()),
            const_cast<i_t*>(op_problem.offsets.data()),
            const_cast<i_t*>(op_problem.variables.data()),
            const_cast<f_t*>(op_problem.coefficients.data()));
 
   A_T.create(op_problem.n_variables,
              op_problem.n_constraints,
-             op_problem.nnz,
+             static_cast<int64_t>(A_T_.size()),
              const_cast<i_t*>(A_T_offsets_.data()),
              const_cast<i_t*>(A_T_indices_.data()),
              const_cast<f_t*>(A_T_.data()));
@@ -1129,16 +1132,18 @@ cusparse_view_t<i_t, f_t>::cusparse_view_t(
   // Copying them from the existing cuSparse view is a bad practice and creates segfault post
   // CUDA 12.4 Using the saved pointer of the existing cusparse view to make sure we capture the
   // correct pointer
+  // See comment in the PDHG cusparse_view_t ctor: bind the descriptor nnz to
+  // the actual value-buffer length so A and A_T stay symmetric and shard-safe.
   A.create(op_problem.n_constraints,
            op_problem.n_variables,
-           op_problem.nnz,
+           static_cast<int64_t>(A_.size()),
            const_cast<i_t*>(A_offsets_.data()),
            const_cast<i_t*>(A_indices_.data()),
            const_cast<f_t*>(A_.data()));
 
   A_T.create(op_problem.n_variables,
              op_problem.n_constraints,
-             op_problem.nnz,
+             static_cast<int64_t>(existing_cusparse_view.A_T_.size()),
              const_cast<i_t*>(existing_cusparse_view.A_T_offsets_.data()),
              const_cast<i_t*>(existing_cusparse_view.A_T_indices_.data()),
              const_cast<f_t*>(existing_cusparse_view.A_T_.data()));
diff --git a/cpp/src/pdlp/distributed_pdlp/partition_loader.cu b/cpp/src/pdlp/distributed_pdlp/partition_loader.cu
index b9bc71ae9e..5014607736 100644
--- a/cpp/src/pdlp/distributed_pdlp/partition_loader.cu
+++ b/cpp/src/pdlp/distributed_pdlp/partition_loader.cu
@@ -171,6 +171,16 @@ std::vector<rank_data_t<i_t, f_t>> partition_loader_t<i_t, f_t>::create_rank_dat
 
     rd.total_var_size  = rd.owned_var_size + needed_vars.size();
     rd.total_cstr_size = rd.owned_cstr_size + needed_cstrs.size();
+
+    // Pad row-offset arrays so cuSPARSE sees the local matrices as
+    // (total_cstr x total_var) for A and (total_var x total_cstr) for A_T
+    const i_t a_last_nnz =
+      rd.h_A_row_offsets.empty() ? i_t{0} : rd.h_A_row_offsets.back();
+    rd.h_A_row_offsets.resize(rd.total_cstr_size + 1, a_last_nnz);
+
+    const i_t at_last_nnz =
+      rd.h_A_t_row_offsets.empty() ? i_t{0} : rd.h_A_t_row_offsets.back();
+    rd.h_A_t_row_offsets.resize(rd.total_var_size + 1, at_last_nnz);
   }
 
   // 3. Generate local indices for contiguous [[self], [peer1], ..., [peer_k]]

From b41df4583df78ec095efc351df8c10159d78ccdb Mon Sep 17 00:00:00 2001
From: Bulle Mostovoi <vmostovoi@nvidia.com>
Date: Fri, 29 May 2026 06:24:42 -0700
Subject: [PATCH 47/67] copy scalars to host rather than direct d2d. better

---
 cpp/src/pdlp/pdlp.cu                          | 21 ++++++++++++----
 cpp/src/pdlp/pdlp.cuh                         |  5 ++++
 .../restart_strategy/pdlp_restart_strategy.cu | 24 +++++++++++++++----
 3 files changed, 40 insertions(+), 10 deletions(-)

diff --git a/cpp/src/pdlp/pdlp.cu b/cpp/src/pdlp/pdlp.cu
index 8cf37cd8a1..241b9a5aeb 100644
--- a/cpp/src/pdlp/pdlp.cu
+++ b/cpp/src/pdlp/pdlp.cu
@@ -572,14 +572,25 @@ pdlp_solver_t<i_t, f_t>::pdlp_solver_t(problem_t<i_t, f_t>& op_problem,
                              op_problem_scaled_.presolve_data.objective_scaling_factor,
                              sub_pdlp_settings);
 
+    // Copy to host and then to shards.
+    // More robust than cudaDeviceEnablePeerAccess and cost-free-ish.
+    f_t h_step_size{}, h_primal_weight{}, h_best_primal_weight{};
+    f_t h_primal_step_size{}, h_dual_step_size{};
+    raft::copy(&h_step_size, step_size_.data(), 1, stream_view_);
+    raft::copy(&h_primal_weight, primal_weight_.data(), 1, stream_view_);
+    raft::copy(&h_best_primal_weight, best_primal_weight_.data(), 1, stream_view_);
+    raft::copy(&h_primal_step_size, primal_step_size_.data(), 1, stream_view_);
+    raft::copy(&h_dual_step_size, dual_step_size_.data(), 1, stream_view_);
+    handle_ptr_->sync_stream(stream_view_);
+
     for (auto& shard : multi_gpu_engine->shards) {
       raft::device_setter guard(shard->device_id);
       auto& sub = *shard->sub_pdlp;
-      raft::copy(sub.step_size_.data(), step_size_.data(), 1, shard->stream);
-      raft::copy(sub.primal_weight_.data(), primal_weight_.data(), 1, shard->stream);
-      raft::copy(sub.best_primal_weight_.data(), best_primal_weight_.data(), 1, shard->stream);
-      raft::copy(sub.primal_step_size_.data(), primal_step_size_.data(), 1, shard->stream);
-      raft::copy(sub.dual_step_size_.data(), dual_step_size_.data(), 1, shard->stream);
+      raft::copy(sub.step_size_.data(), &h_step_size, 1, shard->stream);
+      raft::copy(sub.primal_weight_.data(), &h_primal_weight, 1, shard->stream);
+      raft::copy(sub.best_primal_weight_.data(), &h_best_primal_weight, 1, shard->stream);
+      raft::copy(sub.primal_step_size_.data(), &h_primal_step_size, 1, shard->stream);
+      raft::copy(sub.dual_step_size_.data(), &h_dual_step_size, 1, shard->stream);
     }
 
     // Wire the engine into the master pdhg_solver_. Shards' pdhg_solver_ keep
diff --git a/cpp/src/pdlp/pdlp.cuh b/cpp/src/pdlp/pdlp.cuh
index 17fb05080f..15ddfdaad3 100644
--- a/cpp/src/pdlp/pdlp.cuh
+++ b/cpp/src/pdlp/pdlp.cuh
@@ -117,6 +117,11 @@ class pdlp_solver_t {
   // call across all shards' pdhg_solver_t::*_transform methods.
   rmm::device_uvector<f_t>& get_primal_step_size() { return primal_step_size_; }
   rmm::device_uvector<f_t>& get_dual_step_size() { return dual_step_size_; }
+  // Multi-GPU restart broadcast needs to mirror master's primal_weight /
+  // best_primal_weight onto every shard after each cuPDLPx restart so that
+  // downstream shard-side restart machinery stays in sync with master.
+  rmm::device_uvector<f_t>& get_primal_weight() { return primal_weight_; }
+  rmm::device_uvector<f_t>& get_best_primal_weight() { return best_primal_weight_; }
 
  private:
   void print_termination_criteria(const timer_t& timer, bool is_average = false);
diff --git a/cpp/src/pdlp/restart_strategy/pdlp_restart_strategy.cu b/cpp/src/pdlp/restart_strategy/pdlp_restart_strategy.cu
index 00c5b16c8b..b7d49fc32f 100644
--- a/cpp/src/pdlp/restart_strategy/pdlp_restart_strategy.cu
+++ b/cpp/src/pdlp/restart_strategy/pdlp_restart_strategy.cu
@@ -1004,15 +1004,29 @@ void pdlp_restart_strategy_t<i_t, f_t>::cupdlpx_restart(
     best_primal_weight.set_element_async(0, best_primal_weight_value, stream_view_);
   }
 
-  // Broadcast the primal and dual step sizes to all shards
+  // mGPU: Broadcast all primal-weight / step-size scalars updated by the cuPDLPx
+  // restart on the master to every shard so the restart-state on
+  // each shard stays in sync with master.
   if (auto* engine = pdhg_solver.get_mgpu_engine()) {
     RAFT_CUDA_TRY(cudaStreamSynchronize(stream_view_));
+
+    f_t h_primal_step_size{}, h_dual_step_size{};
+    f_t h_primal_weight{}, h_best_primal_weight{};
+
+    raft::copy(&h_primal_step_size, primal_step_size.data(), 1, stream_view_);
+    raft::copy(&h_dual_step_size, dual_step_size.data(), 1, stream_view_);
+    raft::copy(&h_primal_weight, primal_weight.data(), 1, stream_view_);
+    raft::copy(&h_best_primal_weight, best_primal_weight.data(), 1, stream_view_);
+    RAFT_CUDA_TRY(cudaStreamSynchronize(stream_view_));
+
     engine->for_each_shard([&](auto& shard) {
       auto& sub = *shard.sub_pdlp;
-      raft::copy(sub.get_primal_step_size().data(),
-                 primal_step_size.data(), 1, shard.stream.view());
-      raft::copy(sub.get_dual_step_size().data(),
-                 dual_step_size.data(), 1, shard.stream.view());
+      raft::copy(
+        sub.get_primal_step_size().data(), &h_primal_step_size, 1, shard.stream.view());
+      raft::copy(sub.get_dual_step_size().data(), &h_dual_step_size, 1, shard.stream.view());
+      raft::copy(sub.get_primal_weight().data(), &h_primal_weight, 1, shard.stream.view());
+      raft::copy(
+        sub.get_best_primal_weight().data(), &h_best_primal_weight, 1, shard.stream.view());
     });
   }
   // TODO later batch mode: remove if you have per climber restart

From a1ffe1d791d203a903956769e89cde5452309d91 Mon Sep 17 00:00:00 2001
From: Bulle Mostovoi <vmostovoi@nvidia.com>
Date: Fri, 29 May 2026 06:29:39 -0700
Subject: [PATCH 48/67] force re-inject offset and variables to undo the sort,
 cheap and ugly but works

---
 cpp/src/pdlp/distributed_pdlp/shard.cu | 16 +++++++++++++++-
 1 file changed, 15 insertions(+), 1 deletion(-)

diff --git a/cpp/src/pdlp/distributed_pdlp/shard.cu b/cpp/src/pdlp/distributed_pdlp/shard.cu
index 3a49287362..356e10a03c 100644
--- a/cpp/src/pdlp/distributed_pdlp/shard.cu
+++ b/cpp/src/pdlp/distributed_pdlp/shard.cu
@@ -155,12 +155,26 @@ pdlp_shard_t<i_t, f_t>::pdlp_shard_t(int device_id,
 
   sub_pdlp->pdhg_solver_.set_is_multi_gpu(true);
 
-  // Inject master-scaled buffers inside sub_pdlp
+  // Re-inject master-scaled buffers inside sub_pdlp.
+  // Need to also re-inject the offsets and variables arrays to revert
+  // the csrsort done by problem_t's constructor.
   auto& scaled = sub_pdlp->get_op_problem_scaled();
+  raft::copy(scaled.offsets.data(),
+             rank_data.h_A_row_offsets.data(),
+             rank_data.h_A_row_offsets.size(),
+             stream_view);
+  raft::copy(scaled.variables.data(),
+             rank_data.h_A_col_indices.data(),
+             rank_data.h_A_col_indices.size(),
+             stream_view);
   raft::copy(scaled.coefficients.data(),
              rank_data.h_A_values_scaled.data(),
              rank_data.h_A_values_scaled.size(),
              stream_view);
+  // A_T side: all three arrays were already overridden together from
+  // rank_data on sub_problem (see step 4 above) and deep-copied into the
+  // scaled problem, so reverse_offsets / reverse_constraints already match
+  // h_A_t_values_scaled's order. Only the values need a SCALED swap-in.
   raft::copy(scaled.reverse_coefficients.data(),
              rank_data.h_A_t_values_scaled.data(),
              rank_data.h_A_t_values_scaled.size(),

From c9394d9d00147ab740765ef02ba1ec6a77de2514 Mon Sep 17 00:00:00 2001
From: Bulle Mostovoi <vmostovoi@nvidia.com>
Date: Fri, 29 May 2026 06:48:38 -0700
Subject: [PATCH 49/67] few style changes, better args and prints

---
 cpp/cuopt_cli.cpp                             | 18 +++++++++--
 .../cuopt/linear_programming/constants.h      |  1 +
 .../pdlp/solver_settings.hpp                  |  3 ++
 cpp/src/math_optimization/solver_settings.cu  |  3 +-
 cpp/src/pdlp/pdlp.cu                          | 30 ++++++++++++-------
 cpp/src/pdlp/pdlp.cuh                         |  2 +-
 cpp/src/pdlp/solve.cu                         | 25 ++++++++++++----
 7 files changed, 60 insertions(+), 22 deletions(-)

diff --git a/cpp/cuopt_cli.cpp b/cpp/cuopt_cli.cpp
index 39aab47170..7c0a9111d9 100644
--- a/cpp/cuopt_cli.cpp
+++ b/cpp/cuopt_cli.cpp
@@ -426,10 +426,22 @@ int main(int argc, char* argv[])
   std::vector<rmm::mr::cuda_async_memory_resource> memory_resources;
 
   if (memory_backend == cuopt::linear_programming::memory_backend_t::GPU) {
-    const int num_gpus = settings.get_parameter<int>(CUOPT_NUM_GPUS);
+    // Distributed PDLP scales one shard per GPU and uses its own knob; everything else
+    // (concurrent, batch, MIP) uses num_gpus which is capped at 2.
+    // For distributed PDLP, -1 means "auto-detect": resolve to the visible device
+    // count so the RMM memory pools match what solve.cu will eventually dispatch.
+    const bool use_distributed_pdlp = settings.get_parameter<bool>(CUOPT_USE_DISTRIBUTED_PDLP);
+    int requested_gpus =
+      use_distributed_pdlp ? settings.get_parameter<int>(CUOPT_DISTRIBUTED_PDLP_NUM_GPUS)
+                           : settings.get_parameter<int>(CUOPT_NUM_GPUS);
+    if (use_distributed_pdlp && requested_gpus == -1) {
+      requested_gpus = raft::device_setter::get_device_count();
+    }
+    const int provisioned_gpus =
+      std::min(raft::device_setter::get_device_count(), requested_gpus);
 
-    memory_resources.reserve(std::min(raft::device_setter::get_device_count(), num_gpus));
-    for (int i = 0; i < std::min(raft::device_setter::get_device_count(), num_gpus); ++i) {
+    memory_resources.reserve(provisioned_gpus);
+    for (int i = 0; i < provisioned_gpus; ++i) {
       RAFT_CUDA_TRY(cudaSetDevice(i));
       memory_resources.emplace_back();
       rmm::mr::set_per_device_resource(rmm::cuda_device_id{i}, memory_resources.back());
diff --git a/cpp/include/cuopt/linear_programming/constants.h b/cpp/include/cuopt/linear_programming/constants.h
index 26ef3653e0..3346ab3565 100644
--- a/cpp/include/cuopt/linear_programming/constants.h
+++ b/cpp/include/cuopt/linear_programming/constants.h
@@ -83,6 +83,7 @@
 #define CUOPT_SOLUTION_FILE            "solution_file"
 #define CUOPT_NUM_CPU_THREADS          "num_cpu_threads"
 #define CUOPT_NUM_GPUS                 "num_gpus"
+#define CUOPT_DISTRIBUTED_PDLP_NUM_GPUS "distributed_pdlp_num_gpus"
 #define CUOPT_MULTI_GPU_PARTITION_FILE "multi_gpu_partition_file"
 #define CUOPT_USE_DISTRIBUTED_PDLP     "use_distributed_pdlp"
 #define CUOPT_USER_PROBLEM_FILE        "user_problem_file"
diff --git a/cpp/include/cuopt/linear_programming/pdlp/solver_settings.hpp b/cpp/include/cuopt/linear_programming/pdlp/solver_settings.hpp
index e8beef007d..efdbd5733c 100644
--- a/cpp/include/cuopt/linear_programming/pdlp/solver_settings.hpp
+++ b/cpp/include/cuopt/linear_programming/pdlp/solver_settings.hpp
@@ -307,6 +307,9 @@ class pdlp_solver_settings_t {
   presolver_t presolver{presolver_t::Default};
   bool dual_postsolve{true};
   int num_gpus{1};
+  // Number of GPUs to use specifically for distributed PDLP (use_distributed_pdlp=true).
+  // -1 means auto-detect
+  int distributed_pdlp_num_gpus{-1};
   std::string multi_gpu_partition_file{""};
   // Set to true inside the shards
   bool is_distributed_sub_pdlp{false};
diff --git a/cpp/src/math_optimization/solver_settings.cu b/cpp/src/math_optimization/solver_settings.cu
index 991b0d62c1..207e53f20d 100644
--- a/cpp/src/math_optimization/solver_settings.cu
+++ b/cpp/src/math_optimization/solver_settings.cu
@@ -138,8 +138,9 @@ solver_settings_t<i_t, f_t>::solver_settings_t() : pdlp_settings(), mip_settings
     {CUOPT_MIP_IMPLIED_BOUND_CUTS, &mip_settings.implied_bound_cuts, -1, 1, -1},
     {CUOPT_MIP_STRONG_CHVATAL_GOMORY_CUTS, &mip_settings.strong_chvatal_gomory_cuts, -1, 1, -1},
     {CUOPT_MIP_REDUCED_COST_STRENGTHENING, &mip_settings.reduced_cost_strengthening, -1, std::numeric_limits<i_t>::max(), -1},
-    {CUOPT_NUM_GPUS, &pdlp_settings.num_gpus, 1, 576, 1},
+    {CUOPT_NUM_GPUS, &pdlp_settings.num_gpus, 1, 2, 1},
     {CUOPT_NUM_GPUS, &mip_settings.num_gpus, 1, 2, 1},
+    {CUOPT_DISTRIBUTED_PDLP_NUM_GPUS, &pdlp_settings.distributed_pdlp_num_gpus, -1, 576, -1},
     {CUOPT_MIP_BATCH_PDLP_STRONG_BRANCHING, &mip_settings.mip_batch_pdlp_strong_branching, 0, 2, 0},
     {CUOPT_MIP_BATCH_PDLP_RELIABILITY_BRANCHING, &mip_settings.mip_batch_pdlp_reliability_branching, 0, 2, 0},
     {CUOPT_MIP_STRONG_BRANCHING_SIMPLEX_ITERATION_LIMIT, &mip_settings.strong_branching_simplex_iteration_limit, -1,std::numeric_limits<i_t>::max(), -1},
diff --git a/cpp/src/pdlp/pdlp.cu b/cpp/src/pdlp/pdlp.cu
index 241b9a5aeb..a061a2d468 100644
--- a/cpp/src/pdlp/pdlp.cu
+++ b/cpp/src/pdlp/pdlp.cu
@@ -378,23 +378,29 @@ pdlp_solver_t<i_t, f_t>::pdlp_solver_t(problem_t<i_t, f_t>& op_problem,
 template <typename i_t, typename f_t>
 pdlp_solver_t<i_t, f_t>::pdlp_solver_t(problem_t<i_t, f_t>& op_problem,
                                        pdlp_solver_settings_t<i_t, f_t> const& settings,
-                                       int num_gpus)
+                                       int distributed_pdlp_num_gpus)
   // 1. Delegate to single-GPU ctor to bring up all the per-master state
   //    (problem_ptr, op_problem_scaled_, pdhg_solver_, strategies, etc.).
   : pdlp_solver_t(op_problem, settings, /*is_legacy_batch_mode=*/false)
 {
-  if (num_gpus == 1) {
-    std::cout << "CAREFUL: num_gpus == 1, running dummy version" << std::endl;
+  CUOPT_LOG_INFO("Solving with distributed PDLP on %d GPU",
+                 distributed_pdlp_num_gpus);
+  if (distributed_pdlp_num_gpus == 1) {
+    std::cout << "CAREFUL !!: distributed_pdlp_num_gpus == 1, running single-shard dummy path, "
+                 "if you want to set the number of GPUs to use for distributed PDLP, set the "
+                 "parameter --distributed-pdlp-num-gpus"
+              << std::endl;
   }
-  cuopt_expects(num_gpus == settings.num_gpus /*&& settings.num_gpus > 1*/,
+  cuopt_expects(distributed_pdlp_num_gpus == settings.distributed_pdlp_num_gpus,
                 error_type_t::ValidationError,
-                "This constructor should only be used for distributed PDLP (num_gpus > 1)");
+                "This constructor's distributed_pdlp_num_gpus argument must match "
+                "settings.distributed_pdlp_num_gpus");
 
   // Distributed PDLP is currently double-only
   if constexpr (!std::is_same_v<f_t, double>) {
     cuopt_expects(false,
                   error_type_t::ValidationError,
-                  "Distributed PDLP (num_gpus > 1) currently requires double precision");
+                  "Distributed PDLP currently requires double precision");
     return;
   } else {
     // 2. Load or compute partition
@@ -405,20 +411,21 @@ pdlp_solver_t<i_t, f_t>::pdlp_solver_t(problem_t<i_t, f_t>& op_problem,
       validate_partition(parts,
                          op_problem_scaled_.n_constraints,
                          op_problem_scaled_.n_variables,
-                         num_gpus,
+                         distributed_pdlp_num_gpus,
                          "partition file");
     } else {
-      if (num_gpus == 1) {
+      if (distributed_pdlp_num_gpus == 1) {
         // Single-part dummy run: useful for exercising the mGPU code paths on a
         // single physical GPU without a real partition file.
-        std::cout << "CAREFUL: num_gpus == 1, running dummy version (single part covering "
+        std::cout << "CAREFUL: distributed_pdlp_num_gpus == 1, running dummy version (single "
+                     "part covering "
                   << op_problem_scaled_.n_constraints << " cstrs + "
                   << op_problem_scaled_.n_variables << " vars)" << std::endl;
       }
       partitioner_input_t<i_t, f_t> partition_input;
       partition_input.nb_cstr  = op_problem_scaled_.n_constraints;
       partition_input.nb_vars  = op_problem_scaled_.n_variables;
-      partition_input.nb_parts = num_gpus;
+      partition_input.nb_parts = distributed_pdlp_num_gpus;
       // Dummy partitioner ignores A / A_t for now; future METIS partitioners will
       // fill these CSR views before calling partition().
       auto partitioner = make_partitioner<i_t, f_t>(partitioner_kind_t::Dummy);
@@ -538,7 +545,7 @@ pdlp_solver_t<i_t, f_t>::pdlp_solver_t(problem_t<i_t, f_t>& op_problem,
                                                                 h_A_t_col_indices,
                                                                 h_A_t_values,
                                                                 h_A_t_values_scaled,
-                                                                settings.num_gpus,
+                                                                settings.distributed_pdlp_num_gpus,
                                                                 n_cstr,
                                                                 n_vars,
                                                                 nnz);
@@ -546,6 +553,7 @@ pdlp_solver_t<i_t, f_t>::pdlp_solver_t(problem_t<i_t, f_t>& op_problem,
     // 7. Build the per-shard PDLP settings:
     pdlp_solver_settings_t<i_t, f_t> sub_pdlp_settings                    = settings;
     sub_pdlp_settings.num_gpus                                            = 1;
+    sub_pdlp_settings.distributed_pdlp_num_gpus                           = 1;
     sub_pdlp_settings.multi_gpu_partition_file                            = "";
     sub_pdlp_settings.is_distributed_sub_pdlp                             = true;
     sub_pdlp_settings.hyper_params.default_l_inf_ruiz_iterations          = 0;
diff --git a/cpp/src/pdlp/pdlp.cuh b/cpp/src/pdlp/pdlp.cuh
index 15ddfdaad3..14651eab3f 100644
--- a/cpp/src/pdlp/pdlp.cuh
+++ b/cpp/src/pdlp/pdlp.cuh
@@ -66,7 +66,7 @@ class pdlp_solver_t {
   // Distributed Solver Constructor
   pdlp_solver_t(problem_t<i_t, f_t>& op_problem,
                 pdlp_solver_settings_t<i_t, f_t> const& settings,
-                int num_gpus);
+                int distributed_pdlp_num_gpus);
 
   optimization_problem_solution_t<i_t, f_t> run_solver(const timer_t& timer);
 
diff --git a/cpp/src/pdlp/solve.cu b/cpp/src/pdlp/solve.cu
index e401ab35b6..338083f03a 100644
--- a/cpp/src/pdlp/solve.cu
+++ b/cpp/src/pdlp/solve.cu
@@ -771,16 +771,29 @@ static optimization_problem_solution_t<i_t, f_t> run_pdlp_solver(
   }
 #endif
   if (settings.hyper_params.use_distributed_pdlp) {
-    /*
-    cuopt_expects(settings.num_gpus > 1,
+    // Resolve the -1 "auto-detect" sentinel to the actual visible-device count on
+    // the master process
+    pdlp_solver_settings_t<i_t, f_t> settings_resolved = settings;
+    if (settings_resolved.distributed_pdlp_num_gpus == -1) {
+      settings_resolved.distributed_pdlp_num_gpus = raft::device_setter::get_device_count();
+      CUOPT_LOG_INFO("distributed_pdlp_num_gpus == -1: auto-detected %d visible CUDA device",
+                     settings_resolved.distributed_pdlp_num_gpus);
+    }
+    cuopt_expects(settings_resolved.distributed_pdlp_num_gpus >= 1,
                   error_type_t::ValidationError,
-                  "use_distributed_pdlp requires settings.num_gpus > 1"); */
-    if (settings.num_gpus == 1) {std::cout << "CAREFUL: use_distributed_pdlp requires settings.num_gpus > 1" << std::endl;}
+                  "distributed_pdlp_num_gpus must be >= 1 or -1 (auto-detect)");
+    if (settings_resolved.distributed_pdlp_num_gpus == 1) {
+      std::cout
+        << "CAREFUL: use_distributed_pdlp with distributed_pdlp_num_gpus == 1 runs the "
+           "single-shard dummy path"
+        << std::endl;
+    }
     cuopt_expects(!is_batch_mode,
                   error_type_t::ValidationError,
                   "Distributed PDLP does not support batch mode");
-    // Multi-GPU ctor; dispatched by 3rd-arg TYPE (int num_gpus, not bool batch).
-    detail::pdlp_solver_t<i_t, f_t> solver(problem, settings, settings.num_gpus);
+    // Multi-GPU ctor; dispatched by 3rd-arg TYPE (int, not bool batch).
+    detail::pdlp_solver_t<i_t, f_t> solver(
+      problem, settings_resolved, settings_resolved.distributed_pdlp_num_gpus);
     return solver.run_solver(timer);
   }
   detail::pdlp_solver_t<i_t, f_t> solver(problem, settings, is_batch_mode);

From 4faa7df79320fc5588796e6828642bce523ea726 Mon Sep 17 00:00:00 2001
From: Bulle Mostovoi <vmostovoi@nvidia.com>
Date: Fri, 29 May 2026 07:20:12 -0700
Subject: [PATCH 50/67] added disable_graph flag, afiro gets solved on
 non-graph just as if it was single

---
 .../cuopt/linear_programming/constants.h        |  1 +
 .../pdlp/pdlp_hyper_params.cuh                  |  3 +++
 cpp/src/math_optimization/solver_settings.cu    |  1 +
 cpp/src/pdlp/solve.cu                           |  3 +++
 cpp/src/pdlp/utilities/ping_pong_graph.cuh      | 17 ++++++++++++++++-
 5 files changed, 24 insertions(+), 1 deletion(-)

diff --git a/cpp/include/cuopt/linear_programming/constants.h b/cpp/include/cuopt/linear_programming/constants.h
index 3346ab3565..e695bb21d3 100644
--- a/cpp/include/cuopt/linear_programming/constants.h
+++ b/cpp/include/cuopt/linear_programming/constants.h
@@ -86,6 +86,7 @@
 #define CUOPT_DISTRIBUTED_PDLP_NUM_GPUS "distributed_pdlp_num_gpus"
 #define CUOPT_MULTI_GPU_PARTITION_FILE "multi_gpu_partition_file"
 #define CUOPT_USE_DISTRIBUTED_PDLP     "use_distributed_pdlp"
+#define CUOPT_PDLP_DISABLE_GRAPH       "pdlp_disable_graph"
 #define CUOPT_USER_PROBLEM_FILE        "user_problem_file"
 #define CUOPT_PRESOLVE_FILE            "presolve_file"
 #define CUOPT_RANDOM_SEED              "random_seed"
diff --git a/cpp/include/cuopt/linear_programming/pdlp/pdlp_hyper_params.cuh b/cpp/include/cuopt/linear_programming/pdlp/pdlp_hyper_params.cuh
index 962f06ee4a..c68dc86d6a 100644
--- a/cpp/include/cuopt/linear_programming/pdlp/pdlp_hyper_params.cuh
+++ b/cpp/include/cuopt/linear_programming/pdlp/pdlp_hyper_params.cuh
@@ -48,6 +48,9 @@ struct pdlp_hyper_params_t {
   bool use_reflected_primal_dual                                  = true;
   bool use_fixed_point_error                                      = true;
   bool use_distributed_pdlp                                       = false;
+  // Debug/diagnostic knob: when true, PDLP bypasses CUDA-graph capture in
+  // ping_pong_graph_t and executes each iteration eagerly
+  bool pdlp_disable_graph                                         = false;
   double reflection_coefficient                                   = 1.0;
   double restart_k_p                                              = 0.99;
   double restart_k_i                                              = 0.01;
diff --git a/cpp/src/math_optimization/solver_settings.cu b/cpp/src/math_optimization/solver_settings.cu
index 207e53f20d..629c8a8428 100644
--- a/cpp/src/math_optimization/solver_settings.cu
+++ b/cpp/src/math_optimization/solver_settings.cu
@@ -179,6 +179,7 @@ solver_settings_t<i_t, f_t>::solver_settings_t() : pdlp_settings(), mip_settings
     {CUOPT_BARRIER_ITERATIVE_REFINEMENT, &pdlp_settings.barrier_iterative_refinement, true},
     {CUOPT_MIP_PROBING, &mip_settings.probing, true},
     {CUOPT_USE_DISTRIBUTED_PDLP, &pdlp_settings.hyper_params.use_distributed_pdlp, false},
+    {CUOPT_PDLP_DISABLE_GRAPH, &pdlp_settings.hyper_params.pdlp_disable_graph, false},
   };
   // String parameters
   string_parameters = {
diff --git a/cpp/src/pdlp/solve.cu b/cpp/src/pdlp/solve.cu
index 338083f03a..70c488e3f3 100644
--- a/cpp/src/pdlp/solve.cu
+++ b/cpp/src/pdlp/solve.cu
@@ -756,6 +756,9 @@ static optimization_problem_solution_t<i_t, f_t> run_pdlp_solver(
   const timer_t& timer,
   bool is_batch_mode)
 {
+  detail::pdlp_graph_disabled_flag().store(settings.hyper_params.pdlp_disable_graph,
+                                           std::memory_order_relaxed);
+
   if (problem.n_constraints == 0) {
     CUOPT_LOG_CONDITIONAL_INFO(
       !settings.inside_mip,
diff --git a/cpp/src/pdlp/utilities/ping_pong_graph.cuh b/cpp/src/pdlp/utilities/ping_pong_graph.cuh
index dbc8fe5828..6b527f81b2 100644
--- a/cpp/src/pdlp/utilities/ping_pong_graph.cuh
+++ b/cpp/src/pdlp/utilities/ping_pong_graph.cuh
@@ -12,10 +12,25 @@
 
 #include <rmm/cuda_stream_view.hpp>
 
+#include <atomic>
 #include <utility>
 
 namespace cuopt::linear_programming::detail {
 
+// Debug/diagnostic toggle: when set, ping_pong_graph_t::run() bypasses CUDA
+// graph capture and executes its work eagerly on every iteration. Useful for
+// for debugging
+inline std::atomic<bool>& pdlp_graph_disabled_flag()
+{
+  static std::atomic<bool> s_flag{false};
+  return s_flag;
+}
+
+inline bool pdlp_graph_disabled()
+{
+  return pdlp_graph_disabled_flag().load(std::memory_order_relaxed);
+}
+
 // Two-slot CUDA-graph cache for PDLP. PDLP swaps pointers (rather than
 // copying vectors) at the end of adaptive pdhg step, so the captured graph
 // topology alternates between two layouts depending on iteration parity.
@@ -49,7 +64,7 @@ class ping_pong_graph_t {
 #ifdef CUPDLP_DEBUG_MODE
     work();
 #else
-    if (is_legacy_batch_mode_) {
+    if (is_legacy_batch_mode_ || pdlp_graph_disabled()) {
       work();
       return;
     }

From 61acddb5cd0c0df8f09086d87264759e66ac94dd Mon Sep 17 00:00:00 2001
From: Bulle Mostovoi <vmostovoi@nvidia.com>
Date: Sun, 31 May 2026 10:02:48 -0700
Subject: [PATCH 51/67] makes reductions in compute interraction adn movement
 use owned_size rather than total size hehehehe

---
 cpp/src/pdlp/pdlp.cu                           |  5 ++++-
 .../adaptive_step_size_strategy.cu             | 18 ++++++++++++++----
 .../adaptive_step_size_strategy.hpp            |  6 +++++-
 3 files changed, 23 insertions(+), 6 deletions(-)

diff --git a/cpp/src/pdlp/pdlp.cu b/cpp/src/pdlp/pdlp.cu
index a061a2d468..3b77a1cf47 100644
--- a/cpp/src/pdlp/pdlp.cu
+++ b/cpp/src/pdlp/pdlp.cu
@@ -2327,10 +2327,13 @@ void pdlp_solver_t<i_t, f_t>::compute_fixed_error(std::vector<int>& has_restarte
         cusparseDnVecSetValues(sub_cv.potential_next_dual_solution,
                                (void*)sub_pdlp.pdhg_solver_.get_reflected_dual().data()));
 
+      // Ensure norm is on owned size
       sub_pdlp.step_size_strategy_.compute_interaction_and_movement(
         sub_pdlp.pdhg_solver_.get_primal_tmp_resource(),
         sub_cv,
-        sub_pdlp.pdhg_solver_.get_saddle_point_state());
+        sub_pdlp.pdhg_solver_.get_saddle_point_state(),
+        shard->rank_data.owned_var_size,
+        shard->rank_data.owned_cstr_size);
 
       RAFT_CUSPARSE_TRY(cusparseDnVecSetValues(
         sub_cv.potential_next_dual_solution,
diff --git a/cpp/src/pdlp/step_size_strategy/adaptive_step_size_strategy.cu b/cpp/src/pdlp/step_size_strategy/adaptive_step_size_strategy.cu
index 2cb843ae86..530a426117 100644
--- a/cpp/src/pdlp/step_size_strategy/adaptive_step_size_strategy.cu
+++ b/cpp/src/pdlp/step_size_strategy/adaptive_step_size_strategy.cu
@@ -364,8 +364,18 @@ template <typename i_t, typename f_t>
 void adaptive_step_size_strategy_t<i_t, f_t>::compute_interaction_and_movement(
   rmm::device_uvector<f_t>& tmp_primal,
   cusparse_view_t<i_t, f_t>& cusparse_view,
-  saddle_point_state_t<i_t, f_t>& current_saddle_point_state)
+  saddle_point_state_t<i_t, f_t>& current_saddle_point_state,
+  i_t owned_primal_size,
+  i_t owned_cstr_size)
 {
+  // mGPU needs to know owned size to restrict the reductions to the owned prefix
+  const i_t reduce_primal_size = (owned_primal_size >= 0)
+                                   ? owned_primal_size
+                                   : current_saddle_point_state.get_primal_size();
+  const i_t reduce_dual_size   = (owned_cstr_size >= 0)
+                                   ? owned_cstr_size
+                                   : current_saddle_point_state.get_dual_size();
+
   // QP would need this:
   // if iszero(problem.objective_matrix)
   //   primal_objective_interaction = 0.0
@@ -444,7 +454,7 @@ void adaptive_step_size_strategy_t<i_t, f_t>::compute_interaction_and_movement(
     // compute interaction (x'-x) . (A(y'-y))
     RAFT_CUBLAS_TRY(
       raft::linalg::detail::cublasdot(handle_ptr_->get_cublas_handle(),
-                                      current_saddle_point_state.get_primal_size(),
+                                      reduce_primal_size,
                                       tmp_primal.data(),
                                       primal_stride,
                                       current_saddle_point_state.get_delta_primal().data(),
@@ -462,7 +472,7 @@ void adaptive_step_size_strategy_t<i_t, f_t>::compute_interaction_and_movement(
     //               norm(delta_dual) ^ 2;
     RAFT_CUBLAS_TRY(
       raft::linalg::detail::cublasdot(handle_ptr_->get_cublas_handle(),
-                                      current_saddle_point_state.get_primal_size(),
+                                      reduce_primal_size,
                                       current_saddle_point_state.get_delta_primal().data(),
                                       primal_stride,
                                       current_saddle_point_state.get_delta_primal().data(),
@@ -472,7 +482,7 @@ void adaptive_step_size_strategy_t<i_t, f_t>::compute_interaction_and_movement(
 
     RAFT_CUBLAS_TRY(
       raft::linalg::detail::cublasdot(handle_ptr_->get_cublas_handle(),
-                                      current_saddle_point_state.get_dual_size(),
+                                      reduce_dual_size,
                                       current_saddle_point_state.get_delta_dual().data(),
                                       dual_stride,
                                       current_saddle_point_state.get_delta_dual().data(),
diff --git a/cpp/src/pdlp/step_size_strategy/adaptive_step_size_strategy.hpp b/cpp/src/pdlp/step_size_strategy/adaptive_step_size_strategy.hpp
index 896c6fa24e..238735e8ff 100644
--- a/cpp/src/pdlp/step_size_strategy/adaptive_step_size_strategy.hpp
+++ b/cpp/src/pdlp/step_size_strategy/adaptive_step_size_strategy.hpp
@@ -88,9 +88,13 @@ class adaptive_step_size_strategy_t {
   rmm::device_uvector<f_t>& get_norm_squared_delta_primal();
   rmm::device_uvector<f_t>& get_norm_squared_delta_dual();
 
+  // owned_primal_size / owned_cstr_size are mGPU overrides.
+  // mGPU needs to know owned size to restrict the reductions to the owned prefix
   void compute_interaction_and_movement(rmm::device_uvector<f_t>& tmp_primal,
                                         cusparse_view_t<i_t, f_t>& cusparse_view,
-                                        saddle_point_state_t<i_t, f_t>& current_saddle_point_state);
+                                        saddle_point_state_t<i_t, f_t>& current_saddle_point_state,
+                                        i_t owned_primal_size = -1,
+                                        i_t owned_cstr_size   = -1);
 
   void swap_context(const thrust::universal_host_pinned_vector<swap_pair_t<i_t>>& swap_pairs);
   void resize_context(i_t new_size);

From b8b59bfce89a26652d809dc2b9966d20febc28ef Mon Sep 17 00:00:00 2001
From: Bulle Mostovoi <vmostovoi@nvidia.com>
Date: Sun, 31 May 2026 11:49:27 -0700
Subject: [PATCH 52/67] added emtis partitionner, still need it in the env. it
 is FAST. but we lose a lot of time on actal partitionning and data movements.
 Everything seems to be working

---
 cpp/CMakeLists.txt                            |  37 +++++
 cpp/src/pdlp/CMakeLists.txt                   |   1 +
 .../distributed_pdlp/metis_partitioner.cu     | 142 ++++++++++++++++++
 cpp/src/pdlp/distributed_pdlp/partitioner.cu  |   3 +
 cpp/src/pdlp/distributed_pdlp/partitioner.hpp |   2 +-
 cpp/src/pdlp/utilities/mgpu_trace.cuh         |  52 +++++++
 6 files changed, 236 insertions(+), 1 deletion(-)
 create mode 100644 cpp/src/pdlp/distributed_pdlp/metis_partitioner.cu
 create mode 100644 cpp/src/pdlp/utilities/mgpu_trace.cuh

diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index da7d4a4d35..d27072bcf9 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -342,6 +342,42 @@ set_target_properties(nccl_external PROPERTIES
 )
 message(STATUS "Using NCCL: ${NCCL_LIBRARY}")
 
+# ##################################################################################################
+# - METIS (graph partitioning for distributed PDLP) -----------------------------------------------
+# Found by searching CONDA_PREFIX first, then CUOPT_METIS_ROOT (cmake var or env)
+# if the user wants to pull METIS from a different conda env / system path.
+set(METIS_HINT_PREFIXES "")
+if (DEFINED ENV{CONDA_PREFIX} AND NOT "$ENV{CONDA_PREFIX}" STREQUAL "")
+    list(APPEND METIS_HINT_PREFIXES "$ENV{CONDA_PREFIX}")
+endif ()
+if (DEFINED CUOPT_METIS_ROOT AND NOT "${CUOPT_METIS_ROOT}" STREQUAL "")
+    list(APPEND METIS_HINT_PREFIXES "${CUOPT_METIS_ROOT}")
+endif ()
+if (DEFINED ENV{CUOPT_METIS_ROOT} AND NOT "$ENV{CUOPT_METIS_ROOT}" STREQUAL "")
+    list(APPEND METIS_HINT_PREFIXES "$ENV{CUOPT_METIS_ROOT}")
+endif ()
+find_path(METIS_INCLUDE_DIR
+    NAMES metis.h
+    HINTS ${METIS_HINT_PREFIXES}
+    PATH_SUFFIXES include
+)
+find_library(METIS_LIBRARY
+    NAMES metis libmetis
+    HINTS ${METIS_HINT_PREFIXES}
+    PATH_SUFFIXES lib lib64
+)
+if (NOT METIS_INCLUDE_DIR OR NOT METIS_LIBRARY)
+    message(FATAL_ERROR "METIS not found. Looked in: ${METIS_HINT_PREFIXES}. "
+                        "Install it via 'conda install -c conda-forge metis' in the active env, "
+                        "or set CUOPT_METIS_ROOT to a prefix containing include/metis.h and lib/libmetis.{so,a}.")
+endif ()
+add_library(metis_external UNKNOWN IMPORTED GLOBAL)
+set_target_properties(metis_external PROPERTIES
+    IMPORTED_LOCATION "${METIS_LIBRARY}"
+    INTERFACE_INCLUDE_DIRECTORIES "${METIS_INCLUDE_DIR}"
+)
+message(STATUS "Using METIS: ${METIS_LIBRARY}")
+
 # ##################################################################################################
 # - gRPC and Protobuf setup -----------------------------------------------------------------------
 
@@ -605,6 +641,7 @@ target_link_libraries(cuopt
         PRIVATE
         ${CUOPT_PRIVATE_CUDA_LIBS}
         nccl_external
+        metis_external
         $<$<BOOL:${CUOPT_ENABLE_GRPC}>:protobuf::libprotobuf>
         $<$<BOOL:${CUOPT_ENABLE_GRPC}>:gRPC::grpc++>
 )
diff --git a/cpp/src/pdlp/CMakeLists.txt b/cpp/src/pdlp/CMakeLists.txt
index a6ef14e3ff..863cf20962 100644
--- a/cpp/src/pdlp/CMakeLists.txt
+++ b/cpp/src/pdlp/CMakeLists.txt
@@ -33,6 +33,7 @@ set(LP_CORE_FILES
   ${CMAKE_CURRENT_SOURCE_DIR}/distributed_pdlp/multi_gpu_engine.cu
   ${CMAKE_CURRENT_SOURCE_DIR}/distributed_pdlp/partition_loader.cu
   ${CMAKE_CURRENT_SOURCE_DIR}/distributed_pdlp/partitioner.cu
+  ${CMAKE_CURRENT_SOURCE_DIR}/distributed_pdlp/metis_partitioner.cu
 )
 
 # C and Python adapter files
diff --git a/cpp/src/pdlp/distributed_pdlp/metis_partitioner.cu b/cpp/src/pdlp/distributed_pdlp/metis_partitioner.cu
new file mode 100644
index 0000000000..6ed80b0047
--- /dev/null
+++ b/cpp/src/pdlp/distributed_pdlp/metis_partitioner.cu
@@ -0,0 +1,142 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#include <pdlp/distributed_pdlp/metis_partitioner.hpp>
+#include <pdlp/distributed_pdlp/partitioner.hpp>
+
+#include <utilities/logger.hpp>
+
+#include <cuopt/error.hpp>
+
+#include <metis.h>
+
+#include <chrono>
+#include <cstddef>
+#include <vector>
+
+namespace cuopt::linear_programming::detail {
+
+// Builds the bipartite constraint/variable graph induced by A and runs
+// METIS_PartGraphKway to assign each of the (nb_cstr + nb_vars) nodes to a
+// part in [0, nb_parts). Layout matches metis_tests:
+//   * nodes [0, nb_cstr)              : constraint nodes
+//   * nodes [nb_cstr, nb_cstr+nb_vars): variable nodes
+//   * undirected edges from each A nonzero (one half via A, one via A_t)
+// The output is consumed by partition_loader_t::create_rank_data_from_parts.
+template <typename i_t, typename f_t>
+std::vector<i_t> metis_partitioner_t<i_t, f_t>::partition(
+  partitioner_input_t<i_t, f_t> const& input) const
+{
+  cuopt_expects(input.nb_parts > 0,
+                error_type_t::ValidationError,
+                "metis_partitioner: nb_parts must be positive");
+  cuopt_expects(input.nb_cstr >= 0 && input.nb_vars >= 0,
+                error_type_t::ValidationError,
+                "metis_partitioner: invalid problem dimensions");
+
+  cuopt_expects(input.A.row_offsets != nullptr && input.A.col_indices != nullptr,
+                error_type_t::ValidationError,
+                "metis_partitioner: A.row_offsets and A.col_indices are required");
+  cuopt_expects(input.A_t.row_offsets != nullptr && input.A_t.col_indices != nullptr,
+                error_type_t::ValidationError,
+                "metis_partitioner: A_t.row_offsets and A_t.col_indices are required");
+
+  auto const& A_offsets   = *input.A.row_offsets;
+  auto const& A_cols      = *input.A.col_indices;
+  auto const& A_t_offsets = *input.A_t.row_offsets;
+  auto const& A_t_cols    = *input.A_t.col_indices;
+
+  cuopt_expects(static_cast<i_t>(A_offsets.size()) == input.nb_cstr + 1,
+                error_type_t::ValidationError,
+                "metis_partitioner: A.row_offsets size mismatch (expected nb_cstr+1)");
+  cuopt_expects(static_cast<i_t>(A_t_offsets.size()) == input.nb_vars + 1,
+                error_type_t::ValidationError,
+                "metis_partitioner: A_t.row_offsets size mismatch (expected nb_vars+1)");
+  cuopt_expects(A_cols.size() == A_t_cols.size(),
+                error_type_t::ValidationError,
+                "metis_partitioner: A and A_t nnz mismatch");
+
+  const i_t nb_cstr = input.nb_cstr;
+  const i_t nb_vars = input.nb_vars;
+  const i_t nnz     = static_cast<i_t>(A_cols.size());
+  const i_t nvtx    = nb_cstr + nb_vars;
+
+  // Bipartite CSR. Same construction as metis_tests/src/main.cpp:
+  //   xadj   has length nvtx + 1
+  //   adjncy has length 2 * nnz (each A nonzero contributes one half-edge
+  //          from cstr side via A and one half-edge from var side via A_t)
+  std::vector<idx_t> xadj(nvtx + 1);
+  std::vector<idx_t> adjncy(2 * static_cast<std::size_t>(nnz));
+
+  // cstr-side row offsets: A_offsets[0..nb_cstr] (no shift).
+  for (i_t i = 0; i <= nb_cstr; ++i) { xadj[i] = static_cast<idx_t>(A_offsets[i]); }
+  // var-side row offsets: A_t_offsets[0..nb_vars], shifted by +nnz so that
+  // they index into the second half of adjncy.
+  for (i_t i = 0; i <= nb_vars; ++i) {
+    xadj[nb_cstr + i] = static_cast<idx_t>(A_t_offsets[i]) + static_cast<idx_t>(nnz);
+  }
+
+  // cstr-side neighbours: A_cols[i] shifted by +nb_cstr to index into the
+  // variable node block.
+  for (i_t k = 0; k < nnz; ++k) {
+    adjncy[k] = static_cast<idx_t>(A_cols[k]) + static_cast<idx_t>(nb_cstr);
+  }
+  // var-side neighbours: A_t_cols[i] already in [0, nb_cstr).
+  for (i_t k = 0; k < nnz; ++k) {
+    adjncy[nnz + k] = static_cast<idx_t>(A_t_cols[k]);
+  }
+
+  idx_t metis_options[METIS_NOPTIONS];
+  METIS_SetDefaultOptions(metis_options);
+  metis_options[METIS_OPTION_OBJTYPE] = METIS_OBJTYPE_CUT;
+
+  idx_t metis_nvtx = static_cast<idx_t>(nvtx);
+  idx_t ncon       = 1;
+  idx_t nparts     = static_cast<idx_t>(input.nb_parts);
+  idx_t objval     = 0;
+  std::vector<idx_t> metis_parts(nvtx);
+
+  auto t0 = std::chrono::high_resolution_clock::now();
+  const int status = METIS_PartGraphKway(&metis_nvtx,
+                                         &ncon,
+                                         xadj.data(),
+                                         adjncy.data(),
+                                         /*vwgt=*/nullptr,
+                                         /*vsize=*/nullptr,
+                                         /*adjwgt=*/nullptr,
+                                         &nparts,
+                                         /*tpwgts=*/nullptr,
+                                         /*ubvec=*/nullptr,
+                                         metis_options,
+                                         &objval,
+                                         metis_parts.data());
+  auto t1 = std::chrono::high_resolution_clock::now();
+  const double dt = std::chrono::duration<double>(t1 - t0).count();
+  cuopt_expects(status == METIS_OK,
+                error_type_t::RuntimeError,
+                "METIS_PartGraphKway failed (status=%d)",
+                status);
+  CUOPT_LOG_INFO(
+    "METIS partitioned bipartite graph: nvtx=%d nnz=%d nb_parts=%d edge_cut=%lld in %.3fs",
+    static_cast<int>(nvtx),
+    static_cast<int>(nnz),
+    static_cast<int>(input.nb_parts),
+    static_cast<long long>(objval),
+    dt);
+
+  std::vector<i_t> parts(static_cast<std::size_t>(nvtx));
+  for (i_t i = 0; i < nvtx; ++i) { parts[i] = static_cast<i_t>(metis_parts[i]); }
+
+  validate_partition(parts,
+                     static_cast<int>(nb_cstr),
+                     static_cast<int>(nb_vars),
+                     static_cast<int>(input.nb_parts),
+                     "metis_partitioner");
+  return parts;
+}
+
+template class metis_partitioner_t<int, double>;
+
+}  // namespace cuopt::linear_programming::detail
diff --git a/cpp/src/pdlp/distributed_pdlp/partitioner.cu b/cpp/src/pdlp/distributed_pdlp/partitioner.cu
index bdbfcacf06..4b809986ce 100644
--- a/cpp/src/pdlp/distributed_pdlp/partitioner.cu
+++ b/cpp/src/pdlp/distributed_pdlp/partitioner.cu
@@ -3,6 +3,7 @@
  * SPDX-License-Identifier: Apache-2.0
  */
 
+#include <pdlp/distributed_pdlp/metis_partitioner.hpp>
 #include <pdlp/distributed_pdlp/partitioner.hpp>
 
 #include <cuopt/error.hpp>
@@ -76,6 +77,8 @@ std::unique_ptr<partitioner_i<i_t, f_t>> make_partitioner(partitioner_kind_t kin
   switch (kind) {
     case partitioner_kind_t::Dummy:
       return std::make_unique<dummy_partitioner_t<i_t, f_t>>();
+    case partitioner_kind_t::Metis:
+      return std::make_unique<metis_partitioner_t<i_t, f_t>>();
   }
   cuopt_expects(false, error_type_t::RuntimeError, "make_partitioner: unsupported partitioner kind");
   return nullptr;
diff --git a/cpp/src/pdlp/distributed_pdlp/partitioner.hpp b/cpp/src/pdlp/distributed_pdlp/partitioner.hpp
index ee5798fd0b..82650ad805 100644
--- a/cpp/src/pdlp/distributed_pdlp/partitioner.hpp
+++ b/cpp/src/pdlp/distributed_pdlp/partitioner.hpp
@@ -36,7 +36,7 @@ struct partitioner_input_t {
   csr_host_view_t<i_t, f_t> A_t{};
 };
 
-enum class partitioner_kind_t { Dummy /*, Metis */ };
+enum class partitioner_kind_t { Dummy, Metis };
 
 template <typename i_t, typename f_t>
 class partitioner_i {
diff --git a/cpp/src/pdlp/utilities/mgpu_trace.cuh b/cpp/src/pdlp/utilities/mgpu_trace.cuh
new file mode 100644
index 0000000000..06a848b18e
--- /dev/null
+++ b/cpp/src/pdlp/utilities/mgpu_trace.cuh
@@ -0,0 +1,52 @@
+/* clang-format off */
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ */
+/* clang-format on */
+#pragma once
+
+// Lightweight env-gated tracing for multi-GPU PDLP diagnosis.
+//
+// Enable by setting CUOPT_MGPU_TRACE=1 in the environment.
+// All prints go to stderr (line-buffered + explicit flush) so they survive
+// a CUDA hang and interleave with cuOpt's normal output.
+//
+// Usage:
+//   MGPU_TRACE("entering compute_At_y");
+//   MGPU_TRACE_FMT("shard %d nnz=%lld", r, (long long)nnz);
+//
+// The guard reads the env var once on first use (thread-safe via static
+// initialization) and the cost when disabled is a single load + branch.
+
+#include <cstdio>
+#include <cstdlib>
+
+namespace cuopt::linear_programming::detail {
+
+inline bool mgpu_trace_enabled()
+{
+  static const bool enabled = []() {
+    const char* v = std::getenv("CUOPT_MGPU_TRACE");
+    return v != nullptr && v[0] != '\0' && v[0] != '0';
+  }();
+  return enabled;
+}
+
+}  // namespace cuopt::linear_programming::detail
+
+#define MGPU_TRACE(msg)                                                                        \
+  do {                                                                                         \
+    if (::cuopt::linear_programming::detail::mgpu_trace_enabled()) {                           \
+      std::fprintf(stderr, "[mgpu %s:%d] %s\n", __func__, __LINE__, (msg));                    \
+      std::fflush(stderr);                                                                     \
+    }                                                                                          \
+  } while (0)
+
+#define MGPU_TRACE_FMT(fmt, ...)                                                               \
+  do {                                                                                         \
+    if (::cuopt::linear_programming::detail::mgpu_trace_enabled()) {                           \
+      std::fprintf(stderr, "[mgpu %s:%d] " fmt "\n", __func__, __LINE__, __VA_ARGS__);         \
+      std::fflush(stderr);                                                                     \
+    }                                                                                          \
+  } while (0)

From 7d74e740ca3369ff10e9402573c1bed73dcae13a Mon Sep 17 00:00:00 2001
From: Bulle Mostovoi <vmostovoi@nvidia.com>
Date: Sun, 31 May 2026 11:53:44 -0700
Subject: [PATCH 53/67] forgot to push a file, maybe doesnt compile lol

---
 cpp/src/pdlp/pdlp.cu | 47 +++++++++++++++++++++++++++++++++++++++++---
 1 file changed, 44 insertions(+), 3 deletions(-)

diff --git a/cpp/src/pdlp/pdlp.cu b/cpp/src/pdlp/pdlp.cu
index 3b77a1cf47..d80adf248d 100644
--- a/cpp/src/pdlp/pdlp.cu
+++ b/cpp/src/pdlp/pdlp.cu
@@ -426,9 +426,50 @@ pdlp_solver_t<i_t, f_t>::pdlp_solver_t(problem_t<i_t, f_t>& op_problem,
       partition_input.nb_cstr  = op_problem_scaled_.n_constraints;
       partition_input.nb_vars  = op_problem_scaled_.n_variables;
       partition_input.nb_parts = distributed_pdlp_num_gpus;
-      // Dummy partitioner ignores A / A_t for now; future METIS partitioners will
-      // fill these CSR views before calling partition().
-      auto partitioner = make_partitioner<i_t, f_t>(partitioner_kind_t::Dummy);
+
+      // Topology buffers: only needed for METIS (Dummy ignores them).
+      // Read CSR offsets and col indices from the (unscaled) problem; the
+      // partitioner only needs topology, not values, and scaled/unscaled share
+      // the same nonzero pattern.
+      std::vector<i_t> h_part_A_row_offsets;
+      std::vector<i_t> h_part_A_col_indices;
+      std::vector<i_t> h_part_A_t_row_offsets;
+      std::vector<i_t> h_part_A_t_col_indices;
+
+      const partitioner_kind_t kind =  partitioner_kind_t::Metis;
+      if (kind == partitioner_kind_t::Metis) {
+        const auto stream = op_problem_scaled_.handle_ptr->get_stream();
+        const i_t n_cstr  = op_problem_scaled_.n_constraints;
+        const i_t n_vars  = op_problem_scaled_.n_variables;
+        const i_t nnz     = op_problem_scaled_.nnz;
+        h_part_A_row_offsets.resize(n_cstr + 1);
+        h_part_A_col_indices.resize(nnz);
+        h_part_A_t_row_offsets.resize(n_vars + 1);
+        h_part_A_t_col_indices.resize(nnz);
+        raft::copy(
+          h_part_A_row_offsets.data(), op_problem_scaled_.offsets.data(), n_cstr + 1, stream);
+        raft::copy(
+          h_part_A_col_indices.data(), op_problem_scaled_.variables.data(), nnz, stream);
+        raft::copy(h_part_A_t_row_offsets.data(),
+                   op_problem_scaled_.reverse_offsets.data(),
+                   n_vars + 1,
+                   stream);
+        raft::copy(h_part_A_t_col_indices.data(),
+                   op_problem_scaled_.reverse_constraints.data(),
+                   nnz,
+                   stream);
+        op_problem_scaled_.handle_ptr->sync_stream(stream);
+
+        partition_input.A.row_offsets   = &h_part_A_row_offsets;
+        partition_input.A.col_indices   = &h_part_A_col_indices;
+        partition_input.A.num_rows      = n_cstr;
+        partition_input.A.num_cols      = n_vars;
+        partition_input.A_t.row_offsets = &h_part_A_t_row_offsets;
+        partition_input.A_t.col_indices = &h_part_A_t_col_indices;
+        partition_input.A_t.num_rows    = n_vars;
+        partition_input.A_t.num_cols    = n_cstr;
+      }
+      auto partitioner = make_partitioner<i_t, f_t>(kind);
       parts            = partitioner->partition(partition_input);
     }
 

From 859a299b0e3b296957a6de64418b2807796b87db Mon Sep 17 00:00:00 2001
From: Bulle Mostovoi <vmostovoi@nvidia.com>
Date: Mon, 1 Jun 2026 10:43:00 +0200
Subject: [PATCH 54/67] fixed dummy partitionner on single gpu

---
 cpp/src/pdlp/distributed_pdlp/metis_partitioner.cu | 8 ++++++++
 cpp/src/pdlp/pdlp.cu                               | 8 +++++++-
 2 files changed, 15 insertions(+), 1 deletion(-)

diff --git a/cpp/src/pdlp/distributed_pdlp/metis_partitioner.cu b/cpp/src/pdlp/distributed_pdlp/metis_partitioner.cu
index 6ed80b0047..73e2736251 100644
--- a/cpp/src/pdlp/distributed_pdlp/metis_partitioner.cu
+++ b/cpp/src/pdlp/distributed_pdlp/metis_partitioner.cu
@@ -32,6 +32,14 @@ std::vector<i_t> metis_partitioner_t<i_t, f_t>::partition(
   cuopt_expects(input.nb_parts > 0,
                 error_type_t::ValidationError,
                 "metis_partitioner: nb_parts must be positive");
+  // METIS_PartGraphKway internally does integer arithmetic of the form
+  // `nedges / nparts` and traps with SIGFPE when nparts == 1. The single-part
+  // case is also trivial (everything in part 0) so callers should route it to
+  // the Dummy partitioner instead (see pdlp_solver_t mGPU ctor).
+  cuopt_expects(input.nb_parts >= 2,
+                error_type_t::ValidationError,
+                "metis_partitioner: nb_parts must be >= 2 (METIS_PartGraphKway requirement); "
+                "use the Dummy partitioner for the single-shard case");
   cuopt_expects(input.nb_cstr >= 0 && input.nb_vars >= 0,
                 error_type_t::ValidationError,
                 "metis_partitioner: invalid problem dimensions");
diff --git a/cpp/src/pdlp/pdlp.cu b/cpp/src/pdlp/pdlp.cu
index d80adf248d..a747706639 100644
--- a/cpp/src/pdlp/pdlp.cu
+++ b/cpp/src/pdlp/pdlp.cu
@@ -436,7 +436,13 @@ pdlp_solver_t<i_t, f_t>::pdlp_solver_t(problem_t<i_t, f_t>& op_problem,
       std::vector<i_t> h_part_A_t_row_offsets;
       std::vector<i_t> h_part_A_t_col_indices;
 
-      const partitioner_kind_t kind =  partitioner_kind_t::Metis;
+      // METIS_PartGraphKway requires nparts >= 2; calling it with nparts == 1
+      // traps inside METIS (SIGFPE on integer division by zero). The
+      // num_gpus == 1 path is the single-shard dummy run anyway -- there's
+      // nothing for METIS to do, so route directly to Dummy which just places
+      // every vertex into part 0.
+      const partitioner_kind_t kind =
+        (distributed_pdlp_num_gpus == 1) ? partitioner_kind_t::Dummy : partitioner_kind_t::Metis;
       if (kind == partitioner_kind_t::Metis) {
         const auto stream = op_problem_scaled_.handle_ptr->get_stream();
         const i_t n_cstr  = op_problem_scaled_.n_constraints;

From 7daa7400e1f1b7a421a8ac9f9fbbba3d42489c16 Mon Sep 17 00:00:00 2001
From: Bulle Mostovoi <vmostovoi@nvidia.com>
Date: Mon, 1 Jun 2026 11:34:16 +0200
Subject: [PATCH 55/67] added some plumbing, will not load full problem on gpu

---
 cpp/src/pdlp/solve.cu  | 47 ++++++++++++++++++++++++++++++++++++++++++
 cpp/src/pdlp/solve.cuh | 40 +++++++++++++++++++++++++++++++++++
 2 files changed, 87 insertions(+)

diff --git a/cpp/src/pdlp/solve.cu b/cpp/src/pdlp/solve.cu
index 70c488e3f3..8081c42ffb 100644
--- a/cpp/src/pdlp/solve.cu
+++ b/cpp/src/pdlp/solve.cu
@@ -2143,10 +2143,49 @@ optimization_problem_solution_t<i_t, f_t> solve_lp(
   bool problem_checking,
   bool use_pdlp_solver_mode)
 {
+  // In distributed PDLP we can't allocate the full problem on the master device
+  if (settings.hyper_params.use_distributed_pdlp) {
+    return solve_lp_distributed_from_mps(
+      handle_ptr, mps_data_model, settings, problem_checking, use_pdlp_solver_mode);
+  }
   auto op_problem = mps_data_model_to_optimization_problem(handle_ptr, mps_data_model);
   return solve_lp(op_problem, settings, problem_checking, use_pdlp_solver_mode);
 }
 
+template <typename i_t, typename f_t>
+optimization_problem_solution_t<i_t, f_t> solve_lp_distributed_from_mps(
+  raft::handle_t const* handle_ptr,
+  const cuopt::linear_programming::io::mps_data_model_t<i_t, f_t>& mps_data_model,
+  pdlp_solver_settings_t<i_t, f_t> const& settings,
+  bool problem_checking,
+  bool use_pdlp_solver_mode)
+{
+  cuopt_expects(handle_ptr != nullptr,
+                error_type_t::ValidationError,
+                "solve_lp_distributed_from_mps: handle_ptr must not be null");
+  cuopt_expects(settings.hyper_params.use_distributed_pdlp,
+                error_type_t::ValidationError,
+                "solve_lp_distributed_from_mps: settings.hyper_params.use_distributed_pdlp "
+                "must be true");
+
+  pdlp_solver_settings_t<i_t, f_t> settings_resolved = settings;
+  if (settings_resolved.distributed_pdlp_num_gpus == -1) {
+    settings_resolved.distributed_pdlp_num_gpus = raft::device_setter::get_device_count();
+    CUOPT_LOG_INFO(
+      "solve_lp_distributed_from_mps: distributed_pdlp_num_gpus == -1, auto-detected "
+      "%d visible CUDA device(s)",
+      settings_resolved.distributed_pdlp_num_gpus);
+  }
+  if (settings_resolved.distributed_pdlp_num_gpus <= 1)
+  {
+    std::cout << "CAREFUL: use_distributed_pdlp with distributed_pdlp_num_gpus == 1 runs the "
+                 "single-shard dummy path"
+              << std::endl;
+  }
+  auto op_problem = mps_data_model_to_optimization_problem(handle_ptr, mps_data_model);
+  return solve_lp(op_problem, settings_resolved, problem_checking, use_pdlp_solver_mode);
+}
+
 // ============================================================================
 // CPU problem overloads (convert to GPU, solve, convert solution back)
 // ============================================================================
@@ -2287,6 +2326,14 @@ std::unique_ptr<lp_solution_interface_t<i_t, f_t>> solve_lp(
   template optimization_problem_t<int, F_TYPE> mps_data_model_to_optimization_problem(           \
     raft::handle_t const* handle_ptr,                                                            \
     const cuopt::linear_programming::io::mps_data_model_t<int, F_TYPE>& data_model);             \
+                                                                                                 \
+  template optimization_problem_solution_t<int, F_TYPE> solve_lp_distributed_from_mps(           \
+    raft::handle_t const* handle_ptr,                                                            \
+    const cuopt::linear_programming::io::mps_data_model_t<int, F_TYPE>& mps_data_model,          \
+    pdlp_solver_settings_t<int, F_TYPE> const& settings,                                         \
+    bool problem_checking,                                                                       \
+    bool use_pdlp_solver_mode);                                                                  \
+                                                                                                 \
   template void set_pdlp_solver_mode(pdlp_solver_settings_t<int, F_TYPE>& settings);
 
 #if MIP_INSTANTIATE_FLOAT
diff --git a/cpp/src/pdlp/solve.cuh b/cpp/src/pdlp/solve.cuh
index 90e5e4fe95..abb657943f 100644
--- a/cpp/src/pdlp/solve.cuh
+++ b/cpp/src/pdlp/solve.cuh
@@ -32,6 +32,46 @@ cuopt::linear_programming::optimization_problem_solution_t<i_t, f_t> solve_lp_wi
   const timer_t& timer,
   bool is_batch_mode = false);
 
+/**
+ * @brief Distributed-PDLP entry point that consumes the host MPS data model
+ *        directly, without ever materializing the full problem on a single
+ *        (master) GPU.
+ *
+ * This is the entry point intended for problems whose `nnz` exceeds the memory
+ * of a single device. Today (Step 1 of the mGPU memory refactor) it is a thin
+ * routing shim: it resolves `distributed_pdlp_num_gpus == -1` against the
+ * visible-device count and delegates to the legacy
+ * `mps_data_model_to_optimization_problem(...)` + device-side `solve_lp(...)`
+ * pipeline, which still allocates the full problem on master. The shim exists
+ * so the public-facing call site is already in place; subsequent commits will
+ * replace the body with:
+ *   1. host-side METIS partitioning straight off the MPS CSR
+ *   2. per-shard host CSR slicing
+ *   3. construction of an mGPU-native pdlp_solver_t whose master only holds
+ *      scalar metadata + gather buffers (no full A / A^T / scaled copies).
+ *
+ * Until then, behaviour and memory footprint are identical to the legacy path.
+ *
+ * @param handle_ptr  Master raft handle (its stream owns the gather buffers
+ *                    and any master-side aggregator allocations).
+ * @param mps_data_model  Host-resident MPS data (CPU vectors only).
+ * @param settings    User-supplied PDLP solver settings; the
+ *                    `distributed_pdlp_num_gpus == -1` sentinel is resolved
+ *                    here against the visible-device count.
+ * @param problem_checking      Forwarded to the eventual solver.
+ * @param use_pdlp_solver_mode  Forwarded to the eventual solver.
+ *
+ * @pre `settings.hyper_params.use_distributed_pdlp == true`.
+ */
+template <typename i_t, typename f_t>
+cuopt::linear_programming::optimization_problem_solution_t<i_t, f_t>
+solve_lp_distributed_from_mps(
+  raft::handle_t const* handle_ptr,
+  const cuopt::linear_programming::io::mps_data_model_t<i_t, f_t>& mps_data_model,
+  pdlp_solver_settings_t<i_t, f_t> const& settings,
+  bool problem_checking,
+  bool use_pdlp_solver_mode);
+
 /**
  * @brief Entry point for batch PDLP. Solves multiple LPs sharing the same constraint
  *        matrix structure in a single batched GPU run.

From 8a39e8c9e1b62cff57b09e80c013ae6ee53e30d4 Mon Sep 17 00:00:00 2001
From: Bulle Mostovoi <vmostovoi@nvidia.com>
Date: Mon, 1 Jun 2026 13:53:49 +0200
Subject: [PATCH 56/67] added guard to ensure presolver is not supported in
 mGPU

---
 cpp/src/pdlp/solve.cu | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/cpp/src/pdlp/solve.cu b/cpp/src/pdlp/solve.cu
index 8081c42ffb..b32bad87f8 100644
--- a/cpp/src/pdlp/solve.cu
+++ b/cpp/src/pdlp/solve.cu
@@ -2167,6 +2167,10 @@ optimization_problem_solution_t<i_t, f_t> solve_lp_distributed_from_mps(
                 error_type_t::ValidationError,
                 "solve_lp_distributed_from_mps: settings.hyper_params.use_distributed_pdlp "
                 "must be true");
+  cuopt_expects(settings.presolver == cuopt::linear_programming::presolver_t::None,
+                error_type_t::ValidationError,
+                "solve_lp_distributed_from_mps: presolve is not yet supported with "
+                "use_distributed_pdlp; please set settings.presolver = presolver_t::None");
 
   pdlp_solver_settings_t<i_t, f_t> settings_resolved = settings;
   if (settings_resolved.distributed_pdlp_num_gpus == -1) {

From 5a3b9ce521ac23d10a2356bcb2bb5413c66e98e0 Mon Sep 17 00:00:00 2001
From: Bulle Mostovoi <vmostovoi@nvidia.com>
Date: Tue, 2 Jun 2026 10:41:06 +0200
Subject: [PATCH 57/67] plumbed  pdlp_distributed_solver with mps_data_model
 and now data doesnt transit on master device !

---
 cpp/src/pdlp/pdlp.cu  | 327 +++++++++++++++++-------------------------
 cpp/src/pdlp/pdlp.cuh |   7 +-
 cpp/src/pdlp/solve.cu | 104 ++++++++++----
 3 files changed, 211 insertions(+), 227 deletions(-)

diff --git a/cpp/src/pdlp/pdlp.cu b/cpp/src/pdlp/pdlp.cu
index a747706639..21291b853d 100644
--- a/cpp/src/pdlp/pdlp.cu
+++ b/cpp/src/pdlp/pdlp.cu
@@ -17,6 +17,7 @@
 #include <pdlp/swap_and_resize_helper.cuh>
 #include <pdlp/utils.cuh>
 
+#include <dual_simplex/sparse_matrix.hpp>
 #include <mip_heuristics/mip_constants.hpp>
 #include "cuopt/linear_programming/pdlp/solver_solution.hpp"
 #include "distributed_pdlp/multi_gpu_engine.hpp"
@@ -375,15 +376,28 @@ pdlp_solver_t<i_t, f_t>::pdlp_solver_t(problem_t<i_t, f_t>& op_problem,
   }
 }
 
+// ============================================================================
+// Distributed multi-GPU ctor.
+// needs placeholder_problem to be a shape-0 problem
+// reads the problem from mps_data_model directly
+// builds internal attributes from the placeholder_problem
+// builds the engine from the mps_data_model
 template <typename i_t, typename f_t>
-pdlp_solver_t<i_t, f_t>::pdlp_solver_t(problem_t<i_t, f_t>& op_problem,
-                                       pdlp_solver_settings_t<i_t, f_t> const& settings,
-                                       int distributed_pdlp_num_gpus)
-  // 1. Delegate to single-GPU ctor to bring up all the per-master state
-  //    (problem_ptr, op_problem_scaled_, pdhg_solver_, strategies, etc.).
-  : pdlp_solver_t(op_problem, settings, /*is_legacy_batch_mode=*/false)
+pdlp_solver_t<i_t, f_t>::pdlp_solver_t(
+  problem_t<i_t, f_t>& placeholder_problem,
+  cuopt::linear_programming::io::mps_data_model_t<i_t, f_t> const& mps,
+  pdlp_solver_settings_t<i_t, f_t> const& settings)
+  // Makes all inner feilds of master 0 size
+  : pdlp_solver_t(placeholder_problem, settings, /*is_legacy_batch_mode=*/false)
 {
-  CUOPT_LOG_INFO("Solving with distributed PDLP on %d GPU",
+  cuopt_expects(placeholder_problem.n_variables == 0 &&
+                  placeholder_problem.n_constraints == 0 &&
+                  placeholder_problem.nnz == 0,
+                error_type_t::ValidationError,
+                "Distributed mGPU pdlp_solver_t ctor requires a shape-0 "
+                "placeholder problem (n_variables == n_constraints == nnz == 0)");
+  const int distributed_pdlp_num_gpus = settings.distributed_pdlp_num_gpus;
+  CUOPT_LOG_INFO("Solving with distributed PDLP on %d GPU (mps direct path)",
                  distributed_pdlp_num_gpus);
   if (distributed_pdlp_num_gpus == 1) {
     std::cout << "CAREFUL !!: distributed_pdlp_num_gpus == 1, running single-shard dummy path, "
@@ -391,87 +405,125 @@ pdlp_solver_t<i_t, f_t>::pdlp_solver_t(problem_t<i_t, f_t>& op_problem,
                  "parameter --distributed-pdlp-num-gpus"
               << std::endl;
   }
-  cuopt_expects(distributed_pdlp_num_gpus == settings.distributed_pdlp_num_gpus,
-                error_type_t::ValidationError,
-                "This constructor's distributed_pdlp_num_gpus argument must match "
-                "settings.distributed_pdlp_num_gpus");
 
-  // Distributed PDLP is currently double-only
   if constexpr (!std::is_same_v<f_t, double>) {
     cuopt_expects(false,
                   error_type_t::ValidationError,
                   "Distributed PDLP currently requires double precision");
     return;
-  } else {
-    // 2. Load or compute partition
+  }
+    // ----- 1. Read problem shape and bulk data directly from mps (host) -----
+    const i_t n_vars = static_cast<i_t>(mps.get_objective_coefficients().size());
+    const i_t n_cstr = static_cast<i_t>(mps.get_constraint_lower_bounds().size());
+    const i_t nnz    = static_cast<i_t>(mps.get_constraint_matrix_values().size());
+    cuopt_expects(n_vars > 0,
+                  error_type_t::ValidationError,
+                  "Distributed PDLP from mps requires a non-empty objective");
+    cuopt_expects(n_cstr > 0,
+                  error_type_t::ValidationError,
+                  "Distributed PDLP from mps requires at least one constraint");
+    cuopt_expects(static_cast<i_t>(mps.get_constraint_matrix_offsets().size()) == n_cstr + 1,
+                  error_type_t::ValidationError,
+                  "mps constraint_matrix_offsets size must equal n_constraints + 1");
+    cuopt_expects(
+      static_cast<i_t>(mps.get_constraint_matrix_indices().size()) == nnz,
+      error_type_t::ValidationError,
+      "mps constraint_matrix_indices size must equal nnz (constraint_matrix_values size)");
+    cuopt_expects(static_cast<i_t>(mps.get_constraint_upper_bounds().size()) == n_cstr,
+                  error_type_t::ValidationError,
+                  "mps constraint_upper_bounds size must equal n_constraints");
+    cuopt_expects(static_cast<i_t>(mps.get_variable_lower_bounds().size()) == n_vars,
+                  error_type_t::ValidationError,
+                  "mps variable_lower_bounds size must equal n_variables");
+    cuopt_expects(static_cast<i_t>(mps.get_variable_upper_bounds().size()) == n_vars,
+                  error_type_t::ValidationError,
+                  "mps variable_upper_bounds size must equal n_variables");
+
+    const bool maximize           = mps.get_sense();
+    f_t objective_offset          = mps.get_objective_offset();
+    f_t objective_scaling_factor  = mps.get_objective_scaling_factor();
+
+    // Objective: copy (mutable so we can negate for maximize, matching
+    // problem_helpers.cuh::convert_to_maximization_problem).
+    std::vector<f_t> h_obj = mps.get_objective_coefficients();
+    if (maximize) {
+      for (auto& v : h_obj) v = -v;
+      objective_offset         = -objective_offset;
+      objective_scaling_factor = -objective_scaling_factor;
+    }
+
+    // Bounds (copy from mps; engine ctor takes by const ref to std::vector).
+    std::vector<f_t> h_var_lower  = mps.get_variable_lower_bounds();
+    std::vector<f_t> h_var_upper  = mps.get_variable_upper_bounds();
+    std::vector<f_t> h_cstr_lower = mps.get_constraint_lower_bounds();
+    std::vector<f_t> h_cstr_upper = mps.get_constraint_upper_bounds();
+
+    // A (CSR) — mutable copies for the engine + partitioner consumers below.
+    std::vector<i_t> h_A_row_offsets = mps.get_constraint_matrix_offsets();
+    std::vector<i_t> h_A_col_indices = mps.get_constraint_matrix_indices();
+    std::vector<f_t> h_A_values      = mps.get_constraint_matrix_values();
+
+    // ----- 2. Transpose A -> A^T on the host (one-shot CSR transpose) -----
+    // CSC(A) and CSR(A^T) share the same memory layout, so the CSC produced
+    // by dual_simplex::csr_matrix_t::to_compressed_col IS the CSR of A^T.
+    // O(nnz + n_vars) counting sort, same as problem_t::compute_transpose.
+    namespace ds = cuopt::linear_programming::dual_simplex;
+    ds::csr_matrix_t<i_t, f_t> A_csr(n_cstr, n_vars, nnz);
+    A_csr.row_start = h_A_row_offsets;
+    A_csr.j         = h_A_col_indices;
+    A_csr.x         = h_A_values;
+    ds::csc_matrix_t<i_t, f_t> AT_as_csc(n_vars, n_cstr, nnz);
+    A_csr.to_compressed_col(AT_as_csc);
+    std::vector<i_t> h_A_t_row_offsets = std::move(AT_as_csc.col_start);
+    std::vector<i_t> h_A_t_col_indices = std::move(AT_as_csc.i);
+    std::vector<f_t> h_A_t_values      = std::move(AT_as_csc.x);
+
+    // ----- 3. Identity scaling for V1 -----
+    // Real multi-GPU scaling is a TODO; ship the unscaled problem to shards as
+    // both "unscaled" and "scaled" so the engine and per-shard pdlp_solver_t
+    // can run end-to-end. Scaling factor vectors are 1.0 everywhere so the
+    // shard-side unscale at the end is a no-op.
+    std::vector<f_t> h_A_values_scaled              = h_A_values;
+    std::vector<f_t> h_A_t_values_scaled            = h_A_t_values;
+    std::vector<f_t> h_obj_scaled                   = h_obj;
+    std::vector<f_t> h_var_lower_scaled             = h_var_lower;
+    std::vector<f_t> h_var_upper_scaled             = h_var_upper;
+    std::vector<f_t> h_cstr_lower_scaled            = h_cstr_lower;
+    std::vector<f_t> h_cstr_upper_scaled            = h_cstr_upper;
+    std::vector<f_t> h_cummulative_cstr_scaling(n_cstr, f_t(1.0));
+    std::vector<f_t> h_cummulative_var_scaling(n_vars, f_t(1.0));
+    const f_t h_bound_rescaling                     = f_t(1.0);
+    const f_t h_objective_rescaling                 = f_t(1.0);
+
+    // ----- 4. Partition -----
     std::vector<i_t> parts;
     if (!settings.multi_gpu_partition_file.empty()) {
       parts = partition_loader_t<i_t, f_t>::parse_distributed_pdlp_partition_file(
         settings.multi_gpu_partition_file);
-      validate_partition(parts,
-                         op_problem_scaled_.n_constraints,
-                         op_problem_scaled_.n_variables,
-                         distributed_pdlp_num_gpus,
-                         "partition file");
+      validate_partition(parts, n_cstr, n_vars, distributed_pdlp_num_gpus, "partition file");
     } else {
       if (distributed_pdlp_num_gpus == 1) {
-        // Single-part dummy run: useful for exercising the mGPU code paths on a
-        // single physical GPU without a real partition file.
         std::cout << "CAREFUL: distributed_pdlp_num_gpus == 1, running dummy version (single "
                      "part covering "
-                  << op_problem_scaled_.n_constraints << " cstrs + "
-                  << op_problem_scaled_.n_variables << " vars)" << std::endl;
+                  << n_cstr << " cstrs + " << n_vars << " vars)" << std::endl;
       }
       partitioner_input_t<i_t, f_t> partition_input;
-      partition_input.nb_cstr  = op_problem_scaled_.n_constraints;
-      partition_input.nb_vars  = op_problem_scaled_.n_variables;
+      partition_input.nb_cstr  = n_cstr;
+      partition_input.nb_vars  = n_vars;
       partition_input.nb_parts = distributed_pdlp_num_gpus;
 
-      // Topology buffers: only needed for METIS (Dummy ignores them).
-      // Read CSR offsets and col indices from the (unscaled) problem; the
-      // partitioner only needs topology, not values, and scaled/unscaled share
-      // the same nonzero pattern.
-      std::vector<i_t> h_part_A_row_offsets;
-      std::vector<i_t> h_part_A_col_indices;
-      std::vector<i_t> h_part_A_t_row_offsets;
-      std::vector<i_t> h_part_A_t_col_indices;
-
-      // METIS_PartGraphKway requires nparts >= 2; calling it with nparts == 1
-      // traps inside METIS (SIGFPE on integer division by zero). The
-      // num_gpus == 1 path is the single-shard dummy run anyway -- there's
-      // nothing for METIS to do, so route directly to Dummy which just places
-      // every vertex into part 0.
+      // METIS_PartGraphKway requires nparts >= 2; route num_gpus == 1 to Dummy.
       const partitioner_kind_t kind =
         (distributed_pdlp_num_gpus == 1) ? partitioner_kind_t::Dummy : partitioner_kind_t::Metis;
       if (kind == partitioner_kind_t::Metis) {
-        const auto stream = op_problem_scaled_.handle_ptr->get_stream();
-        const i_t n_cstr  = op_problem_scaled_.n_constraints;
-        const i_t n_vars  = op_problem_scaled_.n_variables;
-        const i_t nnz     = op_problem_scaled_.nnz;
-        h_part_A_row_offsets.resize(n_cstr + 1);
-        h_part_A_col_indices.resize(nnz);
-        h_part_A_t_row_offsets.resize(n_vars + 1);
-        h_part_A_t_col_indices.resize(nnz);
-        raft::copy(
-          h_part_A_row_offsets.data(), op_problem_scaled_.offsets.data(), n_cstr + 1, stream);
-        raft::copy(
-          h_part_A_col_indices.data(), op_problem_scaled_.variables.data(), nnz, stream);
-        raft::copy(h_part_A_t_row_offsets.data(),
-                   op_problem_scaled_.reverse_offsets.data(),
-                   n_vars + 1,
-                   stream);
-        raft::copy(h_part_A_t_col_indices.data(),
-                   op_problem_scaled_.reverse_constraints.data(),
-                   nnz,
-                   stream);
-        op_problem_scaled_.handle_ptr->sync_stream(stream);
-
-        partition_input.A.row_offsets   = &h_part_A_row_offsets;
-        partition_input.A.col_indices   = &h_part_A_col_indices;
+        // partitioner_input_t holds non-const std::vector<i_t>* pointers; we
+        // already have the data in our local mutable buffers above.
+        partition_input.A.row_offsets   = &h_A_row_offsets;
+        partition_input.A.col_indices   = &h_A_col_indices;
         partition_input.A.num_rows      = n_cstr;
         partition_input.A.num_cols      = n_vars;
-        partition_input.A_t.row_offsets = &h_part_A_t_row_offsets;
-        partition_input.A_t.col_indices = &h_part_A_t_col_indices;
+        partition_input.A_t.row_offsets = &h_A_t_row_offsets;
+        partition_input.A_t.col_indices = &h_A_t_col_indices;
         partition_input.A_t.num_rows    = n_vars;
         partition_input.A_t.num_cols    = n_cstr;
       }
@@ -479,109 +531,7 @@ pdlp_solver_t<i_t, f_t>::pdlp_solver_t(problem_t<i_t, f_t>& op_problem,
       parts            = partitioner->partition(partition_input);
     }
 
-    // always compute initial step size before scaling and primal_weight after scaling to do like
-    // cuPDLPx
-    assert(settings_.hyper_params.compute_initial_primal_weight_before_scaling &&
-           "compute_initial_primal_weight_before_scaling must be true in distributed mode");
-    assert(!settings_.hyper_params.compute_initial_step_size_before_scaling &&
-           "compute_initial_step_size_before_scaling must be false in distributed mode");
-
-    compute_initial_primal_weight();
-
-    // scale globally before dispatching to shards
-    initial_scaling_strategy_.scale_problem();
-
-    compute_initial_step_size();
-    step_size_strategy_.get_primal_and_dual_stepsizes(primal_step_size_, dual_step_size_);
-
-    const f_t initial_step_size_global     = get_step_size_h(0);
-    const f_t initial_primal_weight_global = get_primal_weight_h(0);
-
-    // 4. Copy both scaled and unscaled pb
-    auto const stream = op_problem_scaled_.handle_ptr->get_stream();
-    i_t const n_cstr  = op_problem_scaled_.n_constraints;
-    i_t const n_vars  = op_problem_scaled_.n_variables;
-    i_t const nnz     = op_problem_scaled_.nnz;
-
-    // Shared topology (taken from the scaled problem, but identical on both).
-    std::vector<i_t> h_A_row_offsets(n_cstr + 1);
-    std::vector<i_t> h_A_col_indices(nnz);
-    std::vector<i_t> h_A_t_row_offsets(n_vars + 1);
-    std::vector<i_t> h_A_t_col_indices(nnz);
-    raft::copy(h_A_row_offsets.data(), op_problem_scaled_.offsets.data(), n_cstr + 1, stream);
-    raft::copy(h_A_col_indices.data(), op_problem_scaled_.variables.data(), nnz, stream);
-    raft::copy(
-      h_A_t_row_offsets.data(), op_problem_scaled_.reverse_offsets.data(), n_vars + 1, stream);
-    raft::copy(
-      h_A_t_col_indices.data(), op_problem_scaled_.reverse_constraints.data(), nnz, stream);
-
-    // Paired value arrays for A and A_T.
-    std::vector<f_t> h_A_values(nnz);
-    std::vector<f_t> h_A_values_scaled(nnz);
-    std::vector<f_t> h_A_t_values(nnz);
-    std::vector<f_t> h_A_t_values_scaled(nnz);
-    raft::copy(h_A_values.data(), problem_ptr->coefficients.data(), nnz, stream);
-    raft::copy(h_A_t_values.data(), problem_ptr->reverse_coefficients.data(), nnz, stream);
-    raft::copy(h_A_values_scaled.data(), op_problem_scaled_.coefficients.data(), nnz, stream);
-    raft::copy(
-      h_A_t_values_scaled.data(), op_problem_scaled_.reverse_coefficients.data(), nnz, stream);
-
-    using f_t2 = typename type_2<f_t>::type;
-
-    std::vector<f_t> h_obj(n_vars);
-    std::vector<f_t> h_obj_scaled(n_vars);
-    std::vector<f_t2> h_var_bounds_packed(n_vars);
-    std::vector<f_t2> h_var_bounds_scaled_packed(n_vars);
-    std::vector<f_t> h_cstr_lower(n_cstr);
-    std::vector<f_t> h_cstr_upper(n_cstr);
-    std::vector<f_t> h_cstr_lower_scaled(n_cstr);
-    std::vector<f_t> h_cstr_upper_scaled(n_cstr);
-
-    raft::copy(h_obj.data(), problem_ptr->objective_coefficients.data(), n_vars, stream);
-    raft::copy(
-      h_obj_scaled.data(), op_problem_scaled_.objective_coefficients.data(), n_vars, stream);
-    raft::copy(h_var_bounds_packed.data(), problem_ptr->variable_bounds.data(), n_vars, stream);
-    raft::copy(
-      h_var_bounds_scaled_packed.data(), op_problem_scaled_.variable_bounds.data(), n_vars, stream);
-    raft::copy(h_cstr_lower.data(), problem_ptr->constraint_lower_bounds.data(), n_cstr, stream);
-    raft::copy(h_cstr_upper.data(), problem_ptr->constraint_upper_bounds.data(), n_cstr, stream);
-    raft::copy(h_cstr_lower_scaled.data(),
-               op_problem_scaled_.constraint_lower_bounds.data(),
-               n_cstr,
-               stream);
-    raft::copy(h_cstr_upper_scaled.data(),
-               op_problem_scaled_.constraint_upper_bounds.data(),
-               n_cstr,
-               stream);
-
-    // 5. Get full scaling factors on host
-    std::vector<f_t> h_cummulative_cstr_scaling(n_cstr);
-    std::vector<f_t> h_cummulative_var_scaling(n_vars);
-    raft::copy(h_cummulative_cstr_scaling.data(),
-               initial_scaling_strategy_.get_constraint_matrix_scaling_vector().data(),
-               n_cstr,
-               stream);
-    raft::copy(h_cummulative_var_scaling.data(),
-               initial_scaling_strategy_.get_variable_scaling_vector().data(),
-               n_vars,
-               stream);
-    const f_t h_bound_rescaling     = initial_scaling_strategy_.get_h_bound_rescaling();
-    const f_t h_objective_rescaling = initial_scaling_strategy_.get_h_objective_rescaling();
-
-    op_problem_scaled_.handle_ptr->sync_stream(stream);
-
-    // Unpack interleaved {lower, upper} into separate vectors for both
-    // versions, so the shard ctor's slicing loop is uniform.
-    std::vector<f_t> h_var_lower(n_vars), h_var_upper(n_vars);
-    std::vector<f_t> h_var_lower_scaled(n_vars), h_var_upper_scaled(n_vars);
-    for (i_t i = 0; i < n_vars; ++i) {
-      h_var_lower[i]        = h_var_bounds_packed[i].x;
-      h_var_upper[i]        = h_var_bounds_packed[i].y;
-      h_var_lower_scaled[i] = h_var_bounds_scaled_packed[i].x;
-      h_var_upper_scaled[i] = h_var_bounds_scaled_packed[i].y;
-    }
-
-    // 6. Build per-rank data and meta-data.
+    // ----- 5. Build per-rank data -----
     std::vector<rank_data_t<i_t, f_t>> sub_pdlp_rank_data =
       partition_loader_t<i_t, f_t>::create_rank_data_from_parts(parts,
                                                                 h_A_row_offsets,
@@ -597,7 +547,7 @@ pdlp_solver_t<i_t, f_t>::pdlp_solver_t(problem_t<i_t, f_t>& op_problem,
                                                                 n_vars,
                                                                 nnz);
 
-    // 7. Build the per-shard PDLP settings:
+    // ----- 6. Per-shard settings -----
     pdlp_solver_settings_t<i_t, f_t> sub_pdlp_settings                    = settings;
     sub_pdlp_settings.num_gpus                                            = 1;
     sub_pdlp_settings.distributed_pdlp_num_gpus                           = 1;
@@ -606,7 +556,7 @@ pdlp_solver_t<i_t, f_t>::pdlp_solver_t(problem_t<i_t, f_t>& op_problem,
     sub_pdlp_settings.hyper_params.default_l_inf_ruiz_iterations          = 0;
     sub_pdlp_settings.hyper_params.default_alpha_pock_chambolle_rescaling = 0.0;
 
-    // 8. Construct the engine, creates NCCL comms and shards
+    // ----- 7. Construct the engine: NCCL comms + per-shard pdlp_solver_t -----
     multi_gpu_engine.emplace(std::move(sub_pdlp_rank_data),
                              h_obj,
                              h_var_lower,
@@ -622,13 +572,12 @@ pdlp_solver_t<i_t, f_t>::pdlp_solver_t(problem_t<i_t, f_t>& op_problem,
                              h_cummulative_var_scaling,
                              h_bound_rescaling,
                              h_objective_rescaling,
-                             op_problem_scaled_.maximize,
-                             op_problem_scaled_.objective_offset,
-                             op_problem_scaled_.presolve_data.objective_scaling_factor,
+                             maximize,
+                             objective_offset,
+                             objective_scaling_factor,
                              sub_pdlp_settings);
 
-    // Copy to host and then to shards.
-    // More robust than cudaDeviceEnablePeerAccess and cost-free-ish.
+    // ----- 8. Seed shard step-size / primal-weight scalars from the master -----
     f_t h_step_size{}, h_primal_weight{}, h_best_primal_weight{};
     f_t h_primal_step_size{}, h_dual_step_size{};
     raft::copy(&h_step_size, step_size_.data(), 1, stream_view_);
@@ -648,27 +597,17 @@ pdlp_solver_t<i_t, f_t>::pdlp_solver_t(problem_t<i_t, f_t>& op_problem,
       raft::copy(sub.dual_step_size_.data(), &h_dual_step_size, 1, shard->stream);
     }
 
-    // Wire the engine into the master pdhg_solver_. Shards' pdhg_solver_ keep
-    // mgpu_engine_ == nullptr so they run plain single-GPU SpMV on local A.
+    // Wire the engine into master's pdhg_solver_; shards keep mgpu_engine_ == nullptr.
     pdhg_solver_.set_multi_gpu_engine(&*multi_gpu_engine);
 
-    // Project initial primal solution
-    if (settings_.hyper_params.project_initial_primal) {
-      // Use refine_initial_primal_projection ???
-      using f_t2 = typename type_2<f_t>::type;
-      for (auto& shard : multi_gpu_engine->shards) {
-        raft::device_setter guard(shard->device_id);
-        auto& sub = *shard->sub_pdlp;
-        cub::DeviceTransform::Transform(
-          cuda::std::make_tuple(sub.pdhg_solver_.get_primal_solution().data(),
-                                sub.get_op_problem_scaled().variable_bounds.data()),
-          sub.pdhg_solver_.get_primal_solution().data(),
-          sub.pdhg_solver_.get_primal_solution().size(),
-          clamp<f_t, f_t2>(),
-          shard->stream.view());
-      }
-    }
-  }  // end if constexpr (std::is_same_v<f_t, double>)
+    // ----- 9. Resize master gather destinations to the full problem size -----
+    pdhg_solver_.get_potential_next_primal_solution().resize(n_vars, stream_view_);
+    pdhg_solver_.get_potential_next_dual_solution().resize(n_cstr, stream_view_);
+    current_termination_strategy_.get_convergence_information().get_reduced_cost().resize(
+      n_vars, stream_view_);
+    primal_size_h_ = n_vars;
+    dual_size_h_   = n_cstr;
+    handle_ptr_->sync_stream(stream_view_);
 }
 
 template <typename i_t, typename f_t>
diff --git a/cpp/src/pdlp/pdlp.cuh b/cpp/src/pdlp/pdlp.cuh
index 14651eab3f..3544de89fa 100644
--- a/cpp/src/pdlp/pdlp.cuh
+++ b/cpp/src/pdlp/pdlp.cuh
@@ -8,6 +8,7 @@
 #pragma once
 
 #include <branch_and_bound/shared_strong_branching_context.hpp>
+#include <cuopt/linear_programming/io/mps_data_model.hpp>
 #include <cuopt/linear_programming/pdlp/solver_settings.hpp>
 #include <cuopt/linear_programming/pdlp/solver_solution.hpp>
 
@@ -64,9 +65,9 @@ class pdlp_solver_t {
                 bool is_batch_mode = false);
 
   // Distributed Solver Constructor
-  pdlp_solver_t(problem_t<i_t, f_t>& op_problem,
-                pdlp_solver_settings_t<i_t, f_t> const& settings,
-                int distributed_pdlp_num_gpus);
+  pdlp_solver_t(problem_t<i_t, f_t>& placeholder_problem,
+                cuopt::linear_programming::io::mps_data_model_t<i_t, f_t> const& mps,
+                pdlp_solver_settings_t<i_t, f_t> const& settings);
 
   optimization_problem_solution_t<i_t, f_t> run_solver(const timer_t& timer);
 
diff --git a/cpp/src/pdlp/solve.cu b/cpp/src/pdlp/solve.cu
index b32bad87f8..ef273faf13 100644
--- a/cpp/src/pdlp/solve.cu
+++ b/cpp/src/pdlp/solve.cu
@@ -773,32 +773,15 @@ static optimization_problem_solution_t<i_t, f_t> run_pdlp_solver(
     }
   }
 #endif
-  if (settings.hyper_params.use_distributed_pdlp) {
-    // Resolve the -1 "auto-detect" sentinel to the actual visible-device count on
-    // the master process
-    pdlp_solver_settings_t<i_t, f_t> settings_resolved = settings;
-    if (settings_resolved.distributed_pdlp_num_gpus == -1) {
-      settings_resolved.distributed_pdlp_num_gpus = raft::device_setter::get_device_count();
-      CUOPT_LOG_INFO("distributed_pdlp_num_gpus == -1: auto-detected %d visible CUDA device",
-                     settings_resolved.distributed_pdlp_num_gpus);
-    }
-    cuopt_expects(settings_resolved.distributed_pdlp_num_gpus >= 1,
-                  error_type_t::ValidationError,
-                  "distributed_pdlp_num_gpus must be >= 1 or -1 (auto-detect)");
-    if (settings_resolved.distributed_pdlp_num_gpus == 1) {
-      std::cout
-        << "CAREFUL: use_distributed_pdlp with distributed_pdlp_num_gpus == 1 runs the "
-           "single-shard dummy path"
-        << std::endl;
-    }
-    cuopt_expects(!is_batch_mode,
-                  error_type_t::ValidationError,
-                  "Distributed PDLP does not support batch mode");
-    // Multi-GPU ctor; dispatched by 3rd-arg TYPE (int, not bool batch).
-    detail::pdlp_solver_t<i_t, f_t> solver(
-      problem, settings_resolved, settings_resolved.distributed_pdlp_num_gpus);
-    return solver.run_solver(timer);
-  }
+  // Distributed PDLP cannot enter through this path: by the time we have a
+  // problem_t, the full problem already lives on the master GPU, which defeats
+  // the purpose of distributed mode. Callers must route to
+  // solve_lp_distributed_from_mps via solve_lp(mps_data_model, ...).
+  cuopt_expects(!settings.hyper_params.use_distributed_pdlp,
+                error_type_t::ValidationError,
+                "Distributed PDLP must be entered via solve_lp(mps_data_model, ...) "
+                "so the master GPU never materializes the full problem. Call sites "
+                "with a problem_t cannot dispatch to distributed mode.");
   detail::pdlp_solver_t<i_t, f_t> solver(problem, settings, is_batch_mode);
   if (settings.inside_mip) { solver.set_inside_mip(true); }
   return solver.run_solver(timer);
@@ -2180,14 +2163,75 @@ optimization_problem_solution_t<i_t, f_t> solve_lp_distributed_from_mps(
       "%d visible CUDA device(s)",
       settings_resolved.distributed_pdlp_num_gpus);
   }
-  if (settings_resolved.distributed_pdlp_num_gpus <= 1)
-  {
+  if (settings_resolved.distributed_pdlp_num_gpus <= 1) {
     std::cout << "CAREFUL: use_distributed_pdlp with distributed_pdlp_num_gpus == 1 runs the "
                  "single-shard dummy path"
               << std::endl;
   }
-  auto op_problem = mps_data_model_to_optimization_problem(handle_ptr, mps_data_model);
-  return solve_lp(op_problem, settings_resolved, problem_checking, use_pdlp_solver_mode);
+  // PDLP precision validations (mirror the checks in run_pdlp; distributed
+  // path only supports the default-precision, non-batch double config).
+  cuopt_expects(settings_resolved.pdlp_precision == pdlp_precision_t::DefaultPrecision,
+                error_type_t::ValidationError,
+                "Distributed PDLP only supports DefaultPrecision (double).");
+  cuopt_expects(!settings_resolved.inside_mip,
+                error_type_t::ValidationError,
+                "Distributed PDLP is not yet supported from inside MIP.");
+
+  init_logger_t log(settings_resolved.log_file, settings_resolved.log_to_console);
+  print_version_info();
+  init_handler(handle_ptr);
+
+  const i_t n_vars = static_cast<i_t>(mps_data_model.get_objective_coefficients().size());
+  const i_t n_cstr = static_cast<i_t>(mps_data_model.get_constraint_lower_bounds().size());
+  const i_t nnz    = static_cast<i_t>(mps_data_model.get_constraint_matrix_values().size());
+  CUOPT_LOG_INFO("Solving a problem with %d constraints, %d variables (%d integers), and %d "
+                 "nonzeros (distributed mps-direct path)",
+                 n_cstr,
+                 n_vars,
+                 0,
+                 nnz);
+
+  auto lp_timer = cuopt::timer_t(settings_resolved.time_limit);
+
+  // Shape-0 placeholder: needed to build an empty pdlp_solver
+  cuopt::linear_programming::optimization_problem_t<i_t, f_t> placeholder_op(handle_ptr);
+  {
+    std::vector<i_t> empty_offsets = {0};
+    placeholder_op.set_csr_constraint_matrix(
+      nullptr, 0, nullptr, 0, empty_offsets.data(), static_cast<i_t>(empty_offsets.size()));
+  }
+  detail::problem_t<i_t, f_t> placeholder_problem(placeholder_op);
+
+  detail::pdlp_solver_t<i_t, f_t> solver(
+    placeholder_problem, mps_data_model, settings_resolved);
+
+  auto sol = solver.run_solver(lp_timer);
+
+  // Maximization post-processing (matches run_pdlp at solve.cu:835-839):
+  // PDLP internally solves the negated objective, so flip dual / reduced
+  // cost signs on the gathered solution before returning.
+  if (mps_data_model.get_sense()) {
+    adjust_dual_solution_and_reduced_cost(
+      sol.get_dual_solution(), sol.get_reduced_cost(), handle_ptr->get_stream());
+    handle_ptr->sync_stream();
+  }
+
+  sol.set_solve_time(lp_timer.elapsed_time());
+  CUOPT_LOG_INFO("PDLP finished");
+  if (sol.get_termination_status() != pdlp_termination_status_t::ConcurrentLimit) {
+    CUOPT_LOG_INFO("Status: %s   Objective: %.8e  Iterations: %d  Time: %.3fs",
+                   sol.get_termination_status_string().c_str(),
+                   sol.get_objective_value(),
+                   sol.get_additional_termination_information().number_of_steps_taken,
+                   sol.get_solve_time());
+  }
+
+  if (settings_resolved.sol_file != "") {
+    CUOPT_LOG_INFO("Writing solution to file %s", settings_resolved.sol_file.c_str());
+    sol.write_to_sol_file(settings_resolved.sol_file, handle_ptr->get_stream());
+  }
+
+  return sol;
 }
 
 // ============================================================================

From e4739b5a16c94d719187e28cd4ea3e32740c8f0b Mon Sep 17 00:00:00 2001
From: Bulle Mostovoi <vmostovoi@nvidia.com>
Date: Tue, 2 Jun 2026 15:21:20 +0200
Subject: [PATCH 58/67] removed usage of problem_t for distributed PDLP

---
 cpp/cuopt_cli.cpp                             |   7 +-
 .../distributed_pdlp/multi_gpu_engine.hpp     | 499 ++++++++++++++++++
 .../initial_scaling.cu                        | 120 +++--
 .../initial_scaling.cuh                       |  15 +-
 cpp/src/pdlp/pdlp.cu                          | 112 +++-
 cpp/src/pdlp/saddle_point.cu                  |   7 +-
 .../convergence_information.cu                |  71 +++
 .../convergence_information.hpp               |   5 +
 8 files changed, 790 insertions(+), 46 deletions(-)

diff --git a/cpp/cuopt_cli.cpp b/cpp/cuopt_cli.cpp
index 7c0a9111d9..0ea79bd4ec 100644
--- a/cpp/cuopt_cli.cpp
+++ b/cpp/cuopt_cli.cpp
@@ -176,7 +176,12 @@ int run_single_file(const std::string& file_path,
       auto solution = cuopt::linear_programming::solve_mip(problem_interface.get(), mip_settings);
     } else {
       auto& lp_settings = settings.get_pdlp_settings();
-      auto solution     = cuopt::linear_programming::solve_lp(problem_interface.get(), lp_settings);
+
+      if (lp_settings.hyper_params.use_distributed_pdlp) {
+        cuopt::linear_programming::solve_lp(handle_ptr.get(), mps_data_model, lp_settings);
+      } else {
+        cuopt::linear_programming::solve_lp(problem_interface.get(), lp_settings);
+      }
     }
   } catch (const std::exception& e) {
     fprintf(stderr, "cuopt_cli error: %s\n", e.what());
diff --git a/cpp/src/pdlp/distributed_pdlp/multi_gpu_engine.hpp b/cpp/src/pdlp/distributed_pdlp/multi_gpu_engine.hpp
index 674c4c0ef2..6ab4e35b71 100644
--- a/cpp/src/pdlp/distributed_pdlp/multi_gpu_engine.hpp
+++ b/cpp/src/pdlp/distributed_pdlp/multi_gpu_engine.hpp
@@ -12,6 +12,8 @@
 
 #include <cuopt/linear_programming/pdlp/solver_settings.hpp>
 
+#include <raft/sparse/detail/cusparse_wrappers.h>
+#include <raft/core/cusparse_macros.hpp>
 #include <raft/core/device_setter.hpp>
 #include <raft/linalg/detail/cublas_wrappers.hpp>
 
@@ -27,6 +29,7 @@
 #include <nccl.h>
 
 #include <memory>
+#include <random>
 #include <tuple>
 #include <vector>
 
@@ -336,6 +339,502 @@ struct multi_gpu_engine_t {
     for_each_shard([](auto& shard) { shard.sub_pdlp->pdhg_solver_.spmvop_At_y(); });
   }
 
+  // -------- Distributed Ruiz inf-scaling -----------------------------------
+  void alloc_global_var_scratch(i_t n_global_vars,
+                                std::vector<rmm::device_uvector<f_t>>& global_var_buf,
+                                std::vector<rmm::device_uvector<i_t>>& local_to_global_var_d)
+  {
+    const int nb = static_cast<int>(shards.size());
+    global_var_buf.reserve(nb);
+    local_to_global_var_d.reserve(nb);
+    for (int r = 0; r < nb; ++r) {
+      auto& s = *shards[r];
+      raft::device_setter guard(s.device_id);
+      global_var_buf.emplace_back(static_cast<std::size_t>(n_global_vars), s.stream.view());
+      local_to_global_var_d.emplace_back(static_cast<std::size_t>(s.rank_data.total_var_size),
+                                         s.stream.view());
+      if (s.rank_data.total_var_size > 0) {
+        RAFT_CUDA_TRY(cudaMemcpyAsync(local_to_global_var_d.back().data(),
+                                      s.rank_data.local_to_global_var.data(),
+                                      sizeof(i_t) * s.rank_data.local_to_global_var.size(),
+                                      cudaMemcpyHostToDevice,
+                                      s.stream.view().value()));
+      }
+    }
+  }
+
+  void reduce_iteration_variable_scaling_across_shards(
+    ncclRedOp_t op,
+    i_t n_global_vars,
+    std::vector<rmm::device_uvector<f_t>>& global_var_buf,
+    std::vector<rmm::device_uvector<i_t>>& local_to_global_var_d)
+  {
+    const int nb = static_cast<int>(shards.size());
+
+    // Zero global buffers, then scatter each shard's local values into their
+    // global column indices.
+    for (int r = 0; r < nb; ++r) {
+      auto& s = *shards[r];
+      raft::device_setter guard(s.device_id);
+      RAFT_CUDA_TRY(cudaMemsetAsync(global_var_buf[r].data(),
+                                    0,
+                                    sizeof(f_t) * static_cast<std::size_t>(n_global_vars),
+                                    s.stream.view().value()));
+      auto& iter_var_scaling =
+        s.sub_pdlp->get_initial_scaling_strategy().get_iteration_variable_scaling();
+      if (s.rank_data.total_var_size > 0) {
+        thrust::scatter(rmm::exec_policy_nosync(s.stream.view()),
+                        iter_var_scaling.begin(),
+                        iter_var_scaling.begin() + s.rank_data.total_var_size,
+                        local_to_global_var_d[r].begin(),
+                        global_var_buf[r].begin());
+      }
+    }
+
+    ncclGroupStart();
+    for (int r = 0; r < nb; ++r) {
+      auto& s = *shards[r];
+      raft::device_setter guard(s.device_id);
+      ncclAllReduce(global_var_buf[r].data(),
+                    global_var_buf[r].data(),
+                    static_cast<size_t>(n_global_vars),
+                    ncclFloat64,
+                    op,
+                    s.comm.get(),
+                    s.stream.view().value());
+    }
+    ncclGroupEnd();
+
+    // Gather the global per-column value back into each shard's local iter vector.
+    for (int r = 0; r < nb; ++r) {
+      auto& s = *shards[r];
+      raft::device_setter guard(s.device_id);
+      auto& iter_var_scaling =
+        s.sub_pdlp->get_initial_scaling_strategy().get_iteration_variable_scaling();
+      if (s.rank_data.total_var_size > 0) {
+        thrust::gather(rmm::exec_policy_nosync(s.stream.view()),
+                       local_to_global_var_d[r].begin(),
+                       local_to_global_var_d[r].begin() + s.rank_data.total_var_size,
+                       global_var_buf[r].begin(),
+                       iter_var_scaling.begin());
+      }
+    }
+  }
+
+  void distributed_ruiz_inf_scaling(int num_iter, i_t n_global_vars)
+  {
+    if (num_iter <= 0 || n_global_vars <= 0) return;
+    raft::common::nvtx::range scope("distributed_ruiz_inf_scaling");
+
+    std::vector<rmm::device_uvector<f_t>> global_var_buf;
+    std::vector<rmm::device_uvector<i_t>> local_to_global_var_d;
+    alloc_global_var_scratch(n_global_vars, global_var_buf, local_to_global_var_d);
+
+    for (int it = 0; it < num_iter; ++it) {
+      // 1) per-shard local kernel: writes iteration_variable_scaling (per-column
+      //    inf-norm partial) and iteration_constraint_matrix_scaling (row, complete).
+      for_each_shard([](auto& shard) {
+        shard.sub_pdlp->get_initial_scaling_strategy().ruiz_iter_compute_local_iteration_vectors();
+      });
+
+      // 2) cross-shard column inf-norm reduction (MAX).
+      reduce_iteration_variable_scaling_across_shards(
+        ncclMax, n_global_vars, global_var_buf, local_to_global_var_d);
+
+      // 3) per-shard fold into cumulative + reset iter vectors.
+      for_each_shard([](auto& shard) {
+        shard.sub_pdlp->get_initial_scaling_strategy().ruiz_iter_apply_cumulative_update();
+      });
+    }
+
+    // Make sure per-shard cumulative writes are observable on subsequent
+    // calls (e.g., the next distributed_max_singular_value).
+    for_each_shard([](auto& shard) { shard.stream.synchronize(); });
+  }
+
+  // Distributed Pock-Chambolle: one pass, mirroring single-GPU
+  // pock_chambolle_scaling but with the per-column sum-of-powers reduced across
+  // shards (SUM) between the local kernels and the cumulative fold. Rows are
+  // owned exclusively, so the row half stays local. Runs after the distributed
+  // Ruiz pass, matching the single-GPU order (Ruiz then Pock-Chambolle).
+  void distributed_pock_chambolle_scaling(f_t alpha, i_t n_global_vars)
+  {
+    if (n_global_vars <= 0) return;
+    raft::common::nvtx::range scope("distributed_pock_chambolle_scaling");
+
+    std::vector<rmm::device_uvector<f_t>> global_var_buf;
+    std::vector<rmm::device_uvector<i_t>> local_to_global_var_d;
+    alloc_global_var_scratch(n_global_vars, global_var_buf, local_to_global_var_d);
+
+    // 1) per-shard local kernels: row sum (complete) + column sum (partial).
+    for_each_shard([alpha](auto& shard) {
+      shard.sub_pdlp->get_initial_scaling_strategy().pock_chambolle_compute_local_iteration_vectors(
+        alpha);
+    });
+
+    // 2) cross-shard column sum-of-powers reduction (SUM).
+    reduce_iteration_variable_scaling_across_shards(
+      ncclSum, n_global_vars, global_var_buf, local_to_global_var_d);
+
+    // 3) per-shard fold into cumulative (cumulative /= sqrt(iteration)).
+    for_each_shard([](auto& shard) {
+      shard.sub_pdlp->get_initial_scaling_strategy().pock_chambolle_apply_cumulative_update();
+    });
+
+    for_each_shard([](auto& shard) { shard.stream.synchronize(); });
+  }
+
+  // -------- Distributed σ_max(A) via power iteration ----------------------
+  f_t distributed_max_singular_value(i_t n_global_cstrs,
+                                     int max_iterations = 5000,
+                                     f_t tolerance      = 1e-4)
+  {
+    raft::common::nvtx::range scope("distributed_max_singular_value");
+
+    const int nb = static_cast<int>(shards.size());
+
+    // Generate the GLOBAL z[] sequence in cstr-index order from a fresh
+    // mt19937(1), once per call. It's m doubles regardless of N (cheap).
+    // Each shard then keeps only z[global_idx_for_owned_local_i].
+    std::vector<f_t> h_global_z(static_cast<std::size_t>(n_global_cstrs));
+    {
+      std::mt19937 gen(1);
+      std::normal_distribution<f_t> dist(f_t(0.0), f_t(1.0));
+      for (i_t i = 0; i < n_global_cstrs; ++i) {
+        h_global_z[i] = dist(gen);
+      }
+    }
+
+    // Per-shard scratch lives on each shard's device. We use total (owned +
+    // halo) sizes for q/z/atq because they're SpMV inputs that need halo
+    // space. Norms / dot are scalars.
+    // We use size-1 rmm::device_uvector instead of rmm::device_scalar for the
+    // per-shard scratch scalars: nvcc + libcudacxx <cuda/basic_any> fail the
+    // copy_constructible concept check when device_scalar<T> appears in a
+    // std::vector (the check transitively touches rmm::cuda_stream, which is
+    // non-copyable). device_uvector<T> avoids that path.
+    std::vector<rmm::device_uvector<f_t>> q;
+    std::vector<rmm::device_uvector<f_t>> z;
+    std::vector<rmm::device_uvector<f_t>> atq;
+    std::vector<rmm::device_uvector<f_t>> sigma_sq;
+    std::vector<rmm::device_uvector<f_t>> norm_q;
+    std::vector<rmm::device_uvector<f_t>> residual_norm;
+    std::vector<cusparseDnVecDescr_t> z_dn(nb, nullptr);
+    std::vector<cusparseDnVecDescr_t> atq_dn(nb, nullptr);
+    q.reserve(nb);
+    z.reserve(nb);
+    atq.reserve(nb);
+    sigma_sq.reserve(nb);
+    norm_q.reserve(nb);
+    residual_norm.reserve(nb);
+
+    for (int r = 0; r < nb; ++r) {
+      auto& s = *shards[r];
+      raft::device_setter guard(s.device_id);
+      const i_t cstr_total = s.rank_data.total_cstr_size;
+      const i_t var_total  = s.rank_data.total_var_size;
+      q.emplace_back(static_cast<std::size_t>(cstr_total), s.stream.view());
+      z.emplace_back(static_cast<std::size_t>(cstr_total), s.stream.view());
+      atq.emplace_back(static_cast<std::size_t>(var_total), s.stream.view());
+      sigma_sq.emplace_back(std::size_t{1}, s.stream.view());
+      norm_q.emplace_back(std::size_t{1}, s.stream.view());
+      residual_norm.emplace_back(std::size_t{1}, s.stream.view());
+      RAFT_CUSPARSE_TRY(raft::sparse::detail::cusparsecreatednvec(
+        &z_dn[r], static_cast<int64_t>(cstr_total), z.back().data()));
+      RAFT_CUSPARSE_TRY(raft::sparse::detail::cusparsecreatednvec(
+        &atq_dn[r], static_cast<int64_t>(var_total), atq.back().data()));
+
+      std::vector<f_t> h_owned_z(static_cast<std::size_t>(s.rank_data.owned_cstr_size));
+      for (i_t i = 0; i < s.rank_data.owned_cstr_size; ++i) {
+        const i_t g  = s.rank_data.local_to_global_cstr[i];
+        h_owned_z[i] = h_global_z[g];
+      }
+      if (s.rank_data.owned_cstr_size > 0) {
+        RAFT_CUDA_TRY(
+          cudaMemcpyAsync(z.back().data(),
+                          h_owned_z.data(),
+                          sizeof(f_t) * static_cast<std::size_t>(s.rank_data.owned_cstr_size),
+                          cudaMemcpyHostToDevice,
+                          s.stream.view().value()));
+      }
+      if (cstr_total > s.rank_data.owned_cstr_size) {
+        RAFT_CUDA_TRY(cudaMemsetAsync(
+          z.back().data() + s.rank_data.owned_cstr_size,
+          0,
+          sizeof(f_t) * static_cast<std::size_t>(cstr_total - s.rank_data.owned_cstr_size),
+          s.stream.view().value()));
+      }
+      // Sync to ensure h_owned_z stays valid through the H2D copy (it goes
+      // out of scope at end of this iteration of the per-shard loop).
+      s.stream.synchronize();
+    }
+
+    // Local halo-exchange helpers that work directly on per-shard external
+    // buffers (the engine's halo_exchange_var/cstr expect accessors that
+    // resolve through pdhg_solver_t, which doesn't see our scratch).
+    auto halo_exchange_cstr_bufs = [&](std::vector<rmm::device_uvector<f_t>>& bufs) {
+      for (int r = 0; r < nb; ++r) {
+        auto& s = *shards[r];
+        raft::device_setter guard(s.device_id);
+        auto& y = bufs[r];
+        for (int peer = 0; peer < nb; ++peer) {
+          if (peer == r) continue;
+          if (s.cstr_send_indices_d[peer].size() == 0) continue;
+          thrust::gather(rmm::exec_policy_nosync(s.stream.view()),
+                         s.cstr_send_indices_d[peer].begin(),
+                         s.cstr_send_indices_d[peer].end(),
+                         y.begin(),
+                         s.cstr_send_buf_d[peer].begin());
+        }
+      }
+      ncclGroupStart();
+      for (int r = 0; r < nb; ++r) {
+        auto& s = *shards[r];
+        raft::device_setter guard(s.device_id);
+        for (int peer = 0; peer < nb; ++peer) {
+          if (peer == r) continue;
+          ncclSend(s.cstr_send_buf_d[peer].data(),
+                   s.cstr_send_buf_d[peer].size(),
+                   ncclFloat64,
+                   peer,
+                   s.comm.get(),
+                   s.stream.view().value());
+        }
+      }
+      for (int r = 0; r < nb; ++r) {
+        auto& s  = *shards[r];
+        auto& rd = s.rank_data;
+        raft::device_setter guard(s.device_id);
+        auto& y = bufs[r];
+        for (int peer = 0; peer < nb; ++peer) {
+          if (peer == r) continue;
+          f_t* recv_ptr = y.data() + rd.owned_cstr_size + rd.cstr_recv_offsets[peer];
+          ncclRecv(recv_ptr,
+                   static_cast<size_t>(rd.cstr_recv_counts[peer]),
+                   ncclFloat64,
+                   peer,
+                   s.comm.get(),
+                   s.stream.view().value());
+        }
+      }
+      ncclGroupEnd();
+    };
+    auto halo_exchange_var_bufs = [&](std::vector<rmm::device_uvector<f_t>>& bufs) {
+      for (int r = 0; r < nb; ++r) {
+        auto& s = *shards[r];
+        raft::device_setter guard(s.device_id);
+        auto& x = bufs[r];
+        for (int peer = 0; peer < nb; ++peer) {
+          if (peer == r) continue;
+          if (s.var_send_indices_d[peer].size() == 0) continue;
+          thrust::gather(rmm::exec_policy_nosync(s.stream.view()),
+                         s.var_send_indices_d[peer].begin(),
+                         s.var_send_indices_d[peer].end(),
+                         x.begin(),
+                         s.var_send_buf_d[peer].begin());
+        }
+      }
+      ncclGroupStart();
+      for (int r = 0; r < nb; ++r) {
+        auto& s = *shards[r];
+        raft::device_setter guard(s.device_id);
+        for (int peer = 0; peer < nb; ++peer) {
+          if (peer == r) continue;
+          ncclSend(s.var_send_buf_d[peer].data(),
+                   s.var_send_buf_d[peer].size(),
+                   ncclFloat64,
+                   peer,
+                   s.comm.get(),
+                   s.stream.view().value());
+        }
+      }
+      for (int r = 0; r < nb; ++r) {
+        auto& s  = *shards[r];
+        auto& rd = s.rank_data;
+        raft::device_setter guard(s.device_id);
+        auto& x = bufs[r];
+        for (int peer = 0; peer < nb; ++peer) {
+          if (peer == r) continue;
+          f_t* recv_ptr = x.data() + rd.owned_var_size + rd.var_recv_offsets[peer];
+          ncclRecv(recv_ptr,
+                   static_cast<size_t>(rd.var_recv_counts[peer]),
+                   ncclFloat64,
+                   peer,
+                   s.comm.get(),
+                   s.stream.view().value());
+        }
+      }
+      ncclGroupEnd();
+    };
+
+    // Per-shard partial reductions over the OWNED cstr slice + NCCL allreduce.
+    // For norm: out := sqrt(Σ_r ||bufs[r][0:owned_cstr]||²).
+    // For dot : out := Σ_r <a[r][0:owned_cstr], b[r][0:owned_cstr]>.
+    auto distributed_norm_owned_cstr = [&](std::vector<rmm::device_uvector<f_t>>& bufs,
+                                           std::vector<rmm::device_uvector<f_t>>& out) {
+      for (int r = 0; r < nb; ++r) {
+        auto& s = *shards[r];
+        raft::device_setter guard(s.device_id);
+        const i_t n_owned = s.rank_data.owned_cstr_size;
+        RAFT_CUBLAS_TRY(raft::linalg::detail::cublasdot(s.handle.get_cublas_handle(),
+                                                        static_cast<int>(n_owned),
+                                                        bufs[r].data(),
+                                                        1,
+                                                        bufs[r].data(),
+                                                        1,
+                                                        out[r].data(),
+                                                        s.stream.view().value()));
+      }
+      ncclGroupStart();
+      for (int r = 0; r < nb; ++r) {
+        auto& s = *shards[r];
+        raft::device_setter guard(s.device_id);
+        ncclAllReduce(out[r].data(),
+                      out[r].data(),
+                      1,
+                      ncclFloat64,
+                      ncclSum,
+                      s.comm.get(),
+                      s.stream.view().value());
+      }
+      ncclGroupEnd();
+      for (int r = 0; r < nb; ++r) {
+        auto& s = *shards[r];
+        raft::device_setter guard(s.device_id);
+        cub::DeviceTransform::Transform(
+          out[r].data(), out[r].data(), 1, sqrt_inplace_op_t<f_t>{}, s.stream.view().value());
+      }
+    };
+    auto distributed_dot_owned_cstr = [&](std::vector<rmm::device_uvector<f_t>>& a,
+                                          std::vector<rmm::device_uvector<f_t>>& b,
+                                          std::vector<rmm::device_uvector<f_t>>& out) {
+      for (int r = 0; r < nb; ++r) {
+        auto& s = *shards[r];
+        raft::device_setter guard(s.device_id);
+        const i_t n_owned = s.rank_data.owned_cstr_size;
+        RAFT_CUBLAS_TRY(raft::linalg::detail::cublasdot(s.handle.get_cublas_handle(),
+                                                        static_cast<int>(n_owned),
+                                                        a[r].data(),
+                                                        1,
+                                                        b[r].data(),
+                                                        1,
+                                                        out[r].data(),
+                                                        s.stream.view().value()));
+      }
+      ncclGroupStart();
+      for (int r = 0; r < nb; ++r) {
+        auto& s = *shards[r];
+        raft::device_setter guard(s.device_id);
+        ncclAllReduce(out[r].data(),
+                      out[r].data(),
+                      1,
+                      ncclFloat64,
+                      ncclSum,
+                      s.comm.get(),
+                      s.stream.view().value());
+      }
+      ncclGroupEnd();
+    };
+
+    // ===== Power iteration =====
+    // Mirrors single-GPU compute_initial_step_size: z is the carried iterate
+    // (A Aᵀ q each step); at the top of each iteration q := z then q is
+    // normalized; the residual z − σ²q is written back into q only to drive
+    // the convergence check (next iteration's q := z discards it).
+    for (int it = 0; it < max_iterations; ++it) {
+      // q := z on the owned slice (the carried iterate), then normalize.
+      for (int r = 0; r < nb; ++r) {
+        auto& s = *shards[r];
+        raft::device_setter guard(s.device_id);
+        const i_t n_owned = s.rank_data.owned_cstr_size;
+        raft::copy(q[r].data(), z[r].data(), n_owned, s.stream.view());
+      }
+
+      // ||q||₂ over the global OWNED cstr slice (one allreduce-sum + sqrt).
+      distributed_norm_owned_cstr(q, norm_q);
+
+      // q /= ||q||₂ on owned slice (halo gets refreshed by next exchange).
+      for (int r = 0; r < nb; ++r) {
+        auto& s = *shards[r];
+        raft::device_setter guard(s.device_id);
+        const i_t n_owned = s.rank_data.owned_cstr_size;
+        cub::DeviceTransform::Transform(
+          q[r].data(),
+          q[r].data(),
+          n_owned,
+          [n = norm_q[r].data()] __device__(f_t v) { return v / *n; },
+          s.stream.view().value());
+      }
+
+      // atq = A^T q : halo-exchange q, then per-shard SpMV. spmv_At_into
+      // rebinds the dual_solution dnvec to q[r].data() and restores the
+      // canonical binding after the call (see pdhg.cu:643-644).
+      halo_exchange_cstr_bufs(q);
+      for (int r = 0; r < nb; ++r) {
+        auto& s = *shards[r];
+        raft::device_setter guard(s.device_id);
+        s.sub_pdlp->pdhg_solver_.spmv_At_into(q[r], atq_dn[r]);
+      }
+
+      // z = A atq : halo-exchange atq, then per-shard SpMV.
+      halo_exchange_var_bufs(atq);
+      for (int r = 0; r < nb; ++r) {
+        auto& s = *shards[r];
+        raft::device_setter guard(s.device_id);
+        s.sub_pdlp->pdhg_solver_.spmv_A_into(atq[r], z_dn[r]);
+      }
+
+      // σ² = q · z over the global OWNED cstr slice (= q^T A A^T q = σ_max²
+      // when q is the dominant left-singular vector).
+      distributed_dot_owned_cstr(q, z, sigma_sq);
+
+      // q := -σ² q + z (owned slice) — residual of the eigen-equation.
+      for (int r = 0; r < nb; ++r) {
+        auto& s = *shards[r];
+        raft::device_setter guard(s.device_id);
+        const i_t n_owned = s.rank_data.owned_cstr_size;
+        cub::DeviceTransform::Transform(
+          cuda::std::make_tuple(q[r].data(), z[r].data()),
+          q[r].data(),
+          n_owned,
+          [s2 = sigma_sq[r].data()] __device__(f_t qv, f_t zv) { return -(*s2) * qv + zv; },
+          s.stream.view().value());
+      }
+
+      // Convergence check via global residual norm.
+      distributed_norm_owned_cstr(q, residual_norm);
+      auto& s0 = *shards[0];
+      raft::device_setter guard0(s0.device_id);
+      f_t h_res{};
+      RAFT_CUDA_TRY(cudaMemcpyAsync(&h_res,
+                                    residual_norm[0].data(),
+                                    sizeof(f_t),
+                                    cudaMemcpyDeviceToHost,
+                                    s0.stream.view().value()));
+      s0.stream.synchronize();
+      if (h_res < tolerance) break;
+    }
+
+    // σ_max² is the same on every shard after the last allreduce.
+    auto& s0 = *shards[0];
+    raft::device_setter guard0(s0.device_id);
+    f_t sigma_sq_h{};
+    RAFT_CUDA_TRY(cudaMemcpyAsync(&sigma_sq_h,
+                                  sigma_sq[0].data(),
+                                  sizeof(f_t),
+                                  cudaMemcpyDeviceToHost,
+                                  s0.stream.view().value()));
+    s0.stream.synchronize();
+
+    for (int r = 0; r < nb; ++r) {
+      raft::device_setter guard(shards[r]->device_id);
+      RAFT_CUSPARSE_TRY(cusparseDestroyDnVec(z_dn[r]));
+      RAFT_CUSPARSE_TRY(cusparseDestroyDnVec(atq_dn[r]));
+    }
+
+    return std::sqrt(std::max(sigma_sq_h, f_t(0)));
+  }
+
   // -------- Solution gather (shards -> master) ----------------------------
   // Assembles the global potential_next primal/dual solutions and the
   // reduced_cost on the master from the owned slices distributed across
diff --git a/cpp/src/pdlp/initial_scaling_strategy/initial_scaling.cu b/cpp/src/pdlp/initial_scaling_strategy/initial_scaling.cu
index 478753e9d9..dcc3e662b0 100644
--- a/cpp/src/pdlp/initial_scaling_strategy/initial_scaling.cu
+++ b/cpp/src/pdlp/initial_scaling_strategy/initial_scaling.cu
@@ -142,6 +142,10 @@ void pdlp_initial_scaling_strategy_t<i_t, f_t>::compute_scaling_vectors(
 {
   raft::common::nvtx::range fun_scope("compute_scaling_vectors");
 
+  // Skip scaling entirely for a shape-0 problem (distributed PDLP builds the
+  // master pdlp_solver_t from a shape-0 placeholder)
+  if (primal_size_h_ == 0 || dual_size_h_ == 0) return;
+
   if (hyper_params_.do_ruiz_scaling) { ruiz_inf_scaling(number_of_ruiz_iterations); }
   if (hyper_params_.do_pock_chambolle_scaling) { pock_chambolle_scaling(alpha); }
 }
@@ -213,6 +217,72 @@ __global__ void inf_norm_row_and_col_kernel(
   }
 }
 
+template <typename i_t, typename f_t>
+void pdlp_initial_scaling_strategy_t<i_t, f_t>::ruiz_iter_compute_local_iteration_vectors()
+{
+  // find inf norm over rows and columns of the scaled matrix in given iteration
+  i_t number_of_blocks = op_problem_scaled_.n_constraints / block_size;
+  if (op_problem_scaled_.n_constraints % block_size) number_of_blocks++;
+  i_t number_of_threads = std::min(op_problem_scaled_.n_variables, (i_t)block_size);
+  inf_norm_row_and_col_kernel<i_t, f_t><<<number_of_blocks, number_of_threads, 0, stream_view_>>>(
+    op_problem_scaled_.view(), this->view());
+  RAFT_CUDA_TRY(cudaPeekAtLastError());
+
+  if (running_mip_) { reset_integer_variables(); }
+}
+
+template <typename i_t, typename f_t>
+void pdlp_initial_scaling_strategy_t<i_t, f_t>::ruiz_iter_apply_cumulative_update()
+{
+  raft::linalg::binaryOp(cummulative_constraint_matrix_scaling_.data(),
+                         cummulative_constraint_matrix_scaling_.data(),
+                         iteration_constraint_matrix_scaling_.data(),
+                         dual_size_h_,
+                         a_divides_sqrt_b_bounded<f_t>(),
+                         stream_view_);
+
+  raft::linalg::binaryOp(cummulative_variable_scaling_.data(),
+                         cummulative_variable_scaling_.data(),
+                         iteration_variable_scaling_.data(),
+                         primal_size_h_,
+                         a_divides_sqrt_b_bounded<f_t>(),
+                         stream_view_);
+
+  // Reset the iteration_scaling vectors to all 0
+  RAFT_CUDA_TRY(cudaMemsetAsync(
+    iteration_constraint_matrix_scaling_.data(), 0.0, sizeof(f_t) * dual_size_h_, stream_view_));
+  RAFT_CUDA_TRY(cudaMemsetAsync(
+    iteration_variable_scaling_.data(), 0.0, sizeof(f_t) * primal_size_h_, stream_view_));
+}
+
+template <typename i_t, typename f_t>
+void pdlp_initial_scaling_strategy_t<i_t, f_t>::reset_scaling_state_for_distributed()
+{
+  if (primal_size_h_ == 0 || dual_size_h_ == 0) return;
+
+  // Re-allocate the iteration vectors the ctor shrank to 0 and zero them.
+  iteration_constraint_matrix_scaling_.resize(static_cast<size_t>(dual_size_h_), stream_view_);
+  iteration_variable_scaling_.resize(static_cast<size_t>(primal_size_h_), stream_view_);
+  RAFT_CUDA_TRY(cudaMemsetAsync(
+    iteration_constraint_matrix_scaling_.data(), 0, sizeof(f_t) * dual_size_h_, stream_view_));
+  RAFT_CUDA_TRY(cudaMemsetAsync(
+    iteration_variable_scaling_.data(), 0, sizeof(f_t) * primal_size_h_, stream_view_));
+
+  // Reset cumulative scaling + rescaling to identity (the ctor's stray
+  // Pock-Chambolle pass and shard.cu's set_cummulative_scaling left these in
+  // an arbitrary state; distributed scaling recomputes from a clean slate).
+  thrust::fill(handle_ptr_->get_thrust_policy(),
+               cummulative_constraint_matrix_scaling_.begin(),
+               cummulative_constraint_matrix_scaling_.end(),
+               f_t(1));
+  thrust::fill(handle_ptr_->get_thrust_policy(),
+               cummulative_variable_scaling_.begin(),
+               cummulative_variable_scaling_.end(),
+               f_t(1));
+  set_h_bound_rescaling(f_t(1));
+  set_h_objective_rescaling(f_t(1));
+}
+
 template <typename i_t, typename f_t>
 void pdlp_initial_scaling_strategy_t<i_t, f_t>::ruiz_inf_scaling(i_t number_of_ruiz_iterations)
 {
@@ -221,36 +291,8 @@ void pdlp_initial_scaling_strategy_t<i_t, f_t>::ruiz_inf_scaling(i_t number_of_r
   std::cout << "Doing ruiz_inf_scaling" << std::endl;
 #endif
   for (int i = 0; i < number_of_ruiz_iterations; i++) {
-    // find inf norm over rows and columns of the scaled matrix in given iteration (matrix is not
-    // actually updated, but the scaled value is computed and evaluated)
-    i_t number_of_blocks = op_problem_scaled_.n_constraints / block_size;
-    if (op_problem_scaled_.n_constraints % block_size) number_of_blocks++;
-    i_t number_of_threads = std::min(op_problem_scaled_.n_variables, (i_t)block_size);
-    inf_norm_row_and_col_kernel<i_t, f_t><<<number_of_blocks, number_of_threads, 0, stream_view_>>>(
-      op_problem_scaled_.view(), this->view());
-    RAFT_CUDA_TRY(cudaPeekAtLastError());
-
-    if (running_mip_) { reset_integer_variables(); }
-
-    raft::linalg::binaryOp(cummulative_constraint_matrix_scaling_.data(),
-                           cummulative_constraint_matrix_scaling_.data(),
-                           iteration_constraint_matrix_scaling_.data(),
-                           dual_size_h_,
-                           a_divides_sqrt_b_bounded<f_t>(),
-                           stream_view_);
-
-    raft::linalg::binaryOp(cummulative_variable_scaling_.data(),
-                           cummulative_variable_scaling_.data(),
-                           iteration_variable_scaling_.data(),
-                           primal_size_h_,
-                           a_divides_sqrt_b_bounded<f_t>(),
-                           stream_view_);
-
-    // Reset the iteration_scaling vectors to all 0
-    RAFT_CUDA_TRY(cudaMemsetAsync(
-      iteration_constraint_matrix_scaling_.data(), 0.0, sizeof(f_t) * dual_size_h_, stream_view_));
-    RAFT_CUDA_TRY(cudaMemsetAsync(
-      iteration_variable_scaling_.data(), 0.0, sizeof(f_t) * primal_size_h_, stream_view_));
+    ruiz_iter_compute_local_iteration_vectors();
+    ruiz_iter_apply_cumulative_update();
   }
 }
 
@@ -343,8 +385,12 @@ __global__ void pock_chambolle_scaling_kernel_col(
   if (threadIdx.x == 0) initial_scaling_view.iteration_variable_scaling[col] = accumulated_value;
 }
 
+// Local half of one Pock-Chambolle pass: writes the per-row and per-column
+// sums-of-powers into iteration_constraint_matrix_scaling_ /
+// iteration_variable_scaling_
 template <typename i_t, typename f_t>
-void pdlp_initial_scaling_strategy_t<i_t, f_t>::pock_chambolle_scaling(f_t alpha)
+void pdlp_initial_scaling_strategy_t<i_t, f_t>::pock_chambolle_compute_local_iteration_vectors(
+  f_t alpha)
 {
   // Reset the iteration_scaling vectors to all 0
   RAFT_CUDA_TRY(cudaMemsetAsync(
@@ -379,7 +425,12 @@ void pdlp_initial_scaling_strategy_t<i_t, f_t>::pock_chambolle_scaling(f_t alpha
       A_T_offsets_.data(),
       A_T_indices_.data());
   RAFT_CUDA_TRY(cudaPeekAtLastError());
+}
 
+// Fold half of one Pock-Chambolle pass: cumulative /= sqrt(iteration).
+template <typename i_t, typename f_t>
+void pdlp_initial_scaling_strategy_t<i_t, f_t>::pock_chambolle_apply_cumulative_update()
+{
   if (running_mip_) { reset_integer_variables(); }
 
   // divide the sqrt of the vectors of the sums from above to the respective scaling vectors
@@ -398,6 +449,13 @@ void pdlp_initial_scaling_strategy_t<i_t, f_t>::pock_chambolle_scaling(f_t alpha
                          stream_view_);
 }
 
+template <typename i_t, typename f_t>
+void pdlp_initial_scaling_strategy_t<i_t, f_t>::pock_chambolle_scaling(f_t alpha)
+{
+  pock_chambolle_compute_local_iteration_vectors(alpha);
+  pock_chambolle_apply_cumulative_update();
+}
+
 template <typename i_t, typename f_t>
 __global__ void scale_problem_kernel(
   const typename pdlp_initial_scaling_strategy_t<i_t, f_t>::view_t initial_scaling_view,
diff --git a/cpp/src/pdlp/initial_scaling_strategy/initial_scaling.cuh b/cpp/src/pdlp/initial_scaling_strategy/initial_scaling.cuh
index dbdb604082..148ccce238 100644
--- a/cpp/src/pdlp/initial_scaling_strategy/initial_scaling.cuh
+++ b/cpp/src/pdlp/initial_scaling_strategy/initial_scaling.cuh
@@ -94,6 +94,20 @@ class pdlp_initial_scaling_strategy_t {
 
   void bound_objective_rescaling();
 
+  // Public for distributed PDLP
+  void compute_scaling_vectors(i_t number_of_ruiz_iterations, f_t alpha);
+
+  // ----- Distributed-PDLP hooks -----
+
+  void ruiz_iter_compute_local_iteration_vectors();
+  void ruiz_iter_apply_cumulative_update();
+  void pock_chambolle_compute_local_iteration_vectors(f_t alpha);
+  void pock_chambolle_apply_cumulative_update();
+  rmm::device_uvector<f_t>& get_iteration_variable_scaling() { return iteration_variable_scaling_; }
+
+  // Restore the clean pre-scaling state for the distributed path.
+  void reset_scaling_state_for_distributed();
+
   /**
    * @brief Gets the device-side view (with raw pointers), for ease of access
    *        inside cuda kernels
@@ -101,7 +115,6 @@ class pdlp_initial_scaling_strategy_t {
   view_t view();
 
  private:
-  void compute_scaling_vectors(i_t number_of_ruiz_iterations, f_t alpha);
   void ruiz_inf_scaling(i_t number_of_ruiz_iterations);
   void pock_chambolle_scaling(f_t alpha);
   void reset_integer_variables();
diff --git a/cpp/src/pdlp/pdlp.cu b/cpp/src/pdlp/pdlp.cu
index 21291b853d..013905b4fb 100644
--- a/cpp/src/pdlp/pdlp.cu
+++ b/cpp/src/pdlp/pdlp.cu
@@ -577,24 +577,71 @@ pdlp_solver_t<i_t, f_t>::pdlp_solver_t(
                              objective_scaling_factor,
                              sub_pdlp_settings);
 
-    // ----- 8. Seed shard step-size / primal-weight scalars from the master -----
-    f_t h_step_size{}, h_primal_weight{}, h_best_primal_weight{};
-    f_t h_primal_step_size{}, h_dual_step_size{};
-    raft::copy(&h_step_size, step_size_.data(), 1, stream_view_);
-    raft::copy(&h_primal_weight, primal_weight_.data(), 1, stream_view_);
-    raft::copy(&h_best_primal_weight, best_primal_weight_.data(), 1, stream_view_);
-    raft::copy(&h_primal_step_size, primal_step_size_.data(), 1, stream_view_);
-    raft::copy(&h_dual_step_size, dual_step_size_.data(), 1, stream_view_);
+    // ----- 8 Distributed Scaling -----
+    for (auto& shard : multi_gpu_engine->shards) {
+      raft::device_setter guard(shard->device_id);
+      shard->sub_pdlp->get_initial_scaling_strategy().reset_scaling_state_for_distributed();
+    }
+    for (auto& shard : multi_gpu_engine->shards) {
+      raft::device_setter guard(shard->device_id);
+      shard->stream.synchronize();
+    }
+
+    // Distributed scaling
+    if (settings_.hyper_params.do_ruiz_scaling) {
+      multi_gpu_engine->distributed_ruiz_inf_scaling(
+        settings_.hyper_params.default_l_inf_ruiz_iterations, n_vars);
+    }
+    if (settings_.hyper_params.do_pock_chambolle_scaling) {
+      multi_gpu_engine->distributed_pock_chambolle_scaling(
+        static_cast<f_t>(settings_.hyper_params.default_alpha_pock_chambolle_rescaling), n_vars);
+    }
+
+    for (auto& shard : multi_gpu_engine->shards) {
+      raft::device_setter guard(shard->device_id);
+      auto& scaling = shard->sub_pdlp->get_initial_scaling_strategy();
+      scaling.scale_problem();
+
+      shard->sub_pdlp->pdhg_solver_.get_cusparse_view().create_spmv_op_plans(
+        /*is_reflected=*/settings_.hyper_params.use_reflected_primal_dual);
+    }
+    for (auto& shard : multi_gpu_engine->shards) {
+      raft::device_setter guard(shard->device_id);
+      shard->stream.synchronize();
+    }
+
+    // ----- 8b. Seed initial step-size / primal-weight (distributed, scales to N shards) -----
+    constexpr f_t kStepSizeScale = f_t{0.998};
+    const f_t sigma_max          = multi_gpu_engine->distributed_max_singular_value(n_cstr);
+    const f_t h_primal_weight    = f_t{1};
+    const f_t h_step_size        = (sigma_max > f_t{0}) ? kStepSizeScale / sigma_max : f_t{1};
+    // With primal_weight = 1 the adaptive step-size strategy collapses to
+    // primal_step_size = step_size / primal_weight = step_size
+    // dual_step_size   = step_size * primal_weight = step_size.
+    const f_t h_primal_step_size = h_step_size;
+    const f_t h_dual_step_size   = h_step_size;
+
+    // Put the values on master
+    raft::copy(step_size_.data(), &h_step_size, 1, stream_view_);
+    raft::copy(primal_weight_.data(), &h_primal_weight, 1, stream_view_);
+    raft::copy(best_primal_weight_.data(), &h_primal_weight, 1, stream_view_);
+    raft::copy(primal_step_size_.data(), &h_primal_step_size, 1, stream_view_);
+    raft::copy(dual_step_size_.data(), &h_dual_step_size, 1, stream_view_);
     handle_ptr_->sync_stream(stream_view_);
 
+    // put the values on each shard
     for (auto& shard : multi_gpu_engine->shards) {
       raft::device_setter guard(shard->device_id);
       auto& sub = *shard->sub_pdlp;
       raft::copy(sub.step_size_.data(), &h_step_size, 1, shard->stream);
       raft::copy(sub.primal_weight_.data(), &h_primal_weight, 1, shard->stream);
-      raft::copy(sub.best_primal_weight_.data(), &h_best_primal_weight, 1, shard->stream);
-      raft::copy(sub.primal_step_size_.data(), &h_primal_step_size, 1, shard->stream);
-      raft::copy(sub.dual_step_size_.data(), &h_dual_step_size, 1, shard->stream);
+      raft::copy(sub.best_primal_weight_.data(), &h_primal_weight, 1, shard->stream);
+      raft::copy(sub.get_primal_step_size().data(), &h_primal_step_size, 1, shard->stream);
+      raft::copy(sub.get_dual_step_size().data(), &h_dual_step_size, 1, shard->stream);
+    }
+    for (auto& shard : multi_gpu_engine->shards) {
+      raft::device_setter guard(shard->device_id);
+      shard->stream.synchronize();
     }
 
     // Wire the engine into master's pdhg_solver_; shards keep mgpu_engine_ == nullptr.
@@ -607,6 +654,49 @@ pdlp_solver_t<i_t, f_t>::pdlp_solver_t(
       n_vars, stream_view_);
     primal_size_h_ = n_vars;
     dual_size_h_   = n_cstr;
+
+    // Distributed conergence_information::init_l2_norms
+    for (auto& shard : multi_gpu_engine->shards) {
+      raft::device_setter guard(shard->device_id);
+      shard->sub_pdlp->get_current_termination_strategy()
+        .get_convergence_information()
+        .compute_owned_reference_norm_partials(shard->rank_data.owned_var_size,
+                                               shard->rank_data.owned_cstr_size);
+    }
+    multi_gpu_engine->allreduce_sum_inplace([](pdlp_solver_t<i_t, f_t>& sp) -> f_t* {
+      return sp.get_current_termination_strategy()
+        .get_convergence_information()
+        .l2_norm_primal_right_hand_side_data();
+    });
+    multi_gpu_engine->allreduce_sum_inplace([](pdlp_solver_t<i_t, f_t>& sp) -> f_t* {
+      return sp.get_current_termination_strategy()
+        .get_convergence_information()
+        .l2_norm_primal_linear_objective_data();
+    });
+    for (auto& shard : multi_gpu_engine->shards) {
+      raft::device_setter guard(shard->device_id);
+      shard->sub_pdlp->get_current_termination_strategy()
+        .get_convergence_information()
+        .sqrt_reference_norms_inplace();
+      shard->stream.synchronize();
+    }
+    // Broadcast the values to the master
+    {
+      auto& s0      = *multi_gpu_engine->shards[0];
+      auto& s0_conv = s0.sub_pdlp->get_current_termination_strategy().get_convergence_information();
+      raft::device_setter guard(s0.device_id);
+      for (auto* ts : {&current_termination_strategy_, &average_termination_strategy_}) {
+        auto& ci = ts->get_convergence_information();
+        raft::copy(ci.l2_norm_primal_right_hand_side_data(),
+                   s0_conv.l2_norm_primal_right_hand_side_data(),
+                   1,
+                   stream_view_);
+        raft::copy(ci.l2_norm_primal_linear_objective_data(),
+                   s0_conv.l2_norm_primal_linear_objective_data(),
+                   1,
+                   stream_view_);
+      }
+    }
     handle_ptr_->sync_stream(stream_view_);
 }
 
diff --git a/cpp/src/pdlp/saddle_point.cu b/cpp/src/pdlp/saddle_point.cu
index f740176a3c..07a5d0146e 100644
--- a/cpp/src/pdlp/saddle_point.cu
+++ b/cpp/src/pdlp/saddle_point.cu
@@ -38,8 +38,11 @@ saddle_point_state_t<i_t, f_t>::saddle_point_state_t(
     current_AtY_{batch_size * primal_size, handle_ptr->get_stream()},
     next_AtY_{batch_size * primal_size, handle_ptr->get_stream()}
 {
-  EXE_CUOPT_EXPECTS(primal_size > 0, "Size of the primal problem must be larger than 0");
-  EXE_CUOPT_EXPECTS(dual_size > 0, "Size of the dual problem must be larger than 0");
+  // >= 0 (not > 0): distributed PDLP builds the master pdlp_solver_t from a
+  // shape-0 placeholder problem so the master never materializes per-variable
+  // / per-constraint vectors; size-0 device_uvectors are valid throughout.
+  EXE_CUOPT_EXPECTS(primal_size >= 0, "Size of the primal problem must be non-negative");
+  EXE_CUOPT_EXPECTS(dual_size >= 0, "Size of the dual problem must be non-negative");
 
   // Starting from all 0
   thrust::fill(
diff --git a/cpp/src/pdlp/termination_strategy/convergence_information.cu b/cpp/src/pdlp/termination_strategy/convergence_information.cu
index da2340146a..44ddd5b2a1 100644
--- a/cpp/src/pdlp/termination_strategy/convergence_information.cu
+++ b/cpp/src/pdlp/termination_strategy/convergence_information.cu
@@ -212,6 +212,77 @@ void convergence_information_t<i_t, f_t>::init_l2_norms()
   }
 }
 
+template <typename i_t, typename f_t>
+void convergence_information_t<i_t, f_t>::compute_owned_reference_norm_partials(
+  i_t owned_var_size, i_t owned_cstr_size)
+{
+  cuopt_assert(!batch_mode_, "owned reference-norm partials only used in non-batch mGPU mode");
+  cuopt_assert(owned_var_size <= primal_size_h_, "owned_var_size must be <= primal_size_h_");
+  cuopt_assert(owned_cstr_size <= dual_size_h_, "owned_cstr_size must be <= dual_size_h_");
+
+  // Σ objective[0:owned_var]²
+  RAFT_CUBLAS_TRY(raft::linalg::detail::cublasdot(handle_ptr_->get_cublas_handle(),
+                                                  static_cast<int>(owned_var_size),
+                                                  problem_ptr->objective_coefficients.data(),
+                                                  1,
+                                                  problem_ptr->objective_coefficients.data(),
+                                                  1,
+                                                  l2_norm_primal_linear_objective_.data(),
+                                                  stream_view_));
+
+  // rhs_sum_of_squares(lower[0:owned_cstr], upper[0:owned_cstr])  (no sqrt)
+  {
+    rmm::device_buffer d_temp_storage;
+    size_t bytes = 0;
+    auto zip_begin = thrust::make_zip_iterator(problem_ptr->constraint_lower_bounds.data(),
+                                               problem_ptr->constraint_upper_bounds.data());
+    cub::DeviceReduce::TransformReduce(nullptr,
+                                       bytes,
+                                       zip_begin,
+                                       l2_norm_primal_right_hand_side_.data(),
+                                       static_cast<int>(owned_cstr_size),
+                                       cuda::std::plus<>{},
+                                       rhs_sum_of_squares_t<f_t>{},
+                                       f_t(0),
+                                       stream_view_);
+    d_temp_storage.resize(bytes, stream_view_);
+    cub::DeviceReduce::TransformReduce(d_temp_storage.data(),
+                                       bytes,
+                                       zip_begin,
+                                       l2_norm_primal_right_hand_side_.data(),
+                                       static_cast<int>(owned_cstr_size),
+                                       cuda::std::plus<>{},
+                                       rhs_sum_of_squares_t<f_t>{},
+                                       f_t(0),
+                                       stream_view_);
+  }
+  RAFT_CUDA_TRY(cudaStreamSynchronize(stream_view_));
+}
+
+template <typename i_t, typename f_t>
+void convergence_information_t<i_t, f_t>::sqrt_reference_norms_inplace()
+{
+  cub::DeviceTransform::Transform(l2_norm_primal_linear_objective_.data(),
+                                  l2_norm_primal_linear_objective_.data(),
+                                  1,
+                                  sqrt_func_t<f_t>{},
+                                  stream_view_);
+  cub::DeviceTransform::Transform(l2_norm_primal_right_hand_side_.data(),
+                                  l2_norm_primal_right_hand_side_.data(),
+                                  1,
+                                  sqrt_func_t<f_t>{},
+                                  stream_view_);
+  // Broadcast slot [0] to all climbers (no-op outside batch mode).
+  thrust::fill(handle_ptr_->get_thrust_policy(),
+               l2_norm_primal_linear_objective_.begin(),
+               l2_norm_primal_linear_objective_.end(),
+               l2_norm_primal_linear_objective_.element(0, stream_view_));
+  thrust::fill(handle_ptr_->get_thrust_policy(),
+               l2_norm_primal_right_hand_side_.begin(),
+               l2_norm_primal_right_hand_side_.end(),
+               l2_norm_primal_right_hand_side_.element(0, stream_view_));
+}
+
 // ---------------------------------------------------------------------------
 // init_reduction_storage: allocate and size the temporary buffers used by
 // cub::DeviceReduce and cub::DeviceSegmentedReduce throughout solving.
diff --git a/cpp/src/pdlp/termination_strategy/convergence_information.hpp b/cpp/src/pdlp/termination_strategy/convergence_information.hpp
index 6325622a2b..7ff45e46f0 100644
--- a/cpp/src/pdlp/termination_strategy/convergence_information.hpp
+++ b/cpp/src/pdlp/termination_strategy/convergence_information.hpp
@@ -69,6 +69,11 @@ class convergence_information_t {
   const rmm::device_uvector<f_t>& get_l2_norm_primal_linear_objective() const;
   const rmm::device_uvector<f_t>& get_l2_norm_primal_right_hand_side() const;
 
+  void compute_owned_reference_norm_partials(i_t owned_var_size, i_t owned_cstr_size);
+  void sqrt_reference_norms_inplace();
+  f_t* l2_norm_primal_right_hand_side_data() { return l2_norm_primal_right_hand_side_.data(); }
+  f_t* l2_norm_primal_linear_objective_data() { return l2_norm_primal_linear_objective_.data(); }
+
   struct view_t {
     i_t primal_size;
     i_t dual_size;

From 1903f4bfea48d25ba4042bb7d2a02e0a41267718 Mon Sep 17 00:00:00 2001
From: Bulle Mostovoi <vmostovoi@nvidia.com>
Date: Tue, 2 Jun 2026 15:39:56 +0200
Subject: [PATCH 59/67] added a cuopt assert for solve_lp in mgpu mode

---
 cpp/src/pdlp/solve.cu | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/cpp/src/pdlp/solve.cu b/cpp/src/pdlp/solve.cu
index ef273faf13..feaeb7bd57 100644
--- a/cpp/src/pdlp/solve.cu
+++ b/cpp/src/pdlp/solve.cu
@@ -2126,13 +2126,11 @@ optimization_problem_solution_t<i_t, f_t> solve_lp(
   bool problem_checking,
   bool use_pdlp_solver_mode)
 {
-  // In distributed PDLP we can't allocate the full problem on the master device
-  if (settings.hyper_params.use_distributed_pdlp) {
+  cuopt_expects(settings.hyper_params.use_distributed_pdlp,
+                error_type_t::ValidationError,
+                "solve_lp from mps_data_model: settings.hyper_params.use_distributed_pdlp must be true");
     return solve_lp_distributed_from_mps(
       handle_ptr, mps_data_model, settings, problem_checking, use_pdlp_solver_mode);
-  }
-  auto op_problem = mps_data_model_to_optimization_problem(handle_ptr, mps_data_model);
-  return solve_lp(op_problem, settings, problem_checking, use_pdlp_solver_mode);
 }
 
 template <typename i_t, typename f_t>

From 0aacb4f702fe0a413623f07522dac6745f484692 Mon Sep 17 00:00:00 2001
From: Bulle Mostovoi <vmostovoi@nvidia.com>
Date: Tue, 2 Jun 2026 15:42:45 +0200
Subject: [PATCH 60/67] style

---
 cpp/cuopt_cli.cpp                             |   9 +-
 .../cuopt/linear_programming/constants.h      |  22 +-
 .../pdlp/pdlp_hyper_params.cuh                |  14 +-
 .../distributed_pdlp/metis_partitioner.cu     |  14 +-
 .../distributed_pdlp/multi_gpu_engine.hpp     |  18 +-
 .../pdlp/distributed_pdlp/partition_loader.cu |   6 +-
 cpp/src/pdlp/distributed_pdlp/partitioner.cu  |  25 +-
 cpp/src/pdlp/distributed_pdlp/partitioner.hpp |   2 +-
 cpp/src/pdlp/pdhg.cu                          |  16 +-
 cpp/src/pdlp/pdlp.cu                          | 586 +++++++++---------
 .../restart_strategy/pdlp_restart_strategy.cu |  17 +-
 cpp/src/pdlp/solve.cu                         |  27 +-
 cpp/src/pdlp/solve.cuh                        |   3 +-
 .../adaptive_step_size_strategy.cu            |  10 +-
 .../convergence_information.cu                | 101 ++-
 cpp/src/pdlp/utilities/mgpu_trace.cuh         |  24 +-
 16 files changed, 429 insertions(+), 465 deletions(-)

diff --git a/cpp/cuopt_cli.cpp b/cpp/cuopt_cli.cpp
index 0ea79bd4ec..b730067a28 100644
--- a/cpp/cuopt_cli.cpp
+++ b/cpp/cuopt_cli.cpp
@@ -436,14 +436,13 @@ int main(int argc, char* argv[])
     // For distributed PDLP, -1 means "auto-detect": resolve to the visible device
     // count so the RMM memory pools match what solve.cu will eventually dispatch.
     const bool use_distributed_pdlp = settings.get_parameter<bool>(CUOPT_USE_DISTRIBUTED_PDLP);
-    int requested_gpus =
-      use_distributed_pdlp ? settings.get_parameter<int>(CUOPT_DISTRIBUTED_PDLP_NUM_GPUS)
-                           : settings.get_parameter<int>(CUOPT_NUM_GPUS);
+    int requested_gpus              = use_distributed_pdlp
+                                        ? settings.get_parameter<int>(CUOPT_DISTRIBUTED_PDLP_NUM_GPUS)
+                                        : settings.get_parameter<int>(CUOPT_NUM_GPUS);
     if (use_distributed_pdlp && requested_gpus == -1) {
       requested_gpus = raft::device_setter::get_device_count();
     }
-    const int provisioned_gpus =
-      std::min(raft::device_setter::get_device_count(), requested_gpus);
+    const int provisioned_gpus = std::min(raft::device_setter::get_device_count(), requested_gpus);
 
     memory_resources.reserve(provisioned_gpus);
     for (int i = 0; i < provisioned_gpus; ++i) {
diff --git a/cpp/include/cuopt/linear_programming/constants.h b/cpp/include/cuopt/linear_programming/constants.h
index e695bb21d3..e2cc264cdc 100644
--- a/cpp/include/cuopt/linear_programming/constants.h
+++ b/cpp/include/cuopt/linear_programming/constants.h
@@ -80,18 +80,18 @@
 #define CUOPT_MIP_STRONG_BRANCHING_SIMPLEX_ITERATION_LIMIT \
   "mip_strong_branching_simplex_iteration_limit"
 
-#define CUOPT_SOLUTION_FILE            "solution_file"
-#define CUOPT_NUM_CPU_THREADS          "num_cpu_threads"
-#define CUOPT_NUM_GPUS                 "num_gpus"
+#define CUOPT_SOLUTION_FILE             "solution_file"
+#define CUOPT_NUM_CPU_THREADS           "num_cpu_threads"
+#define CUOPT_NUM_GPUS                  "num_gpus"
 #define CUOPT_DISTRIBUTED_PDLP_NUM_GPUS "distributed_pdlp_num_gpus"
-#define CUOPT_MULTI_GPU_PARTITION_FILE "multi_gpu_partition_file"
-#define CUOPT_USE_DISTRIBUTED_PDLP     "use_distributed_pdlp"
-#define CUOPT_PDLP_DISABLE_GRAPH       "pdlp_disable_graph"
-#define CUOPT_USER_PROBLEM_FILE        "user_problem_file"
-#define CUOPT_PRESOLVE_FILE            "presolve_file"
-#define CUOPT_RANDOM_SEED              "random_seed"
-#define CUOPT_PDLP_PRECISION           "pdlp_precision"
-#define CUOPT_MIP_SEMICONTINUOUS_BIG_M "mip_semi_continuous_big_m"
+#define CUOPT_MULTI_GPU_PARTITION_FILE  "multi_gpu_partition_file"
+#define CUOPT_USE_DISTRIBUTED_PDLP      "use_distributed_pdlp"
+#define CUOPT_PDLP_DISABLE_GRAPH        "pdlp_disable_graph"
+#define CUOPT_USER_PROBLEM_FILE         "user_problem_file"
+#define CUOPT_PRESOLVE_FILE             "presolve_file"
+#define CUOPT_RANDOM_SEED               "random_seed"
+#define CUOPT_PDLP_PRECISION            "pdlp_precision"
+#define CUOPT_MIP_SEMICONTINUOUS_BIG_M  "mip_semi_continuous_big_m"
 
 #define CUOPT_MIP_HYPER_HEURISTIC_POPULATION_SIZE     "mip_hyper_heuristic_population_size"
 #define CUOPT_MIP_HYPER_HEURISTIC_NUM_CPUFJ_THREADS   "mip_hyper_heuristic_num_cpufj_threads"
diff --git a/cpp/include/cuopt/linear_programming/pdlp/pdlp_hyper_params.cuh b/cpp/include/cuopt/linear_programming/pdlp/pdlp_hyper_params.cuh
index c68dc86d6a..0ce90e7228 100644
--- a/cpp/include/cuopt/linear_programming/pdlp/pdlp_hyper_params.cuh
+++ b/cpp/include/cuopt/linear_programming/pdlp/pdlp_hyper_params.cuh
@@ -50,13 +50,13 @@ struct pdlp_hyper_params_t {
   bool use_distributed_pdlp                                       = false;
   // Debug/diagnostic knob: when true, PDLP bypasses CUDA-graph capture in
   // ping_pong_graph_t and executes each iteration eagerly
-  bool pdlp_disable_graph                                         = false;
-  double reflection_coefficient                                   = 1.0;
-  double restart_k_p                                              = 0.99;
-  double restart_k_i                                              = 0.01;
-  double restart_k_d                                              = 0.0;
-  double restart_i_smooth                                         = 0.3;
-  bool use_conditional_major                                      = true;
+  bool pdlp_disable_graph       = false;
+  double reflection_coefficient = 1.0;
+  double restart_k_p            = 0.99;
+  double restart_k_i            = 0.01;
+  double restart_k_d            = 0.0;
+  double restart_i_smooth       = 0.3;
+  bool use_conditional_major    = true;
 };
 
 // TODO most likely we want to get rid of pdlp_solver_mode and just have prebuilt
diff --git a/cpp/src/pdlp/distributed_pdlp/metis_partitioner.cu b/cpp/src/pdlp/distributed_pdlp/metis_partitioner.cu
index 73e2736251..ecc60adda0 100644
--- a/cpp/src/pdlp/distributed_pdlp/metis_partitioner.cu
+++ b/cpp/src/pdlp/distributed_pdlp/metis_partitioner.cu
@@ -79,7 +79,9 @@ std::vector<i_t> metis_partitioner_t<i_t, f_t>::partition(
   std::vector<idx_t> adjncy(2 * static_cast<std::size_t>(nnz));
 
   // cstr-side row offsets: A_offsets[0..nb_cstr] (no shift).
-  for (i_t i = 0; i <= nb_cstr; ++i) { xadj[i] = static_cast<idx_t>(A_offsets[i]); }
+  for (i_t i = 0; i <= nb_cstr; ++i) {
+    xadj[i] = static_cast<idx_t>(A_offsets[i]);
+  }
   // var-side row offsets: A_t_offsets[0..nb_vars], shifted by +nnz so that
   // they index into the second half of adjncy.
   for (i_t i = 0; i <= nb_vars; ++i) {
@@ -106,7 +108,7 @@ std::vector<i_t> metis_partitioner_t<i_t, f_t>::partition(
   idx_t objval     = 0;
   std::vector<idx_t> metis_parts(nvtx);
 
-  auto t0 = std::chrono::high_resolution_clock::now();
+  auto t0          = std::chrono::high_resolution_clock::now();
   const int status = METIS_PartGraphKway(&metis_nvtx,
                                          &ncon,
                                          xadj.data(),
@@ -120,8 +122,8 @@ std::vector<i_t> metis_partitioner_t<i_t, f_t>::partition(
                                          metis_options,
                                          &objval,
                                          metis_parts.data());
-  auto t1 = std::chrono::high_resolution_clock::now();
-  const double dt = std::chrono::duration<double>(t1 - t0).count();
+  auto t1          = std::chrono::high_resolution_clock::now();
+  const double dt  = std::chrono::duration<double>(t1 - t0).count();
   cuopt_expects(status == METIS_OK,
                 error_type_t::RuntimeError,
                 "METIS_PartGraphKway failed (status=%d)",
@@ -135,7 +137,9 @@ std::vector<i_t> metis_partitioner_t<i_t, f_t>::partition(
     dt);
 
   std::vector<i_t> parts(static_cast<std::size_t>(nvtx));
-  for (i_t i = 0; i < nvtx; ++i) { parts[i] = static_cast<i_t>(metis_parts[i]); }
+  for (i_t i = 0; i < nvtx; ++i) {
+    parts[i] = static_cast<i_t>(metis_parts[i]);
+  }
 
   validate_partition(parts,
                      static_cast<int>(nb_cstr),
diff --git a/cpp/src/pdlp/distributed_pdlp/multi_gpu_engine.hpp b/cpp/src/pdlp/distributed_pdlp/multi_gpu_engine.hpp
index 6ab4e35b71..0297ecc0a6 100644
--- a/cpp/src/pdlp/distributed_pdlp/multi_gpu_engine.hpp
+++ b/cpp/src/pdlp/distributed_pdlp/multi_gpu_engine.hpp
@@ -255,9 +255,7 @@ struct multi_gpu_engine_t {
   // OutAccess  : pdlp_solver_t<i_t,f_t>& -> f_t*   (single scalar in shard memory)
   // SizeAccess : pdlp_shard_t<i_t,f_t>&  -> i_t    (owned slice length)
   template <typename BufAccess, typename OutAccess, typename SizeAccess>
-  void distributed_l2_norm(BufAccess&& buf_access,
-                           OutAccess&& out_access,
-                           SizeAccess&& size_access)
+  void distributed_l2_norm(BufAccess&& buf_access, OutAccess&& out_access, SizeAccess&& size_access)
   {
     for_each_shard([&](auto& shard) {
       auto& sub   = *shard.sub_pdlp;
@@ -858,13 +856,11 @@ struct multi_gpu_engine_t {
   // master_reduced_cost  : destination for the reduced_cost (var-shaped, lives
   //                        in the master pdlp_solver_t's termination strategy
   //                        convergence_information_).
-  void gather_potential_next_solutions_to_master(
-    pdhg_solver_t<i_t, f_t>& master_pdhg, rmm::device_uvector<f_t>& master_reduced_cost)
+  void gather_potential_next_solutions_to_master(pdhg_solver_t<i_t, f_t>& master_pdhg,
+                                                 rmm::device_uvector<f_t>& master_reduced_cost)
   {
-    const std::size_t total_vars =
-      master_pdhg.get_potential_next_primal_solution().size();
-    const std::size_t total_cstrs =
-      master_pdhg.get_potential_next_dual_solution().size();
+    const std::size_t total_vars  = master_pdhg.get_potential_next_primal_solution().size();
+    const std::size_t total_cstrs = master_pdhg.get_potential_next_dual_solution().size();
 
     std::vector<f_t> h_primal(total_vars);
     std::vector<f_t> h_dual(total_cstrs);
@@ -987,8 +983,8 @@ struct multi_gpu_engine_t {
     }
   }
 
-  // Functionnaly same as graph_capture_fork_to_shards but on a different event to avoid race conditions
-  // Can be used as a way to sync shards with master stream
+  // Functionnaly same as graph_capture_fork_to_shards but on a different event to avoid race
+  // conditions Can be used as a way to sync shards with master stream
   void sync_await_master(rmm::cuda_stream_view master_stream)
   {
     sync_master_ready_event_->record(master_stream);
diff --git a/cpp/src/pdlp/distributed_pdlp/partition_loader.cu b/cpp/src/pdlp/distributed_pdlp/partition_loader.cu
index 5014607736..5c317f664e 100644
--- a/cpp/src/pdlp/distributed_pdlp/partition_loader.cu
+++ b/cpp/src/pdlp/distributed_pdlp/partition_loader.cu
@@ -174,12 +174,10 @@ std::vector<rank_data_t<i_t, f_t>> partition_loader_t<i_t, f_t>::create_rank_dat
 
     // Pad row-offset arrays so cuSPARSE sees the local matrices as
     // (total_cstr x total_var) for A and (total_var x total_cstr) for A_T
-    const i_t a_last_nnz =
-      rd.h_A_row_offsets.empty() ? i_t{0} : rd.h_A_row_offsets.back();
+    const i_t a_last_nnz = rd.h_A_row_offsets.empty() ? i_t{0} : rd.h_A_row_offsets.back();
     rd.h_A_row_offsets.resize(rd.total_cstr_size + 1, a_last_nnz);
 
-    const i_t at_last_nnz =
-      rd.h_A_t_row_offsets.empty() ? i_t{0} : rd.h_A_t_row_offsets.back();
+    const i_t at_last_nnz = rd.h_A_t_row_offsets.empty() ? i_t{0} : rd.h_A_t_row_offsets.back();
     rd.h_A_t_row_offsets.resize(rd.total_var_size + 1, at_last_nnz);
   }
 
diff --git a/cpp/src/pdlp/distributed_pdlp/partitioner.cu b/cpp/src/pdlp/distributed_pdlp/partitioner.cu
index 4b809986ce..bc84e521e2 100644
--- a/cpp/src/pdlp/distributed_pdlp/partitioner.cu
+++ b/cpp/src/pdlp/distributed_pdlp/partitioner.cu
@@ -38,11 +38,8 @@ std::vector<i_t> dummy_partitioner_t<i_t, f_t>::partition(
   return parts;
 }
 
-void validate_partition(std::vector<int> const& parts,
-                        int nb_cstr,
-                        int nb_vars,
-                        int nb_parts,
-                        char const* context)
+void validate_partition(
+  std::vector<int> const& parts, int nb_cstr, int nb_vars, int nb_parts, char const* context)
 {
   const std::size_t expected =
     static_cast<std::size_t>(nb_cstr) + static_cast<std::size_t>(nb_vars);
@@ -52,10 +49,8 @@ void validate_partition(std::vector<int> const& parts,
                 context,
                 expected,
                 parts.size());
-  cuopt_expects(nb_parts > 0,
-                error_type_t::ValidationError,
-                "%s: nb_parts must be positive",
-                context);
+  cuopt_expects(
+    nb_parts > 0, error_type_t::ValidationError, "%s: nb_parts must be positive", context);
   if (parts.empty()) { return; }
   const auto [min_it, max_it] = std::minmax_element(parts.begin(), parts.end());
   cuopt_expects(*min_it >= 0,
@@ -75,16 +70,16 @@ template <typename i_t, typename f_t>
 std::unique_ptr<partitioner_i<i_t, f_t>> make_partitioner(partitioner_kind_t kind)
 {
   switch (kind) {
-    case partitioner_kind_t::Dummy:
-      return std::make_unique<dummy_partitioner_t<i_t, f_t>>();
-    case partitioner_kind_t::Metis:
-      return std::make_unique<metis_partitioner_t<i_t, f_t>>();
+    case partitioner_kind_t::Dummy: return std::make_unique<dummy_partitioner_t<i_t, f_t>>();
+    case partitioner_kind_t::Metis: return std::make_unique<metis_partitioner_t<i_t, f_t>>();
   }
-  cuopt_expects(false, error_type_t::RuntimeError, "make_partitioner: unsupported partitioner kind");
+  cuopt_expects(
+    false, error_type_t::RuntimeError, "make_partitioner: unsupported partitioner kind");
   return nullptr;
 }
 
 template class dummy_partitioner_t<int, double>;
-template std::unique_ptr<partitioner_i<int, double>> make_partitioner<int, double>(partitioner_kind_t);
+template std::unique_ptr<partitioner_i<int, double>> make_partitioner<int, double>(
+  partitioner_kind_t);
 
 }  // namespace cuopt::linear_programming::detail
diff --git a/cpp/src/pdlp/distributed_pdlp/partitioner.hpp b/cpp/src/pdlp/distributed_pdlp/partitioner.hpp
index 82650ad805..2a2149db63 100644
--- a/cpp/src/pdlp/distributed_pdlp/partitioner.hpp
+++ b/cpp/src/pdlp/distributed_pdlp/partitioner.hpp
@@ -41,7 +41,7 @@ enum class partitioner_kind_t { Dummy, Metis };
 template <typename i_t, typename f_t>
 class partitioner_i {
  public:
-  virtual ~partitioner_i() = default;
+  virtual ~partitioner_i()                                                             = default;
   virtual std::vector<i_t> partition(partitioner_input_t<i_t, f_t> const& input) const = 0;
 };
 
diff --git a/cpp/src/pdlp/pdhg.cu b/cpp/src/pdlp/pdhg.cu
index ec983fd01b..b1f1a59ada 100644
--- a/cpp/src/pdlp/pdhg.cu
+++ b/cpp/src/pdlp/pdhg.cu
@@ -1257,9 +1257,7 @@ void pdhg_solver_t<i_t, f_t>::compute_next_primal_dual_solution_reflected(
       // the capture or run outside the graph, leaving the captured graph
       // empty (or broken) -- which produces the cycling/stall behavior we
       // observed on larger problems. Mirrors metis_tests bench.cu fork/join.
-      if (mgpu_engine_ != nullptr) {
-        mgpu_engine_->graph_capture_fork_to_shards(stream_view_);
-      }
+      if (mgpu_engine_ != nullptr) { mgpu_engine_->graph_capture_fork_to_shards(stream_view_); }
 
       compute_At_y();
       if (mgpu_engine_ != nullptr) {
@@ -1362,16 +1360,12 @@ void pdhg_solver_t<i_t, f_t>::compute_next_primal_dual_solution_reflected(
       // Multi-GPU: close the fork by joining every shard stream back into
       // the master stream so cudaStreamEndCapture sees a single graph
       // spanning all streams.
-      if (mgpu_engine_ != nullptr) {
-        mgpu_engine_->graph_capture_join_from_shards(stream_view_);
-      }
+      if (mgpu_engine_ != nullptr) { mgpu_engine_->graph_capture_join_from_shards(stream_view_); }
     });
 
   } else {
     graph_all.run(should_major, [&]() {
-      if (mgpu_engine_ != nullptr) {
-        mgpu_engine_->graph_capture_fork_to_shards(stream_view_);
-      }
+      if (mgpu_engine_ != nullptr) { mgpu_engine_->graph_capture_fork_to_shards(stream_view_); }
 
       // Compute next primal
       compute_At_y();
@@ -1478,9 +1472,7 @@ void pdhg_solver_t<i_t, f_t>::compute_next_primal_dual_solution_reflected(
       print("reflected_dual_", reflected_dual_);
 #endif
 
-      if (mgpu_engine_ != nullptr) {
-        mgpu_engine_->graph_capture_join_from_shards(stream_view_);
-      }
+      if (mgpu_engine_ != nullptr) { mgpu_engine_->graph_capture_join_from_shards(stream_view_); }
     });
   }
 
diff --git a/cpp/src/pdlp/pdlp.cu b/cpp/src/pdlp/pdlp.cu
index 013905b4fb..576ab417f1 100644
--- a/cpp/src/pdlp/pdlp.cu
+++ b/cpp/src/pdlp/pdlp.cu
@@ -390,8 +390,7 @@ pdlp_solver_t<i_t, f_t>::pdlp_solver_t(
   // Makes all inner feilds of master 0 size
   : pdlp_solver_t(placeholder_problem, settings, /*is_legacy_batch_mode=*/false)
 {
-  cuopt_expects(placeholder_problem.n_variables == 0 &&
-                  placeholder_problem.n_constraints == 0 &&
+  cuopt_expects(placeholder_problem.n_variables == 0 && placeholder_problem.n_constraints == 0 &&
                   placeholder_problem.nnz == 0,
                 error_type_t::ValidationError,
                 "Distributed mGPU pdlp_solver_t ctor requires a shape-0 "
@@ -407,297 +406,297 @@ pdlp_solver_t<i_t, f_t>::pdlp_solver_t(
   }
 
   if constexpr (!std::is_same_v<f_t, double>) {
-    cuopt_expects(false,
-                  error_type_t::ValidationError,
-                  "Distributed PDLP currently requires double precision");
+    cuopt_expects(
+      false, error_type_t::ValidationError, "Distributed PDLP currently requires double precision");
     return;
   }
-    // ----- 1. Read problem shape and bulk data directly from mps (host) -----
-    const i_t n_vars = static_cast<i_t>(mps.get_objective_coefficients().size());
-    const i_t n_cstr = static_cast<i_t>(mps.get_constraint_lower_bounds().size());
-    const i_t nnz    = static_cast<i_t>(mps.get_constraint_matrix_values().size());
-    cuopt_expects(n_vars > 0,
-                  error_type_t::ValidationError,
-                  "Distributed PDLP from mps requires a non-empty objective");
-    cuopt_expects(n_cstr > 0,
-                  error_type_t::ValidationError,
-                  "Distributed PDLP from mps requires at least one constraint");
-    cuopt_expects(static_cast<i_t>(mps.get_constraint_matrix_offsets().size()) == n_cstr + 1,
-                  error_type_t::ValidationError,
-                  "mps constraint_matrix_offsets size must equal n_constraints + 1");
-    cuopt_expects(
-      static_cast<i_t>(mps.get_constraint_matrix_indices().size()) == nnz,
-      error_type_t::ValidationError,
-      "mps constraint_matrix_indices size must equal nnz (constraint_matrix_values size)");
-    cuopt_expects(static_cast<i_t>(mps.get_constraint_upper_bounds().size()) == n_cstr,
-                  error_type_t::ValidationError,
-                  "mps constraint_upper_bounds size must equal n_constraints");
-    cuopt_expects(static_cast<i_t>(mps.get_variable_lower_bounds().size()) == n_vars,
-                  error_type_t::ValidationError,
-                  "mps variable_lower_bounds size must equal n_variables");
-    cuopt_expects(static_cast<i_t>(mps.get_variable_upper_bounds().size()) == n_vars,
-                  error_type_t::ValidationError,
-                  "mps variable_upper_bounds size must equal n_variables");
-
-    const bool maximize           = mps.get_sense();
-    f_t objective_offset          = mps.get_objective_offset();
-    f_t objective_scaling_factor  = mps.get_objective_scaling_factor();
-
-    // Objective: copy (mutable so we can negate for maximize, matching
-    // problem_helpers.cuh::convert_to_maximization_problem).
-    std::vector<f_t> h_obj = mps.get_objective_coefficients();
-    if (maximize) {
-      for (auto& v : h_obj) v = -v;
-      objective_offset         = -objective_offset;
-      objective_scaling_factor = -objective_scaling_factor;
-    }
-
-    // Bounds (copy from mps; engine ctor takes by const ref to std::vector).
-    std::vector<f_t> h_var_lower  = mps.get_variable_lower_bounds();
-    std::vector<f_t> h_var_upper  = mps.get_variable_upper_bounds();
-    std::vector<f_t> h_cstr_lower = mps.get_constraint_lower_bounds();
-    std::vector<f_t> h_cstr_upper = mps.get_constraint_upper_bounds();
-
-    // A (CSR) — mutable copies for the engine + partitioner consumers below.
-    std::vector<i_t> h_A_row_offsets = mps.get_constraint_matrix_offsets();
-    std::vector<i_t> h_A_col_indices = mps.get_constraint_matrix_indices();
-    std::vector<f_t> h_A_values      = mps.get_constraint_matrix_values();
-
-    // ----- 2. Transpose A -> A^T on the host (one-shot CSR transpose) -----
-    // CSC(A) and CSR(A^T) share the same memory layout, so the CSC produced
-    // by dual_simplex::csr_matrix_t::to_compressed_col IS the CSR of A^T.
-    // O(nnz + n_vars) counting sort, same as problem_t::compute_transpose.
-    namespace ds = cuopt::linear_programming::dual_simplex;
-    ds::csr_matrix_t<i_t, f_t> A_csr(n_cstr, n_vars, nnz);
-    A_csr.row_start = h_A_row_offsets;
-    A_csr.j         = h_A_col_indices;
-    A_csr.x         = h_A_values;
-    ds::csc_matrix_t<i_t, f_t> AT_as_csc(n_vars, n_cstr, nnz);
-    A_csr.to_compressed_col(AT_as_csc);
-    std::vector<i_t> h_A_t_row_offsets = std::move(AT_as_csc.col_start);
-    std::vector<i_t> h_A_t_col_indices = std::move(AT_as_csc.i);
-    std::vector<f_t> h_A_t_values      = std::move(AT_as_csc.x);
-
-    // ----- 3. Identity scaling for V1 -----
-    // Real multi-GPU scaling is a TODO; ship the unscaled problem to shards as
-    // both "unscaled" and "scaled" so the engine and per-shard pdlp_solver_t
-    // can run end-to-end. Scaling factor vectors are 1.0 everywhere so the
-    // shard-side unscale at the end is a no-op.
-    std::vector<f_t> h_A_values_scaled              = h_A_values;
-    std::vector<f_t> h_A_t_values_scaled            = h_A_t_values;
-    std::vector<f_t> h_obj_scaled                   = h_obj;
-    std::vector<f_t> h_var_lower_scaled             = h_var_lower;
-    std::vector<f_t> h_var_upper_scaled             = h_var_upper;
-    std::vector<f_t> h_cstr_lower_scaled            = h_cstr_lower;
-    std::vector<f_t> h_cstr_upper_scaled            = h_cstr_upper;
-    std::vector<f_t> h_cummulative_cstr_scaling(n_cstr, f_t(1.0));
-    std::vector<f_t> h_cummulative_var_scaling(n_vars, f_t(1.0));
-    const f_t h_bound_rescaling                     = f_t(1.0);
-    const f_t h_objective_rescaling                 = f_t(1.0);
-
-    // ----- 4. Partition -----
-    std::vector<i_t> parts;
-    if (!settings.multi_gpu_partition_file.empty()) {
-      parts = partition_loader_t<i_t, f_t>::parse_distributed_pdlp_partition_file(
-        settings.multi_gpu_partition_file);
-      validate_partition(parts, n_cstr, n_vars, distributed_pdlp_num_gpus, "partition file");
-    } else {
-      if (distributed_pdlp_num_gpus == 1) {
-        std::cout << "CAREFUL: distributed_pdlp_num_gpus == 1, running dummy version (single "
-                     "part covering "
-                  << n_cstr << " cstrs + " << n_vars << " vars)" << std::endl;
-      }
-      partitioner_input_t<i_t, f_t> partition_input;
-      partition_input.nb_cstr  = n_cstr;
-      partition_input.nb_vars  = n_vars;
-      partition_input.nb_parts = distributed_pdlp_num_gpus;
-
-      // METIS_PartGraphKway requires nparts >= 2; route num_gpus == 1 to Dummy.
-      const partitioner_kind_t kind =
-        (distributed_pdlp_num_gpus == 1) ? partitioner_kind_t::Dummy : partitioner_kind_t::Metis;
-      if (kind == partitioner_kind_t::Metis) {
-        // partitioner_input_t holds non-const std::vector<i_t>* pointers; we
-        // already have the data in our local mutable buffers above.
-        partition_input.A.row_offsets   = &h_A_row_offsets;
-        partition_input.A.col_indices   = &h_A_col_indices;
-        partition_input.A.num_rows      = n_cstr;
-        partition_input.A.num_cols      = n_vars;
-        partition_input.A_t.row_offsets = &h_A_t_row_offsets;
-        partition_input.A_t.col_indices = &h_A_t_col_indices;
-        partition_input.A_t.num_rows    = n_vars;
-        partition_input.A_t.num_cols    = n_cstr;
-      }
-      auto partitioner = make_partitioner<i_t, f_t>(kind);
-      parts            = partitioner->partition(partition_input);
-    }
-
-    // ----- 5. Build per-rank data -----
-    std::vector<rank_data_t<i_t, f_t>> sub_pdlp_rank_data =
-      partition_loader_t<i_t, f_t>::create_rank_data_from_parts(parts,
-                                                                h_A_row_offsets,
-                                                                h_A_col_indices,
-                                                                h_A_values,
-                                                                h_A_values_scaled,
-                                                                h_A_t_row_offsets,
-                                                                h_A_t_col_indices,
-                                                                h_A_t_values,
-                                                                h_A_t_values_scaled,
-                                                                settings.distributed_pdlp_num_gpus,
-                                                                n_cstr,
-                                                                n_vars,
-                                                                nnz);
-
-    // ----- 6. Per-shard settings -----
-    pdlp_solver_settings_t<i_t, f_t> sub_pdlp_settings                    = settings;
-    sub_pdlp_settings.num_gpus                                            = 1;
-    sub_pdlp_settings.distributed_pdlp_num_gpus                           = 1;
-    sub_pdlp_settings.multi_gpu_partition_file                            = "";
-    sub_pdlp_settings.is_distributed_sub_pdlp                             = true;
-    sub_pdlp_settings.hyper_params.default_l_inf_ruiz_iterations          = 0;
-    sub_pdlp_settings.hyper_params.default_alpha_pock_chambolle_rescaling = 0.0;
-
-    // ----- 7. Construct the engine: NCCL comms + per-shard pdlp_solver_t -----
-    multi_gpu_engine.emplace(std::move(sub_pdlp_rank_data),
-                             h_obj,
-                             h_var_lower,
-                             h_var_upper,
-                             h_cstr_lower,
-                             h_cstr_upper,
-                             h_obj_scaled,
-                             h_var_lower_scaled,
-                             h_var_upper_scaled,
-                             h_cstr_lower_scaled,
-                             h_cstr_upper_scaled,
-                             h_cummulative_cstr_scaling,
-                             h_cummulative_var_scaling,
-                             h_bound_rescaling,
-                             h_objective_rescaling,
-                             maximize,
-                             objective_offset,
-                             objective_scaling_factor,
-                             sub_pdlp_settings);
-
-    // ----- 8 Distributed Scaling -----
-    for (auto& shard : multi_gpu_engine->shards) {
-      raft::device_setter guard(shard->device_id);
-      shard->sub_pdlp->get_initial_scaling_strategy().reset_scaling_state_for_distributed();
-    }
-    for (auto& shard : multi_gpu_engine->shards) {
-      raft::device_setter guard(shard->device_id);
-      shard->stream.synchronize();
-    }
-
-    // Distributed scaling
-    if (settings_.hyper_params.do_ruiz_scaling) {
-      multi_gpu_engine->distributed_ruiz_inf_scaling(
-        settings_.hyper_params.default_l_inf_ruiz_iterations, n_vars);
-    }
-    if (settings_.hyper_params.do_pock_chambolle_scaling) {
-      multi_gpu_engine->distributed_pock_chambolle_scaling(
-        static_cast<f_t>(settings_.hyper_params.default_alpha_pock_chambolle_rescaling), n_vars);
-    }
-
-    for (auto& shard : multi_gpu_engine->shards) {
-      raft::device_setter guard(shard->device_id);
-      auto& scaling = shard->sub_pdlp->get_initial_scaling_strategy();
-      scaling.scale_problem();
-
-      shard->sub_pdlp->pdhg_solver_.get_cusparse_view().create_spmv_op_plans(
-        /*is_reflected=*/settings_.hyper_params.use_reflected_primal_dual);
-    }
-    for (auto& shard : multi_gpu_engine->shards) {
-      raft::device_setter guard(shard->device_id);
-      shard->stream.synchronize();
-    }
-
-    // ----- 8b. Seed initial step-size / primal-weight (distributed, scales to N shards) -----
-    constexpr f_t kStepSizeScale = f_t{0.998};
-    const f_t sigma_max          = multi_gpu_engine->distributed_max_singular_value(n_cstr);
-    const f_t h_primal_weight    = f_t{1};
-    const f_t h_step_size        = (sigma_max > f_t{0}) ? kStepSizeScale / sigma_max : f_t{1};
-    // With primal_weight = 1 the adaptive step-size strategy collapses to
-    // primal_step_size = step_size / primal_weight = step_size
-    // dual_step_size   = step_size * primal_weight = step_size.
-    const f_t h_primal_step_size = h_step_size;
-    const f_t h_dual_step_size   = h_step_size;
-
-    // Put the values on master
-    raft::copy(step_size_.data(), &h_step_size, 1, stream_view_);
-    raft::copy(primal_weight_.data(), &h_primal_weight, 1, stream_view_);
-    raft::copy(best_primal_weight_.data(), &h_primal_weight, 1, stream_view_);
-    raft::copy(primal_step_size_.data(), &h_primal_step_size, 1, stream_view_);
-    raft::copy(dual_step_size_.data(), &h_dual_step_size, 1, stream_view_);
-    handle_ptr_->sync_stream(stream_view_);
-
-    // put the values on each shard
-    for (auto& shard : multi_gpu_engine->shards) {
-      raft::device_setter guard(shard->device_id);
-      auto& sub = *shard->sub_pdlp;
-      raft::copy(sub.step_size_.data(), &h_step_size, 1, shard->stream);
-      raft::copy(sub.primal_weight_.data(), &h_primal_weight, 1, shard->stream);
-      raft::copy(sub.best_primal_weight_.data(), &h_primal_weight, 1, shard->stream);
-      raft::copy(sub.get_primal_step_size().data(), &h_primal_step_size, 1, shard->stream);
-      raft::copy(sub.get_dual_step_size().data(), &h_dual_step_size, 1, shard->stream);
-    }
-    for (auto& shard : multi_gpu_engine->shards) {
-      raft::device_setter guard(shard->device_id);
-      shard->stream.synchronize();
-    }
-
-    // Wire the engine into master's pdhg_solver_; shards keep mgpu_engine_ == nullptr.
-    pdhg_solver_.set_multi_gpu_engine(&*multi_gpu_engine);
-
-    // ----- 9. Resize master gather destinations to the full problem size -----
-    pdhg_solver_.get_potential_next_primal_solution().resize(n_vars, stream_view_);
-    pdhg_solver_.get_potential_next_dual_solution().resize(n_cstr, stream_view_);
-    current_termination_strategy_.get_convergence_information().get_reduced_cost().resize(
-      n_vars, stream_view_);
-    primal_size_h_ = n_vars;
-    dual_size_h_   = n_cstr;
-
-    // Distributed conergence_information::init_l2_norms
-    for (auto& shard : multi_gpu_engine->shards) {
-      raft::device_setter guard(shard->device_id);
-      shard->sub_pdlp->get_current_termination_strategy()
-        .get_convergence_information()
-        .compute_owned_reference_norm_partials(shard->rank_data.owned_var_size,
-                                               shard->rank_data.owned_cstr_size);
+  // ----- 1. Read problem shape and bulk data directly from mps (host) -----
+  const i_t n_vars = static_cast<i_t>(mps.get_objective_coefficients().size());
+  const i_t n_cstr = static_cast<i_t>(mps.get_constraint_lower_bounds().size());
+  const i_t nnz    = static_cast<i_t>(mps.get_constraint_matrix_values().size());
+  cuopt_expects(n_vars > 0,
+                error_type_t::ValidationError,
+                "Distributed PDLP from mps requires a non-empty objective");
+  cuopt_expects(n_cstr > 0,
+                error_type_t::ValidationError,
+                "Distributed PDLP from mps requires at least one constraint");
+  cuopt_expects(static_cast<i_t>(mps.get_constraint_matrix_offsets().size()) == n_cstr + 1,
+                error_type_t::ValidationError,
+                "mps constraint_matrix_offsets size must equal n_constraints + 1");
+  cuopt_expects(
+    static_cast<i_t>(mps.get_constraint_matrix_indices().size()) == nnz,
+    error_type_t::ValidationError,
+    "mps constraint_matrix_indices size must equal nnz (constraint_matrix_values size)");
+  cuopt_expects(static_cast<i_t>(mps.get_constraint_upper_bounds().size()) == n_cstr,
+                error_type_t::ValidationError,
+                "mps constraint_upper_bounds size must equal n_constraints");
+  cuopt_expects(static_cast<i_t>(mps.get_variable_lower_bounds().size()) == n_vars,
+                error_type_t::ValidationError,
+                "mps variable_lower_bounds size must equal n_variables");
+  cuopt_expects(static_cast<i_t>(mps.get_variable_upper_bounds().size()) == n_vars,
+                error_type_t::ValidationError,
+                "mps variable_upper_bounds size must equal n_variables");
+
+  const bool maximize          = mps.get_sense();
+  f_t objective_offset         = mps.get_objective_offset();
+  f_t objective_scaling_factor = mps.get_objective_scaling_factor();
+
+  // Objective: copy (mutable so we can negate for maximize, matching
+  // problem_helpers.cuh::convert_to_maximization_problem).
+  std::vector<f_t> h_obj = mps.get_objective_coefficients();
+  if (maximize) {
+    for (auto& v : h_obj)
+      v = -v;
+    objective_offset         = -objective_offset;
+    objective_scaling_factor = -objective_scaling_factor;
+  }
+
+  // Bounds (copy from mps; engine ctor takes by const ref to std::vector).
+  std::vector<f_t> h_var_lower  = mps.get_variable_lower_bounds();
+  std::vector<f_t> h_var_upper  = mps.get_variable_upper_bounds();
+  std::vector<f_t> h_cstr_lower = mps.get_constraint_lower_bounds();
+  std::vector<f_t> h_cstr_upper = mps.get_constraint_upper_bounds();
+
+  // A (CSR) — mutable copies for the engine + partitioner consumers below.
+  std::vector<i_t> h_A_row_offsets = mps.get_constraint_matrix_offsets();
+  std::vector<i_t> h_A_col_indices = mps.get_constraint_matrix_indices();
+  std::vector<f_t> h_A_values      = mps.get_constraint_matrix_values();
+
+  // ----- 2. Transpose A -> A^T on the host (one-shot CSR transpose) -----
+  // CSC(A) and CSR(A^T) share the same memory layout, so the CSC produced
+  // by dual_simplex::csr_matrix_t::to_compressed_col IS the CSR of A^T.
+  // O(nnz + n_vars) counting sort, same as problem_t::compute_transpose.
+  namespace ds = cuopt::linear_programming::dual_simplex;
+  ds::csr_matrix_t<i_t, f_t> A_csr(n_cstr, n_vars, nnz);
+  A_csr.row_start = h_A_row_offsets;
+  A_csr.j         = h_A_col_indices;
+  A_csr.x         = h_A_values;
+  ds::csc_matrix_t<i_t, f_t> AT_as_csc(n_vars, n_cstr, nnz);
+  A_csr.to_compressed_col(AT_as_csc);
+  std::vector<i_t> h_A_t_row_offsets = std::move(AT_as_csc.col_start);
+  std::vector<i_t> h_A_t_col_indices = std::move(AT_as_csc.i);
+  std::vector<f_t> h_A_t_values      = std::move(AT_as_csc.x);
+
+  // ----- 3. Identity scaling for V1 -----
+  // Real multi-GPU scaling is a TODO; ship the unscaled problem to shards as
+  // both "unscaled" and "scaled" so the engine and per-shard pdlp_solver_t
+  // can run end-to-end. Scaling factor vectors are 1.0 everywhere so the
+  // shard-side unscale at the end is a no-op.
+  std::vector<f_t> h_A_values_scaled   = h_A_values;
+  std::vector<f_t> h_A_t_values_scaled = h_A_t_values;
+  std::vector<f_t> h_obj_scaled        = h_obj;
+  std::vector<f_t> h_var_lower_scaled  = h_var_lower;
+  std::vector<f_t> h_var_upper_scaled  = h_var_upper;
+  std::vector<f_t> h_cstr_lower_scaled = h_cstr_lower;
+  std::vector<f_t> h_cstr_upper_scaled = h_cstr_upper;
+  std::vector<f_t> h_cummulative_cstr_scaling(n_cstr, f_t(1.0));
+  std::vector<f_t> h_cummulative_var_scaling(n_vars, f_t(1.0));
+  const f_t h_bound_rescaling     = f_t(1.0);
+  const f_t h_objective_rescaling = f_t(1.0);
+
+  // ----- 4. Partition -----
+  std::vector<i_t> parts;
+  if (!settings.multi_gpu_partition_file.empty()) {
+    parts = partition_loader_t<i_t, f_t>::parse_distributed_pdlp_partition_file(
+      settings.multi_gpu_partition_file);
+    validate_partition(parts, n_cstr, n_vars, distributed_pdlp_num_gpus, "partition file");
+  } else {
+    if (distributed_pdlp_num_gpus == 1) {
+      std::cout << "CAREFUL: distributed_pdlp_num_gpus == 1, running dummy version (single "
+                   "part covering "
+                << n_cstr << " cstrs + " << n_vars << " vars)" << std::endl;
     }
-    multi_gpu_engine->allreduce_sum_inplace([](pdlp_solver_t<i_t, f_t>& sp) -> f_t* {
-      return sp.get_current_termination_strategy()
-        .get_convergence_information()
-        .l2_norm_primal_right_hand_side_data();
-    });
-    multi_gpu_engine->allreduce_sum_inplace([](pdlp_solver_t<i_t, f_t>& sp) -> f_t* {
-      return sp.get_current_termination_strategy()
-        .get_convergence_information()
-        .l2_norm_primal_linear_objective_data();
-    });
-    for (auto& shard : multi_gpu_engine->shards) {
-      raft::device_setter guard(shard->device_id);
-      shard->sub_pdlp->get_current_termination_strategy()
-        .get_convergence_information()
-        .sqrt_reference_norms_inplace();
-      shard->stream.synchronize();
+    partitioner_input_t<i_t, f_t> partition_input;
+    partition_input.nb_cstr  = n_cstr;
+    partition_input.nb_vars  = n_vars;
+    partition_input.nb_parts = distributed_pdlp_num_gpus;
+
+    // METIS_PartGraphKway requires nparts >= 2; route num_gpus == 1 to Dummy.
+    const partitioner_kind_t kind =
+      (distributed_pdlp_num_gpus == 1) ? partitioner_kind_t::Dummy : partitioner_kind_t::Metis;
+    if (kind == partitioner_kind_t::Metis) {
+      // partitioner_input_t holds non-const std::vector<i_t>* pointers; we
+      // already have the data in our local mutable buffers above.
+      partition_input.A.row_offsets   = &h_A_row_offsets;
+      partition_input.A.col_indices   = &h_A_col_indices;
+      partition_input.A.num_rows      = n_cstr;
+      partition_input.A.num_cols      = n_vars;
+      partition_input.A_t.row_offsets = &h_A_t_row_offsets;
+      partition_input.A_t.col_indices = &h_A_t_col_indices;
+      partition_input.A_t.num_rows    = n_vars;
+      partition_input.A_t.num_cols    = n_cstr;
     }
-    // Broadcast the values to the master
-    {
-      auto& s0      = *multi_gpu_engine->shards[0];
-      auto& s0_conv = s0.sub_pdlp->get_current_termination_strategy().get_convergence_information();
-      raft::device_setter guard(s0.device_id);
-      for (auto* ts : {&current_termination_strategy_, &average_termination_strategy_}) {
-        auto& ci = ts->get_convergence_information();
-        raft::copy(ci.l2_norm_primal_right_hand_side_data(),
-                   s0_conv.l2_norm_primal_right_hand_side_data(),
-                   1,
-                   stream_view_);
-        raft::copy(ci.l2_norm_primal_linear_objective_data(),
-                   s0_conv.l2_norm_primal_linear_objective_data(),
-                   1,
-                   stream_view_);
-      }
+    auto partitioner = make_partitioner<i_t, f_t>(kind);
+    parts            = partitioner->partition(partition_input);
+  }
+
+  // ----- 5. Build per-rank data -----
+  std::vector<rank_data_t<i_t, f_t>> sub_pdlp_rank_data =
+    partition_loader_t<i_t, f_t>::create_rank_data_from_parts(parts,
+                                                              h_A_row_offsets,
+                                                              h_A_col_indices,
+                                                              h_A_values,
+                                                              h_A_values_scaled,
+                                                              h_A_t_row_offsets,
+                                                              h_A_t_col_indices,
+                                                              h_A_t_values,
+                                                              h_A_t_values_scaled,
+                                                              settings.distributed_pdlp_num_gpus,
+                                                              n_cstr,
+                                                              n_vars,
+                                                              nnz);
+
+  // ----- 6. Per-shard settings -----
+  pdlp_solver_settings_t<i_t, f_t> sub_pdlp_settings                    = settings;
+  sub_pdlp_settings.num_gpus                                            = 1;
+  sub_pdlp_settings.distributed_pdlp_num_gpus                           = 1;
+  sub_pdlp_settings.multi_gpu_partition_file                            = "";
+  sub_pdlp_settings.is_distributed_sub_pdlp                             = true;
+  sub_pdlp_settings.hyper_params.default_l_inf_ruiz_iterations          = 0;
+  sub_pdlp_settings.hyper_params.default_alpha_pock_chambolle_rescaling = 0.0;
+
+  // ----- 7. Construct the engine: NCCL comms + per-shard pdlp_solver_t -----
+  multi_gpu_engine.emplace(std::move(sub_pdlp_rank_data),
+                           h_obj,
+                           h_var_lower,
+                           h_var_upper,
+                           h_cstr_lower,
+                           h_cstr_upper,
+                           h_obj_scaled,
+                           h_var_lower_scaled,
+                           h_var_upper_scaled,
+                           h_cstr_lower_scaled,
+                           h_cstr_upper_scaled,
+                           h_cummulative_cstr_scaling,
+                           h_cummulative_var_scaling,
+                           h_bound_rescaling,
+                           h_objective_rescaling,
+                           maximize,
+                           objective_offset,
+                           objective_scaling_factor,
+                           sub_pdlp_settings);
+
+  // ----- 8 Distributed Scaling -----
+  for (auto& shard : multi_gpu_engine->shards) {
+    raft::device_setter guard(shard->device_id);
+    shard->sub_pdlp->get_initial_scaling_strategy().reset_scaling_state_for_distributed();
+  }
+  for (auto& shard : multi_gpu_engine->shards) {
+    raft::device_setter guard(shard->device_id);
+    shard->stream.synchronize();
+  }
+
+  // Distributed scaling
+  if (settings_.hyper_params.do_ruiz_scaling) {
+    multi_gpu_engine->distributed_ruiz_inf_scaling(
+      settings_.hyper_params.default_l_inf_ruiz_iterations, n_vars);
+  }
+  if (settings_.hyper_params.do_pock_chambolle_scaling) {
+    multi_gpu_engine->distributed_pock_chambolle_scaling(
+      static_cast<f_t>(settings_.hyper_params.default_alpha_pock_chambolle_rescaling), n_vars);
+  }
+
+  for (auto& shard : multi_gpu_engine->shards) {
+    raft::device_setter guard(shard->device_id);
+    auto& scaling = shard->sub_pdlp->get_initial_scaling_strategy();
+    scaling.scale_problem();
+
+    shard->sub_pdlp->pdhg_solver_.get_cusparse_view().create_spmv_op_plans(
+      /*is_reflected=*/settings_.hyper_params.use_reflected_primal_dual);
+  }
+  for (auto& shard : multi_gpu_engine->shards) {
+    raft::device_setter guard(shard->device_id);
+    shard->stream.synchronize();
+  }
+
+  // ----- 8b. Seed initial step-size / primal-weight (distributed, scales to N shards) -----
+  constexpr f_t kStepSizeScale = f_t{0.998};
+  const f_t sigma_max          = multi_gpu_engine->distributed_max_singular_value(n_cstr);
+  const f_t h_primal_weight    = f_t{1};
+  const f_t h_step_size        = (sigma_max > f_t{0}) ? kStepSizeScale / sigma_max : f_t{1};
+  // With primal_weight = 1 the adaptive step-size strategy collapses to
+  // primal_step_size = step_size / primal_weight = step_size
+  // dual_step_size   = step_size * primal_weight = step_size.
+  const f_t h_primal_step_size = h_step_size;
+  const f_t h_dual_step_size   = h_step_size;
+
+  // Put the values on master
+  raft::copy(step_size_.data(), &h_step_size, 1, stream_view_);
+  raft::copy(primal_weight_.data(), &h_primal_weight, 1, stream_view_);
+  raft::copy(best_primal_weight_.data(), &h_primal_weight, 1, stream_view_);
+  raft::copy(primal_step_size_.data(), &h_primal_step_size, 1, stream_view_);
+  raft::copy(dual_step_size_.data(), &h_dual_step_size, 1, stream_view_);
+  handle_ptr_->sync_stream(stream_view_);
+
+  // put the values on each shard
+  for (auto& shard : multi_gpu_engine->shards) {
+    raft::device_setter guard(shard->device_id);
+    auto& sub = *shard->sub_pdlp;
+    raft::copy(sub.step_size_.data(), &h_step_size, 1, shard->stream);
+    raft::copy(sub.primal_weight_.data(), &h_primal_weight, 1, shard->stream);
+    raft::copy(sub.best_primal_weight_.data(), &h_primal_weight, 1, shard->stream);
+    raft::copy(sub.get_primal_step_size().data(), &h_primal_step_size, 1, shard->stream);
+    raft::copy(sub.get_dual_step_size().data(), &h_dual_step_size, 1, shard->stream);
+  }
+  for (auto& shard : multi_gpu_engine->shards) {
+    raft::device_setter guard(shard->device_id);
+    shard->stream.synchronize();
+  }
+
+  // Wire the engine into master's pdhg_solver_; shards keep mgpu_engine_ == nullptr.
+  pdhg_solver_.set_multi_gpu_engine(&*multi_gpu_engine);
+
+  // ----- 9. Resize master gather destinations to the full problem size -----
+  pdhg_solver_.get_potential_next_primal_solution().resize(n_vars, stream_view_);
+  pdhg_solver_.get_potential_next_dual_solution().resize(n_cstr, stream_view_);
+  current_termination_strategy_.get_convergence_information().get_reduced_cost().resize(
+    n_vars, stream_view_);
+  primal_size_h_ = n_vars;
+  dual_size_h_   = n_cstr;
+
+  // Distributed conergence_information::init_l2_norms
+  for (auto& shard : multi_gpu_engine->shards) {
+    raft::device_setter guard(shard->device_id);
+    shard->sub_pdlp->get_current_termination_strategy()
+      .get_convergence_information()
+      .compute_owned_reference_norm_partials(shard->rank_data.owned_var_size,
+                                             shard->rank_data.owned_cstr_size);
+  }
+  multi_gpu_engine->allreduce_sum_inplace([](pdlp_solver_t<i_t, f_t>& sp) -> f_t* {
+    return sp.get_current_termination_strategy()
+      .get_convergence_information()
+      .l2_norm_primal_right_hand_side_data();
+  });
+  multi_gpu_engine->allreduce_sum_inplace([](pdlp_solver_t<i_t, f_t>& sp) -> f_t* {
+    return sp.get_current_termination_strategy()
+      .get_convergence_information()
+      .l2_norm_primal_linear_objective_data();
+  });
+  for (auto& shard : multi_gpu_engine->shards) {
+    raft::device_setter guard(shard->device_id);
+    shard->sub_pdlp->get_current_termination_strategy()
+      .get_convergence_information()
+      .sqrt_reference_norms_inplace();
+    shard->stream.synchronize();
+  }
+  // Broadcast the values to the master
+  {
+    auto& s0      = *multi_gpu_engine->shards[0];
+    auto& s0_conv = s0.sub_pdlp->get_current_termination_strategy().get_convergence_information();
+    raft::device_setter guard(s0.device_id);
+    for (auto* ts : {&current_termination_strategy_, &average_termination_strategy_}) {
+      auto& ci = ts->get_convergence_information();
+      raft::copy(ci.l2_norm_primal_right_hand_side_data(),
+                 s0_conv.l2_norm_primal_right_hand_side_data(),
+                 1,
+                 stream_view_);
+      raft::copy(ci.l2_norm_primal_linear_objective_data(),
+                 s0_conv.l2_norm_primal_linear_objective_data(),
+                 1,
+                 stream_view_);
     }
-    handle_ptr_->sync_stream(stream_view_);
+  }
+  handle_ptr_->sync_stream(stream_view_);
 }
 
 template <typename i_t, typename f_t>
@@ -2418,10 +2417,9 @@ void pdlp_solver_t<i_t, f_t>::compute_fixed_error(std::vector<int>& has_restarte
 
     multi_gpu_engine->allreduce_sum_inplace(
       [](auto& sp) -> f_t* { return sp.step_size_strategy_.get_interaction().data(); });
-    multi_gpu_engine->allreduce_sum_inplace(
-      [](auto& sp) -> f_t* {
-        return sp.step_size_strategy_.get_norm_squared_delta_primal().data();
-      });
+    multi_gpu_engine->allreduce_sum_inplace([](auto& sp) -> f_t* {
+      return sp.step_size_strategy_.get_norm_squared_delta_primal().data();
+    });
     multi_gpu_engine->allreduce_sum_inplace(
       [](auto& sp) -> f_t* { return sp.step_size_strategy_.get_norm_squared_delta_dual().data(); });
 
@@ -3022,9 +3020,7 @@ optimization_problem_solution_t<i_t, f_t> pdlp_solver_t<i_t, f_t>::run_solver(co
         // 1. At the very beginning of the solver, when no steps have been taken yet
         // 2. After a single step, since average of one step is the same step
         if (internal_solver_iterations_ <= 1) {
-          if (multi_gpu_engine) {
-            assert(false && "Not implemented");
-          }
+          if (multi_gpu_engine) { assert(false && "Not implemented"); }
           raft::copy(unscaled_primal_avg_solution_.data(),
                      pdhg_solver_.get_primal_solution().data(),
                      primal_size_h_,
diff --git a/cpp/src/pdlp/restart_strategy/pdlp_restart_strategy.cu b/cpp/src/pdlp/restart_strategy/pdlp_restart_strategy.cu
index b7d49fc32f..ee1d19b96b 100644
--- a/cpp/src/pdlp/restart_strategy/pdlp_restart_strategy.cu
+++ b/cpp/src/pdlp/restart_strategy/pdlp_restart_strategy.cu
@@ -931,11 +931,11 @@ void pdlp_restart_strategy_t<i_t, f_t>::cupdlpx_restart(
                  .last_restart_duality_gap_.primal_distance_traveled_.data(),
                1,
                stream_view_);
-    raft::copy(last_restart_duality_gap_.dual_distance_traveled_.data(),
-               s0.sub_pdlp->get_restart_strategy()
-                 .last_restart_duality_gap_.dual_distance_traveled_.data(),
-               1,
-               stream_view_);
+    raft::copy(
+      last_restart_duality_gap_.dual_distance_traveled_.data(),
+      s0.sub_pdlp->get_restart_strategy().last_restart_duality_gap_.dual_distance_traveled_.data(),
+      1,
+      stream_view_);
   } else {
     distance_squared_moved_from_last_restart_period(
       pdhg_solver.get_potential_next_primal_solution(),
@@ -1021,8 +1021,7 @@ void pdlp_restart_strategy_t<i_t, f_t>::cupdlpx_restart(
 
     engine->for_each_shard([&](auto& shard) {
       auto& sub = *shard.sub_pdlp;
-      raft::copy(
-        sub.get_primal_step_size().data(), &h_primal_step_size, 1, shard.stream.view());
+      raft::copy(sub.get_primal_step_size().data(), &h_primal_step_size, 1, shard.stream.view());
       raft::copy(sub.get_dual_step_size().data(), &h_dual_step_size, 1, shard.stream.view());
       raft::copy(sub.get_primal_weight().data(), &h_primal_weight, 1, shard.stream.view());
       raft::copy(
@@ -1087,8 +1086,8 @@ void pdlp_restart_strategy_t<i_t, f_t>::cupdlpx_restart(
 
   if (auto* engine = pdhg_solver.get_mgpu_engine()) {
     engine->for_each_shard([&](auto& shard) {
-      shard.sub_pdlp->get_restart_strategy().weighted_average_solution_.iterations_since_last_restart_ =
-        0;
+      shard.sub_pdlp->get_restart_strategy()
+        .weighted_average_solution_.iterations_since_last_restart_ = 0;
     });
   }
 }
diff --git a/cpp/src/pdlp/solve.cu b/cpp/src/pdlp/solve.cu
index feaeb7bd57..156a601b29 100644
--- a/cpp/src/pdlp/solve.cu
+++ b/cpp/src/pdlp/solve.cu
@@ -2126,11 +2126,12 @@ optimization_problem_solution_t<i_t, f_t> solve_lp(
   bool problem_checking,
   bool use_pdlp_solver_mode)
 {
-  cuopt_expects(settings.hyper_params.use_distributed_pdlp,
-                error_type_t::ValidationError,
-                "solve_lp from mps_data_model: settings.hyper_params.use_distributed_pdlp must be true");
-    return solve_lp_distributed_from_mps(
-      handle_ptr, mps_data_model, settings, problem_checking, use_pdlp_solver_mode);
+  cuopt_expects(
+    settings.hyper_params.use_distributed_pdlp,
+    error_type_t::ValidationError,
+    "solve_lp from mps_data_model: settings.hyper_params.use_distributed_pdlp must be true");
+  return solve_lp_distributed_from_mps(
+    handle_ptr, mps_data_model, settings, problem_checking, use_pdlp_solver_mode);
 }
 
 template <typename i_t, typename f_t>
@@ -2182,12 +2183,13 @@ optimization_problem_solution_t<i_t, f_t> solve_lp_distributed_from_mps(
   const i_t n_vars = static_cast<i_t>(mps_data_model.get_objective_coefficients().size());
   const i_t n_cstr = static_cast<i_t>(mps_data_model.get_constraint_lower_bounds().size());
   const i_t nnz    = static_cast<i_t>(mps_data_model.get_constraint_matrix_values().size());
-  CUOPT_LOG_INFO("Solving a problem with %d constraints, %d variables (%d integers), and %d "
-                 "nonzeros (distributed mps-direct path)",
-                 n_cstr,
-                 n_vars,
-                 0,
-                 nnz);
+  CUOPT_LOG_INFO(
+    "Solving a problem with %d constraints, %d variables (%d integers), and %d "
+    "nonzeros (distributed mps-direct path)",
+    n_cstr,
+    n_vars,
+    0,
+    nnz);
 
   auto lp_timer = cuopt::timer_t(settings_resolved.time_limit);
 
@@ -2200,8 +2202,7 @@ optimization_problem_solution_t<i_t, f_t> solve_lp_distributed_from_mps(
   }
   detail::problem_t<i_t, f_t> placeholder_problem(placeholder_op);
 
-  detail::pdlp_solver_t<i_t, f_t> solver(
-    placeholder_problem, mps_data_model, settings_resolved);
+  detail::pdlp_solver_t<i_t, f_t> solver(placeholder_problem, mps_data_model, settings_resolved);
 
   auto sol = solver.run_solver(lp_timer);
 
diff --git a/cpp/src/pdlp/solve.cuh b/cpp/src/pdlp/solve.cuh
index abb657943f..160f4602ba 100644
--- a/cpp/src/pdlp/solve.cuh
+++ b/cpp/src/pdlp/solve.cuh
@@ -64,8 +64,7 @@ cuopt::linear_programming::optimization_problem_solution_t<i_t, f_t> solve_lp_wi
  * @pre `settings.hyper_params.use_distributed_pdlp == true`.
  */
 template <typename i_t, typename f_t>
-cuopt::linear_programming::optimization_problem_solution_t<i_t, f_t>
-solve_lp_distributed_from_mps(
+cuopt::linear_programming::optimization_problem_solution_t<i_t, f_t> solve_lp_distributed_from_mps(
   raft::handle_t const* handle_ptr,
   const cuopt::linear_programming::io::mps_data_model_t<i_t, f_t>& mps_data_model,
   pdlp_solver_settings_t<i_t, f_t> const& settings,
diff --git a/cpp/src/pdlp/step_size_strategy/adaptive_step_size_strategy.cu b/cpp/src/pdlp/step_size_strategy/adaptive_step_size_strategy.cu
index 530a426117..aac777a44e 100644
--- a/cpp/src/pdlp/step_size_strategy/adaptive_step_size_strategy.cu
+++ b/cpp/src/pdlp/step_size_strategy/adaptive_step_size_strategy.cu
@@ -369,12 +369,10 @@ void adaptive_step_size_strategy_t<i_t, f_t>::compute_interaction_and_movement(
   i_t owned_cstr_size)
 {
   // mGPU needs to know owned size to restrict the reductions to the owned prefix
-  const i_t reduce_primal_size = (owned_primal_size >= 0)
-                                   ? owned_primal_size
-                                   : current_saddle_point_state.get_primal_size();
-  const i_t reduce_dual_size   = (owned_cstr_size >= 0)
-                                   ? owned_cstr_size
-                                   : current_saddle_point_state.get_dual_size();
+  const i_t reduce_primal_size =
+    (owned_primal_size >= 0) ? owned_primal_size : current_saddle_point_state.get_primal_size();
+  const i_t reduce_dual_size =
+    (owned_cstr_size >= 0) ? owned_cstr_size : current_saddle_point_state.get_dual_size();
 
   // QP would need this:
   // if iszero(problem.objective_matrix)
diff --git a/cpp/src/pdlp/termination_strategy/convergence_information.cu b/cpp/src/pdlp/termination_strategy/convergence_information.cu
index 44ddd5b2a1..1dfc8229da 100644
--- a/cpp/src/pdlp/termination_strategy/convergence_information.cu
+++ b/cpp/src/pdlp/termination_strategy/convergence_information.cu
@@ -213,8 +213,8 @@ void convergence_information_t<i_t, f_t>::init_l2_norms()
 }
 
 template <typename i_t, typename f_t>
-void convergence_information_t<i_t, f_t>::compute_owned_reference_norm_partials(
-  i_t owned_var_size, i_t owned_cstr_size)
+void convergence_information_t<i_t, f_t>::compute_owned_reference_norm_partials(i_t owned_var_size,
+                                                                                i_t owned_cstr_size)
 {
   cuopt_assert(!batch_mode_, "owned reference-norm partials only used in non-batch mGPU mode");
   cuopt_assert(owned_var_size <= primal_size_h_, "owned_var_size must be <= primal_size_h_");
@@ -233,7 +233,7 @@ void convergence_information_t<i_t, f_t>::compute_owned_reference_norm_partials(
   // rhs_sum_of_squares(lower[0:owned_cstr], upper[0:owned_cstr])  (no sqrt)
   {
     rmm::device_buffer d_temp_storage;
-    size_t bytes = 0;
+    size_t bytes   = 0;
     auto zip_begin = thrust::make_zip_iterator(problem_ptr->constraint_lower_bounds.data(),
                                                problem_ptr->constraint_upper_bounds.data());
     cub::DeviceReduce::TransformReduce(nullptr,
@@ -491,8 +491,7 @@ void convergence_information_t<i_t, f_t>::compute_convergence_information(
   print("dual_slack", dual_slack);
 #endif
 
-  if (current_pdhg_solver.is_multi_gpu())
-  {
+  if (current_pdhg_solver.is_multi_gpu()) {
     auto* engine = current_pdhg_solver.get_mgpu_engine();
     cuopt_assert(engine != nullptr,
                  "mGPU branch reached but current_pdhg_solver has no engine (shard pdhg?)");
@@ -502,19 +501,17 @@ void convergence_information_t<i_t, f_t>::compute_convergence_information(
 
     // Prepares halo values in potential_next_primal_solution
 
-    engine->halo_exchange_var(
-      [](pdhg_solver_t<i_t, f_t>& pdhg) -> rmm::device_uvector<f_t>& {
-        return pdhg.get_potential_next_primal_solution();
-      });
+    engine->halo_exchange_var([](pdhg_solver_t<i_t, f_t>& pdhg) -> rmm::device_uvector<f_t>& {
+      return pdhg.get_potential_next_primal_solution();
+    });
 
     for (auto& shard : engine->shards) {
       raft::device_setter guard(shard->device_id);
       auto& sub_pdlp = *shard->sub_pdlp;
       auto& sub_conv = sub_pdlp.get_current_termination_strategy().get_convergence_information();
-      sub_conv.compute_primal_residual(
-        sub_conv.op_problem_cusparse_view_,
-        sub_pdlp.pdhg_solver_.get_dual_tmp_resource(),
-        sub_pdlp.pdhg_solver_.get_potential_next_dual_solution());
+      sub_conv.compute_primal_residual(sub_conv.op_problem_cusparse_view_,
+                                       sub_pdlp.pdhg_solver_.get_dual_tmp_resource(),
+                                       sub_pdlp.pdhg_solver_.get_potential_next_dual_solution());
       sub_conv.compute_primal_objective_owned_partial(
         sub_pdlp.pdhg_solver_.get_potential_next_primal_solution(),
         shard->rank_data.owned_var_size);
@@ -522,13 +519,12 @@ void convergence_information_t<i_t, f_t>::compute_convergence_information(
 
     // Reduce all primal objectives across shards
     cuopt_assert(!batch_mode_, "multi-GPU PDLP is not supported in batch mode");
-    engine->allreduce_sum_inplace(
-      [](pdlp_solver_t<i_t, f_t>& sp) -> f_t* {
-        return sp.get_current_termination_strategy()
-          .get_convergence_information()
-          .get_primal_objective()
-          .data();
-      });
+    engine->allreduce_sum_inplace([](pdlp_solver_t<i_t, f_t>& sp) -> f_t* {
+      return sp.get_current_termination_strategy()
+        .get_convergence_information()
+        .get_primal_objective()
+        .data();
+    });
 
     // Get the reduced primal objective from the shard[0] (arbitrary)
     // Sync shards with master stream to avoid race conditions
@@ -536,16 +532,15 @@ void convergence_information_t<i_t, f_t>::compute_convergence_information(
     {
       auto& s0 = *engine->shards[0];
       raft::device_setter guard(s0.device_id);
-      auto& s0_conv =
-        s0.sub_pdlp->get_current_termination_strategy().get_convergence_information();
+      auto& s0_conv = s0.sub_pdlp->get_current_termination_strategy().get_convergence_information();
       raft::copy(primal_objective_.data(), s0_conv.get_primal_objective().data(), 1, stream_view_);
     }
     apply_primal_objective_scaling_and_offset();
+  } else {
+    compute_primal_residual(
+      op_problem_cusparse_view_, current_pdhg_solver.get_dual_tmp_resource(), dual_iterate);
+    compute_primal_objective(primal_iterate);
   }
-  else {
-  compute_primal_residual(
-    op_problem_cusparse_view_, current_pdhg_solver.get_dual_tmp_resource(), dual_iterate);
-  compute_primal_objective(primal_iterate);}
 
 #ifdef CUPDLP_DEBUG_MODE
   print("Primal Residual", primal_residual_);
@@ -556,9 +551,7 @@ void convergence_information_t<i_t, f_t>::compute_convergence_information(
     auto* engine = current_pdhg_solver.get_mgpu_engine();
     engine->distributed_l2_norm(
       [](pdlp_solver_t<i_t, f_t>& sp) -> rmm::device_uvector<f_t>& {
-        return sp.get_current_termination_strategy()
-          .get_convergence_information()
-          .primal_residual_;
+        return sp.get_current_termination_strategy().get_convergence_information().primal_residual_;
       },
       [](pdlp_solver_t<i_t, f_t>& sp) -> f_t* {
         return sp.get_current_termination_strategy()
@@ -629,10 +622,9 @@ void convergence_information_t<i_t, f_t>::compute_convergence_information(
     //    cv.dual_solution descriptor, which (cuPDLPx, see
     //    cusparse_view.cu:931-937) is bound to _potential_next_dual -- not to
     //    current.dual_solution. So we must halo-exchange the same buffer.
-    engine->halo_exchange_cstr(
-      [](pdhg_solver_t<i_t, f_t>& pdhg) -> rmm::device_uvector<f_t>& {
-        return pdhg.get_potential_next_dual_solution();
-      });
+    engine->halo_exchange_cstr([](pdhg_solver_t<i_t, f_t>& pdhg) -> rmm::device_uvector<f_t>& {
+      return pdhg.get_potential_next_dual_solution();
+    });
 
     // 2-3) Per-shard:
     //      - compute_dual_residual: shard.dual_residual_ has owned-var entries
@@ -653,11 +645,10 @@ void convergence_information_t<i_t, f_t>::compute_convergence_information(
       raft::device_setter guard(shard->device_id);
       auto& sub_pdlp = *shard->sub_pdlp;
       auto& sub_conv = sub_pdlp.get_current_termination_strategy().get_convergence_information();
-      sub_conv.compute_dual_residual(
-        sub_conv.op_problem_cusparse_view_,
-        sub_pdlp.pdhg_solver_.get_primal_tmp_resource(),
-        sub_pdlp.pdhg_solver_.get_potential_next_primal_solution(),
-        sub_pdlp.pdhg_solver_.get_dual_slack());
+      sub_conv.compute_dual_residual(sub_conv.op_problem_cusparse_view_,
+                                     sub_pdlp.pdhg_solver_.get_primal_tmp_resource(),
+                                     sub_pdlp.pdhg_solver_.get_potential_next_primal_solution(),
+                                     sub_pdlp.pdhg_solver_.get_dual_slack());
       sub_conv.compute_dual_objective_owned_partial(
         sub_pdlp.pdhg_solver_.get_potential_next_primal_solution(),
         sub_pdlp.pdhg_solver_.get_dual_slack(),
@@ -668,21 +659,19 @@ void convergence_information_t<i_t, f_t>::compute_convergence_information(
     // 4) Allreduce dual_objective_ across shards (sum, in place). Same
     //    offset/scaling-after-allreduce reasoning as primal: applying offset
     //    per-shard would over-count it Nshards times.
-    engine->allreduce_sum_inplace(
-      [](pdlp_solver_t<i_t, f_t>& sp) -> f_t* {
-        return sp.get_current_termination_strategy()
-          .get_convergence_information()
-          .get_dual_objective()
-          .data();
-      });
+    engine->allreduce_sum_inplace([](pdlp_solver_t<i_t, f_t>& sp) -> f_t* {
+      return sp.get_current_termination_strategy()
+        .get_convergence_information()
+        .get_dual_objective()
+        .data();
+    });
 
     // Sync shards with master stream to avoid race conditions
     engine->sync_await_shards(stream_view_);
     {
       auto& s0 = *engine->shards[0];
       raft::device_setter guard(s0.device_id);
-      auto& s0_conv =
-        s0.sub_pdlp->get_current_termination_strategy().get_convergence_information();
+      auto& s0_conv = s0.sub_pdlp->get_current_termination_strategy().get_convergence_information();
       raft::copy(dual_objective_.data(), s0_conv.get_dual_objective().data(), 1, stream_view_);
     }
     apply_dual_objective_scaling_and_offset();
@@ -704,9 +693,7 @@ void convergence_information_t<i_t, f_t>::compute_convergence_information(
     auto* engine = current_pdhg_solver.get_mgpu_engine();
     engine->distributed_l2_norm(
       [](pdlp_solver_t<i_t, f_t>& sp) -> rmm::device_uvector<f_t>& {
-        return sp.get_current_termination_strategy()
-          .get_convergence_information()
-          .dual_residual_;
+        return sp.get_current_termination_strategy().get_convergence_information().dual_residual_;
       },
       [](pdlp_solver_t<i_t, f_t>& sp) -> f_t* {
         return sp.get_current_termination_strategy()
@@ -758,7 +745,8 @@ void convergence_information_t<i_t, f_t>::compute_convergence_information(
                                                    std::numeric_limits<f_t>::lowest());
   }
 
-  // In mGPU, full primal_objective and dual_objective already mirrored to master so no special behaviour 
+  // In mGPU, full primal_objective and dual_objective already mirrored to master so no special
+  // behaviour
   const auto [grid_size, block_size] = kernel_config_from_batch_size(climber_strategies_.size());
   compute_remaining_stats_kernel<i_t, f_t>
     <<<grid_size, block_size, 0, stream_view_>>>(this->view(), climber_strategies_.size());
@@ -1049,12 +1037,11 @@ void convergence_information_t<i_t, f_t>::compute_dual_objective_owned_partial(
                          stream_view_);
 
   // dual_objective_ = dual_dot_ + sum_primal_slack_ (still a partial sum).
-  cub::DeviceTransform::Transform(
-    cuda::std::make_tuple(dual_dot_.data(), sum_primal_slack_.data()),
-    dual_objective_.data(),
-    1,
-    cuda::std::plus<>{},
-    stream_view_);
+  cub::DeviceTransform::Transform(cuda::std::make_tuple(dual_dot_.data(), sum_primal_slack_.data()),
+                                  dual_objective_.data(),
+                                  1,
+                                  cuda::std::plus<>{},
+                                  stream_view_);
 }
 
 template <typename i_t, typename f_t>
diff --git a/cpp/src/pdlp/utilities/mgpu_trace.cuh b/cpp/src/pdlp/utilities/mgpu_trace.cuh
index 06a848b18e..d9975d3202 100644
--- a/cpp/src/pdlp/utilities/mgpu_trace.cuh
+++ b/cpp/src/pdlp/utilities/mgpu_trace.cuh
@@ -35,18 +35,18 @@ inline bool mgpu_trace_enabled()
 
 }  // namespace cuopt::linear_programming::detail
 
-#define MGPU_TRACE(msg)                                                                        \
-  do {                                                                                         \
-    if (::cuopt::linear_programming::detail::mgpu_trace_enabled()) {                           \
-      std::fprintf(stderr, "[mgpu %s:%d] %s\n", __func__, __LINE__, (msg));                    \
-      std::fflush(stderr);                                                                     \
-    }                                                                                          \
+#define MGPU_TRACE(msg)                                                     \
+  do {                                                                      \
+    if (::cuopt::linear_programming::detail::mgpu_trace_enabled()) {        \
+      std::fprintf(stderr, "[mgpu %s:%d] %s\n", __func__, __LINE__, (msg)); \
+      std::fflush(stderr);                                                  \
+    }                                                                       \
   } while (0)
 
-#define MGPU_TRACE_FMT(fmt, ...)                                                               \
-  do {                                                                                         \
-    if (::cuopt::linear_programming::detail::mgpu_trace_enabled()) {                           \
-      std::fprintf(stderr, "[mgpu %s:%d] " fmt "\n", __func__, __LINE__, __VA_ARGS__);         \
-      std::fflush(stderr);                                                                     \
-    }                                                                                          \
+#define MGPU_TRACE_FMT(fmt, ...)                                                       \
+  do {                                                                                 \
+    if (::cuopt::linear_programming::detail::mgpu_trace_enabled()) {                   \
+      std::fprintf(stderr, "[mgpu %s:%d] " fmt "\n", __func__, __LINE__, __VA_ARGS__); \
+      std::fflush(stderr);                                                             \
+    }                                                                                  \
   } while (0)

From 6df81454a13b14e51bf504615996f066e72172ec Mon Sep 17 00:00:00 2001
From: Bulle Mostovoi <vmostovoi@nvidia.com>
Date: Tue, 2 Jun 2026 09:16:25 -0700
Subject: [PATCH 61/67] fixed bound/objective rescaling, now afiro on 8 shards
 work but hangs in the end

---
 .../distributed_pdlp/multi_gpu_engine.hpp     | 179 ++++++++++++++++++
 .../pdlp/distributed_pdlp/partition_loader.cu |   5 +-
 .../initial_scaling.cu                        |  47 ++++-
 .../initial_scaling.cuh                       |  16 ++
 cpp/src/pdlp/pdlp.cu                          |  13 ++
 5 files changed, 256 insertions(+), 4 deletions(-)

diff --git a/cpp/src/pdlp/distributed_pdlp/multi_gpu_engine.hpp b/cpp/src/pdlp/distributed_pdlp/multi_gpu_engine.hpp
index 0297ecc0a6..3a0fcb755d 100644
--- a/cpp/src/pdlp/distributed_pdlp/multi_gpu_engine.hpp
+++ b/cpp/src/pdlp/distributed_pdlp/multi_gpu_engine.hpp
@@ -18,16 +18,21 @@
 #include <raft/linalg/detail/cublas_wrappers.hpp>
 
 #include <rmm/cuda_stream.hpp>
+#include <rmm/device_buffer.hpp>
 #include <rmm/exec_policy.hpp>
 
 #include <thrust/execution_policy.h>
 #include <thrust/gather.h>
+#include <thrust/iterator/transform_iterator.h>
+#include <thrust/iterator/zip_iterator.h>
 #include <thrust/scatter.h>
+#include <cub/device/device_reduce.cuh>
 #include <cub/device/device_transform.cuh>
 #include <cuda/std/tuple>
 
 #include <nccl.h>
 
+#include <cmath>
 #include <memory>
 #include <random>
 #include <tuple>
@@ -45,6 +50,29 @@ struct sqrt_inplace_op_t {
   __host__ __device__ f_t operator()(f_t x) const { return raft::sqrt(x); }
 };
 
+// Squared-norm contribution of a constraint's [lower, upper] bound pair, used to
+// build the distributed bound rescaling (mirrors rhs_sum_of_squares_t). Defined
+// at namespace scope to avoid extended-lambda-in-template restrictions.
+template <typename f_t>
+struct mgpu_rhs_sq_op_t {
+  __host__ __device__ f_t operator()(const thrust::tuple<f_t, f_t>& t) const
+  {
+    const f_t lower = thrust::get<0>(t);
+    const f_t upper = thrust::get<1>(t);
+    f_t sum         = f_t(0);
+    if (isfinite(lower) && (lower != upper)) sum += lower * lower;
+    if (isfinite(upper)) sum += upper * upper;
+    return sum;
+  }
+};
+
+// Weighted square of an objective coefficient (mirrors weighted_square_op).
+template <typename f_t>
+struct mgpu_weighted_sq_op_t {
+  f_t weight;
+  __host__ __device__ f_t operator()(f_t v) const { return v * v * weight; }
+};
+
 template <typename i_t, typename f_t>
 struct multi_gpu_engine_t {
   // Constructs shards from rank_data
@@ -219,6 +247,63 @@ struct multi_gpu_engine_t {
     ncclGroupEnd();
   }
 
+  // -------- Broadcast owned constraint (row) scaling into halo ------------
+  void broadcast_constraint_scaling_to_halo()
+  {
+    const int nb = static_cast<int>(shards.size());
+    auto buf_access = [](pdlp_shard_t<i_t, f_t>& s) -> rmm::device_uvector<f_t>& {
+      return s.sub_pdlp->get_initial_scaling_strategy().get_cummulative_constraint_matrix_scaling();
+    };
+
+    // Gather each owner's owned scaling values that peers need.
+    for (int r = 0; r < nb; ++r) {
+      auto& s = *shards[r];
+      raft::device_setter guard(s.device_id);
+      auto& y = buf_access(s);
+      for (int peer = 0; peer < nb; ++peer) {
+        if (peer == r) continue;
+        if (s.cstr_send_indices_d[peer].size() == 0) continue;
+        thrust::gather(rmm::exec_policy_nosync(s.stream.view()),
+                       s.cstr_send_indices_d[peer].begin(),
+                       s.cstr_send_indices_d[peer].end(),
+                       y.begin(),
+                       s.cstr_send_buf_d[peer].begin());
+      }
+    }
+
+    ncclGroupStart();
+    for (int r = 0; r < nb; ++r) {
+      auto& s = *shards[r];
+      raft::device_setter guard(s.device_id);
+      for (int peer = 0; peer < nb; ++peer) {
+        if (peer == r) continue;
+        ncclSend(s.cstr_send_buf_d[peer].data(),
+                 s.cstr_send_buf_d[peer].size(),
+                 ncclFloat64,
+                 peer,
+                 s.comm.get(),
+                 s.stream.view().value());
+      }
+    }
+    for (int r = 0; r < nb; ++r) {
+      auto& s  = *shards[r];
+      auto& rd = s.rank_data;
+      raft::device_setter guard(s.device_id);
+      auto& y = buf_access(s);
+      for (int peer = 0; peer < nb; ++peer) {
+        if (peer == r) continue;
+        f_t* recv_ptr = y.data() + rd.owned_cstr_size + rd.cstr_recv_offsets[peer];
+        ncclRecv(recv_ptr,
+                 static_cast<size_t>(rd.cstr_recv_counts[peer]),
+                 ncclFloat64,
+                 peer,
+                 s.comm.get(),
+                 s.stream.view().value());
+      }
+    }
+    ncclGroupEnd();
+  }
+
   // -------- NCCL allreduce (sum, in place) --------------------------------
   // Per-shard in-place sum-allreduce. Each shard's stream issues an
   // ncclAllReduce(buf, buf, count, ncclFloat64, ncclSum, ...) inside a single
@@ -281,6 +366,100 @@ struct multi_gpu_engine_t {
     });
   }
 
+  // -------- Distributed bound / objective rescaling -----------------------
+  void distributed_bound_objective_rescaling(f_t c_scaling_weight)
+  {
+    const int nb = static_cast<int>(shards.size());
+
+    std::vector<rmm::device_uvector<f_t>> bound_sq;
+    std::vector<rmm::device_uvector<f_t>> obj_sq;
+    bound_sq.reserve(nb);
+    obj_sq.reserve(nb);
+
+    // 1) per-shard partial squared norms over OWNED entries only (halo rhs is
+    //    +/-inf and would otherwise double-count owned entries shared as halo).
+    for (int r = 0; r < nb; ++r) {
+      auto& s = *shards[r];
+      raft::device_setter guard(s.device_id);
+      bound_sq.emplace_back(1, s.stream.view());
+      obj_sq.emplace_back(1, s.stream.view());
+
+      const auto& scaled = s.sub_pdlp->get_initial_scaling_strategy().get_scaled_op_problem();
+      const int n_owned_cstr = static_cast<int>(s.rank_data.owned_cstr_size);
+      const int n_owned_var  = static_cast<int>(s.rank_data.owned_var_size);
+
+      auto bound_in = thrust::make_transform_iterator(
+        thrust::make_zip_iterator(scaled.constraint_lower_bounds.data(),
+                                  scaled.constraint_upper_bounds.data()),
+        mgpu_rhs_sq_op_t<f_t>{});
+      size_t tmp_bytes_b = 0;
+      cub::DeviceReduce::Sum(
+        nullptr, tmp_bytes_b, bound_in, bound_sq[r].data(), n_owned_cstr, s.stream.view().value());
+      rmm::device_buffer scratch_b(tmp_bytes_b, s.stream.view());
+      cub::DeviceReduce::Sum(scratch_b.data(),
+                             tmp_bytes_b,
+                             bound_in,
+                             bound_sq[r].data(),
+                             n_owned_cstr,
+                             s.stream.view().value());
+
+      auto obj_in = thrust::make_transform_iterator(scaled.objective_coefficients.data(),
+                                                    mgpu_weighted_sq_op_t<f_t>{c_scaling_weight});
+      size_t tmp_bytes_o = 0;
+      cub::DeviceReduce::Sum(
+        nullptr, tmp_bytes_o, obj_in, obj_sq[r].data(), n_owned_var, s.stream.view().value());
+      rmm::device_buffer scratch_o(tmp_bytes_o, s.stream.view());
+      cub::DeviceReduce::Sum(scratch_o.data(),
+                             tmp_bytes_o,
+                             obj_in,
+                             obj_sq[r].data(),
+                             n_owned_var,
+                             s.stream.view().value());
+    }
+
+    // 2) NCCL allreduce SUM -> every shard holds the global squared norms.
+    ncclGroupStart();
+    for (int r = 0; r < nb; ++r) {
+      auto& s = *shards[r];
+      raft::device_setter guard(s.device_id);
+      ncclAllReduce(bound_sq[r].data(),
+                    bound_sq[r].data(),
+                    1,
+                    ncclFloat64,
+                    ncclSum,
+                    s.comm.get(),
+                    s.stream.view().value());
+      ncclAllReduce(obj_sq[r].data(),
+                    obj_sq[r].data(),
+                    1,
+                    ncclFloat64,
+                    ncclSum,
+                    s.comm.get(),
+                    s.stream.view().value());
+    }
+    ncclGroupEnd();
+
+    // 3) derive the identical scalars and apply on every shard.
+    for (int r = 0; r < nb; ++r) {
+      auto& s = *shards[r];
+      raft::device_setter guard(s.device_id);
+      f_t h_bound_sq = f_t(0);
+      f_t h_obj_sq   = f_t(0);
+      raft::copy(&h_bound_sq, bound_sq[r].data(), 1, s.stream.view());
+      raft::copy(&h_obj_sq, obj_sq[r].data(), 1, s.stream.view());
+      s.stream.synchronize();
+      const f_t bound_rescaling     = f_t(1) / (std::sqrt(h_bound_sq) + f_t(1));
+      const f_t objective_rescaling = f_t(1) / (std::sqrt(h_obj_sq) + f_t(1));
+      s.sub_pdlp->get_initial_scaling_strategy().apply_distributed_bound_objective_rescaling(
+        bound_rescaling, objective_rescaling);
+    }
+    for (int r = 0; r < nb; ++r) {
+      auto& s = *shards[r];
+      raft::device_setter guard(s.device_id);
+      s.stream.synchronize();
+    }
+  }
+
   // -------- Generic distributed SpMVs -------------------------------------
   // distributed_spmv_A : halo-update the var-shaped input buffer returned by
   // `in_buf(pdhg)`, then per-shard A @ in_buf -> out_desc.
diff --git a/cpp/src/pdlp/distributed_pdlp/partition_loader.cu b/cpp/src/pdlp/distributed_pdlp/partition_loader.cu
index 5c317f664e..0ef1eaf4da 100644
--- a/cpp/src/pdlp/distributed_pdlp/partition_loader.cu
+++ b/cpp/src/pdlp/distributed_pdlp/partition_loader.cu
@@ -196,8 +196,7 @@ std::vector<rank_data_t<i_t, f_t>> partition_loader_t<i_t, f_t>::create_rank_dat
       if (peer == rank) continue;
       for (auto recv_cstr : rank_data[peer].cstr_send_per_peer[rank]) {
         rd.global_to_local_cstr[recv_cstr] = curr_id;
-        // rd.local_to_global_cstr.push_back(recv_cstr); // Not needed, we only do local_to_global
-        // on owned side
+        rd.local_to_global_cstr.push_back(recv_cstr);
         curr_id++;
       }
     }
@@ -212,7 +211,7 @@ std::vector<rank_data_t<i_t, f_t>> partition_loader_t<i_t, f_t>::create_rank_dat
       if (peer == rank) continue;
       for (auto recv_var : rank_data[peer].var_send_per_peer[rank]) {
         rd.global_to_local_var[recv_var] = curr_id;
-        // rd.local_to_global_var.push_back(recv_var); // same as over
+        rd.local_to_global_var.push_back(recv_var);
         curr_id++;
       }
     }
diff --git a/cpp/src/pdlp/initial_scaling_strategy/initial_scaling.cu b/cpp/src/pdlp/initial_scaling_strategy/initial_scaling.cu
index dcc3e662b0..cb498b3756 100644
--- a/cpp/src/pdlp/initial_scaling_strategy/initial_scaling.cu
+++ b/cpp/src/pdlp/initial_scaling_strategy/initial_scaling.cu
@@ -644,7 +644,8 @@ void pdlp_initial_scaling_strategy_t<i_t, f_t>::scale_problem()
     cuda::std::multiplies<f_t>{},
     stream_view_);
 
-  if (hyper_params_.bound_objective_rescaling && !running_mip_) {
+  if (hyper_params_.bound_objective_rescaling && !running_mip_ &&
+      !skip_distributed_local_rescaling_) {
     // Coefficients are computed on the already scaled values
     bound_objective_rescaling();
 
@@ -957,6 +958,50 @@ const problem_t<i_t, f_t>& pdlp_initial_scaling_strategy_t<i_t, f_t>::get_scaled
   return op_problem_scaled_;
 }
 
+template <typename i_t, typename f_t>
+void pdlp_initial_scaling_strategy_t<i_t, f_t>::apply_distributed_bound_objective_rescaling(
+  f_t bound_rescaling, f_t objective_rescaling)
+{
+  using f_t2 = typename type_2<f_t>::type;
+
+  // constraint bounds *= bound_rescaling  (matches scale_problem() bound block)
+  cub::DeviceTransform::Transform(
+    cuda::std::make_tuple(op_problem_scaled_.constraint_lower_bounds.data(),
+                          op_problem_scaled_.constraint_upper_bounds.data()),
+    thrust::make_zip_iterator(op_problem_scaled_.constraint_lower_bounds.data(),
+                              op_problem_scaled_.constraint_upper_bounds.data()),
+    op_problem_scaled_.constraint_upper_bounds.size(),
+    [bound_rescaling] __device__(f_t lower, f_t upper) -> thrust::tuple<f_t, f_t> {
+      return {lower * bound_rescaling, upper * bound_rescaling};
+    },
+    stream_view_.value());
+
+  // variable bounds *= bound_rescaling (batch-1 path only; distributed is batch 1)
+  cub::DeviceTransform::Transform(
+    op_problem_scaled_.variable_bounds.data(),
+    op_problem_scaled_.variable_bounds.data(),
+    op_problem_scaled_.variable_bounds.size(),
+    [bound_rescaling] __device__(f_t2 variable_bounds) -> f_t2 {
+      return {variable_bounds.x * bound_rescaling, variable_bounds.y * bound_rescaling};
+    },
+    stream_view_);
+
+  // objective *= objective_rescaling
+  cub::DeviceTransform::Transform(
+    op_problem_scaled_.objective_coefficients.data(),
+    op_problem_scaled_.objective_coefficients.data(),
+    op_problem_scaled_.objective_coefficients.size(),
+    [objective_rescaling] __device__(f_t c) -> f_t { return c * objective_rescaling; },
+    stream_view_);
+
+  // Store the factors (sets both host copies and the device rescaling vectors)
+  // so unscale_solutions() / scale_solutions() apply them consistently. The flag
+  // hyper_params_.bound_objective_rescaling stays true on shards so those paths
+  // are active; only scale_problem()'s local recompute is skipped.
+  set_h_bound_rescaling(bound_rescaling);
+  set_h_objective_rescaling(objective_rescaling);
+}
+
 template <typename i_t, typename f_t>
 const rmm::device_uvector<f_t>&
 pdlp_initial_scaling_strategy_t<i_t, f_t>::get_constraint_matrix_scaling_vector() const
diff --git a/cpp/src/pdlp/initial_scaling_strategy/initial_scaling.cuh b/cpp/src/pdlp/initial_scaling_strategy/initial_scaling.cuh
index 148ccce238..409df5340a 100644
--- a/cpp/src/pdlp/initial_scaling_strategy/initial_scaling.cuh
+++ b/cpp/src/pdlp/initial_scaling_strategy/initial_scaling.cuh
@@ -75,6 +75,11 @@ class pdlp_initial_scaling_strategy_t {
                          rmm::device_uvector<f_t>& dual_slack) const;
   void unscale_solutions(solution_t<i_t, f_t>& solution) const;
   const rmm::device_uvector<f_t>& get_constraint_matrix_scaling_vector() const;
+  // Mutable access needed by distributed PDLP to broadcast owned constraint
+  rmm::device_uvector<f_t>& get_cummulative_constraint_matrix_scaling()
+  {
+    return cummulative_constraint_matrix_scaling_;
+  }
   const rmm::device_uvector<f_t>& get_variable_scaling_vector() const;
   const problem_t<i_t, f_t>& get_scaled_op_problem();
 
@@ -94,6 +99,14 @@ class pdlp_initial_scaling_strategy_t {
 
   void bound_objective_rescaling();
 
+  // Distributed PDLP: apply an externally-computed GLOBAL bound / objective
+  // rescaling to the already-scaled problem.
+  void apply_distributed_bound_objective_rescaling(f_t bound_rescaling, f_t objective_rescaling);
+
+  // Distributed PDLP: skip the LOCAL bound/objective rescaling inside
+  // scale_problem()
+  void set_skip_distributed_local_rescaling(bool value) { skip_distributed_local_rescaling_ = value; }
+
   // Public for distributed PDLP
   void compute_scaling_vectors(i_t number_of_ruiz_iterations, f_t alpha);
 
@@ -144,5 +157,8 @@ class pdlp_initial_scaling_strategy_t {
   rmm::device_uvector<i_t>& A_T_indices_;
   const pdlp_hyper_params::pdlp_hyper_params_t& hyper_params_;
   bool running_mip_;
+  // Distributed PDLP: when true, scale_problem() skips its local
+  // bound/objective rescaling (the global factor is applied separately).
+  bool skip_distributed_local_rescaling_{false};
 };
 }  // namespace cuopt::linear_programming::detail
diff --git a/cpp/src/pdlp/pdlp.cu b/cpp/src/pdlp/pdlp.cu
index 576ab417f1..4200b487c8 100644
--- a/cpp/src/pdlp/pdlp.cu
+++ b/cpp/src/pdlp/pdlp.cu
@@ -591,14 +591,21 @@ pdlp_solver_t<i_t, f_t>::pdlp_solver_t(
     multi_gpu_engine->distributed_ruiz_inf_scaling(
       settings_.hyper_params.default_l_inf_ruiz_iterations, n_vars);
   }
+  // push local scaling to halo
+  multi_gpu_engine->broadcast_constraint_scaling_to_halo();
   if (settings_.hyper_params.do_pock_chambolle_scaling) {
     multi_gpu_engine->distributed_pock_chambolle_scaling(
       static_cast<f_t>(settings_.hyper_params.default_alpha_pock_chambolle_rescaling), n_vars);
   }
+  // Refresh the halo constraint scaling after Pock-Chambolle
+  multi_gpu_engine->broadcast_constraint_scaling_to_halo();
 
   for (auto& shard : multi_gpu_engine->shards) {
     raft::device_setter guard(shard->device_id);
     auto& scaling = shard->sub_pdlp->get_initial_scaling_strategy();
+    // Skip the per-shard local bound/objective rescaling; the global factor is
+    // applied below. Keeps the unscale path active (flag stays true).
+    scaling.set_skip_distributed_local_rescaling(true);
     scaling.scale_problem();
 
     shard->sub_pdlp->pdhg_solver_.get_cusparse_view().create_spmv_op_plans(
@@ -609,6 +616,12 @@ pdlp_solver_t<i_t, f_t>::pdlp_solver_t(
     shard->stream.synchronize();
   }
 
+  // Global bound/objective rescaling: allreduce the owned partial squared-norms
+  if (settings_.hyper_params.bound_objective_rescaling && !inside_mip_) {
+    multi_gpu_engine->distributed_bound_objective_rescaling(
+      static_cast<f_t>(settings_.hyper_params.initial_primal_weight_c_scaling));
+  }
+
   // ----- 8b. Seed initial step-size / primal-weight (distributed, scales to N shards) -----
   constexpr f_t kStepSizeScale = f_t{0.998};
   const f_t sigma_max          = multi_gpu_engine->distributed_max_singular_value(n_cstr);

From df9f79366cfc9a997bd59480bae9ae623edafcc6 Mon Sep 17 00:00:00 2001
From: Bulle Mostovoi <vmostovoi@nvidia.com>
Date: Tue, 2 Jun 2026 12:29:09 -0700
Subject: [PATCH 62/67] actually disable the graph ^^ (kms)

---
 cpp/src/pdlp/solve.cu | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/cpp/src/pdlp/solve.cu b/cpp/src/pdlp/solve.cu
index 156a601b29..228bacfd21 100644
--- a/cpp/src/pdlp/solve.cu
+++ b/cpp/src/pdlp/solve.cu
@@ -2155,6 +2155,10 @@ optimization_problem_solution_t<i_t, f_t> solve_lp_distributed_from_mps(
                 "use_distributed_pdlp; please set settings.presolver = presolver_t::None");
 
   pdlp_solver_settings_t<i_t, f_t> settings_resolved = settings;
+
+  detail::pdlp_graph_disabled_flag().store(settings_resolved.hyper_params.pdlp_disable_graph,
+                                           std::memory_order_relaxed);
+
   if (settings_resolved.distributed_pdlp_num_gpus == -1) {
     settings_resolved.distributed_pdlp_num_gpus = raft::device_setter::get_device_count();
     CUOPT_LOG_INFO(

From 4c8bcd1a6710fc4b56b74d1d99ed21f39221b0c1 Mon Sep 17 00:00:00 2001
From: Bulle Mostovoi <vmostovoi@nvidia.com>
Date: Thu, 4 Jun 2026 12:21:34 +0200
Subject: [PATCH 63/67] added option to export parts file

---
 .../cuopt/linear_programming/constants.h       |  1 +
 .../pdlp/solver_settings.hpp                   |  5 +++++
 cpp/src/math_optimization/solver_settings.cu   |  1 +
 .../pdlp/distributed_pdlp/partition_loader.cu  | 18 ++++++++++++++++++
 .../pdlp/distributed_pdlp/partition_loader.hpp |  6 ++++++
 cpp/src/pdlp/pdlp.cu                           |  9 +++++++++
 6 files changed, 40 insertions(+)

diff --git a/cpp/include/cuopt/linear_programming/constants.h b/cpp/include/cuopt/linear_programming/constants.h
index e2cc264cdc..e24ca5c346 100644
--- a/cpp/include/cuopt/linear_programming/constants.h
+++ b/cpp/include/cuopt/linear_programming/constants.h
@@ -85,6 +85,7 @@
 #define CUOPT_NUM_GPUS                  "num_gpus"
 #define CUOPT_DISTRIBUTED_PDLP_NUM_GPUS "distributed_pdlp_num_gpus"
 #define CUOPT_MULTI_GPU_PARTITION_FILE  "multi_gpu_partition_file"
+#define CUOPT_MULTI_GPU_EXPORT_PARTITION_FILE "multi_gpu_export_partition_file"
 #define CUOPT_USE_DISTRIBUTED_PDLP      "use_distributed_pdlp"
 #define CUOPT_PDLP_DISABLE_GRAPH        "pdlp_disable_graph"
 #define CUOPT_USER_PROBLEM_FILE         "user_problem_file"
diff --git a/cpp/include/cuopt/linear_programming/pdlp/solver_settings.hpp b/cpp/include/cuopt/linear_programming/pdlp/solver_settings.hpp
index efdbd5733c..1443333df4 100644
--- a/cpp/include/cuopt/linear_programming/pdlp/solver_settings.hpp
+++ b/cpp/include/cuopt/linear_programming/pdlp/solver_settings.hpp
@@ -311,6 +311,11 @@ class pdlp_solver_settings_t {
   // -1 means auto-detect
   int distributed_pdlp_num_gpus{-1};
   std::string multi_gpu_partition_file{""};
+  // If non-empty, the partition computed for distributed PDLP is written to this
+  // path (one part-id per line) right after partitioning. The file can be fed
+  // back via multi_gpu_partition_file. Exposed as the multi_gpu_export_partition_file
+  // parameter (CLI: --multi-gpu-export-partition-file <path>).
+  std::string multi_gpu_export_partition_file{""};
   // Set to true inside the shards
   bool is_distributed_sub_pdlp{false};
   method_t method{method_t::Concurrent};
diff --git a/cpp/src/math_optimization/solver_settings.cu b/cpp/src/math_optimization/solver_settings.cu
index 629c8a8428..87324524f1 100644
--- a/cpp/src/math_optimization/solver_settings.cu
+++ b/cpp/src/math_optimization/solver_settings.cu
@@ -192,6 +192,7 @@ solver_settings_t<i_t, f_t>::solver_settings_t() : pdlp_settings(), mip_settings
     {CUOPT_PRESOLVE_FILE, &mip_settings.presolve_file, ""},
     {CUOPT_PRESOLVE_FILE, &pdlp_settings.presolve_file, ""},
     {CUOPT_MULTI_GPU_PARTITION_FILE, &pdlp_settings.multi_gpu_partition_file, ""},
+    {CUOPT_MULTI_GPU_EXPORT_PARTITION_FILE, &pdlp_settings.multi_gpu_export_partition_file, ""},
   };
   // clang-format on
 }
diff --git a/cpp/src/pdlp/distributed_pdlp/partition_loader.cu b/cpp/src/pdlp/distributed_pdlp/partition_loader.cu
index 0ef1eaf4da..a6db3a9fe8 100644
--- a/cpp/src/pdlp/distributed_pdlp/partition_loader.cu
+++ b/cpp/src/pdlp/distributed_pdlp/partition_loader.cu
@@ -40,6 +40,24 @@ std::vector<i_t> partition_loader_t<i_t, f_t>::parse_distributed_pdlp_partition_
   return parts;
 }
 
+template <typename i_t, typename f_t>
+void partition_loader_t<i_t, f_t>::export_distributed_pdlp_partition_file(
+  std::string const& file, std::vector<i_t> const& parts)
+{
+  std::ofstream part_file(file);
+  cuopt_expects(part_file.is_open(),
+                error_type_t::ValidationError,
+                "Failed to open partition file for export: %s",
+                file.c_str());
+  for (auto const& part : parts) {
+    part_file << part << "\n";
+  }
+  cuopt_expects(part_file.good(),
+                error_type_t::RuntimeError,
+                "Failed while writing partition file: %s",
+                file.c_str());
+}
+
 template <typename i_t, typename f_t>
 std::vector<rank_data_t<i_t, f_t>> partition_loader_t<i_t, f_t>::create_rank_data_from_parts(
   const std::vector<i_t>& parts,
diff --git a/cpp/src/pdlp/distributed_pdlp/partition_loader.hpp b/cpp/src/pdlp/distributed_pdlp/partition_loader.hpp
index 915c24a828..ce12d241f9 100644
--- a/cpp/src/pdlp/distributed_pdlp/partition_loader.hpp
+++ b/cpp/src/pdlp/distributed_pdlp/partition_loader.hpp
@@ -19,6 +19,12 @@ struct partition_loader_t {
   // nb_cstr + nb_vars, indexed as in create_rank_data_from_parts (cstrs first, then vars).
   static std::vector<i_t> parse_distributed_pdlp_partition_file(std::string const& file);
 
+  // Write a partition vector to file in the same format parse_... reads back:
+  // one part-id per line. Useful for inspecting / reusing a computed partition
+  // (e.g. CLI --distributed-pdlp-export-parts).
+  static void export_distributed_pdlp_partition_file(std::string const& file,
+                                                     std::vector<i_t> const& parts);
+
   // Slices the data to prepare a split from metis partitionning with halo communication
   static std::vector<rank_data_t<i_t, f_t>> create_rank_data_from_parts(
     const std::vector<i_t>& parts,
diff --git a/cpp/src/pdlp/pdlp.cu b/cpp/src/pdlp/pdlp.cu
index 4200b487c8..150311ae33 100644
--- a/cpp/src/pdlp/pdlp.cu
+++ b/cpp/src/pdlp/pdlp.cu
@@ -530,6 +530,14 @@ pdlp_solver_t<i_t, f_t>::pdlp_solver_t(
     parts            = partitioner->partition(partition_input);
   }
 
+  // Optionally dump the partition right after computing it (one part-id per line).
+  if (!settings.multi_gpu_export_partition_file.empty()) {
+    partition_loader_t<i_t, f_t>::export_distributed_pdlp_partition_file(
+      settings.multi_gpu_export_partition_file, parts);
+    std::cout << "Exported " << parts.size() << " part-ids to "
+              << settings.multi_gpu_export_partition_file << std::endl;
+  }
+
   // ----- 5. Build per-rank data -----
   std::vector<rank_data_t<i_t, f_t>> sub_pdlp_rank_data =
     partition_loader_t<i_t, f_t>::create_rank_data_from_parts(parts,
@@ -551,6 +559,7 @@ pdlp_solver_t<i_t, f_t>::pdlp_solver_t(
   sub_pdlp_settings.num_gpus                                            = 1;
   sub_pdlp_settings.distributed_pdlp_num_gpus                           = 1;
   sub_pdlp_settings.multi_gpu_partition_file                            = "";
+  sub_pdlp_settings.multi_gpu_export_partition_file                     = "";
   sub_pdlp_settings.is_distributed_sub_pdlp                             = true;
   sub_pdlp_settings.hyper_params.default_l_inf_ruiz_iterations          = 0;
   sub_pdlp_settings.hyper_params.default_alpha_pock_chambolle_rescaling = 0.0;

From a8a8054b36333ffebeaba6312c2d998bfa9156ec Mon Sep 17 00:00:00 2001
From: Bulle Mostovoi <vmostovoi@nvidia.com>
Date: Thu, 4 Jun 2026 13:29:27 +0200
Subject: [PATCH 64/67] addded test for import export parts file

---
 cpp/tests/linear_programming/pdlp_test.cu | 62 +++++++++++++++++++++++
 1 file changed, 62 insertions(+)

diff --git a/cpp/tests/linear_programming/pdlp_test.cu b/cpp/tests/linear_programming/pdlp_test.cu
index d29995efc5..b20ce4a1c9 100644
--- a/cpp/tests/linear_programming/pdlp_test.cu
+++ b/cpp/tests/linear_programming/pdlp_test.cu
@@ -8,12 +8,16 @@
 #include <branch_and_bound/shared_strong_branching_context.hpp>
 #include <mps_parser_internal.hpp>
 #include <pdlp/cusparse_view.hpp>
+#include <pdlp/distributed_pdlp/partition_loader.hpp>
+#include <pdlp/distributed_pdlp/partitioner.hpp>
 #include <pdlp/initial_scaling_strategy/initial_scaling.cuh>
 #include <pdlp/pdlp.cuh>
 #include <pdlp/pdlp_constants.hpp>
 #include <pdlp/solve.cuh>
 #include <pdlp/utils.cuh>
 
+#include <dual_simplex/sparse_matrix.hpp>
+
 #include "utilities/pdlp_test_utilities.cuh"
 
 #include "../mip/mip_utils.cuh"
@@ -91,6 +95,64 @@ TEST(pdlp_class, run_double)
     afiro_primal_objective, solution.get_additional_termination_information().primal_objective));
 }
 
+// Distributed-PDLP partition round-trip: partition the afiro constraint/variable
+// bipartite graph with METIS, write it out, read it back, and confirm the parsed
+// vector is identical to what the partitioner produced.
+TEST(pdlp_class, distributed_partition_metis_export_import_roundtrip)
+{
+  using namespace cuopt::linear_programming::detail;
+  namespace ds = cuopt::linear_programming::dual_simplex;
+
+  auto path = make_path_absolute("linear_programming/afiro_original.mps");
+  cuopt::linear_programming::io::mps_data_model_t<int, double> mps =
+    cuopt::linear_programming::io::parse_mps<int, double>(path, true);
+
+  const int n_vars = static_cast<int>(mps.get_objective_coefficients().size());
+  const int n_cstr = static_cast<int>(mps.get_constraint_lower_bounds().size());
+  const int nnz    = static_cast<int>(mps.get_constraint_matrix_values().size());
+
+  std::vector<int> h_A_row_offsets    = mps.get_constraint_matrix_offsets();
+  std::vector<int> h_A_col_indices    = mps.get_constraint_matrix_indices();
+  std::vector<double> h_A_values      = mps.get_constraint_matrix_values();
+
+  // Transpose A -> A^T (CSR of A^T == CSC of A), mirroring solve_lp_distributed_from_mps.
+  ds::csr_matrix_t<int, double> A_csr(n_cstr, n_vars, nnz);
+  A_csr.row_start = h_A_row_offsets;
+  A_csr.j         = h_A_col_indices;
+  A_csr.x         = h_A_values;
+  ds::csc_matrix_t<int, double> AT_as_csc(n_vars, n_cstr, nnz);
+  A_csr.to_compressed_col(AT_as_csc);
+  std::vector<int> h_A_t_row_offsets = AT_as_csc.col_start;
+  std::vector<int> h_A_t_col_indices = AT_as_csc.i;
+
+  partitioner_input_t<int, double> input;
+  input.nb_cstr         = n_cstr;
+  input.nb_vars         = n_vars;
+  input.nb_parts        = 2;
+  input.A.row_offsets   = &h_A_row_offsets;
+  input.A.col_indices   = &h_A_col_indices;
+  input.A.num_rows      = n_cstr;
+  input.A.num_cols      = n_vars;
+  input.A_t.row_offsets = &h_A_t_row_offsets;
+  input.A_t.col_indices = &h_A_t_col_indices;
+  input.A_t.num_rows    = n_vars;
+  input.A_t.num_cols    = n_cstr;
+
+  auto partitioner       = make_partitioner<int, double>(partitioner_kind_t::Metis);
+  std::vector<int> parts = partitioner->partition(input);
+  ASSERT_EQ(parts.size(), static_cast<std::size_t>(n_cstr + n_vars));
+
+  std::string dir = ::testing::TempDir();
+  if (!dir.empty() && dir.back() != '/') { dir.push_back('/'); }
+  const std::string out_path = dir + "afiro_metis_roundtrip.parts";
+
+  partition_loader_t<int, double>::export_distributed_pdlp_partition_file(out_path, parts);
+  std::vector<int> reloaded =
+    partition_loader_t<int, double>::parse_distributed_pdlp_partition_file(out_path);
+
+  EXPECT_EQ(parts, reloaded);
+}
+
 TEST(pdlp_class, precision_mixed)
 {
   using namespace cuopt::linear_programming::detail;

From 5abcd2e0feaa00efa9a43daa1be94cf4cb89f034 Mon Sep 17 00:00:00 2001
From: Bulle Mostovoi <vmostovoi@nvidia.com>
Date: Thu, 4 Jun 2026 14:43:36 +0200
Subject: [PATCH 65/67] added full solve tests

---
 cpp/tests/linear_programming/pdlp_test.cu | 104 ++++++++++++++++++++++
 1 file changed, 104 insertions(+)

diff --git a/cpp/tests/linear_programming/pdlp_test.cu b/cpp/tests/linear_programming/pdlp_test.cu
index b20ce4a1c9..65cc2f0d9f 100644
--- a/cpp/tests/linear_programming/pdlp_test.cu
+++ b/cpp/tests/linear_programming/pdlp_test.cu
@@ -49,11 +49,13 @@
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
 
+#include <algorithm>
 #include <chrono>
 #include <cmath>
 #include <cstdint>
 #include <limits>
 #include <sstream>
+#include <string>
 #include <thread>
 #include <tuple>
 #include <utility>
@@ -153,6 +155,108 @@ TEST(pdlp_class, distributed_partition_metis_export_import_roundtrip)
   EXPECT_EQ(parts, reloaded);
 }
 
+namespace {
+
+// Solve `mps_rel_path` with the single-GPU PDLP ("base") and with distributed PDLP
+// (num_gpus = -1 => auto-detect; 1 GPU is fine), then assert the distributed run
+// matches the base run on everything meaningful: termination status, step count
+// (within 15%), primal/dual objective, and the full primal/dual solution vectors.
+// All value comparisons use a loose relative tolerance.
+void expect_distributed_matches_base(raft::handle_t const& handle,
+                                     std::string const& mps_rel_path,
+                                     bool fixed_mps_format = false)
+{
+  constexpr double loose_rel = 1e-3;
+  auto near_rel              = [](double a, double b, double rel) {
+    return std::fabs(a - b) <= rel * (1.0 + std::fabs(a));
+  };
+
+  auto path = make_path_absolute(mps_rel_path);
+  io::mps_data_model_t<int, double> problem = io::parse_mps<int, double>(path, fixed_mps_format);
+
+  // Shared settings: PDLP, no presolve (distributed requires presolver == None, so the
+  // base run must match to keep the two problems identical).
+  pdlp_solver_settings_t<int, double> base_settings{};
+  base_settings.method    = method_t::PDLP;
+  base_settings.presolver = presolver_t::None;
+
+  // ----- base: single-GPU PDLP (materialize the full problem on one GPU) -----
+  auto base_op = mps_data_model_to_optimization_problem<int, double>(&handle, problem);
+  auto base    = solve_lp(base_op, base_settings);
+
+  // ----- distributed PDLP (identical settings, only the distributed flags flipped) -----
+  pdlp_solver_settings_t<int, double> dist_settings = base_settings;
+  dist_settings.hyper_params.use_distributed_pdlp    = true;
+  dist_settings.distributed_pdlp_num_gpus            = -1;
+  auto dist                                          = solve_lp(&handle, problem, dist_settings);
+
+  // ----- termination status -----
+  ASSERT_EQ(static_cast<int>(base.get_termination_status()), CUOPT_TERMINATION_STATUS_OPTIMAL)
+    << mps_rel_path << ": base did not reach optimal";
+  EXPECT_EQ(static_cast<int>(dist.get_termination_status()),
+            static_cast<int>(base.get_termination_status()))
+    << mps_rel_path << ": distributed termination status differs from base";
+
+  const auto& base_info = base.get_additional_termination_information();
+  const auto& dist_info = dist.get_additional_termination_information();
+
+  // ----- objectives -----
+  EXPECT_TRUE(near_rel(base_info.primal_objective, dist_info.primal_objective, loose_rel))
+    << mps_rel_path << ": primal objective base=" << base_info.primal_objective
+    << " distributed=" << dist_info.primal_objective;
+  EXPECT_TRUE(near_rel(base_info.dual_objective, dist_info.dual_objective, loose_rel))
+    << mps_rel_path << ": dual objective base=" << base_info.dual_objective
+    << " distributed=" << dist_info.dual_objective;
+
+  // ----- step count: within 15% of the larger of the two -----
+  const int base_steps = base_info.number_of_steps_taken;
+  const int dist_steps = dist_info.number_of_steps_taken;
+  const int max_steps  = std::max(base_steps, dist_steps);
+  const int step_diff  = std::max(base_steps, dist_steps) - std::min(base_steps, dist_steps);
+  EXPECT_LE(static_cast<double>(step_diff), 0.15 * max_steps)
+    << mps_rel_path << ": step counts differ by >15% (base=" << base_steps
+    << ", distributed=" << dist_steps << ")";
+
+  // ----- primal / dual solution vectors -----
+  auto base_primal = cuopt::host_copy(base.get_primal_solution(), handle.get_stream());
+  auto dist_primal = cuopt::host_copy(dist.get_primal_solution(), handle.get_stream());
+  ASSERT_EQ(base_primal.size(), dist_primal.size()) << mps_rel_path << ": primal size mismatch";
+  for (std::size_t i = 0; i < base_primal.size(); ++i) {
+    EXPECT_TRUE(near_rel(base_primal[i], dist_primal[i], loose_rel))
+      << mps_rel_path << ": primal[" << i << "] base=" << base_primal[i]
+      << " distributed=" << dist_primal[i];
+  }
+
+  auto base_dual = cuopt::host_copy(base.get_dual_solution(), handle.get_stream());
+  auto dist_dual = cuopt::host_copy(dist.get_dual_solution(), handle.get_stream());
+  ASSERT_EQ(base_dual.size(), dist_dual.size()) << mps_rel_path << ": dual size mismatch";
+  for (std::size_t i = 0; i < base_dual.size(); ++i) {
+    EXPECT_TRUE(near_rel(base_dual[i], dist_dual[i], loose_rel))
+      << mps_rel_path << ": dual[" << i << "] base=" << base_dual[i]
+      << " distributed=" << dist_dual[i];
+  }
+}
+
+}  // namespace
+
+TEST(pdlp_class, distributed_parity_afiro)
+{
+  const raft::handle_t handle{};
+  expect_distributed_matches_base(handle, "linear_programming/afiro_original.mps", true);
+}
+
+TEST(pdlp_class, distributed_parity_square41)
+{
+  const raft::handle_t handle{};
+  expect_distributed_matches_base(handle, "linear_programming/neos3/neos3.mps");
+}
+
+TEST(pdlp_class, distributed_parity_a2864)
+{
+  const raft::handle_t handle{};
+  expect_distributed_matches_base(handle, "linear_programming/a2864/a2864.mps");
+}
+
 TEST(pdlp_class, precision_mixed)
 {
   using namespace cuopt::linear_programming::detail;

From 0b0ce2ccd9b2d4f2e1273d7c5a548f81619836ba Mon Sep 17 00:00:00 2001
From: Bulle Mostovoi <vmostovoi@nvidia.com>
Date: Thu, 4 Jun 2026 16:15:58 +0200
Subject: [PATCH 66/67] added kaminpar partitionner and possibility to chose
 the partitionner

---
 cpp/CMakeLists.txt                            |  12 ++
 cpp/cmake/thirdparty/get_kaminpar.cmake       |  48 ++++++
 .../cuopt/linear_programming/constants.h      |   1 +
 .../pdlp/solver_settings.hpp                  |   8 +
 cpp/src/math_optimization/solver_settings.cu  |   1 +
 cpp/src/pdlp/CMakeLists.txt                   |   1 +
 .../distributed_pdlp/kaminpar_partitioner.cpp | 142 ++++++++++++++++++
 .../distributed_pdlp/kaminpar_partitioner.hpp |  23 +++
 .../distributed_pdlp/metis_partitioner.cu     |  13 +-
 cpp/src/pdlp/distributed_pdlp/partitioner.cu  |   3 +
 cpp/src/pdlp/distributed_pdlp/partitioner.hpp |   9 +-
 cpp/src/pdlp/pdlp.cu                          |  54 +++++--
 cpp/src/pdlp/solve.cu                         |   5 -
 13 files changed, 291 insertions(+), 29 deletions(-)
 create mode 100644 cpp/cmake/thirdparty/get_kaminpar.cmake
 create mode 100644 cpp/src/pdlp/distributed_pdlp/kaminpar_partitioner.cpp
 create mode 100644 cpp/src/pdlp/distributed_pdlp/kaminpar_partitioner.hpp

diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index d27072bcf9..0bf2b0f3f7 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -378,6 +378,17 @@ set_target_properties(metis_external PROPERTIES
 )
 message(STATUS "Using METIS: ${METIS_LIBRARY}")
 
+# ##################################################################################################
+# - KaMinPar (multi-threaded partitioning for distributed PDLP) ------------------------------------
+# Brought in the RAPIDS way (rapids_cpm_find): uses an installed KaMinPar (deb/rpm/conda,
+# discovered via its CMake config) if present, otherwise builds the pinned source via CPM.
+# Distributed PDLP prefers KaMinPar over METIS.
+include(cmake/thirdparty/get_kaminpar.cmake)
+if (NOT TARGET KaMinPar::KaMinPar)
+    message(FATAL_ERROR "KaMinPar::KaMinPar was not made available by get_kaminpar.cmake")
+endif ()
+message(STATUS "Using KaMinPar (distributed PDLP prefers KaMinPar over METIS)")
+
 # ##################################################################################################
 # - gRPC and Protobuf setup -----------------------------------------------------------------------
 
@@ -642,6 +653,7 @@ target_link_libraries(cuopt
         ${CUOPT_PRIVATE_CUDA_LIBS}
         nccl_external
         metis_external
+        KaMinPar::KaMinPar
         $<$<BOOL:${CUOPT_ENABLE_GRPC}>:protobuf::libprotobuf>
         $<$<BOOL:${CUOPT_ENABLE_GRPC}>:gRPC::grpc++>
 )
diff --git a/cpp/cmake/thirdparty/get_kaminpar.cmake b/cpp/cmake/thirdparty/get_kaminpar.cmake
new file mode 100644
index 0000000000..d548a76115
--- /dev/null
+++ b/cpp/cmake/thirdparty/get_kaminpar.cmake
@@ -0,0 +1,48 @@
+# cmake-format: off
+# SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+# cmake-format: on
+
+# Multi-threaded graph partitioner for distributed PDLP.
+# Uses rapids_cpm_find so a system / conda / .deb install of KaMinPar (which ships a
+# CMake config package exporting KaMinPar::KaMinPar) is used when available, and
+# otherwise the pinned source is cloned and built via CPM. KaMinPar depends on TBB,
+# which cuOpt already requires (see find_package(TBB) for papilo).
+function(find_and_configure_kaminpar)
+    set(oneValueArgs VERSION PINNED_TAG)
+    cmake_parse_arguments(PKG "" "${oneValueArgs}" "" ${ARGN})
+
+    rapids_cpm_find(KaMinPar ${PKG_VERSION}
+        GLOBAL_TARGETS KaMinPar::KaMinPar
+        CPM_ARGS
+        GIT_REPOSITORY https://github.com/KaHIP/KaMinPar.git
+        GIT_TAG ${PKG_PINNED_TAG}
+        EXCLUDE_FROM_ALL
+        OPTIONS
+            "KAMINPAR_BUILD_APPS OFF"
+            "KAMINPAR_BUILD_TOOLS OFF"
+            "KAMINPAR_BUILD_TESTS OFF"
+            "KAMINPAR_BUILD_BENCHMARKS OFF"
+            "KAMINPAR_BUILD_EXAMPLES OFF"
+            "KAMINPAR_BUILD_DISTRIBUTED OFF"
+            # Timers use global state and force single-threaded use of the library
+            # interface; disable so cuOpt can call the partitioner freely.
+            "KAMINPAR_ENABLE_TIMERS OFF"
+            # Avoid an extra hard dependency on Google Sparsehash.
+            "KAMINPAR_BUILD_WITH_SPARSEHASH OFF"
+            # cuOpt's TBB is discovered via a legacy find that only exposes TBB::tbb
+            # (no TBB::tbbmalloc target); disable KaMinPar's optional tbbmalloc use.
+            "KAMINPAR_ENABLE_TBB_MALLOC OFF"
+            # Large LP constraint graphs can exceed 2^31 directed edges.
+            "KAMINPAR_64BIT_EDGE_IDS ON"
+            "INSTALL_KAMINPAR OFF"
+    )
+
+    if(KaMinPar_ADDED)
+        message(VERBOSE "CUOPT: Using KaMinPar located in ${KaMinPar_SOURCE_DIR}")
+    else()
+        message(VERBOSE "CUOPT: Using KaMinPar located in ${KaMinPar_DIR}")
+    endif()
+endfunction()
+
+find_and_configure_kaminpar(VERSION 3.7.3 PINNED_TAG v3.7.3)
diff --git a/cpp/include/cuopt/linear_programming/constants.h b/cpp/include/cuopt/linear_programming/constants.h
index e24ca5c346..420a03526b 100644
--- a/cpp/include/cuopt/linear_programming/constants.h
+++ b/cpp/include/cuopt/linear_programming/constants.h
@@ -86,6 +86,7 @@
 #define CUOPT_DISTRIBUTED_PDLP_NUM_GPUS "distributed_pdlp_num_gpus"
 #define CUOPT_MULTI_GPU_PARTITION_FILE  "multi_gpu_partition_file"
 #define CUOPT_MULTI_GPU_EXPORT_PARTITION_FILE "multi_gpu_export_partition_file"
+#define CUOPT_DISTRIBUTED_PDLP_PARTITIONER "distributed_pdlp_partitioner"
 #define CUOPT_USE_DISTRIBUTED_PDLP      "use_distributed_pdlp"
 #define CUOPT_PDLP_DISABLE_GRAPH        "pdlp_disable_graph"
 #define CUOPT_USER_PROBLEM_FILE         "user_problem_file"
diff --git a/cpp/include/cuopt/linear_programming/pdlp/solver_settings.hpp b/cpp/include/cuopt/linear_programming/pdlp/solver_settings.hpp
index 1443333df4..42ef1f592a 100644
--- a/cpp/include/cuopt/linear_programming/pdlp/solver_settings.hpp
+++ b/cpp/include/cuopt/linear_programming/pdlp/solver_settings.hpp
@@ -316,6 +316,14 @@ class pdlp_solver_settings_t {
   // back via multi_gpu_partition_file. Exposed as the multi_gpu_export_partition_file
   // parameter (CLI: --multi-gpu-export-partition-file <path>).
   std::string multi_gpu_export_partition_file{""};
+  // Which graph partitioner distributed PDLP uses. One of:
+  //   "auto"     - 1 GPU => Dummy; otherwise KaMinPar
+  //   "dummy"    - round-robin, no graph (trivial)
+  //   "metis"    - serial METIS_PartGraphKway
+  //   "kaminpar" - multi-threaded KaMinPar
+  // Exposed as the distributed_pdlp_partitioner parameter
+  // (CLI: --distributed-pdlp-partitioner <auto|dummy|metis|kaminpar>).
+  std::string distributed_pdlp_partitioner{"auto"};
   // Set to true inside the shards
   bool is_distributed_sub_pdlp{false};
   method_t method{method_t::Concurrent};
diff --git a/cpp/src/math_optimization/solver_settings.cu b/cpp/src/math_optimization/solver_settings.cu
index 87324524f1..254a3afb38 100644
--- a/cpp/src/math_optimization/solver_settings.cu
+++ b/cpp/src/math_optimization/solver_settings.cu
@@ -193,6 +193,7 @@ solver_settings_t<i_t, f_t>::solver_settings_t() : pdlp_settings(), mip_settings
     {CUOPT_PRESOLVE_FILE, &pdlp_settings.presolve_file, ""},
     {CUOPT_MULTI_GPU_PARTITION_FILE, &pdlp_settings.multi_gpu_partition_file, ""},
     {CUOPT_MULTI_GPU_EXPORT_PARTITION_FILE, &pdlp_settings.multi_gpu_export_partition_file, ""},
+    {CUOPT_DISTRIBUTED_PDLP_PARTITIONER, &pdlp_settings.distributed_pdlp_partitioner, "auto"},
   };
   // clang-format on
 }
diff --git a/cpp/src/pdlp/CMakeLists.txt b/cpp/src/pdlp/CMakeLists.txt
index 863cf20962..12f2550203 100644
--- a/cpp/src/pdlp/CMakeLists.txt
+++ b/cpp/src/pdlp/CMakeLists.txt
@@ -34,6 +34,7 @@ set(LP_CORE_FILES
   ${CMAKE_CURRENT_SOURCE_DIR}/distributed_pdlp/partition_loader.cu
   ${CMAKE_CURRENT_SOURCE_DIR}/distributed_pdlp/partitioner.cu
   ${CMAKE_CURRENT_SOURCE_DIR}/distributed_pdlp/metis_partitioner.cu
+  ${CMAKE_CURRENT_SOURCE_DIR}/distributed_pdlp/kaminpar_partitioner.cpp
 )
 
 # C and Python adapter files
diff --git a/cpp/src/pdlp/distributed_pdlp/kaminpar_partitioner.cpp b/cpp/src/pdlp/distributed_pdlp/kaminpar_partitioner.cpp
new file mode 100644
index 0000000000..e7bf943f92
--- /dev/null
+++ b/cpp/src/pdlp/distributed_pdlp/kaminpar_partitioner.cpp
@@ -0,0 +1,142 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+// Plain C++ translation unit (not .cu): KaMinPar's public header is C++20 host code
+// and pulls in TBB; keeping it out of nvcc avoids device-compiler friction.
+
+#include <pdlp/distributed_pdlp/kaminpar_partitioner.hpp>
+#include <pdlp/distributed_pdlp/partitioner.hpp>
+
+#include <utilities/logger.hpp>
+
+#include <cuopt/error.hpp>
+
+#include <kaminpar.h>
+
+#include <chrono>
+#include <cstddef>
+#include <span>
+#include <thread>
+#include <vector>
+
+namespace cuopt::linear_programming::detail {
+
+// Builds the bipartite constraint/variable graph induced by A (identical layout
+// to metis_partitioner_t) and runs the multi-threaded KaMinPar k-way kernel.
+//   * nodes [0, nb_cstr)              : constraint nodes
+//   * nodes [nb_cstr, nb_cstr+nb_vars): variable nodes
+//   * undirected edges from each A nonzero (one half via A, one via A_t)
+template <typename i_t, typename f_t>
+std::vector<i_t> kaminpar_partitioner_t<i_t, f_t>::partition(
+  partitioner_input_t<i_t, f_t> const& input) const
+{
+  cuopt_expects(input.nb_parts >= 1,
+                error_type_t::ValidationError,
+                "kaminpar_partitioner: nb_parts must be >= 1");
+  cuopt_expects(input.nb_cstr >= 0 && input.nb_vars >= 0,
+                error_type_t::ValidationError,
+                "kaminpar_partitioner: invalid problem dimensions");
+
+  // The k-way kernel needs at least 2 blocks. For the single-shard case the
+  // partition is trivial (everything in block 0); short-circuit so KaMinPar can
+  // still be selected with distributed_pdlp_num_gpus == 1 without crashing.
+  if (input.nb_parts == 1) {
+    CUOPT_LOG_INFO("KaMinPar: nb_parts == 1, returning trivial single-block partition");
+    return std::vector<i_t>(static_cast<std::size_t>(input.nb_cstr + input.nb_vars), i_t{0});
+  }
+  cuopt_expects(input.A.row_offsets != nullptr && input.A.col_indices != nullptr,
+                error_type_t::ValidationError,
+                "kaminpar_partitioner: A.row_offsets and A.col_indices are required");
+  cuopt_expects(input.A_t.row_offsets != nullptr && input.A_t.col_indices != nullptr,
+                error_type_t::ValidationError,
+                "kaminpar_partitioner: A_t.row_offsets and A_t.col_indices are required");
+
+  auto const& A_offsets   = *input.A.row_offsets;
+  auto const& A_cols      = *input.A.col_indices;
+  auto const& A_t_offsets = *input.A_t.row_offsets;
+  auto const& A_t_cols    = *input.A_t.col_indices;
+
+  cuopt_expects(static_cast<i_t>(A_offsets.size()) == input.nb_cstr + 1,
+                error_type_t::ValidationError,
+                "kaminpar_partitioner: A.row_offsets size mismatch (expected nb_cstr+1)");
+  cuopt_expects(static_cast<i_t>(A_t_offsets.size()) == input.nb_vars + 1,
+                error_type_t::ValidationError,
+                "kaminpar_partitioner: A_t.row_offsets size mismatch (expected nb_vars+1)");
+  cuopt_expects(A_cols.size() == A_t_cols.size(),
+                error_type_t::ValidationError,
+                "kaminpar_partitioner: A and A_t nnz mismatch");
+
+  const i_t nb_cstr = input.nb_cstr;
+  const i_t nb_vars = input.nb_vars;
+  const i_t nnz     = static_cast<i_t>(A_cols.size());
+  const i_t nvtx    = nb_cstr + nb_vars;
+
+  // Resolve thread count: <= 0 => all hardware threads (1 as a last resort).
+  int nthreads = input.nb_threads > 0 ? static_cast<int>(input.nb_threads) : 0;
+  if (nthreads <= 0) {
+    nthreads = static_cast<int>(std::thread::hardware_concurrency());
+    if (nthreads <= 0) { nthreads = 1; }
+  }
+
+  // Bipartite CSR using KaMinPar index types (EdgeID for offsets, NodeID for neighbours).
+  std::vector<kaminpar::shm::EdgeID> xadj(static_cast<std::size_t>(nvtx) + 1);
+  std::vector<kaminpar::shm::NodeID> adjncy(2 * static_cast<std::size_t>(nnz));
+
+  for (i_t i = 0; i <= nb_cstr; ++i) {
+    xadj[i] = static_cast<kaminpar::shm::EdgeID>(A_offsets[i]);
+  }
+  for (i_t i = 0; i <= nb_vars; ++i) {
+    xadj[nb_cstr + i] =
+      static_cast<kaminpar::shm::EdgeID>(A_t_offsets[i]) + static_cast<kaminpar::shm::EdgeID>(nnz);
+  }
+  for (i_t k = 0; k < nnz; ++k) {
+    adjncy[k] =
+      static_cast<kaminpar::shm::NodeID>(A_cols[k]) + static_cast<kaminpar::shm::NodeID>(nb_cstr);
+  }
+  for (i_t k = 0; k < nnz; ++k) {
+    adjncy[nnz + k] = static_cast<kaminpar::shm::NodeID>(A_t_cols[k]);
+  }
+
+  std::vector<kaminpar::shm::BlockID> block_of(static_cast<std::size_t>(nvtx));
+
+  kaminpar::KaMinPar engine(nthreads, kaminpar::shm::create_default_context());
+  engine.copy_graph(std::span<const kaminpar::shm::EdgeID>(xadj),
+                    std::span<const kaminpar::shm::NodeID>(adjncy));
+  engine.set_k(static_cast<kaminpar::shm::BlockID>(input.nb_parts));
+  // ~3% imbalance, matching METIS_PartGraphKway's default balance constraint.
+  engine.set_uniform_max_block_weights(0.03);
+
+  auto t0 = std::chrono::high_resolution_clock::now();
+  const kaminpar::shm::EdgeWeight edge_cut =
+    engine.compute_partition(std::span<kaminpar::shm::BlockID>(block_of));
+  auto t1         = std::chrono::high_resolution_clock::now();
+  const double dt = std::chrono::duration<double>(t1 - t0).count();
+
+  CUOPT_LOG_INFO(
+    "KaMinPar partitioned bipartite graph: nvtx=%d nnz=%d nb_parts=%d nthreads=%d edge_cut=%lld "
+    "in %.3fs",
+    static_cast<int>(nvtx),
+    static_cast<int>(nnz),
+    static_cast<int>(input.nb_parts),
+    nthreads,
+    static_cast<long long>(edge_cut),
+    dt);
+
+  std::vector<i_t> parts(static_cast<std::size_t>(nvtx));
+  for (i_t i = 0; i < nvtx; ++i) {
+    parts[i] = static_cast<i_t>(block_of[i]);
+  }
+
+  validate_partition(parts,
+                     static_cast<int>(nb_cstr),
+                     static_cast<int>(nb_vars),
+                     static_cast<int>(input.nb_parts),
+                     "kaminpar_partitioner");
+  return parts;
+}
+
+template class kaminpar_partitioner_t<int, double>;
+
+}  // namespace cuopt::linear_programming::detail
diff --git a/cpp/src/pdlp/distributed_pdlp/kaminpar_partitioner.hpp b/cpp/src/pdlp/distributed_pdlp/kaminpar_partitioner.hpp
new file mode 100644
index 0000000000..43fda76f9f
--- /dev/null
+++ b/cpp/src/pdlp/distributed_pdlp/kaminpar_partitioner.hpp
@@ -0,0 +1,23 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#pragma once
+
+#include <pdlp/distributed_pdlp/partitioner.hpp>
+
+namespace cuopt::linear_programming::detail {
+
+// Multi-threaded k-way partitioner backed by KaMinPar. Builds the same
+// constraint/variable bipartite graph as metis_partitioner_t, but runs the
+// shared-memory parallel KaMinPar kernel so partitioning scales across all CPU
+// cores of a node (set via partitioner_input_t::nb_threads; <= 0 => all
+// hardware threads).
+template <typename i_t, typename f_t>
+class kaminpar_partitioner_t : public partitioner_i<i_t, f_t> {
+ public:
+  std::vector<i_t> partition(partitioner_input_t<i_t, f_t> const& input) const override;
+};
+
+}  // namespace cuopt::linear_programming::detail
diff --git a/cpp/src/pdlp/distributed_pdlp/metis_partitioner.cu b/cpp/src/pdlp/distributed_pdlp/metis_partitioner.cu
index ecc60adda0..9a4f0f50b1 100644
--- a/cpp/src/pdlp/distributed_pdlp/metis_partitioner.cu
+++ b/cpp/src/pdlp/distributed_pdlp/metis_partitioner.cu
@@ -32,18 +32,15 @@ std::vector<i_t> metis_partitioner_t<i_t, f_t>::partition(
   cuopt_expects(input.nb_parts > 0,
                 error_type_t::ValidationError,
                 "metis_partitioner: nb_parts must be positive");
-  // METIS_PartGraphKway internally does integer arithmetic of the form
-  // `nedges / nparts` and traps with SIGFPE when nparts == 1. The single-part
-  // case is also trivial (everything in part 0) so callers should route it to
-  // the Dummy partitioner instead (see pdlp_solver_t mGPU ctor).
-  cuopt_expects(input.nb_parts >= 2,
-                error_type_t::ValidationError,
-                "metis_partitioner: nb_parts must be >= 2 (METIS_PartGraphKway requirement); "
-                "use the Dummy partitioner for the single-shard case");
   cuopt_expects(input.nb_cstr >= 0 && input.nb_vars >= 0,
                 error_type_t::ValidationError,
                 "metis_partitioner: invalid problem dimensions");
 
+  if (input.nb_parts == 1) {
+    CUOPT_LOG_INFO("METIS: nb_parts == 1, returning trivial single-block partition");
+    return std::vector<i_t>(static_cast<std::size_t>(input.nb_cstr + input.nb_vars), i_t{0});
+  }
+
   cuopt_expects(input.A.row_offsets != nullptr && input.A.col_indices != nullptr,
                 error_type_t::ValidationError,
                 "metis_partitioner: A.row_offsets and A.col_indices are required");
diff --git a/cpp/src/pdlp/distributed_pdlp/partitioner.cu b/cpp/src/pdlp/distributed_pdlp/partitioner.cu
index bc84e521e2..e3866c3ad1 100644
--- a/cpp/src/pdlp/distributed_pdlp/partitioner.cu
+++ b/cpp/src/pdlp/distributed_pdlp/partitioner.cu
@@ -3,6 +3,7 @@
  * SPDX-License-Identifier: Apache-2.0
  */
 
+#include <pdlp/distributed_pdlp/kaminpar_partitioner.hpp>
 #include <pdlp/distributed_pdlp/metis_partitioner.hpp>
 #include <pdlp/distributed_pdlp/partitioner.hpp>
 
@@ -72,6 +73,8 @@ std::unique_ptr<partitioner_i<i_t, f_t>> make_partitioner(partitioner_kind_t kin
   switch (kind) {
     case partitioner_kind_t::Dummy: return std::make_unique<dummy_partitioner_t<i_t, f_t>>();
     case partitioner_kind_t::Metis: return std::make_unique<metis_partitioner_t<i_t, f_t>>();
+    case partitioner_kind_t::KaMinPar:
+      return std::make_unique<kaminpar_partitioner_t<i_t, f_t>>();
   }
   cuopt_expects(
     false, error_type_t::RuntimeError, "make_partitioner: unsupported partitioner kind");
diff --git a/cpp/src/pdlp/distributed_pdlp/partitioner.hpp b/cpp/src/pdlp/distributed_pdlp/partitioner.hpp
index 2a2149db63..70b2e34c06 100644
--- a/cpp/src/pdlp/distributed_pdlp/partitioner.hpp
+++ b/cpp/src/pdlp/distributed_pdlp/partitioner.hpp
@@ -29,6 +29,10 @@ struct partitioner_input_t {
   i_t nb_cstr{0};
   i_t nb_vars{0};
   i_t nb_parts{0};
+  // Number of CPU threads the partitioner may use. Only honored by the
+  // multi-threaded KaMinPar backend; <= 0 means "auto" (all hardware threads).
+  // Serial backends (METIS, Dummy) ignore it.
+  i_t nb_threads{0};
   // Constraint matrix A (rows = constraints, cols = variables).
   csr_host_view_t<i_t, f_t> A{};
   // Transpose A_t (rows = variables, cols = constraints). Optional for partitioners
@@ -36,7 +40,10 @@ struct partitioner_input_t {
   csr_host_view_t<i_t, f_t> A_t{};
 };
 
-enum class partitioner_kind_t { Dummy, Metis };
+// Dummy: round-robin, no graph (single-shard / debugging).
+// Metis: serial METIS_PartGraphKway.
+// KaMinPar: multi-threaded KaMinPar (preferred for multi-shard partitioning).
+enum class partitioner_kind_t { Dummy, Metis, KaMinPar };
 
 template <typename i_t, typename f_t>
 class partitioner_i {
diff --git a/cpp/src/pdlp/pdlp.cu b/cpp/src/pdlp/pdlp.cu
index 150311ae33..0514ae1d13 100644
--- a/cpp/src/pdlp/pdlp.cu
+++ b/cpp/src/pdlp/pdlp.cu
@@ -44,6 +44,7 @@
 #include <thrust/logical.h>
 
 #include <algorithm>
+#include <cctype>
 #include <cmath>
 #include <optional>
 #include <tuple>
@@ -398,12 +399,6 @@ pdlp_solver_t<i_t, f_t>::pdlp_solver_t(
   const int distributed_pdlp_num_gpus = settings.distributed_pdlp_num_gpus;
   CUOPT_LOG_INFO("Solving with distributed PDLP on %d GPU (mps direct path)",
                  distributed_pdlp_num_gpus);
-  if (distributed_pdlp_num_gpus == 1) {
-    std::cout << "CAREFUL !!: distributed_pdlp_num_gpus == 1, running single-shard dummy path, "
-                 "if you want to set the number of GPUs to use for distributed PDLP, set the "
-                 "parameter --distributed-pdlp-num-gpus"
-              << std::endl;
-  }
 
   if constexpr (!std::is_same_v<f_t, double>) {
     cuopt_expects(
@@ -501,20 +496,37 @@ pdlp_solver_t<i_t, f_t>::pdlp_solver_t(
       settings.multi_gpu_partition_file);
     validate_partition(parts, n_cstr, n_vars, distributed_pdlp_num_gpus, "partition file");
   } else {
-    if (distributed_pdlp_num_gpus == 1) {
-      std::cout << "CAREFUL: distributed_pdlp_num_gpus == 1, running dummy version (single "
-                   "part covering "
-                << n_cstr << " cstrs + " << n_vars << " vars)" << std::endl;
-    }
     partitioner_input_t<i_t, f_t> partition_input;
     partition_input.nb_cstr  = n_cstr;
     partition_input.nb_vars  = n_vars;
     partition_input.nb_parts = distributed_pdlp_num_gpus;
 
-    // METIS_PartGraphKway requires nparts >= 2; route num_gpus == 1 to Dummy.
-    const partitioner_kind_t kind =
-      (distributed_pdlp_num_gpus == 1) ? partitioner_kind_t::Dummy : partitioner_kind_t::Metis;
-    if (kind == partitioner_kind_t::Metis) {
+    // Resolve which partitioner to use.
+    std::string partitioner_choice = settings.distributed_pdlp_partitioner;
+    std::transform(partitioner_choice.begin(),
+                   partitioner_choice.end(),
+                   partitioner_choice.begin(),
+                   [](unsigned char c) { return std::tolower(c); });
+    partitioner_kind_t kind;
+    if (partitioner_choice.empty() || partitioner_choice == "auto") {
+      kind = (distributed_pdlp_num_gpus == 1) ? partitioner_kind_t::Dummy
+                                              : partitioner_kind_t::KaMinPar;
+    } else if (partitioner_choice == "dummy") {
+      kind = partitioner_kind_t::Dummy;
+    } else if (partitioner_choice == "metis") {
+      kind = partitioner_kind_t::Metis;
+    } else if (partitioner_choice == "kaminpar") {
+      kind = partitioner_kind_t::KaMinPar;
+    } else {
+      cuopt_expects(false,
+                    error_type_t::ValidationError,
+                    "Unknown distributed_pdlp_partitioner '%s' (expected auto|dummy|metis|kaminpar)",
+                    settings.distributed_pdlp_partitioner.c_str());
+      kind = partitioner_kind_t::Dummy;  // unreachable; silences -Wmaybe-uninitialized
+    }
+    const bool needs_graph =
+      (kind == partitioner_kind_t::Metis || kind == partitioner_kind_t::KaMinPar);
+    if (needs_graph) {
       // partitioner_input_t holds non-const std::vector<i_t>* pointers; we
       // already have the data in our local mutable buffers above.
       partition_input.A.row_offsets   = &h_A_row_offsets;
@@ -525,7 +537,19 @@ pdlp_solver_t<i_t, f_t>::pdlp_solver_t(
       partition_input.A_t.col_indices = &h_A_t_col_indices;
       partition_input.A_t.num_rows    = n_vars;
       partition_input.A_t.num_cols    = n_cstr;
+      // 0 => KaMinPar auto-detects and uses all hardware threads (ignored by METIS).
+      partition_input.nb_threads = 0;
     }
+    const char* kind_name = (kind == partitioner_kind_t::Dummy)      ? "dummy"
+                            : (kind == partitioner_kind_t::Metis)    ? "metis"
+                            : (kind == partitioner_kind_t::KaMinPar) ? "kaminpar"
+                                                                     : "unknown";
+    CUOPT_LOG_INFO("Partitioning %d constraints + %d variables into %d part(s) using the %s "
+                   "partitioner",
+                   n_cstr,
+                   n_vars,
+                   distributed_pdlp_num_gpus,
+                   kind_name);
     auto partitioner = make_partitioner<i_t, f_t>(kind);
     parts            = partitioner->partition(partition_input);
   }
diff --git a/cpp/src/pdlp/solve.cu b/cpp/src/pdlp/solve.cu
index 228bacfd21..595c06b20a 100644
--- a/cpp/src/pdlp/solve.cu
+++ b/cpp/src/pdlp/solve.cu
@@ -2166,11 +2166,6 @@ optimization_problem_solution_t<i_t, f_t> solve_lp_distributed_from_mps(
       "%d visible CUDA device(s)",
       settings_resolved.distributed_pdlp_num_gpus);
   }
-  if (settings_resolved.distributed_pdlp_num_gpus <= 1) {
-    std::cout << "CAREFUL: use_distributed_pdlp with distributed_pdlp_num_gpus == 1 runs the "
-                 "single-shard dummy path"
-              << std::endl;
-  }
   // PDLP precision validations (mirror the checks in run_pdlp; distributed
   // path only supports the default-precision, non-batch double config).
   cuopt_expects(settings_resolved.pdlp_precision == pdlp_precision_t::DefaultPrecision,

From 91b1ae5a619bb9edec2a6775a24304a95b73fdf6 Mon Sep 17 00:00:00 2001
From: Bulle Mostovoi <vmostovoi@nvidia.com>
Date: Thu, 4 Jun 2026 16:16:19 +0200
Subject: [PATCH 67/67] style

---
 .../cuopt/linear_programming/constants.h      | 26 +++++++++----------
 .../distributed_pdlp/multi_gpu_engine.hpp     |  6 ++---
 cpp/src/pdlp/distributed_pdlp/partitioner.cu  |  3 +--
 .../initial_scaling.cuh                       |  5 +++-
 cpp/src/pdlp/pdlp.cu                          | 26 ++++++++++---------
 cpp/tests/linear_programming/pdlp_test.cu     | 14 +++++-----
 6 files changed, 42 insertions(+), 38 deletions(-)

diff --git a/cpp/include/cuopt/linear_programming/constants.h b/cpp/include/cuopt/linear_programming/constants.h
index 420a03526b..29648d1a0f 100644
--- a/cpp/include/cuopt/linear_programming/constants.h
+++ b/cpp/include/cuopt/linear_programming/constants.h
@@ -80,20 +80,20 @@
 #define CUOPT_MIP_STRONG_BRANCHING_SIMPLEX_ITERATION_LIMIT \
   "mip_strong_branching_simplex_iteration_limit"
 
-#define CUOPT_SOLUTION_FILE             "solution_file"
-#define CUOPT_NUM_CPU_THREADS           "num_cpu_threads"
-#define CUOPT_NUM_GPUS                  "num_gpus"
-#define CUOPT_DISTRIBUTED_PDLP_NUM_GPUS "distributed_pdlp_num_gpus"
-#define CUOPT_MULTI_GPU_PARTITION_FILE  "multi_gpu_partition_file"
+#define CUOPT_SOLUTION_FILE                   "solution_file"
+#define CUOPT_NUM_CPU_THREADS                 "num_cpu_threads"
+#define CUOPT_NUM_GPUS                        "num_gpus"
+#define CUOPT_DISTRIBUTED_PDLP_NUM_GPUS       "distributed_pdlp_num_gpus"
+#define CUOPT_MULTI_GPU_PARTITION_FILE        "multi_gpu_partition_file"
 #define CUOPT_MULTI_GPU_EXPORT_PARTITION_FILE "multi_gpu_export_partition_file"
-#define CUOPT_DISTRIBUTED_PDLP_PARTITIONER "distributed_pdlp_partitioner"
-#define CUOPT_USE_DISTRIBUTED_PDLP      "use_distributed_pdlp"
-#define CUOPT_PDLP_DISABLE_GRAPH        "pdlp_disable_graph"
-#define CUOPT_USER_PROBLEM_FILE         "user_problem_file"
-#define CUOPT_PRESOLVE_FILE             "presolve_file"
-#define CUOPT_RANDOM_SEED               "random_seed"
-#define CUOPT_PDLP_PRECISION            "pdlp_precision"
-#define CUOPT_MIP_SEMICONTINUOUS_BIG_M  "mip_semi_continuous_big_m"
+#define CUOPT_DISTRIBUTED_PDLP_PARTITIONER    "distributed_pdlp_partitioner"
+#define CUOPT_USE_DISTRIBUTED_PDLP            "use_distributed_pdlp"
+#define CUOPT_PDLP_DISABLE_GRAPH              "pdlp_disable_graph"
+#define CUOPT_USER_PROBLEM_FILE               "user_problem_file"
+#define CUOPT_PRESOLVE_FILE                   "presolve_file"
+#define CUOPT_RANDOM_SEED                     "random_seed"
+#define CUOPT_PDLP_PRECISION                  "pdlp_precision"
+#define CUOPT_MIP_SEMICONTINUOUS_BIG_M        "mip_semi_continuous_big_m"
 
 #define CUOPT_MIP_HYPER_HEURISTIC_POPULATION_SIZE     "mip_hyper_heuristic_population_size"
 #define CUOPT_MIP_HYPER_HEURISTIC_NUM_CPUFJ_THREADS   "mip_hyper_heuristic_num_cpufj_threads"
diff --git a/cpp/src/pdlp/distributed_pdlp/multi_gpu_engine.hpp b/cpp/src/pdlp/distributed_pdlp/multi_gpu_engine.hpp
index 3a0fcb755d..89153e8bd7 100644
--- a/cpp/src/pdlp/distributed_pdlp/multi_gpu_engine.hpp
+++ b/cpp/src/pdlp/distributed_pdlp/multi_gpu_engine.hpp
@@ -250,7 +250,7 @@ struct multi_gpu_engine_t {
   // -------- Broadcast owned constraint (row) scaling into halo ------------
   void broadcast_constraint_scaling_to_halo()
   {
-    const int nb = static_cast<int>(shards.size());
+    const int nb    = static_cast<int>(shards.size());
     auto buf_access = [](pdlp_shard_t<i_t, f_t>& s) -> rmm::device_uvector<f_t>& {
       return s.sub_pdlp->get_initial_scaling_strategy().get_cummulative_constraint_matrix_scaling();
     };
@@ -384,7 +384,7 @@ struct multi_gpu_engine_t {
       bound_sq.emplace_back(1, s.stream.view());
       obj_sq.emplace_back(1, s.stream.view());
 
-      const auto& scaled = s.sub_pdlp->get_initial_scaling_strategy().get_scaled_op_problem();
+      const auto& scaled     = s.sub_pdlp->get_initial_scaling_strategy().get_scaled_op_problem();
       const int n_owned_cstr = static_cast<int>(s.rank_data.owned_cstr_size);
       const int n_owned_var  = static_cast<int>(s.rank_data.owned_var_size);
 
@@ -403,7 +403,7 @@ struct multi_gpu_engine_t {
                              n_owned_cstr,
                              s.stream.view().value());
 
-      auto obj_in = thrust::make_transform_iterator(scaled.objective_coefficients.data(),
+      auto obj_in        = thrust::make_transform_iterator(scaled.objective_coefficients.data(),
                                                     mgpu_weighted_sq_op_t<f_t>{c_scaling_weight});
       size_t tmp_bytes_o = 0;
       cub::DeviceReduce::Sum(
diff --git a/cpp/src/pdlp/distributed_pdlp/partitioner.cu b/cpp/src/pdlp/distributed_pdlp/partitioner.cu
index e3866c3ad1..727a8b56f9 100644
--- a/cpp/src/pdlp/distributed_pdlp/partitioner.cu
+++ b/cpp/src/pdlp/distributed_pdlp/partitioner.cu
@@ -73,8 +73,7 @@ std::unique_ptr<partitioner_i<i_t, f_t>> make_partitioner(partitioner_kind_t kin
   switch (kind) {
     case partitioner_kind_t::Dummy: return std::make_unique<dummy_partitioner_t<i_t, f_t>>();
     case partitioner_kind_t::Metis: return std::make_unique<metis_partitioner_t<i_t, f_t>>();
-    case partitioner_kind_t::KaMinPar:
-      return std::make_unique<kaminpar_partitioner_t<i_t, f_t>>();
+    case partitioner_kind_t::KaMinPar: return std::make_unique<kaminpar_partitioner_t<i_t, f_t>>();
   }
   cuopt_expects(
     false, error_type_t::RuntimeError, "make_partitioner: unsupported partitioner kind");
diff --git a/cpp/src/pdlp/initial_scaling_strategy/initial_scaling.cuh b/cpp/src/pdlp/initial_scaling_strategy/initial_scaling.cuh
index 409df5340a..13f639079d 100644
--- a/cpp/src/pdlp/initial_scaling_strategy/initial_scaling.cuh
+++ b/cpp/src/pdlp/initial_scaling_strategy/initial_scaling.cuh
@@ -105,7 +105,10 @@ class pdlp_initial_scaling_strategy_t {
 
   // Distributed PDLP: skip the LOCAL bound/objective rescaling inside
   // scale_problem()
-  void set_skip_distributed_local_rescaling(bool value) { skip_distributed_local_rescaling_ = value; }
+  void set_skip_distributed_local_rescaling(bool value)
+  {
+    skip_distributed_local_rescaling_ = value;
+  }
 
   // Public for distributed PDLP
   void compute_scaling_vectors(i_t number_of_ruiz_iterations, f_t alpha);
diff --git a/cpp/src/pdlp/pdlp.cu b/cpp/src/pdlp/pdlp.cu
index 0514ae1d13..71c6b0a48c 100644
--- a/cpp/src/pdlp/pdlp.cu
+++ b/cpp/src/pdlp/pdlp.cu
@@ -509,8 +509,8 @@ pdlp_solver_t<i_t, f_t>::pdlp_solver_t(
                    [](unsigned char c) { return std::tolower(c); });
     partitioner_kind_t kind;
     if (partitioner_choice.empty() || partitioner_choice == "auto") {
-      kind = (distributed_pdlp_num_gpus == 1) ? partitioner_kind_t::Dummy
-                                              : partitioner_kind_t::KaMinPar;
+      kind =
+        (distributed_pdlp_num_gpus == 1) ? partitioner_kind_t::Dummy : partitioner_kind_t::KaMinPar;
     } else if (partitioner_choice == "dummy") {
       kind = partitioner_kind_t::Dummy;
     } else if (partitioner_choice == "metis") {
@@ -518,10 +518,11 @@ pdlp_solver_t<i_t, f_t>::pdlp_solver_t(
     } else if (partitioner_choice == "kaminpar") {
       kind = partitioner_kind_t::KaMinPar;
     } else {
-      cuopt_expects(false,
-                    error_type_t::ValidationError,
-                    "Unknown distributed_pdlp_partitioner '%s' (expected auto|dummy|metis|kaminpar)",
-                    settings.distributed_pdlp_partitioner.c_str());
+      cuopt_expects(
+        false,
+        error_type_t::ValidationError,
+        "Unknown distributed_pdlp_partitioner '%s' (expected auto|dummy|metis|kaminpar)",
+        settings.distributed_pdlp_partitioner.c_str());
       kind = partitioner_kind_t::Dummy;  // unreachable; silences -Wmaybe-uninitialized
     }
     const bool needs_graph =
@@ -544,12 +545,13 @@ pdlp_solver_t<i_t, f_t>::pdlp_solver_t(
                             : (kind == partitioner_kind_t::Metis)    ? "metis"
                             : (kind == partitioner_kind_t::KaMinPar) ? "kaminpar"
                                                                      : "unknown";
-    CUOPT_LOG_INFO("Partitioning %d constraints + %d variables into %d part(s) using the %s "
-                   "partitioner",
-                   n_cstr,
-                   n_vars,
-                   distributed_pdlp_num_gpus,
-                   kind_name);
+    CUOPT_LOG_INFO(
+      "Partitioning %d constraints + %d variables into %d part(s) using the %s "
+      "partitioner",
+      n_cstr,
+      n_vars,
+      distributed_pdlp_num_gpus,
+      kind_name);
     auto partitioner = make_partitioner<i_t, f_t>(kind);
     parts            = partitioner->partition(partition_input);
   }
diff --git a/cpp/tests/linear_programming/pdlp_test.cu b/cpp/tests/linear_programming/pdlp_test.cu
index 65cc2f0d9f..d17cf2af6f 100644
--- a/cpp/tests/linear_programming/pdlp_test.cu
+++ b/cpp/tests/linear_programming/pdlp_test.cu
@@ -113,9 +113,9 @@ TEST(pdlp_class, distributed_partition_metis_export_import_roundtrip)
   const int n_cstr = static_cast<int>(mps.get_constraint_lower_bounds().size());
   const int nnz    = static_cast<int>(mps.get_constraint_matrix_values().size());
 
-  std::vector<int> h_A_row_offsets    = mps.get_constraint_matrix_offsets();
-  std::vector<int> h_A_col_indices    = mps.get_constraint_matrix_indices();
-  std::vector<double> h_A_values      = mps.get_constraint_matrix_values();
+  std::vector<int> h_A_row_offsets = mps.get_constraint_matrix_offsets();
+  std::vector<int> h_A_col_indices = mps.get_constraint_matrix_indices();
+  std::vector<double> h_A_values   = mps.get_constraint_matrix_values();
 
   // Transpose A -> A^T (CSR of A^T == CSC of A), mirroring solve_lp_distributed_from_mps.
   ds::csr_matrix_t<int, double> A_csr(n_cstr, n_vars, nnz);
@@ -171,7 +171,7 @@ void expect_distributed_matches_base(raft::handle_t const& handle,
     return std::fabs(a - b) <= rel * (1.0 + std::fabs(a));
   };
 
-  auto path = make_path_absolute(mps_rel_path);
+  auto path                                 = make_path_absolute(mps_rel_path);
   io::mps_data_model_t<int, double> problem = io::parse_mps<int, double>(path, fixed_mps_format);
 
   // Shared settings: PDLP, no presolve (distributed requires presolver == None, so the
@@ -186,9 +186,9 @@ void expect_distributed_matches_base(raft::handle_t const& handle,
 
   // ----- distributed PDLP (identical settings, only the distributed flags flipped) -----
   pdlp_solver_settings_t<int, double> dist_settings = base_settings;
-  dist_settings.hyper_params.use_distributed_pdlp    = true;
-  dist_settings.distributed_pdlp_num_gpus            = -1;
-  auto dist                                          = solve_lp(&handle, problem, dist_settings);
+  dist_settings.hyper_params.use_distributed_pdlp   = true;
+  dist_settings.distributed_pdlp_num_gpus           = -1;
+  auto dist                                         = solve_lp(&handle, problem, dist_settings);
 
   // ----- termination status -----
   ASSERT_EQ(static_cast<int>(base.get_termination_status()), CUOPT_TERMINATION_STATUS_OPTIMAL)