From 1e0bd53da23fd9e4c093603d41c3fa6a06e899e4 Mon Sep 17 00:00:00 2001 From: Bulle Mostovoi Date: Thu, 7 May 2026 15:07:26 +0200 Subject: [PATCH 01/67] first commit !! added multi_gpu_partition file to solver settings --- cpp/include/cuopt/linear_programming/constants.h | 1 + .../cuopt/linear_programming/pdlp/solver_settings.hpp | 1 + cpp/src/math_optimization/solver_settings.cu | 5 +++-- 3 files changed, 5 insertions(+), 2 deletions(-) diff --git a/cpp/include/cuopt/linear_programming/constants.h b/cpp/include/cuopt/linear_programming/constants.h index b251b3eaba..7e2682b997 100644 --- a/cpp/include/cuopt/linear_programming/constants.h +++ b/cpp/include/cuopt/linear_programming/constants.h @@ -77,6 +77,7 @@ #define CUOPT_SOLUTION_FILE "solution_file" #define CUOPT_NUM_CPU_THREADS "num_cpu_threads" #define CUOPT_NUM_GPUS "num_gpus" +#define CUOPT_MULTI_GPU_PARTITION_FILE "multi_gpu_partition_file" #define CUOPT_USER_PROBLEM_FILE "user_problem_file" #define CUOPT_PRESOLVE_FILE "presolve_file" #define CUOPT_RANDOM_SEED "random_seed" diff --git a/cpp/include/cuopt/linear_programming/pdlp/solver_settings.hpp b/cpp/include/cuopt/linear_programming/pdlp/solver_settings.hpp index bcf5a736f0..4585b9d1cf 100644 --- a/cpp/include/cuopt/linear_programming/pdlp/solver_settings.hpp +++ b/cpp/include/cuopt/linear_programming/pdlp/solver_settings.hpp @@ -286,6 +286,7 @@ class pdlp_solver_settings_t { presolver_t presolver{presolver_t::Default}; bool dual_postsolve{true}; int num_gpus{1}; + std::string multi_gpu_partition_file{""}; method_t method{method_t::Concurrent}; bool inside_mip{false}; // For concurrent termination diff --git a/cpp/src/math_optimization/solver_settings.cu b/cpp/src/math_optimization/solver_settings.cu index b968ad18ea..42ea533152 100644 --- a/cpp/src/math_optimization/solver_settings.cu +++ b/cpp/src/math_optimization/solver_settings.cu @@ -136,7 +136,7 @@ solver_settings_t::solver_settings_t() : pdlp_settings(), mip_settings {CUOPT_MIP_IMPLIED_BOUND_CUTS, &mip_settings.implied_bound_cuts, -1, 1, -1}, {CUOPT_MIP_STRONG_CHVATAL_GOMORY_CUTS, &mip_settings.strong_chvatal_gomory_cuts, -1, 1, -1}, {CUOPT_MIP_REDUCED_COST_STRENGTHENING, &mip_settings.reduced_cost_strengthening, -1, std::numeric_limits::max(), -1}, - {CUOPT_NUM_GPUS, &pdlp_settings.num_gpus, 1, 2, 1}, + {CUOPT_NUM_GPUS, &pdlp_settings.num_gpus, 1, 576, 1}, {CUOPT_NUM_GPUS, &mip_settings.num_gpus, 1, 2, 1}, {CUOPT_MIP_BATCH_PDLP_STRONG_BRANCHING, &mip_settings.mip_batch_pdlp_strong_branching, 0, 2, 0}, {CUOPT_MIP_BATCH_PDLP_RELIABILITY_BRANCHING, &mip_settings.mip_batch_pdlp_reliability_branching, 0, 2, 0}, @@ -182,7 +182,8 @@ solver_settings_t::solver_settings_t() : pdlp_settings(), mip_settings {CUOPT_USER_PROBLEM_FILE, &mip_settings.user_problem_file, ""}, {CUOPT_USER_PROBLEM_FILE, &pdlp_settings.user_problem_file, ""}, {CUOPT_PRESOLVE_FILE, &mip_settings.presolve_file, ""}, - {CUOPT_PRESOLVE_FILE, &pdlp_settings.presolve_file, ""} + {CUOPT_PRESOLVE_FILE, &pdlp_settings.presolve_file, ""}, + {CUOPT_MULTI_GPU_PARTITION_FILE, &pdlp_settings.multi_gpu_partition_file, ""}, }; // clang-format on } From 978d17bc5e81f10bb0f4305e5886b777251b4ad4 Mon Sep 17 00:00:00 2001 From: Bulle Mostovoi Date: Thu, 7 May 2026 17:51:27 +0200 Subject: [PATCH 02/67] slowly skeletonning --- .../pdlp/distributed_pdlp/communicator.cuh | 0 cpp/src/pdlp/distributed_pdlp/partition.cuh | 33 +++++++++++++++ cpp/src/pdlp/distributed_pdlp/shard.cu | 41 +++++++++++++++++++ cpp/src/pdlp/distributed_pdlp/shard.cuh | 24 +++++++++++ 4 files changed, 98 insertions(+) create mode 100644 cpp/src/pdlp/distributed_pdlp/communicator.cuh create mode 100644 cpp/src/pdlp/distributed_pdlp/partition.cuh create mode 100644 cpp/src/pdlp/distributed_pdlp/shard.cu create mode 100644 cpp/src/pdlp/distributed_pdlp/shard.cuh diff --git a/cpp/src/pdlp/distributed_pdlp/communicator.cuh b/cpp/src/pdlp/distributed_pdlp/communicator.cuh new file mode 100644 index 0000000000..e69de29bb2 diff --git a/cpp/src/pdlp/distributed_pdlp/partition.cuh b/cpp/src/pdlp/distributed_pdlp/partition.cuh new file mode 100644 index 0000000000..38457029be --- /dev/null +++ b/cpp/src/pdlp/distributed_pdlp/partition.cuh @@ -0,0 +1,33 @@ + + + +namespace cuopt::linear_programming::detail { + + +template +class partition_t { + public: + partition_t(const std::string& partition_file); + partition_t(const problem_t& op_problem); + + + size_t nb_parts; + + std::vector raw_parts; + std::vector cstr_parts; + std::vector var_parts; + std::vector> owned_cstr_per_part; + std::vector> owned_var_per_part; + std::vector> needed_cstr_per_part; + std::vector> needed_var_per_part; + std::vector>> sent_cstr_per_part; + std::vector>> sent_var_per_part; + std::vector>> received_cstr_per_part; + std::vector>> received_var_per_part; + + private: + void fill_data(); + void validate() const; + +}; +} // namespace cuopt::linear_programming::detail \ No newline at end of file diff --git a/cpp/src/pdlp/distributed_pdlp/shard.cu b/cpp/src/pdlp/distributed_pdlp/shard.cu new file mode 100644 index 0000000000..43a4526c29 --- /dev/null +++ b/cpp/src/pdlp/distributed_pdlp/shard.cu @@ -0,0 +1,41 @@ + + + +void pre_SpMV_communication(bool is_A_x){ + // Prepare the send_buffers + for (auto& shard: shards){ + comm_planner_t& plan = is_A_x ? shard.x_plan : shard.y_plan; + raft::device_setter guard(shard.device_id); + for (size_t peer = 0; peer < partition.nb_parts; peer++){ + if (peer == shard.rank) continue; + thrust::gather( + shard.handle.get_thrust_policy(), // TODO what exactly do we put here + plan.send_indices_per_peer[peer].begin(), + plan.send_indices_per_peer[peer].end(), + plan.full_local.begin(), + plan.send_buf_per_peer[peer].begin()); + } + } + // Will merge them if it works + ncclgroupstart() + // Send all the data current shard has to send + for (auto& shard: shards){ + comm_planner_t& plan = is_A_x ? shard.x_plan : shard.y_plan; + raft::device_setter guard(shard.device_id); + for (size_t peer = 0; peer < partition.nb_parts; peer++){ + if (peer == shard.rank) continue; + ncclSend(plan.send_buf_per_peer[peer].data(), plan.nb_elt_send_per_peer[peer], peer) + } + } + // Receive all the data current shard has to receive + for (auto& shard: shards){ + comm_planner_t& plan = is_A_x ? shard.x_plan : shard.y_plan; + raft::device_setter guard(shard.device_id); + for (size_t peer = 0; peer < partition.nb_parts; peer++){ + if (peer == shard.rank) continue; + f_t* recv_buff = &plan.full_local[offset_per_peer[peer]]; + ncclRecv(recv_buff, plan.nb_elt_recv_per_peer[peer], peer); + } + } + ncclgroupend() +} \ No newline at end of file diff --git a/cpp/src/pdlp/distributed_pdlp/shard.cuh b/cpp/src/pdlp/distributed_pdlp/shard.cuh new file mode 100644 index 0000000000..30449e04a0 --- /dev/null +++ b/cpp/src/pdlp/distributed_pdlp/shard.cuh @@ -0,0 +1,24 @@ + + +template +struct pdlp_shard_t { + size_t rank; + comm_planner_t x_plan; + comm_planner_t y_plan; +}; + + +template +struct comm_planner_t { + + // The indices of the data we have to send to the others + // Maybe could merge evrything if it gives a speedup but a bit harder to read + std::vector> send_indices_per_peer; + std::vector nb_elt_send_per_peer; + std::vector> send_buf_per_peer; + + // Where to start writing in full_local for each peer + std::vector offset_per_peer; + std::vector nb_elt_recv_per_peer; + rmm::device_uvector full_local; // The full var/cstr vector containing all local data then all remote data +}; \ No newline at end of file From dd0c0eff2de119511065cb1e40a726c6443fb102 Mon Sep 17 00:00:00 2001 From: Bulle Mostovoi Date: Thu, 7 May 2026 18:02:02 +0200 Subject: [PATCH 03/67] better shard.cuh --- cpp/src/pdlp/distributed_pdlp/shard.cuh | 30 ++++++++++++++++--------- 1 file changed, 20 insertions(+), 10 deletions(-) diff --git a/cpp/src/pdlp/distributed_pdlp/shard.cuh b/cpp/src/pdlp/distributed_pdlp/shard.cuh index 30449e04a0..6e4f7eabae 100644 --- a/cpp/src/pdlp/distributed_pdlp/shard.cuh +++ b/cpp/src/pdlp/distributed_pdlp/shard.cuh @@ -1,19 +1,12 @@ -template -struct pdlp_shard_t { - size_t rank; - comm_planner_t x_plan; - comm_planner_t y_plan; -}; - -template +template struct comm_planner_t { // The indices of the data we have to send to the others // Maybe could merge evrything if it gives a speedup but a bit harder to read - std::vector> send_indices_per_peer; + std::vector> send_indices_per_peer; std::vector nb_elt_send_per_peer; std::vector> send_buf_per_peer; @@ -21,4 +14,21 @@ struct comm_planner_t { std::vector offset_per_peer; std::vector nb_elt_recv_per_peer; rmm::device_uvector full_local; // The full var/cstr vector containing all local data then all remote data -}; \ No newline at end of file +}; + +template +struct pdlp_shard_t { + + // Local per-rank PDLP data + raft::handle_t handle; // owned: the actual handle for this shard's device/stream + problem_t local_problem; // owned: holds handle_ptr = &handle (back-ref) + saddle_point_state_t saddle_point; // owned: per-iter state, sized to local + cusparse_view_t cusparse_view; // owned: descriptors bound to local_problem + saddle_point + + // Specific multi-GPU data + int device_id; + ncclComm_t comm; + comm_planner_t x_plan, y_plan; +}; + + From 2037eca41a05ac925d36bd1482c3a1e29b525b49 Mon Sep 17 00:00:00 2001 From: Bulle Mostovoi Date: Sun, 10 May 2026 18:39:30 +0200 Subject: [PATCH 04/67] wip --- cpp/src/pdlp/distributed_pdlp/partition.cu | 24 ++++++++++++++++ cpp/src/pdlp/distributed_pdlp/partition.cuh | 32 ++++++++++++++------- cpp/src/pdlp/distributed_pdlp/shard.cu | 2 +- cpp/src/pdlp/distributed_pdlp/shard.cuh | 32 +++++++++++++++++---- 4 files changed, 73 insertions(+), 17 deletions(-) create mode 100644 cpp/src/pdlp/distributed_pdlp/partition.cu diff --git a/cpp/src/pdlp/distributed_pdlp/partition.cu b/cpp/src/pdlp/distributed_pdlp/partition.cu new file mode 100644 index 0000000000..3410b74fd1 --- /dev/null +++ b/cpp/src/pdlp/distributed_pdlp/partition.cu @@ -0,0 +1,24 @@ + +namespace cuopt::linear_programming::detail { + +template +partition_t::partition_t(const std::string& partition_file){ + +} + +template +partition_t::partition_t(const problem_t& op_problem) +{ + std::cout << "NOT IMPLEMENTED" << std::endl; + return; // TODO: Implement +} + +template +void export_to_file(const std::string& partition_file) const{ + std::cout << "NOT IMPLEMENTED" << std::endl; + return; // TODO: Implement +} + + + +} \ No newline at end of file diff --git a/cpp/src/pdlp/distributed_pdlp/partition.cuh b/cpp/src/pdlp/distributed_pdlp/partition.cuh index 38457029be..a5b5175105 100644 --- a/cpp/src/pdlp/distributed_pdlp/partition.cuh +++ b/cpp/src/pdlp/distributed_pdlp/partition.cuh @@ -4,26 +4,36 @@ namespace cuopt::linear_programming::detail { +template +struct rank_data_t { + // === Ownership === + std::vector owned_var_indices; // global indices of variables in S_r + std::vector owned_constr_indices; // global indices of constraints in T_r + // === Send plan: per peer, LOCAL positions to gather + send === + std::vector> y_send_per_peer; // [peer] -> local positions in T_r to send + std::vector> x_send_per_peer; // [peer] -> local positions in S_r to send + // === Recv plan: per peer, contiguous slot in halo region === + std::vector y_recv_counts; // [peer] -> count + std::vector y_recv_offsets; // [peer] -> offset in dual halo region + std::vector x_recv_counts; + std::vector x_recv_offsets; +}; + + template class partition_t { - public: - partition_t(const std::string& partition_file); + public: + // not sure, good luck hihi + partition_t(std::vector parts, std::vector A_row_offsets, std::vector A_indices, std::vector A_t_row_offsets, std::vector A_t_indices, ); partition_t(const problem_t& op_problem); - + void export_to_file(const std::string& partition_file) const; size_t nb_parts; std::vector raw_parts; std::vector cstr_parts; std::vector var_parts; - std::vector> owned_cstr_per_part; - std::vector> owned_var_per_part; - std::vector> needed_cstr_per_part; - std::vector> needed_var_per_part; - std::vector>> sent_cstr_per_part; - std::vector>> sent_var_per_part; - std::vector>> received_cstr_per_part; - std::vector>> received_var_per_part; + std::vector> rank_data; // [rank] -> partition data for this rank private: void fill_data(); diff --git a/cpp/src/pdlp/distributed_pdlp/shard.cu b/cpp/src/pdlp/distributed_pdlp/shard.cu index 43a4526c29..6de93ad3b8 100644 --- a/cpp/src/pdlp/distributed_pdlp/shard.cu +++ b/cpp/src/pdlp/distributed_pdlp/shard.cu @@ -17,7 +17,7 @@ void pre_SpMV_communication(bool is_A_x){ } } // Will merge them if it works - ncclgroupstart() + ncclgroupstart(); // Send all the data current shard has to send for (auto& shard: shards){ comm_planner_t& plan = is_A_x ? shard.x_plan : shard.y_plan; diff --git a/cpp/src/pdlp/distributed_pdlp/shard.cuh b/cpp/src/pdlp/distributed_pdlp/shard.cuh index 6e4f7eabae..127cc496f1 100644 --- a/cpp/src/pdlp/distributed_pdlp/shard.cuh +++ b/cpp/src/pdlp/distributed_pdlp/shard.cuh @@ -3,7 +3,6 @@ template struct comm_planner_t { - // The indices of the data we have to send to the others // Maybe could merge evrything if it gives a speedup but a bit harder to read std::vector> send_indices_per_peer; @@ -19,16 +18,39 @@ struct comm_planner_t { template struct pdlp_shard_t { + // Specific multi-GPU data + int device_id; + ncclComm_t comm; + comm_planner_t x_plan, y_plan; + // Local per-rank PDLP data raft::handle_t handle; // owned: the actual handle for this shard's device/stream problem_t local_problem; // owned: holds handle_ptr = &handle (back-ref) saddle_point_state_t saddle_point; // owned: per-iter state, sized to local cusparse_view_t cusparse_view; // owned: descriptors bound to local_problem + saddle_point - // Specific multi-GPU data - int device_id; - ncclComm_t comm; - comm_planner_t x_plan, y_plan; + rmm::device_uvector tmp_primal; + rmm::device_uvector tmp_dual; + rmm::device_uvector potential_next_primal; + rmm::device_uvector potential_next_dual; + rmm::device_uvector dual_slack; + rmm::device_uvector reflected_primal; // x, so it has primal_size + halo + rmm::device_uvector reflected_dual; // y, so it has dual_size + halo + + rmm::device_scalar reusable_one; // = 1.0 + rmm::device_scalar reusable_zero; // = 0.0 + rmm::device_scalar reusable_neg_one; // = -1.0 + + // ===== Missing for cuPDLP+ Halpern update ===== + rmm::device_uvector initial_primal; // snapshot at start of restart epoch + rmm::device_uvector initial_dual; + + i_t primal_size_h; + i_t dual_size_h; + i_t primal_halo_size; + i_t dual_halo_size; + i_t full_primal_size_h;// = primal_size_h + primal_halo_size + i_t full_dual_size_h; // = dual_size_h + dual_halo_size }; From 0f62eff269ce7ab5c7f5b2141c5178abeb61ec2c Mon Sep 17 00:00:00 2001 From: Bulle Mostovoi Date: Mon, 18 May 2026 18:06:27 +0200 Subject: [PATCH 05/67] added a bit of skeleton. Forward declared pdlp_solver in shard.hpp, the cycle seems to be fixed, cuopt compiles --- .../pdlp/distributed_pdlp/communicator.cuh | 0 .../distributed_pdlp/multi_gpu_engine.hpp | 14 +++++ cpp/src/pdlp/distributed_pdlp/partition.cu | 24 -------- cpp/src/pdlp/distributed_pdlp/partition.cuh | 43 -------------- .../distributed_pdlp/partition_loader.hpp | 2 + cpp/src/pdlp/distributed_pdlp/rank_data.hpp | 52 +++++++++++++++++ cpp/src/pdlp/distributed_pdlp/shard.cu | 54 ++++++------------ cpp/src/pdlp/distributed_pdlp/shard.cuh | 56 ------------------- cpp/src/pdlp/distributed_pdlp/shard.hpp | 31 ++++++++++ cpp/src/pdlp/pdlp.cuh | 4 ++ 10 files changed, 119 insertions(+), 161 deletions(-) delete mode 100644 cpp/src/pdlp/distributed_pdlp/communicator.cuh create mode 100644 cpp/src/pdlp/distributed_pdlp/multi_gpu_engine.hpp delete mode 100644 cpp/src/pdlp/distributed_pdlp/partition.cu delete mode 100644 cpp/src/pdlp/distributed_pdlp/partition.cuh create mode 100644 cpp/src/pdlp/distributed_pdlp/partition_loader.hpp create mode 100644 cpp/src/pdlp/distributed_pdlp/rank_data.hpp delete mode 100644 cpp/src/pdlp/distributed_pdlp/shard.cuh create mode 100644 cpp/src/pdlp/distributed_pdlp/shard.hpp diff --git a/cpp/src/pdlp/distributed_pdlp/communicator.cuh b/cpp/src/pdlp/distributed_pdlp/communicator.cuh deleted file mode 100644 index e69de29bb2..0000000000 diff --git a/cpp/src/pdlp/distributed_pdlp/multi_gpu_engine.hpp b/cpp/src/pdlp/distributed_pdlp/multi_gpu_engine.hpp new file mode 100644 index 0000000000..13ded70009 --- /dev/null +++ b/cpp/src/pdlp/distributed_pdlp/multi_gpu_engine.hpp @@ -0,0 +1,14 @@ +#pragma once + +#include + +#include + +namespace cuopt::linear_programming::detail { + +template +struct multi_gpu_engine_t { + std::vector> shards; +}; + +} // namespace cuopt::linear_programming::detail diff --git a/cpp/src/pdlp/distributed_pdlp/partition.cu b/cpp/src/pdlp/distributed_pdlp/partition.cu deleted file mode 100644 index 3410b74fd1..0000000000 --- a/cpp/src/pdlp/distributed_pdlp/partition.cu +++ /dev/null @@ -1,24 +0,0 @@ - -namespace cuopt::linear_programming::detail { - -template -partition_t::partition_t(const std::string& partition_file){ - -} - -template -partition_t::partition_t(const problem_t& op_problem) -{ - std::cout << "NOT IMPLEMENTED" << std::endl; - return; // TODO: Implement -} - -template -void export_to_file(const std::string& partition_file) const{ - std::cout << "NOT IMPLEMENTED" << std::endl; - return; // TODO: Implement -} - - - -} \ No newline at end of file diff --git a/cpp/src/pdlp/distributed_pdlp/partition.cuh b/cpp/src/pdlp/distributed_pdlp/partition.cuh deleted file mode 100644 index a5b5175105..0000000000 --- a/cpp/src/pdlp/distributed_pdlp/partition.cuh +++ /dev/null @@ -1,43 +0,0 @@ - - - -namespace cuopt::linear_programming::detail { - - -template -struct rank_data_t { - // === Ownership === - std::vector owned_var_indices; // global indices of variables in S_r - std::vector owned_constr_indices; // global indices of constraints in T_r - // === Send plan: per peer, LOCAL positions to gather + send === - std::vector> y_send_per_peer; // [peer] -> local positions in T_r to send - std::vector> x_send_per_peer; // [peer] -> local positions in S_r to send - // === Recv plan: per peer, contiguous slot in halo region === - std::vector y_recv_counts; // [peer] -> count - std::vector y_recv_offsets; // [peer] -> offset in dual halo region - std::vector x_recv_counts; - std::vector x_recv_offsets; -}; - - -template -class partition_t { - public: - // not sure, good luck hihi - partition_t(std::vector parts, std::vector A_row_offsets, std::vector A_indices, std::vector A_t_row_offsets, std::vector A_t_indices, ); - partition_t(const problem_t& op_problem); - void export_to_file(const std::string& partition_file) const; - - size_t nb_parts; - - std::vector raw_parts; - std::vector cstr_parts; - std::vector var_parts; - std::vector> rank_data; // [rank] -> partition data for this rank - - private: - void fill_data(); - void validate() const; - -}; -} // namespace cuopt::linear_programming::detail \ No newline at end of file diff --git a/cpp/src/pdlp/distributed_pdlp/partition_loader.hpp b/cpp/src/pdlp/distributed_pdlp/partition_loader.hpp new file mode 100644 index 0000000000..139597f9cb --- /dev/null +++ b/cpp/src/pdlp/distributed_pdlp/partition_loader.hpp @@ -0,0 +1,2 @@ + + diff --git a/cpp/src/pdlp/distributed_pdlp/rank_data.hpp b/cpp/src/pdlp/distributed_pdlp/rank_data.hpp new file mode 100644 index 0000000000..ee107f5cf1 --- /dev/null +++ b/cpp/src/pdlp/distributed_pdlp/rank_data.hpp @@ -0,0 +1,52 @@ +#pragma once + +#include +#include + +namespace cuopt::linear_programming::detail { +template +struct rank_data_t { + rank_data_t(std::size_t nb_parts) + : var_send_per_peer(nb_parts), + cstr_send_per_peer(nb_parts), + var_recv_counts(nb_parts, 0), + var_recv_offsets(nb_parts, 0), + cstr_recv_counts(nb_parts, 0), + cstr_recv_offsets(nb_parts, 0) {} + + i_t owned_var_size{0}; + i_t total_var_size{0}; + i_t owned_cstr_size{0}; + i_t total_cstr_size{0}; + + // === Ownership === + std::vector owned_var_indices; + std::vector owned_cstr_indices; + + // === Send plan: per peer, indices to gather + send === + std::vector> var_send_per_peer; + std::vector> cstr_send_per_peer; + + // === Recv plan: per peer, contiguous slot in halo region === + std::vector var_recv_counts; + std::vector var_recv_offsets; + std::vector cstr_recv_counts; + std::vector cstr_recv_offsets; + + // === Mappings === + std::unordered_map global_to_local_var; + std::unordered_map global_to_local_cstr; + std::vector local_to_global_var; + std::vector local_to_global_cstr; + + // === Local host CSR matrices === + // A + std::vector h_A_row_offsets; + std::vector h_A_col_indices; + std::vector h_A_values; + // A_t + std::vector h_A_t_row_offsets; + std::vector h_A_t_col_indices; + std::vector h_A_t_values; + }; +} // namespace cuopt::linear_programming::detail \ No newline at end of file diff --git a/cpp/src/pdlp/distributed_pdlp/shard.cu b/cpp/src/pdlp/distributed_pdlp/shard.cu index 6de93ad3b8..b7e176c3ee 100644 --- a/cpp/src/pdlp/distributed_pdlp/shard.cu +++ b/cpp/src/pdlp/distributed_pdlp/shard.cu @@ -1,41 +1,19 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + */ +#include +#include +namespace cuopt::linear_programming::detail { +// This must be done in .cu file because the pdlp_solver_t is not already complete in the hpp file +template +pdlp_shard_t::~pdlp_shard_t() = default; -void pre_SpMV_communication(bool is_A_x){ - // Prepare the send_buffers - for (auto& shard: shards){ - comm_planner_t& plan = is_A_x ? shard.x_plan : shard.y_plan; - raft::device_setter guard(shard.device_id); - for (size_t peer = 0; peer < partition.nb_parts; peer++){ - if (peer == shard.rank) continue; - thrust::gather( - shard.handle.get_thrust_policy(), // TODO what exactly do we put here - plan.send_indices_per_peer[peer].begin(), - plan.send_indices_per_peer[peer].end(), - plan.full_local.begin(), - plan.send_buf_per_peer[peer].begin()); - } - } - // Will merge them if it works - ncclgroupstart(); - // Send all the data current shard has to send - for (auto& shard: shards){ - comm_planner_t& plan = is_A_x ? shard.x_plan : shard.y_plan; - raft::device_setter guard(shard.device_id); - for (size_t peer = 0; peer < partition.nb_parts; peer++){ - if (peer == shard.rank) continue; - ncclSend(plan.send_buf_per_peer[peer].data(), plan.nb_elt_send_per_peer[peer], peer) - } - } - // Receive all the data current shard has to receive - for (auto& shard: shards){ - comm_planner_t& plan = is_A_x ? shard.x_plan : shard.y_plan; - raft::device_setter guard(shard.device_id); - for (size_t peer = 0; peer < partition.nb_parts; peer++){ - if (peer == shard.rank) continue; - f_t* recv_buff = &plan.full_local[offset_per_peer[peer]]; - ncclRecv(recv_buff, plan.nb_elt_recv_per_peer[peer], peer); - } - } - ncclgroupend() -} \ No newline at end of file + + + +template struct pdlp_shard_t; +//template struct pdlp_shard_t; +} // namespace cuopt::linear_programming::detail diff --git a/cpp/src/pdlp/distributed_pdlp/shard.cuh b/cpp/src/pdlp/distributed_pdlp/shard.cuh deleted file mode 100644 index 127cc496f1..0000000000 --- a/cpp/src/pdlp/distributed_pdlp/shard.cuh +++ /dev/null @@ -1,56 +0,0 @@ - - - -template -struct comm_planner_t { - // The indices of the data we have to send to the others - // Maybe could merge evrything if it gives a speedup but a bit harder to read - std::vector> send_indices_per_peer; - std::vector nb_elt_send_per_peer; - std::vector> send_buf_per_peer; - - // Where to start writing in full_local for each peer - std::vector offset_per_peer; - std::vector nb_elt_recv_per_peer; - rmm::device_uvector full_local; // The full var/cstr vector containing all local data then all remote data -}; - -template -struct pdlp_shard_t { - - // Specific multi-GPU data - int device_id; - ncclComm_t comm; - comm_planner_t x_plan, y_plan; - - // Local per-rank PDLP data - raft::handle_t handle; // owned: the actual handle for this shard's device/stream - problem_t local_problem; // owned: holds handle_ptr = &handle (back-ref) - saddle_point_state_t saddle_point; // owned: per-iter state, sized to local - cusparse_view_t cusparse_view; // owned: descriptors bound to local_problem + saddle_point - - rmm::device_uvector tmp_primal; - rmm::device_uvector tmp_dual; - rmm::device_uvector potential_next_primal; - rmm::device_uvector potential_next_dual; - rmm::device_uvector dual_slack; - rmm::device_uvector reflected_primal; // x, so it has primal_size + halo - rmm::device_uvector reflected_dual; // y, so it has dual_size + halo - - rmm::device_scalar reusable_one; // = 1.0 - rmm::device_scalar reusable_zero; // = 0.0 - rmm::device_scalar reusable_neg_one; // = -1.0 - - // ===== Missing for cuPDLP+ Halpern update ===== - rmm::device_uvector initial_primal; // snapshot at start of restart epoch - rmm::device_uvector initial_dual; - - i_t primal_size_h; - i_t dual_size_h; - i_t primal_halo_size; - i_t dual_halo_size; - i_t full_primal_size_h;// = primal_size_h + primal_halo_size - i_t full_dual_size_h; // = dual_size_h + dual_halo_size -}; - - diff --git a/cpp/src/pdlp/distributed_pdlp/shard.hpp b/cpp/src/pdlp/distributed_pdlp/shard.hpp new file mode 100644 index 0000000000..0fe57be974 --- /dev/null +++ b/cpp/src/pdlp/distributed_pdlp/shard.hpp @@ -0,0 +1,31 @@ +#pragma once +#include +#include +#include +#include +namespace cuopt::linear_programming::detail { + +template +class pdlp_solver_t; + +template +class pdlp_shard_t { + // Declaration only, will be set as default in shard.cu . Needed to manage cyclic include of pdlp_solver_t. + public: + ~pdlp_shard_t(); + pdlp_shard_t(int device_id, + rank_data_t&& rd, + ncclComm_t comm + /* ???????? */); + + pdlp_shard_t(const pdlp_shard_t&) = delete; + pdlp_shard_t& operator=(const pdlp_shard_t&) = delete; // Specific multi-GPU data + int device_id; + raft::handle_t handle; + ncclComm_t comm; + rank_data_t rank_data; + + std::unique_ptr> sub_pdlp; +}; + +} diff --git a/cpp/src/pdlp/pdlp.cuh b/cpp/src/pdlp/pdlp.cuh index d03430f150..5cb267730f 100644 --- a/cpp/src/pdlp/pdlp.cuh +++ b/cpp/src/pdlp/pdlp.cuh @@ -12,6 +12,7 @@ #include #include +#include #include #include #include @@ -32,6 +33,7 @@ #include #include +#include "distributed_pdlp/multi_gpu_engine.hpp" namespace cuopt::linear_programming::detail { /** @@ -237,6 +239,8 @@ class pdlp_solver_t { primal_quality_adapter_t best_primal_quality_so_far_; // Flag to indicate if solver is being called from MIP. No logging is done in this case. bool inside_mip_{false}; + + multi_gpu_engine_t multi_gpu_engine; }; } // namespace cuopt::linear_programming::detail From d89c85a9af1303ae12641a868a9cb83d64c32aee Mon Sep 17 00:00:00 2001 From: Bulle Mostovoi Date: Tue, 19 May 2026 13:49:37 +0200 Subject: [PATCH 06/67] still wip but going well --- .../pdlp/pdlp_hyper_params.cuh | 1 + cpp/src/pdlp/CMakeLists.txt | 3 + .../pdlp/distributed_pdlp/multi_gpu_engine.cu | 73 +++++++ .../distributed_pdlp/multi_gpu_engine.hpp | 61 ++++-- .../pdlp/distributed_pdlp/partition_loader.cu | 178 ++++++++++++++++++ .../distributed_pdlp/partition_loader.hpp | 14 ++ cpp/src/pdlp/distributed_pdlp/shard.cu | 101 +++++++++- cpp/src/pdlp/distributed_pdlp/shard.hpp | 21 ++- cpp/src/pdlp/pdlp.cu | 97 +++++++++- cpp/src/pdlp/pdlp.cuh | 10 +- cpp/src/pdlp/solve.cu | 11 ++ 11 files changed, 551 insertions(+), 19 deletions(-) create mode 100644 cpp/src/pdlp/distributed_pdlp/multi_gpu_engine.cu create mode 100644 cpp/src/pdlp/distributed_pdlp/partition_loader.cu diff --git a/cpp/include/cuopt/linear_programming/pdlp/pdlp_hyper_params.cuh b/cpp/include/cuopt/linear_programming/pdlp/pdlp_hyper_params.cuh index 282e91d7ef..962f06ee4a 100644 --- a/cpp/include/cuopt/linear_programming/pdlp/pdlp_hyper_params.cuh +++ b/cpp/include/cuopt/linear_programming/pdlp/pdlp_hyper_params.cuh @@ -47,6 +47,7 @@ struct pdlp_hyper_params_t { bool bound_objective_rescaling = true; bool use_reflected_primal_dual = true; bool use_fixed_point_error = true; + bool use_distributed_pdlp = false; double reflection_coefficient = 1.0; double restart_k_p = 0.99; double restart_k_i = 0.01; diff --git a/cpp/src/pdlp/CMakeLists.txt b/cpp/src/pdlp/CMakeLists.txt index f5f26837b6..2bc2771c91 100644 --- a/cpp/src/pdlp/CMakeLists.txt +++ b/cpp/src/pdlp/CMakeLists.txt @@ -29,6 +29,9 @@ set(LP_CORE_FILES ${CMAKE_CURRENT_SOURCE_DIR}/termination_strategy/convergence_information.cu ${CMAKE_CURRENT_SOURCE_DIR}/optimal_batch_size_handler/optimal_batch_size_handler.cu ${CMAKE_CURRENT_SOURCE_DIR}/utilities/ping_pong_graph.cu + ${CMAKE_CURRENT_SOURCE_DIR}/distributed_pdlp/shard.cu + ${CMAKE_CURRENT_SOURCE_DIR}/distributed_pdlp/multi_gpu_engine.cu + ${CMAKE_CURRENT_SOURCE_DIR}/distributed_pdlp/partition_loader.cu ) # C and Python adapter files diff --git a/cpp/src/pdlp/distributed_pdlp/multi_gpu_engine.cu b/cpp/src/pdlp/distributed_pdlp/multi_gpu_engine.cu new file mode 100644 index 0000000000..c7307c46ee --- /dev/null +++ b/cpp/src/pdlp/distributed_pdlp/multi_gpu_engine.cu @@ -0,0 +1,73 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + */ + + #include + + #include + + #include + + #include + + #include + + namespace cuopt::linear_programming::detail { + + template + multi_gpu_engine_t::multi_gpu_engine_t( + std::vector>&& rank_data, + std::vector const& h_global_obj, + std::vector const& h_global_var_lower, + std::vector const& h_global_var_upper, + std::vector const& h_global_cstr_lower, + std::vector const& h_global_cstr_upper, + bool maximize, + f_t objective_offset, + f_t objective_scaling_factor, + pdlp_solver_settings_t const& sub_solver_settings) + : stream() + { + const int nb_parts = static_cast(rank_data.size()); + cuopt_expects(nb_parts > 0, + error_type_t::ValidationError, + "multi_gpu_engine_t: rank_data must be non-empty"); + + shards.reserve(nb_parts); + + // 1:1 rank -> device mapping. (Matches metis_tests; refine later if needed.) + std::vector devices(nb_parts); + std::iota(devices.begin(), devices.end(), 0); + + // 2. Collectively bootstrap NCCL communicators across all devices. + // Must be done together; each comm is then handed to one shard, + // which wraps it in a unique_ptr with the device-aware deleter. + std::vector raw_comms(nb_parts); + cuopt_expects(ncclCommInitAll(raw_comms.data(), nb_parts, devices.data()) == ncclSuccess, + error_type_t::RuntimeError, + "ncclCommInitAll failed"); + + // 3. Construct one shard per rank, pinned to its device. + for (int r = 0; r < nb_parts; ++r) { + raft::device_setter guard(devices[r]); // shard ctor asserts current device + shards.emplace_back(std::make_unique>( + devices[r], + std::move(rank_data[r]), + raw_comms[r], + h_global_obj, + h_global_var_lower, + h_global_var_upper, + h_global_cstr_lower, + h_global_cstr_upper, + maximize, + objective_offset, + objective_scaling_factor, + sub_solver_settings)); + } + } + + template struct multi_gpu_engine_t; + // template struct multi_gpu_engine_t; + + } // namespace cuopt::linear_programming::detail \ No newline at end of file diff --git a/cpp/src/pdlp/distributed_pdlp/multi_gpu_engine.hpp b/cpp/src/pdlp/distributed_pdlp/multi_gpu_engine.hpp index 13ded70009..6142c938e3 100644 --- a/cpp/src/pdlp/distributed_pdlp/multi_gpu_engine.hpp +++ b/cpp/src/pdlp/distributed_pdlp/multi_gpu_engine.hpp @@ -1,14 +1,49 @@ -#pragma once +/* + * SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + */ + #pragma once -#include - -#include - -namespace cuopt::linear_programming::detail { - -template -struct multi_gpu_engine_t { - std::vector> shards; -}; - -} // namespace cuopt::linear_programming::detail + #include + #include + + #include + + #include + + #include + #include + + namespace cuopt::linear_programming::detail { + + template + struct multi_gpu_engine_t { + // Constructs one shard per partition. Caller is responsible for: + // - rank_data[i] being correctly populated for rank i + // - the host vectors holding the (already scaled) global problem data + // - sub_solver_settings being the per-shard PDLP config (num_gpus=1, + // multi_gpu_partition_file="", scaling disabled). + multi_gpu_engine_t( + std::vector>&& rank_data, + std::vector const& h_global_obj, + std::vector const& h_global_var_lower, + std::vector const& h_global_var_upper, + std::vector const& h_global_cstr_lower, + std::vector const& h_global_cstr_upper, + bool maximize, + f_t objective_offset, + f_t objective_scaling_factor, + pdlp_solver_settings_t const& sub_solver_settings); + + multi_gpu_engine_t(const multi_gpu_engine_t&) = delete; + multi_gpu_engine_t& operator=(const multi_gpu_engine_t&) = delete; + + // Engine-level stream for fork/join orchestration (master side). + rmm::cuda_stream stream; + + // Shards stored by unique_ptr because pdlp_shard_t is immovable + // (owns device-affine resources: handle, NCCL comm, RMM buffers). + std::vector>> shards; + }; + + } // namespace cuopt::linear_programming::detail \ No newline at end of file diff --git a/cpp/src/pdlp/distributed_pdlp/partition_loader.cu b/cpp/src/pdlp/distributed_pdlp/partition_loader.cu new file mode 100644 index 0000000000..449e8640ab --- /dev/null +++ b/cpp/src/pdlp/distributed_pdlp/partition_loader.cu @@ -0,0 +1,178 @@ +static std::vector parse_distributed_pdlp_partition_file(std::string file){ + //returns a vector with all the values separated by a \n +} + +std::vector create_rank_data_from_parts(const std::vector& parts, + const std::vector& A_row_offsets, + const std::vector& A_col_indices, + const std::vector& A_values, + const std::vector& A_t_row_offsets, + const std::vector& A_t_col_indices, + const std::vector& A_t_values, + i_t nb_parts, + i_t nb_cstr, + i_t nb_vars, + i_t nnz) +{ +std::vector rank_data(nb_parts, rank_data_t(nb_parts)); +std::vector cstr_parts(parts.begin(), parts.begin() + nb_cstr); +std::vector var_parts(parts.begin() + nb_cstr, parts.begin() + nb_cstr + nb_vars); + +// 1. Compute ownership +for (i_t i = 0; i < nb_cstr; i++) { +rank_data[cstr_parts[i]].owned_cstr_indices.push_back(i); +} +for (i_t i = 0; i < nb_vars; i++) { +rank_data[var_parts[i]].owned_var_indices.push_back(i); +} + +// 2. Compute local matrices and rank_data +for (i_t rank = 0; rank < nb_parts; rank++) { +auto& rd = rank_data[rank]; +rd.owned_var_size = rd.owned_var_indices.size(); +rd.owned_cstr_size = rd.owned_cstr_indices.size(); +// ---- A side ---- +std::vector local_A_row_offsets; +std::vector local_A_col_indices; +std::vector local_A_values; + +i_t local_A_nnz = 0; +local_A_row_offsets.push_back(local_A_nnz); + +// For each owned constraint, build local matrix A +for (auto owned_cstr : rd.owned_cstr_indices) { +i_t cstr_len = A_row_offsets[owned_cstr + 1] - A_row_offsets[owned_cstr]; +i_t row_start = A_row_offsets[owned_cstr]; +for (i_t v = 0; v < cstr_len; v++) { +local_A_col_indices.push_back(A_col_indices[row_start + v]); +local_A_values.push_back(A_values[row_start + v]); +} +local_A_nnz += cstr_len; +local_A_row_offsets.push_back(local_A_nnz); +} + +std::set needed_vars; +for (auto indice : local_A_col_indices) { +if (var_parts[indice] != rank) +needed_vars.insert(indice); +} + +for (i_t peer = 0; peer < nb_parts; peer++) { +std::vector needed_var_from_peer; +for (auto needed_var : needed_vars) { +if (var_parts[needed_var] == peer) +needed_var_from_peer.push_back(needed_var); +} +i_t nb_recv_from_peer = needed_var_from_peer.size(); +rd.var_recv_counts[peer] = nb_recv_from_peer; +rd.var_recv_offsets[peer] = +peer == 0 +? 0 +: rd.var_recv_offsets[peer - 1] + rd.var_recv_counts[peer - 1]; +rank_data[peer].var_send_per_peer[rank] = std::move(needed_var_from_peer); +} + +rd.h_A_row_offsets = std::move(local_A_row_offsets); +rd.h_A_col_indices = std::move(local_A_col_indices); +rd.h_A_values = std::move(local_A_values); + +// ---- A_t side ---- +std::vector local_A_t_row_offsets; +std::vector local_A_t_col_indices; +std::vector local_A_t_values; +i_t local_A_t_nnz = 0; +local_A_t_row_offsets.push_back(local_A_t_nnz); + +for (auto owned_var : rd.owned_var_indices) { +i_t var_len = A_t_row_offsets[owned_var + 1] - A_t_row_offsets[owned_var]; +i_t row_start = A_t_row_offsets[owned_var]; +for (i_t v = 0; v < var_len; v++) { +local_A_t_col_indices.push_back(A_t_col_indices[row_start + v]); +local_A_t_values.push_back(A_t_values[row_start + v]); +} +local_A_t_nnz += var_len; +local_A_t_row_offsets.push_back(local_A_t_nnz); +} + +std::set needed_cstrs; +for (auto indice : local_A_t_col_indices) { +if (cstr_parts[indice] != rank) +needed_cstrs.insert(indice); +} + +for (i_t peer = 0; peer < nb_parts; peer++) { +std::vector needed_cstr_from_peer; +for (auto needed_cstr : needed_cstrs) { +if (cstr_parts[needed_cstr] == peer) +needed_cstr_from_peer.push_back(needed_cstr); +} +i_t nb_recv_from_peer = needed_cstr_from_peer.size(); +rd.cstr_recv_counts[peer] = nb_recv_from_peer; +rd.cstr_recv_offsets[peer] = +peer == 0 +? 0 +: rd.cstr_recv_offsets[peer - 1] + rd.cstr_recv_counts[peer - 1]; +rank_data[peer].cstr_send_per_peer[rank] = std::move(needed_cstr_from_peer); +} + +rd.h_A_t_row_offsets = std::move(local_A_t_row_offsets); +rd.h_A_t_col_indices = std::move(local_A_t_col_indices); +rd.h_A_t_values = std::move(local_A_t_values); + +rd.total_var_size = rd.owned_var_size + needed_vars.size(); +rd.total_cstr_size = rd.owned_cstr_size + needed_cstrs.size(); +} + +// 3. Generate local indices for contiguous [[self], [peer1], ..., [peer_k]] +// Build scatter_gather_maps +for (i_t rank = 0; rank < nb_parts; rank++) { +auto& rd = rank_data[rank]; + +i_t curr_id = 0; +for (auto owned_cstr : rd.owned_cstr_indices) { +rd.global_to_local_cstr[owned_cstr] = curr_id; +rd.local_to_global_cstr.push_back(owned_cstr); +curr_id++; +} +for (i_t peer = 0; peer < nb_parts; peer++) { +if (peer == rank) continue; +for (auto recv_cstr : rank_data[peer].cstr_send_per_peer[rank]) { +rd.global_to_local_cstr[recv_cstr] = curr_id; +// rd.local_to_global_cstr.push_back(recv_cstr); // Not needed, we only do local_to_global on owned side +curr_id++; +} +} + +curr_id = 0; +for (auto owned_var : rd.owned_var_indices) { +rd.global_to_local_var[owned_var] = curr_id; +rd.local_to_global_var.push_back(owned_var); +curr_id++; +} +for (i_t peer = 0; peer < nb_parts; peer++) { +if (peer == rank) continue; +for (auto recv_var : rank_data[peer].var_send_per_peer[rank]) { +rd.global_to_local_var[recv_var] = curr_id; +// rd.local_to_global_var.push_back(recv_var); // same as over +curr_id++; +} +} +} + +// 4. Remap global -> local everywhere +for (i_t rank = 0; rank < nb_parts; rank++) { +auto& rd = rank_data[rank]; + +for (auto& send_vec : rd.var_send_per_peer) { +for (auto& v : send_vec) v = rd.global_to_local_var.at(v); +} +for (auto& send_vec : rd.cstr_send_per_peer) { +for (auto& v : send_vec) v = rd.global_to_local_cstr.at(v); +} + +for (auto& v : rd.h_A_col_indices) v = rd.global_to_local_var.at(v); +for (auto& v : rd.h_A_t_col_indices) v = rd.global_to_local_cstr.at(v); +} + +return rank_data; +} diff --git a/cpp/src/pdlp/distributed_pdlp/partition_loader.hpp b/cpp/src/pdlp/distributed_pdlp/partition_loader.hpp index 139597f9cb..4d66d4445c 100644 --- a/cpp/src/pdlp/distributed_pdlp/partition_loader.hpp +++ b/cpp/src/pdlp/distributed_pdlp/partition_loader.hpp @@ -1,2 +1,16 @@ +partition_loader_t { + static std::vector parse_distributed_pdlp_partition_file(std::string file); + std::vector create_rank_data_from_parts(const std::vector& parts, + const std::vector& A_row_offsets, + const std::vector& A_col_indices, + const std::vector& A_values, + const std::vector& A_t_row_offsets, + const std::vector& A_t_col_indices, + const std::vector& A_t_values, + i_t nb_parts, + i_t nb_cstr, + i_t nb_vars, + i_t nnz); +} \ No newline at end of file diff --git a/cpp/src/pdlp/distributed_pdlp/shard.cu b/cpp/src/pdlp/distributed_pdlp/shard.cu index b7e176c3ee..d5e795bb61 100644 --- a/cpp/src/pdlp/distributed_pdlp/shard.cu +++ b/cpp/src/pdlp/distributed_pdlp/shard.cu @@ -5,15 +5,114 @@ #include #include + +#include +#include + +#include +#include + namespace cuopt::linear_programming::detail { // This must be done in .cu file because the pdlp_solver_t is not already complete in the hpp file template pdlp_shard_t::~pdlp_shard_t() = default; +template +pdlp_shard_t::pdlp_shard_t( + int device_id, + rank_data_t&& rd, + ncclComm_t raw_comm, + std::vector const& h_global_obj, + std::vector const& h_global_var_lower, + std::vector const& h_global_var_upper, + std::vector const& h_global_cstr_lower, + std::vector const& h_global_cstr_upper, + bool maximize, + f_t objective_offset, + f_t objective_scaling_factor, + pdlp_solver_settings_t const& settings) + : device_id(device_id), + stream(), + handle(stream.view()), + comm(raw_comm, nccl_comm_deleter_t{device_id}), + rank_data(std::move(rd)), + opt_problem(std::nullopt), + sub_problem(std::nullopt), + sub_pdlp(nullptr) +{ + assert(raft::device_setter::get_current_device() == device_id && "Right device must be set before building the shard"); + + // ---- 1. Gather per-shard host slices using rank_data's index maps. ---- + // All vectors are sized to TOTAL (owned + halo). Owned slots get real + // values; halo slots keep neutral defaults so they are no-ops even if + // accidentally touched before `owned_*_size_` plumbing is in place. + std::vector h_obj (rank_data.total_var_size, f_t{0}); + std::vector h_var_lower (rank_data.total_var_size, -std::numeric_limits::infinity()); + std::vector h_var_upper (rank_data.total_var_size, std::numeric_limits::infinity()); + std::vector h_cstr_lower(rank_data.total_cstr_size, -std::numeric_limits::infinity()); + std::vector h_cstr_upper(rank_data.total_cstr_size, std::numeric_limits::infinity()); + + for (i_t i = 0; i < rank_data.owned_var_size; ++i) { + const auto g = rank_data.local_to_global_var[i]; + h_obj[i] = h_global_obj[g]; + h_var_lower[i] = h_global_var_lower[g]; + h_var_upper[i] = h_global_var_upper[g]; + } + for (i_t i = 0; i < rank_data.owned_cstr_size; ++i) { + const auto g = rank_data.local_to_global_cstr[i]; + h_cstr_lower[i] = h_global_cstr_lower[g]; + h_cstr_upper[i] = h_global_cstr_upper[g]; + } + // ---- 2. Build optimization_problem_t on this shard's device. ---- + opt_problem.emplace(&handle); + opt_problem->set_csr_constraint_matrix( + rank_data.h_A_values .data(), static_cast(rank_data.h_A_values .size()), + rank_data.h_A_col_indices.data(), static_cast(rank_data.h_A_col_indices.size()), + rank_data.h_A_row_offsets.data(), static_cast(rank_data.h_A_row_offsets.size())); + // Primal axis: TOTAL (owned + halo). Halo slots have neutral defaults. + opt_problem->set_objective_coefficients(h_obj .data(), rank_data.total_var_size); + opt_problem->set_variable_lower_bounds (h_var_lower.data(), rank_data.total_var_size); + opt_problem->set_variable_upper_bounds (h_var_upper.data(), rank_data.total_var_size); + + // Dual axis: TOTAL (owned + halo). Halo slots have ±inf so trivially satisfied. + opt_problem->set_constraint_lower_bounds(h_cstr_lower.data(), rank_data.total_cstr_size); + opt_problem->set_constraint_upper_bounds(h_cstr_upper.data(), rank_data.total_cstr_size); + + opt_problem->set_maximize(maximize); + opt_problem->set_objective_offset(objective_offset); + opt_problem->set_objective_scaling_factor(objective_scaling_factor); + opt_problem->set_problem_category(problem_category_t::LP); + + // ---- 3. Build problem_t from opt_problem. ---- + sub_problem.emplace(*opt_problem); + + // ---- 4. Override reverse_* with the real local A_T from rank_data. ---- + // problem_t's ctor computes the transpose of the LOCAL A, which is wrong + // in multi-GPU: A_local is owned_cstr x total_var, and A_t_local is the + // pre-sliced owned_var x total_cstr matrix we built during partitioning. + auto stream_view = handle.get_stream(); + sub_problem->reverse_offsets .resize(rank_data.h_A_t_row_offsets.size(), stream_view); + sub_problem->reverse_constraints .resize(rank_data.h_A_t_col_indices.size(), stream_view); + sub_problem->reverse_coefficients.resize(rank_data.h_A_t_values .size(), stream_view); + raft::copy(sub_problem->reverse_offsets.data(), + rank_data.h_A_t_row_offsets.data(), + rank_data.h_A_t_row_offsets.size(), stream_view); + raft::copy(sub_problem->reverse_constraints.data(), + rank_data.h_A_t_col_indices.data(), + rank_data.h_A_t_col_indices.size(), stream_view); + raft::copy(sub_problem->reverse_coefficients.data(), + rank_data.h_A_t_values.data(), + rank_data.h_A_t_values.size(), stream_view); + handle.sync_stream(stream_view); + + // ---- 5. Build sub_pdlp (single-GPU mode; multi_gpu flags cleared by caller). ---- + sub_pdlp = std::make_unique>(*sub_problem, settings, /*batch=*/false); +} template struct pdlp_shard_t; -//template struct pdlp_shard_t; +// template struct pdlp_shard_t; + } // namespace cuopt::linear_programming::detail diff --git a/cpp/src/pdlp/distributed_pdlp/shard.hpp b/cpp/src/pdlp/distributed_pdlp/shard.hpp index 0fe57be974..7528c35dec 100644 --- a/cpp/src/pdlp/distributed_pdlp/shard.hpp +++ b/cpp/src/pdlp/distributed_pdlp/shard.hpp @@ -8,6 +8,18 @@ namespace cuopt::linear_programming::detail { template class pdlp_solver_t; +struct nccl_comm_deleter_t { + int device_id{-1}; + void operator()(ncclComm* comm) const noexcept + { + raft::device_setter guard(device_id); + if (comm != nullptr) { + ncclCommDestroy(comm); + } + } +}; +using nccl_comm_unique_ptr_t = std::unique_ptr; + template class pdlp_shard_t { // Declaration only, will be set as default in shard.cu . Needed to manage cyclic include of pdlp_solver_t. @@ -19,12 +31,15 @@ class pdlp_shard_t { /* ???????? */); pdlp_shard_t(const pdlp_shard_t&) = delete; - pdlp_shard_t& operator=(const pdlp_shard_t&) = delete; // Specific multi-GPU data + pdlp_shard_t& operator=(const pdlp_shard_t&) = delete; + // Specific multi-GPU data int device_id; + rmm::cuda_stream stream; raft::handle_t handle; - ncclComm_t comm; + nccl_comm_unique_ptr_t comm; rank_data_t rank_data; - + optimization_problem_t opt_problem; + problem_t sub_problem; std::unique_ptr> sub_pdlp; }; diff --git a/cpp/src/pdlp/pdlp.cu b/cpp/src/pdlp/pdlp.cu index a759887fc5..a58ae4f210 100644 --- a/cpp/src/pdlp/pdlp.cu +++ b/cpp/src/pdlp/pdlp.cu @@ -11,12 +11,14 @@ #include #include +#include #include #include #include #include #include "cuopt/linear_programming/pdlp/solver_solution.hpp" +#include "distributed_pdlp/multi_gpu_engine.hpp" #include #include @@ -314,6 +316,95 @@ pdlp_solver_t::pdlp_solver_t(problem_t& op_problem, } } +template +pdlp_solver_t::pdlp_solver_t(problem_t& op_problem, + pdlp_solver_settings_t const& settings, + int num_gpus) + // 1. Delegate to single-GPU ctor to bring up all the per-master state + // (problem_ptr, op_problem_scaled_, pdhg_solver_, strategies, etc.). + : pdlp_solver_t(op_problem, settings, false) +{ + cuopt_expects(num_gpus == settings.num_gpus && settings.num_gpus > 1, + error_type_t::ValidationError, + "This constructor should only be used for distributed PDLP (num_gpus > 1)"); + // 2. Load partition + std::vector parts; + if (!settings.multi_gpu_partition_file.empty()) { + parts = partition_loader_t::parse_distributed_pdlp_partition_file( + settings.multi_gpu_partition_file); + } else { + cuopt_expects(false, error_type_t::NotImplemented, + "Metis partitioning inside cuopt not implemented yet; " + "provide a --parts file via settings.multi_gpu_partition_file"); + } + // 3. Scale now before copying to children + initial_scaling_strategy_.scale_problem(); + + // 4. Copy the scaled global problem from device -> host. + auto const stream = op_problem_scaled_.handle_ptr->get_stream(); + i_t const n_cstr = op_problem_scaled_.n_constraints; + i_t const n_vars = op_problem_scaled_.n_variables; + i_t const nnz = op_problem_scaled_.nnz; + // CSRs (A and A_t). + std::vector h_A_row_offsets (n_cstr + 1); + std::vector h_A_col_indices (nnz); + std::vector h_A_values (nnz); + std::vector h_A_t_row_offsets(n_vars + 1); + std::vector h_A_t_col_indices(nnz); + std::vector h_A_t_values (nnz); + raft::copy(h_A_row_offsets .data(), op_problem_scaled_.offsets .data(), n_cstr + 1, stream); + raft::copy(h_A_col_indices .data(), op_problem_scaled_.variables .data(), nnz, stream); + raft::copy(h_A_values .data(), op_problem_scaled_.coefficients .data(), nnz, stream); + raft::copy(h_A_t_row_offsets.data(), op_problem_scaled_.reverse_offsets .data(), n_vars + 1, stream); + raft::copy(h_A_t_col_indices.data(), op_problem_scaled_.reverse_constraints .data(), nnz, stream); + raft::copy(h_A_t_values .data(), op_problem_scaled_.reverse_coefficients.data(), nnz, stream); + // Objective coefficients. + std::vector h_obj(n_vars); + raft::copy(h_obj.data(), op_problem_scaled_.objective_coefficients.data(), n_vars, stream); + // Variable bounds: stored interleaved as f_t2 {lower, upper}. Unpack into two host vectors. + using f_t2 = typename type_2::type; + std::vector h_var_bounds_packed(n_vars); + raft::copy(h_var_bounds_packed.data(), + op_problem_scaled_.variable_bounds.data(), n_vars, stream); + // Constraint bounds. + std::vector h_cstr_lower(n_cstr); + std::vector h_cstr_upper(n_cstr); + raft::copy(h_cstr_lower.data(), op_problem_scaled_.constraint_lower_bounds.data(), n_cstr, stream); + raft::copy(h_cstr_upper.data(), op_problem_scaled_.constraint_upper_bounds.data(), n_cstr, stream); + op_problem_scaled_.handle_ptr->sync_stream(stream); + + std::vector h_var_lower(n_vars), h_var_upper(n_vars); + for (i_t i = 0; i < n_vars; ++i) { + h_var_lower[i] = h_var_bounds_packed[i].x; + h_var_upper[i] = h_var_bounds_packed[i].y; + } + // 5. Build per-rank data and meta-data + std::vector> sub_pdlp_rank_data = + partition_loader_t::create_rank_data_from_parts( + parts, + h_A_row_offsets, h_A_col_indices, h_A_values, + h_A_t_row_offsets, h_A_t_col_indices, h_A_t_values, + settings.num_gpus, n_cstr, n_vars, nnz); + // 6. Build the per-shard PDLP settings: + // - single-GPU mode (num_gpus=1, no partition file) so sub-solvers don't recurse; + // - disable scaling (master already scaled the data we're handing out). + pdlp_solver_settings_t sub_pdlp_settings = settings; + sub_pdlp_settings.num_gpus = 1; + sub_pdlp_settings.multi_gpu_partition_file = ""; + sub_pdlp_settings.hyper_params.default_l_inf_ruiz_iterations = 0; + sub_pdlp_settings.hyper_params.default_alpha_pock_chambolle_rescaling = 0.0; + + // 7. Construct the engine — this collectively bootstraps NCCL across all GPUs + // and constructs one shard per partition with the right slice of host data. + multi_gpu_engine.emplace( + std::move(sub_pdlp_rank_data), + h_obj, h_var_lower, h_var_upper, h_cstr_lower, h_cstr_upper, + op_problem_scaled_.maximize, + op_problem_scaled_.objective_offset, + op_problem_scaled_.presolve_data.objective_scaling_factor, + sub_pdlp_settings); +} + template void pdlp_solver_t::set_initial_primal_weight(f_t initial_primal_weight) { @@ -2258,7 +2349,11 @@ optimization_problem_solution_t pdlp_solver_t::run_solver(co !settings_.get_initial_primal_weight().has_value()) compute_initial_primal_weight(); - initial_scaling_strategy_.scale_problem(); + // In multi-GPU mode the master scaled op_problem_scaled_ in its ctor before + // distributing data to the shards, so skip the second scaling pass here. + if (!multi_gpu_engine.has_value()) { + initial_scaling_strategy_.scale_problem(); + } // Update FP32 matrix copies for mixed precision SpMV after scaling pdhg_solver_.get_cusparse_view().update_mixed_precision_matrices(); diff --git a/cpp/src/pdlp/pdlp.cuh b/cpp/src/pdlp/pdlp.cuh index 5cb267730f..ef992d2a9e 100644 --- a/cpp/src/pdlp/pdlp.cuh +++ b/cpp/src/pdlp/pdlp.cuh @@ -63,6 +63,11 @@ class pdlp_solver_t { pdlp_solver_t(problem_t& op_problem, pdlp_solver_settings_t const& settings, bool is_batch_mode = false); + + // Distributed Solver Constructor + pdlp_solver_t(problem_t& op_problem, + pdlp_solver_settings_t const& settings, + int num_gpus); optimization_problem_solution_t run_solver(const timer_t& timer); @@ -240,7 +245,10 @@ class pdlp_solver_t { // Flag to indicate if solver is being called from MIP. No logging is done in this case. bool inside_mip_{false}; - multi_gpu_engine_t multi_gpu_engine; + // std::optional because multi_gpu_engine_t is non-default-constructible + // (collectively bootstraps NCCL, owns RMM resources). Stays nullopt in + // single-GPU mode; emplaced by the multi-GPU ctor. + std::optional> multi_gpu_engine; }; } // namespace cuopt::linear_programming::detail diff --git a/cpp/src/pdlp/solve.cu b/cpp/src/pdlp/solve.cu index 59f1a4517f..6057f1cb83 100644 --- a/cpp/src/pdlp/solve.cu +++ b/cpp/src/pdlp/solve.cu @@ -709,6 +709,17 @@ static optimization_problem_solution_t run_pdlp_solver( } } #endif + if (settings.hyper_params.use_distributed_pdlp) { + cuopt_expects(settings.num_gpus > 1, + error_type_t::ValidationError, + "use_distributed_pdlp requires settings.num_gpus > 1"); + cuopt_expects(!is_batch_mode, + error_type_t::ValidationError, + "Distributed PDLP does not support batch mode"); + // Multi-GPU ctor; dispatched by 3rd-arg TYPE (int num_gpus, not bool batch). + detail::pdlp_solver_t solver(problem, settings, settings.num_gpus); + return solver.run_solver(timer); + } detail::pdlp_solver_t solver(problem, settings, is_batch_mode); if (settings.inside_mip) { solver.set_inside_mip(true); } return solver.run_solver(timer); From 5534ff049bca7c32da24fd0dc755f5c17c5a0611 Mon Sep 17 00:00:00 2001 From: Bulle Mostovoi Date: Tue, 19 May 2026 13:54:58 +0200 Subject: [PATCH 07/67] cursor broke everything grrr --- .../pdlp/distributed_pdlp/partition_loader.cu | 371 ++++++++++-------- .../distributed_pdlp/partition_loader.hpp | 45 ++- cpp/src/pdlp/distributed_pdlp/shard.hpp | 122 +++--- 3 files changed, 305 insertions(+), 233 deletions(-) diff --git a/cpp/src/pdlp/distributed_pdlp/partition_loader.cu b/cpp/src/pdlp/distributed_pdlp/partition_loader.cu index 449e8640ab..a9df158601 100644 --- a/cpp/src/pdlp/distributed_pdlp/partition_loader.cu +++ b/cpp/src/pdlp/distributed_pdlp/partition_loader.cu @@ -1,178 +1,201 @@ -static std::vector parse_distributed_pdlp_partition_file(std::string file){ - //returns a vector with all the values separated by a \n -} +/* + * SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + */ -std::vector create_rank_data_from_parts(const std::vector& parts, - const std::vector& A_row_offsets, - const std::vector& A_col_indices, - const std::vector& A_values, - const std::vector& A_t_row_offsets, - const std::vector& A_t_col_indices, - const std::vector& A_t_values, - i_t nb_parts, - i_t nb_cstr, - i_t nb_vars, - i_t nnz) -{ -std::vector rank_data(nb_parts, rank_data_t(nb_parts)); -std::vector cstr_parts(parts.begin(), parts.begin() + nb_cstr); -std::vector var_parts(parts.begin() + nb_cstr, parts.begin() + nb_cstr + nb_vars); +#include -// 1. Compute ownership -for (i_t i = 0; i < nb_cstr; i++) { -rank_data[cstr_parts[i]].owned_cstr_indices.push_back(i); -} -for (i_t i = 0; i < nb_vars; i++) { -rank_data[var_parts[i]].owned_var_indices.push_back(i); -} +#include +#include -// 2. Compute local matrices and rank_data -for (i_t rank = 0; rank < nb_parts; rank++) { -auto& rd = rank_data[rank]; -rd.owned_var_size = rd.owned_var_indices.size(); -rd.owned_cstr_size = rd.owned_cstr_indices.size(); -// ---- A side ---- -std::vector local_A_row_offsets; -std::vector local_A_col_indices; -std::vector local_A_values; - -i_t local_A_nnz = 0; -local_A_row_offsets.push_back(local_A_nnz); - -// For each owned constraint, build local matrix A -for (auto owned_cstr : rd.owned_cstr_indices) { -i_t cstr_len = A_row_offsets[owned_cstr + 1] - A_row_offsets[owned_cstr]; -i_t row_start = A_row_offsets[owned_cstr]; -for (i_t v = 0; v < cstr_len; v++) { -local_A_col_indices.push_back(A_col_indices[row_start + v]); -local_A_values.push_back(A_values[row_start + v]); -} -local_A_nnz += cstr_len; -local_A_row_offsets.push_back(local_A_nnz); -} - -std::set needed_vars; -for (auto indice : local_A_col_indices) { -if (var_parts[indice] != rank) -needed_vars.insert(indice); -} - -for (i_t peer = 0; peer < nb_parts; peer++) { -std::vector needed_var_from_peer; -for (auto needed_var : needed_vars) { -if (var_parts[needed_var] == peer) -needed_var_from_peer.push_back(needed_var); -} -i_t nb_recv_from_peer = needed_var_from_peer.size(); -rd.var_recv_counts[peer] = nb_recv_from_peer; -rd.var_recv_offsets[peer] = -peer == 0 -? 0 -: rd.var_recv_offsets[peer - 1] + rd.var_recv_counts[peer - 1]; -rank_data[peer].var_send_per_peer[rank] = std::move(needed_var_from_peer); -} +namespace cuopt::linear_programming::detail { -rd.h_A_row_offsets = std::move(local_A_row_offsets); -rd.h_A_col_indices = std::move(local_A_col_indices); -rd.h_A_values = std::move(local_A_values); - -// ---- A_t side ---- -std::vector local_A_t_row_offsets; -std::vector local_A_t_col_indices; -std::vector local_A_t_values; -i_t local_A_t_nnz = 0; -local_A_t_row_offsets.push_back(local_A_t_nnz); - -for (auto owned_var : rd.owned_var_indices) { -i_t var_len = A_t_row_offsets[owned_var + 1] - A_t_row_offsets[owned_var]; -i_t row_start = A_t_row_offsets[owned_var]; -for (i_t v = 0; v < var_len; v++) { -local_A_t_col_indices.push_back(A_t_col_indices[row_start + v]); -local_A_t_values.push_back(A_t_values[row_start + v]); -} -local_A_t_nnz += var_len; -local_A_t_row_offsets.push_back(local_A_t_nnz); -} - -std::set needed_cstrs; -for (auto indice : local_A_t_col_indices) { -if (cstr_parts[indice] != rank) -needed_cstrs.insert(indice); -} - -for (i_t peer = 0; peer < nb_parts; peer++) { -std::vector needed_cstr_from_peer; -for (auto needed_cstr : needed_cstrs) { -if (cstr_parts[needed_cstr] == peer) -needed_cstr_from_peer.push_back(needed_cstr); -} -i_t nb_recv_from_peer = needed_cstr_from_peer.size(); -rd.cstr_recv_counts[peer] = nb_recv_from_peer; -rd.cstr_recv_offsets[peer] = -peer == 0 -? 0 -: rd.cstr_recv_offsets[peer - 1] + rd.cstr_recv_counts[peer - 1]; -rank_data[peer].cstr_send_per_peer[rank] = std::move(needed_cstr_from_peer); -} - -rd.h_A_t_row_offsets = std::move(local_A_t_row_offsets); -rd.h_A_t_col_indices = std::move(local_A_t_col_indices); -rd.h_A_t_values = std::move(local_A_t_values); - -rd.total_var_size = rd.owned_var_size + needed_vars.size(); -rd.total_cstr_size = rd.owned_cstr_size + needed_cstrs.size(); -} - -// 3. Generate local indices for contiguous [[self], [peer1], ..., [peer_k]] -// Build scatter_gather_maps -for (i_t rank = 0; rank < nb_parts; rank++) { -auto& rd = rank_data[rank]; - -i_t curr_id = 0; -for (auto owned_cstr : rd.owned_cstr_indices) { -rd.global_to_local_cstr[owned_cstr] = curr_id; -rd.local_to_global_cstr.push_back(owned_cstr); -curr_id++; -} -for (i_t peer = 0; peer < nb_parts; peer++) { -if (peer == rank) continue; -for (auto recv_cstr : rank_data[peer].cstr_send_per_peer[rank]) { -rd.global_to_local_cstr[recv_cstr] = curr_id; -// rd.local_to_global_cstr.push_back(recv_cstr); // Not needed, we only do local_to_global on owned side -curr_id++; -} -} - -curr_id = 0; -for (auto owned_var : rd.owned_var_indices) { -rd.global_to_local_var[owned_var] = curr_id; -rd.local_to_global_var.push_back(owned_var); -curr_id++; -} -for (i_t peer = 0; peer < nb_parts; peer++) { -if (peer == rank) continue; -for (auto recv_var : rank_data[peer].var_send_per_peer[rank]) { -rd.global_to_local_var[recv_var] = curr_id; -// rd.local_to_global_var.push_back(recv_var); // same as over -curr_id++; -} -} -} - -// 4. Remap global -> local everywhere -for (i_t rank = 0; rank < nb_parts; rank++) { -auto& rd = rank_data[rank]; - -for (auto& send_vec : rd.var_send_per_peer) { -for (auto& v : send_vec) v = rd.global_to_local_var.at(v); -} -for (auto& send_vec : rd.cstr_send_per_peer) { -for (auto& v : send_vec) v = rd.global_to_local_cstr.at(v); -} - -for (auto& v : rd.h_A_col_indices) v = rd.global_to_local_var.at(v); -for (auto& v : rd.h_A_t_col_indices) v = rd.global_to_local_cstr.at(v); -} - -return rank_data; -} +template +std::vector partition_loader_t::parse_distributed_pdlp_partition_file( + std::string file) +{ + // returns a vector with all the values separated by a \n + return {}; // TODO: implement +} + +template +std::vector> +partition_loader_t::create_rank_data_from_parts( + const std::vector& parts, + const std::vector& A_row_offsets, + const std::vector& A_col_indices, + const std::vector& A_values, + const std::vector& A_t_row_offsets, + const std::vector& A_t_col_indices, + const std::vector& A_t_values, + i_t nb_parts, + i_t nb_cstr, + i_t nb_vars, + i_t nnz) +{ + std::vector> rank_data(nb_parts, rank_data_t(nb_parts)); + std::vector cstr_parts(parts.begin(), parts.begin() + nb_cstr); + std::vector var_parts(parts.begin() + nb_cstr, parts.begin() + nb_cstr + nb_vars); + + // 1. Compute ownership + for (i_t i = 0; i < nb_cstr; i++) { + rank_data[cstr_parts[i]].owned_cstr_indices.push_back(i); + } + for (i_t i = 0; i < nb_vars; i++) { + rank_data[var_parts[i]].owned_var_indices.push_back(i); + } + + // 2. Compute local matrices and rank_data + for (i_t rank = 0; rank < nb_parts; rank++) { + auto& rd = rank_data[rank]; + rd.owned_var_size = rd.owned_var_indices.size(); + rd.owned_cstr_size = rd.owned_cstr_indices.size(); + // ---- A side ---- + std::vector local_A_row_offsets; + std::vector local_A_col_indices; + std::vector local_A_values; + + i_t local_A_nnz = 0; + local_A_row_offsets.push_back(local_A_nnz); + + // For each owned constraint, build local matrix A + for (auto owned_cstr : rd.owned_cstr_indices) { + i_t cstr_len = A_row_offsets[owned_cstr + 1] - A_row_offsets[owned_cstr]; + i_t row_start = A_row_offsets[owned_cstr]; + for (i_t v = 0; v < cstr_len; v++) { + local_A_col_indices.push_back(A_col_indices[row_start + v]); + local_A_values.push_back(A_values[row_start + v]); + } + local_A_nnz += cstr_len; + local_A_row_offsets.push_back(local_A_nnz); + } + + std::set needed_vars; + for (auto indice : local_A_col_indices) { + if (var_parts[indice] != rank) + needed_vars.insert(indice); + } + + for (i_t peer = 0; peer < nb_parts; peer++) { + std::vector needed_var_from_peer; + for (auto needed_var : needed_vars) { + if (var_parts[needed_var] == peer) + needed_var_from_peer.push_back(needed_var); + } + i_t nb_recv_from_peer = needed_var_from_peer.size(); + rd.var_recv_counts[peer] = nb_recv_from_peer; + rd.var_recv_offsets[peer] = + peer == 0 + ? 0 + : rd.var_recv_offsets[peer - 1] + rd.var_recv_counts[peer - 1]; + rank_data[peer].var_send_per_peer[rank] = std::move(needed_var_from_peer); + } + + rd.h_A_row_offsets = std::move(local_A_row_offsets); + rd.h_A_col_indices = std::move(local_A_col_indices); + rd.h_A_values = std::move(local_A_values); + + // ---- A_t side ---- + std::vector local_A_t_row_offsets; + std::vector local_A_t_col_indices; + std::vector local_A_t_values; + i_t local_A_t_nnz = 0; + local_A_t_row_offsets.push_back(local_A_t_nnz); + + for (auto owned_var : rd.owned_var_indices) { + i_t var_len = A_t_row_offsets[owned_var + 1] - A_t_row_offsets[owned_var]; + i_t row_start = A_t_row_offsets[owned_var]; + for (i_t v = 0; v < var_len; v++) { + local_A_t_col_indices.push_back(A_t_col_indices[row_start + v]); + local_A_t_values.push_back(A_t_values[row_start + v]); + } + local_A_t_nnz += var_len; + local_A_t_row_offsets.push_back(local_A_t_nnz); + } + + std::set needed_cstrs; + for (auto indice : local_A_t_col_indices) { + if (cstr_parts[indice] != rank) + needed_cstrs.insert(indice); + } + + for (i_t peer = 0; peer < nb_parts; peer++) { + std::vector needed_cstr_from_peer; + for (auto needed_cstr : needed_cstrs) { + if (cstr_parts[needed_cstr] == peer) + needed_cstr_from_peer.push_back(needed_cstr); + } + i_t nb_recv_from_peer = needed_cstr_from_peer.size(); + rd.cstr_recv_counts[peer] = nb_recv_from_peer; + rd.cstr_recv_offsets[peer] = + peer == 0 + ? 0 + : rd.cstr_recv_offsets[peer - 1] + rd.cstr_recv_counts[peer - 1]; + rank_data[peer].cstr_send_per_peer[rank] = std::move(needed_cstr_from_peer); + } + + rd.h_A_t_row_offsets = std::move(local_A_t_row_offsets); + rd.h_A_t_col_indices = std::move(local_A_t_col_indices); + rd.h_A_t_values = std::move(local_A_t_values); + + rd.total_var_size = rd.owned_var_size + needed_vars.size(); + rd.total_cstr_size = rd.owned_cstr_size + needed_cstrs.size(); + } + + // 3. Generate local indices for contiguous [[self], [peer1], ..., [peer_k]] + // Build scatter_gather_maps + for (i_t rank = 0; rank < nb_parts; rank++) { + auto& rd = rank_data[rank]; + + i_t curr_id = 0; + for (auto owned_cstr : rd.owned_cstr_indices) { + rd.global_to_local_cstr[owned_cstr] = curr_id; + rd.local_to_global_cstr.push_back(owned_cstr); + curr_id++; + } + for (i_t peer = 0; peer < nb_parts; peer++) { + if (peer == rank) continue; + for (auto recv_cstr : rank_data[peer].cstr_send_per_peer[rank]) { + rd.global_to_local_cstr[recv_cstr] = curr_id; + // rd.local_to_global_cstr.push_back(recv_cstr); // Not needed, we only do local_to_global on owned side + curr_id++; + } + } + + curr_id = 0; + for (auto owned_var : rd.owned_var_indices) { + rd.global_to_local_var[owned_var] = curr_id; + rd.local_to_global_var.push_back(owned_var); + curr_id++; + } + for (i_t peer = 0; peer < nb_parts; peer++) { + if (peer == rank) continue; + for (auto recv_var : rank_data[peer].var_send_per_peer[rank]) { + rd.global_to_local_var[recv_var] = curr_id; + // rd.local_to_global_var.push_back(recv_var); // same as over + curr_id++; + } + } + } + + // 4. Remap global -> local everywhere + for (i_t rank = 0; rank < nb_parts; rank++) { + auto& rd = rank_data[rank]; + + for (auto& send_vec : rd.var_send_per_peer) { + for (auto& v : send_vec) v = rd.global_to_local_var.at(v); + } + for (auto& send_vec : rd.cstr_send_per_peer) { + for (auto& v : send_vec) v = rd.global_to_local_cstr.at(v); + } + + for (auto& v : rd.h_A_col_indices) v = rd.global_to_local_var.at(v); + for (auto& v : rd.h_A_t_col_indices) v = rd.global_to_local_cstr.at(v); + } + + return rank_data; +} + +template struct partition_loader_t; + +} // namespace cuopt::linear_programming::detail diff --git a/cpp/src/pdlp/distributed_pdlp/partition_loader.hpp b/cpp/src/pdlp/distributed_pdlp/partition_loader.hpp index 4d66d4445c..efdfd0ba0e 100644 --- a/cpp/src/pdlp/distributed_pdlp/partition_loader.hpp +++ b/cpp/src/pdlp/distributed_pdlp/partition_loader.hpp @@ -1,16 +1,33 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + */ +#pragma once -partition_loader_t { - static std::vector parse_distributed_pdlp_partition_file(std::string file); - std::vector create_rank_data_from_parts(const std::vector& parts, - const std::vector& A_row_offsets, - const std::vector& A_col_indices, - const std::vector& A_values, - const std::vector& A_t_row_offsets, - const std::vector& A_t_col_indices, - const std::vector& A_t_values, - i_t nb_parts, - i_t nb_cstr, - i_t nb_vars, - i_t nnz); -} \ No newline at end of file +#include + +#include +#include + +namespace cuopt::linear_programming::detail { + +template +struct partition_loader_t { + static std::vector parse_distributed_pdlp_partition_file(std::string file); + + static std::vector> create_rank_data_from_parts( + const std::vector& parts, + const std::vector& A_row_offsets, + const std::vector& A_col_indices, + const std::vector& A_values, + const std::vector& A_t_row_offsets, + const std::vector& A_t_col_indices, + const std::vector& A_t_values, + i_t nb_parts, + i_t nb_cstr, + i_t nb_vars, + i_t nnz); +}; + +} // namespace cuopt::linear_programming::detail diff --git a/cpp/src/pdlp/distributed_pdlp/shard.hpp b/cpp/src/pdlp/distributed_pdlp/shard.hpp index 7528c35dec..a33477edf1 100644 --- a/cpp/src/pdlp/distributed_pdlp/shard.hpp +++ b/cpp/src/pdlp/distributed_pdlp/shard.hpp @@ -1,46 +1,78 @@ -#pragma once -#include -#include -#include -#include -namespace cuopt::linear_programming::detail { +/* + * SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + */ + #pragma once -template -class pdlp_solver_t; - -struct nccl_comm_deleter_t { - int device_id{-1}; - void operator()(ncclComm* comm) const noexcept - { - raft::device_setter guard(device_id); - if (comm != nullptr) { - ncclCommDestroy(comm); - } - } -}; -using nccl_comm_unique_ptr_t = std::unique_ptr; - -template -class pdlp_shard_t { - // Declaration only, will be set as default in shard.cu . Needed to manage cyclic include of pdlp_solver_t. - public: - ~pdlp_shard_t(); - pdlp_shard_t(int device_id, - rank_data_t&& rd, - ncclComm_t comm - /* ???????? */); - - pdlp_shard_t(const pdlp_shard_t&) = delete; - pdlp_shard_t& operator=(const pdlp_shard_t&) = delete; - // Specific multi-GPU data - int device_id; - rmm::cuda_stream stream; - raft::handle_t handle; - nccl_comm_unique_ptr_t comm; - rank_data_t rank_data; - optimization_problem_t opt_problem; - problem_t sub_problem; - std::unique_ptr> sub_pdlp; -}; - -} + #include + + #include + #include + #include + + #include + #include + #include + + #include + + #include + #include + #include + + namespace cuopt::linear_programming::detail { + + // Forward-declare to break the cyclic include with pdlp.cuh + // (pdlp.cuh -> multi_gpu_engine.hpp -> shard.hpp -> pdlp.cuh). + // Definitions of out-of-line members live in shard.cu, which includes pdlp.cuh. + template + class pdlp_solver_t; + + // RAII deleter for ncclComm_t; sets the right device before destroy. + struct nccl_comm_deleter_t { + int device_id{-1}; + void operator()(ncclComm* comm) const noexcept + { + if (comm == nullptr) return; + raft::device_setter guard(device_id); + ncclCommDestroy(comm); + } + }; + using nccl_comm_unique_ptr_t = std::unique_ptr; + + template + struct pdlp_shard_t { + // Out-of-line (in shard.cu) because pdlp_solver_t is incomplete here. + ~pdlp_shard_t(); + + pdlp_shard_t(int device_id, + rank_data_t&& rd, + ncclComm_t raw_comm, + std::vector const& h_global_obj, + std::vector const& h_global_var_lower, + std::vector const& h_global_var_upper, + std::vector const& h_global_cstr_lower, + std::vector const& h_global_cstr_upper, + bool maximize, + f_t objective_offset, + f_t objective_scaling_factor, + pdlp_solver_settings_t const& settings); + + pdlp_shard_t(const pdlp_shard_t&) = delete; + pdlp_shard_t& operator=(const pdlp_shard_t&) = delete; + // Move ops are implicitly deleted (user-declared dtor + deleted copy). + // Intentional: shard owns device-affine resources and must never move. + // Store as std::unique_ptr in any container. + + int device_id; + rmm::cuda_stream stream; + raft::handle_t handle; + nccl_comm_unique_ptr_t comm; + rank_data_t rank_data; + std::optional> opt_problem; + std::optional> sub_problem; + std::unique_ptr> sub_pdlp; + }; + + } // namespace cuopt::linear_programming::detail + \ No newline at end of file From dd935c5307a312918121b53a27674bb4656fd291 Mon Sep 17 00:00:00 2001 From: Bulle Mostovoi Date: Tue, 19 May 2026 14:26:31 +0200 Subject: [PATCH 08/67] partition loader now partition loads --- .../pdlp/distributed_pdlp/partition_loader.cu | 28 ++++++++++++++++--- .../distributed_pdlp/partition_loader.hpp | 5 +++- 2 files changed, 28 insertions(+), 5 deletions(-) diff --git a/cpp/src/pdlp/distributed_pdlp/partition_loader.cu b/cpp/src/pdlp/distributed_pdlp/partition_loader.cu index a9df158601..0e122cefc0 100644 --- a/cpp/src/pdlp/distributed_pdlp/partition_loader.cu +++ b/cpp/src/pdlp/distributed_pdlp/partition_loader.cu @@ -5,17 +5,37 @@ #include +#include + +#include #include #include namespace cuopt::linear_programming::detail { template -std::vector partition_loader_t::parse_distributed_pdlp_partition_file( - std::string file) +std::vector partition_loader_t::parse_distributed_pdlp_partition_file( + std::string const& file) { - // returns a vector with all the values separated by a \n - return {}; // TODO: implement + std::ifstream part_file(file); + cuopt_expects(part_file.is_open(), + error_type_t::ValidationError, + "Failed to open partition file: " + file); + + // One integer per line; operator>> skips whitespace so blank lines and + // trailing newlines are tolerated. + std::vector parts; + i_t part = 0; + while (part_file >> part) { + parts.push_back(part); + } + + // We must have hit EOF cleanly; any other state means a malformed token. + cuopt_expects(part_file.eof(), + error_type_t::ValidationError, + "Malformed partition file (expected one integer per line): " + file); + + return parts; } template diff --git a/cpp/src/pdlp/distributed_pdlp/partition_loader.hpp b/cpp/src/pdlp/distributed_pdlp/partition_loader.hpp index efdfd0ba0e..25560cdbfd 100644 --- a/cpp/src/pdlp/distributed_pdlp/partition_loader.hpp +++ b/cpp/src/pdlp/distributed_pdlp/partition_loader.hpp @@ -14,7 +14,10 @@ namespace cuopt::linear_programming::detail { template struct partition_loader_t { - static std::vector parse_distributed_pdlp_partition_file(std::string file); + // Read a Metis-style partition file: one part-id per line (whitespace-tolerant), + // ASCII integers in [0, nb_parts). Returns a flat vector of length + // nb_cstr + nb_vars, indexed as in create_rank_data_from_parts (cstrs first, then vars). + static std::vector parse_distributed_pdlp_partition_file(std::string const& file); static std::vector> create_rank_data_from_parts( const std::vector& parts, From 09eb20b7701df0079309ab6932a5a03a9fd6595e Mon Sep 17 00:00:00 2001 From: Bulle Mostovoi Date: Tue, 19 May 2026 19:44:15 +0200 Subject: [PATCH 09/67] big advancements ayo ! We can soon start working on imlementing the solver !!! --- .../pdlp/solver_settings.hpp | 2 + .../pdlp/distributed_pdlp/multi_gpu_engine.cu | 116 ++++++++------ .../distributed_pdlp/multi_gpu_engine.hpp | 41 ++--- .../pdlp/distributed_pdlp/partition_loader.cu | 38 +++-- .../distributed_pdlp/partition_loader.hpp | 3 + cpp/src/pdlp/distributed_pdlp/rank_data.hpp | 2 + cpp/src/pdlp/distributed_pdlp/shard.cu | 102 ++++++++++-- cpp/src/pdlp/distributed_pdlp/shard.hpp | 35 +++-- .../initial_scaling.cu | 36 +++++ .../initial_scaling.cuh | 7 + cpp/src/pdlp/pdlp.cu | 145 ++++++++++++------ cpp/src/pdlp/pdlp.cuh | 8 +- 12 files changed, 382 insertions(+), 153 deletions(-) diff --git a/cpp/include/cuopt/linear_programming/pdlp/solver_settings.hpp b/cpp/include/cuopt/linear_programming/pdlp/solver_settings.hpp index 4585b9d1cf..2a18b8060f 100644 --- a/cpp/include/cuopt/linear_programming/pdlp/solver_settings.hpp +++ b/cpp/include/cuopt/linear_programming/pdlp/solver_settings.hpp @@ -287,6 +287,8 @@ class pdlp_solver_settings_t { bool dual_postsolve{true}; int num_gpus{1}; std::string multi_gpu_partition_file{""}; + // Set to true inside the shards + bool is_distributed_sub_pdlp{false}; method_t method{method_t::Concurrent}; bool inside_mip{false}; // For concurrent termination diff --git a/cpp/src/pdlp/distributed_pdlp/multi_gpu_engine.cu b/cpp/src/pdlp/distributed_pdlp/multi_gpu_engine.cu index c7307c46ee..9b404bbd53 100644 --- a/cpp/src/pdlp/distributed_pdlp/multi_gpu_engine.cu +++ b/cpp/src/pdlp/distributed_pdlp/multi_gpu_engine.cu @@ -15,57 +15,71 @@ namespace cuopt::linear_programming::detail { - template - multi_gpu_engine_t::multi_gpu_engine_t( - std::vector>&& rank_data, - std::vector const& h_global_obj, - std::vector const& h_global_var_lower, - std::vector const& h_global_var_upper, - std::vector const& h_global_cstr_lower, - std::vector const& h_global_cstr_upper, - bool maximize, - f_t objective_offset, - f_t objective_scaling_factor, - pdlp_solver_settings_t const& sub_solver_settings) - : stream() - { - const int nb_parts = static_cast(rank_data.size()); - cuopt_expects(nb_parts > 0, - error_type_t::ValidationError, - "multi_gpu_engine_t: rank_data must be non-empty"); - - shards.reserve(nb_parts); - - // 1:1 rank -> device mapping. (Matches metis_tests; refine later if needed.) - std::vector devices(nb_parts); - std::iota(devices.begin(), devices.end(), 0); - - // 2. Collectively bootstrap NCCL communicators across all devices. - // Must be done together; each comm is then handed to one shard, - // which wraps it in a unique_ptr with the device-aware deleter. - std::vector raw_comms(nb_parts); - cuopt_expects(ncclCommInitAll(raw_comms.data(), nb_parts, devices.data()) == ncclSuccess, - error_type_t::RuntimeError, - "ncclCommInitAll failed"); - - // 3. Construct one shard per rank, pinned to its device. - for (int r = 0; r < nb_parts; ++r) { - raft::device_setter guard(devices[r]); // shard ctor asserts current device - shards.emplace_back(std::make_unique>( - devices[r], - std::move(rank_data[r]), - raw_comms[r], - h_global_obj, - h_global_var_lower, - h_global_var_upper, - h_global_cstr_lower, - h_global_cstr_upper, - maximize, - objective_offset, - objective_scaling_factor, - sub_solver_settings)); - } - } +template +multi_gpu_engine_t::multi_gpu_engine_t( + std::vector>&& rank_data, + std::vector const& h_global_obj, + std::vector const& h_global_var_lower, + std::vector const& h_global_var_upper, + std::vector const& h_global_cstr_lower, + std::vector const& h_global_cstr_upper, + std::vector const& h_global_obj_scaled, + std::vector const& h_global_var_lower_scaled, + std::vector const& h_global_var_upper_scaled, + std::vector const& h_global_cstr_lower_scaled, + std::vector const& h_global_cstr_upper_scaled, + std::vector const& h_global_cummulative_cstr_scaling, + std::vector const& h_global_cummulative_var_scaling, + f_t h_bound_rescaling, + f_t h_objective_rescaling, + bool maximize, + f_t objective_offset, + f_t objective_scaling_factor, + pdlp_solver_settings_t const& sub_solver_settings) + : stream() +{ + const int nb_parts = static_cast(rank_data.size()); + cuopt_expects(nb_parts > 0, + error_type_t::ValidationError, + "multi_gpu_engine_t: rank_data must be non-empty"); + + shards.reserve(nb_parts); + std::vector devices(nb_parts); + std::iota(devices.begin(), devices.end(), 0); + + // Create NCCL Comms then let shards own them + std::vector raw_comms(nb_parts); + cuopt_expects(ncclCommInitAll(raw_comms.data(), nb_parts, devices.data()) == ncclSuccess, + error_type_t::RuntimeError, + "ncclCommInitAll failed"); + + // 3. Construct one shard per rank, pinned to its device. + for (int r = 0; r < nb_parts; ++r) { + raft::device_setter guard(devices[r]); // shard ctor needs device set + shards.emplace_back(std::make_unique>( + devices[r], + std::move(rank_data[r]), + raw_comms[r], + h_global_obj, + h_global_var_lower, + h_global_var_upper, + h_global_cstr_lower, + h_global_cstr_upper, + h_global_obj_scaled, + h_global_var_lower_scaled, + h_global_var_upper_scaled, + h_global_cstr_lower_scaled, + h_global_cstr_upper_scaled, + h_global_cummulative_cstr_scaling, + h_global_cummulative_var_scaling, + h_bound_rescaling, + h_objective_rescaling, + maximize, + objective_offset, + objective_scaling_factor, + sub_solver_settings)); + } +} template struct multi_gpu_engine_t; // template struct multi_gpu_engine_t; diff --git a/cpp/src/pdlp/distributed_pdlp/multi_gpu_engine.hpp b/cpp/src/pdlp/distributed_pdlp/multi_gpu_engine.hpp index 6142c938e3..d672e18197 100644 --- a/cpp/src/pdlp/distributed_pdlp/multi_gpu_engine.hpp +++ b/cpp/src/pdlp/distributed_pdlp/multi_gpu_engine.hpp @@ -16,24 +16,29 @@ namespace cuopt::linear_programming::detail { - template - struct multi_gpu_engine_t { - // Constructs one shard per partition. Caller is responsible for: - // - rank_data[i] being correctly populated for rank i - // - the host vectors holding the (already scaled) global problem data - // - sub_solver_settings being the per-shard PDLP config (num_gpus=1, - // multi_gpu_partition_file="", scaling disabled). - multi_gpu_engine_t( - std::vector>&& rank_data, - std::vector const& h_global_obj, - std::vector const& h_global_var_lower, - std::vector const& h_global_var_upper, - std::vector const& h_global_cstr_lower, - std::vector const& h_global_cstr_upper, - bool maximize, - f_t objective_offset, - f_t objective_scaling_factor, - pdlp_solver_settings_t const& sub_solver_settings); +template +struct multi_gpu_engine_t { + // Constructs shards from rank_data + multi_gpu_engine_t( + std::vector>&& rank_data, + std::vector const& h_global_obj, + std::vector const& h_global_var_lower, + std::vector const& h_global_var_upper, + std::vector const& h_global_cstr_lower, + std::vector const& h_global_cstr_upper, + std::vector const& h_global_obj_scaled, + std::vector const& h_global_var_lower_scaled, + std::vector const& h_global_var_upper_scaled, + std::vector const& h_global_cstr_lower_scaled, + std::vector const& h_global_cstr_upper_scaled, + std::vector const& h_global_cummulative_cstr_scaling, + std::vector const& h_global_cummulative_var_scaling, + f_t h_bound_rescaling, + f_t h_objective_rescaling, + bool maximize, + f_t objective_offset, + f_t objective_scaling_factor, + pdlp_solver_settings_t const& sub_solver_settings); multi_gpu_engine_t(const multi_gpu_engine_t&) = delete; multi_gpu_engine_t& operator=(const multi_gpu_engine_t&) = delete; diff --git a/cpp/src/pdlp/distributed_pdlp/partition_loader.cu b/cpp/src/pdlp/distributed_pdlp/partition_loader.cu index 0e122cefc0..047fb536d5 100644 --- a/cpp/src/pdlp/distributed_pdlp/partition_loader.cu +++ b/cpp/src/pdlp/distributed_pdlp/partition_loader.cu @@ -45,14 +45,23 @@ partition_loader_t::create_rank_data_from_parts( const std::vector& A_row_offsets, const std::vector& A_col_indices, const std::vector& A_values, + const std::vector& A_values_scaled, const std::vector& A_t_row_offsets, const std::vector& A_t_col_indices, const std::vector& A_t_values, + const std::vector& A_t_values_scaled, i_t nb_parts, i_t nb_cstr, i_t nb_vars, i_t nnz) { + cuopt_expects(A_values.size() == A_values_scaled.size(), + error_type_t::ValidationError, + "A_values and A_values_scaled must have the same length"); + cuopt_expects(A_t_values.size() == A_t_values_scaled.size(), + error_type_t::ValidationError, + "A_t_values and A_t_values_scaled must have the same length"); + std::vector> rank_data(nb_parts, rank_data_t(nb_parts)); std::vector cstr_parts(parts.begin(), parts.begin() + nb_cstr); std::vector var_parts(parts.begin() + nb_cstr, parts.begin() + nb_cstr + nb_vars); @@ -74,17 +83,22 @@ partition_loader_t::create_rank_data_from_parts( std::vector local_A_row_offsets; std::vector local_A_col_indices; std::vector local_A_values; + std::vector local_A_values_scaled; i_t local_A_nnz = 0; local_A_row_offsets.push_back(local_A_nnz); - // For each owned constraint, build local matrix A + // For each owned constraint, build local matrix A. We walk both the + // unscaled and scaled global value arrays in lockstep so the produced + // local arrays share identical (offsets, col_indices) and differ only + // in values. for (auto owned_cstr : rd.owned_cstr_indices) { i_t cstr_len = A_row_offsets[owned_cstr + 1] - A_row_offsets[owned_cstr]; i_t row_start = A_row_offsets[owned_cstr]; for (i_t v = 0; v < cstr_len; v++) { local_A_col_indices.push_back(A_col_indices[row_start + v]); - local_A_values.push_back(A_values[row_start + v]); + local_A_values .push_back(A_values [row_start + v]); + local_A_values_scaled.push_back(A_values_scaled[row_start + v]); } local_A_nnz += cstr_len; local_A_row_offsets.push_back(local_A_nnz); @@ -111,14 +125,16 @@ partition_loader_t::create_rank_data_from_parts( rank_data[peer].var_send_per_peer[rank] = std::move(needed_var_from_peer); } - rd.h_A_row_offsets = std::move(local_A_row_offsets); - rd.h_A_col_indices = std::move(local_A_col_indices); - rd.h_A_values = std::move(local_A_values); + rd.h_A_row_offsets = std::move(local_A_row_offsets); + rd.h_A_col_indices = std::move(local_A_col_indices); + rd.h_A_values = std::move(local_A_values); + rd.h_A_values_scaled = std::move(local_A_values_scaled); // ---- A_t side ---- std::vector local_A_t_row_offsets; std::vector local_A_t_col_indices; std::vector local_A_t_values; + std::vector local_A_t_values_scaled; i_t local_A_t_nnz = 0; local_A_t_row_offsets.push_back(local_A_t_nnz); @@ -126,8 +142,9 @@ partition_loader_t::create_rank_data_from_parts( i_t var_len = A_t_row_offsets[owned_var + 1] - A_t_row_offsets[owned_var]; i_t row_start = A_t_row_offsets[owned_var]; for (i_t v = 0; v < var_len; v++) { - local_A_t_col_indices.push_back(A_t_col_indices[row_start + v]); - local_A_t_values.push_back(A_t_values[row_start + v]); + local_A_t_col_indices .push_back(A_t_col_indices [row_start + v]); + local_A_t_values .push_back(A_t_values [row_start + v]); + local_A_t_values_scaled.push_back(A_t_values_scaled[row_start + v]); } local_A_t_nnz += var_len; local_A_t_row_offsets.push_back(local_A_t_nnz); @@ -154,9 +171,10 @@ partition_loader_t::create_rank_data_from_parts( rank_data[peer].cstr_send_per_peer[rank] = std::move(needed_cstr_from_peer); } - rd.h_A_t_row_offsets = std::move(local_A_t_row_offsets); - rd.h_A_t_col_indices = std::move(local_A_t_col_indices); - rd.h_A_t_values = std::move(local_A_t_values); + rd.h_A_t_row_offsets = std::move(local_A_t_row_offsets); + rd.h_A_t_col_indices = std::move(local_A_t_col_indices); + rd.h_A_t_values = std::move(local_A_t_values); + rd.h_A_t_values_scaled = std::move(local_A_t_values_scaled); rd.total_var_size = rd.owned_var_size + needed_vars.size(); rd.total_cstr_size = rd.owned_cstr_size + needed_cstrs.size(); diff --git a/cpp/src/pdlp/distributed_pdlp/partition_loader.hpp b/cpp/src/pdlp/distributed_pdlp/partition_loader.hpp index 25560cdbfd..915c24a828 100644 --- a/cpp/src/pdlp/distributed_pdlp/partition_loader.hpp +++ b/cpp/src/pdlp/distributed_pdlp/partition_loader.hpp @@ -19,14 +19,17 @@ struct partition_loader_t { // nb_cstr + nb_vars, indexed as in create_rank_data_from_parts (cstrs first, then vars). static std::vector parse_distributed_pdlp_partition_file(std::string const& file); + // Slices the data to prepare a split from metis partitionning with halo communication static std::vector> create_rank_data_from_parts( const std::vector& parts, const std::vector& A_row_offsets, const std::vector& A_col_indices, const std::vector& A_values, + const std::vector& A_values_scaled, const std::vector& A_t_row_offsets, const std::vector& A_t_col_indices, const std::vector& A_t_values, + const std::vector& A_t_values_scaled, i_t nb_parts, i_t nb_cstr, i_t nb_vars, diff --git a/cpp/src/pdlp/distributed_pdlp/rank_data.hpp b/cpp/src/pdlp/distributed_pdlp/rank_data.hpp index ee107f5cf1..29d76ae110 100644 --- a/cpp/src/pdlp/distributed_pdlp/rank_data.hpp +++ b/cpp/src/pdlp/distributed_pdlp/rank_data.hpp @@ -44,9 +44,11 @@ struct rank_data_t { std::vector h_A_row_offsets; std::vector h_A_col_indices; std::vector h_A_values; + std::vector h_A_values_scaled; // A_t std::vector h_A_t_row_offsets; std::vector h_A_t_col_indices; std::vector h_A_t_values; + std::vector h_A_t_values_scaled; }; } // namespace cuopt::linear_programming::detail \ No newline at end of file diff --git a/cpp/src/pdlp/distributed_pdlp/shard.cu b/cpp/src/pdlp/distributed_pdlp/shard.cu index d5e795bb61..41f74086ab 100644 --- a/cpp/src/pdlp/distributed_pdlp/shard.cu +++ b/cpp/src/pdlp/distributed_pdlp/shard.cu @@ -5,6 +5,9 @@ #include #include +#include + +#include #include #include @@ -28,6 +31,15 @@ pdlp_shard_t::pdlp_shard_t( std::vector const& h_global_var_upper, std::vector const& h_global_cstr_lower, std::vector const& h_global_cstr_upper, + std::vector const& h_global_obj_scaled, + std::vector const& h_global_var_lower_scaled, + std::vector const& h_global_var_upper_scaled, + std::vector const& h_global_cstr_lower_scaled, + std::vector const& h_global_cstr_upper_scaled, + std::vector const& h_global_cummulative_cstr_scaling, + std::vector const& h_global_cummulative_var_scaling, + f_t h_bound_rescaling, + f_t h_objective_rescaling, bool maximize, f_t objective_offset, f_t objective_scaling_factor, @@ -45,27 +57,47 @@ pdlp_shard_t::pdlp_shard_t( // ---- 1. Gather per-shard host slices using rank_data's index maps. ---- // All vectors are sized to TOTAL (owned + halo). Owned slots get real - // values; halo slots keep neutral defaults so they are no-ops even if - // accidentally touched before `owned_*_size_` plumbing is in place. - std::vector h_obj (rank_data.total_var_size, f_t{0}); - std::vector h_var_lower (rank_data.total_var_size, -std::numeric_limits::infinity()); - std::vector h_var_upper (rank_data.total_var_size, std::numeric_limits::infinity()); - std::vector h_cstr_lower(rank_data.total_cstr_size, -std::numeric_limits::infinity()); - std::vector h_cstr_upper(rank_data.total_cstr_size, std::numeric_limits::infinity()); + // values; halo slots keep defaults because they should not be accessed + std::vector h_obj (rank_data.total_var_size, f_t{0}); + std::vector h_var_lower (rank_data.total_var_size, -std::numeric_limits::infinity()); + std::vector h_var_upper (rank_data.total_var_size, std::numeric_limits::infinity()); + std::vector h_cstr_lower (rank_data.total_cstr_size, -std::numeric_limits::infinity()); + std::vector h_cstr_upper (rank_data.total_cstr_size, std::numeric_limits::infinity()); + + std::vector h_obj_scaled (rank_data.total_var_size, f_t{0}); + std::vector h_var_lower_scaled (rank_data.total_var_size, -std::numeric_limits::infinity()); + std::vector h_var_upper_scaled (rank_data.total_var_size, std::numeric_limits::infinity()); + std::vector h_cstr_lower_scaled(rank_data.total_cstr_size, -std::numeric_limits::infinity()); + std::vector h_cstr_upper_scaled(rank_data.total_cstr_size, std::numeric_limits::infinity()); for (i_t i = 0; i < rank_data.owned_var_size; ++i) { - const auto g = rank_data.local_to_global_var[i]; - h_obj[i] = h_global_obj[g]; - h_var_lower[i] = h_global_var_lower[g]; - h_var_upper[i] = h_global_var_upper[g]; + const auto g = rank_data.local_to_global_var[i]; + h_obj[i] = h_global_obj[g]; + h_var_lower[i] = h_global_var_lower[g]; + h_var_upper[i] = h_global_var_upper[g]; + h_obj_scaled[i] = h_global_obj_scaled[g]; + h_var_lower_scaled[i] = h_global_var_lower_scaled[g]; + h_var_upper_scaled[i] = h_global_var_upper_scaled[g]; + } + for (i_t i = 0; i < rank_data.owned_cstr_size; ++i) { + const auto g = rank_data.local_to_global_cstr[i]; + h_cstr_lower[i] = h_global_cstr_lower[g]; + h_cstr_upper[i] = h_global_cstr_upper[g]; + h_cstr_lower_scaled[i] = h_global_cstr_lower_scaled[g]; + h_cstr_upper_scaled[i] = h_global_cstr_upper_scaled[g]; } + + // Get local scaling factors + std::vector h_cstr_scaling_local(rank_data.total_cstr_size, f_t{1}); + std::vector h_var_scaling_local (rank_data.total_var_size, f_t{1}); for (i_t i = 0; i < rank_data.owned_cstr_size; ++i) { - const auto g = rank_data.local_to_global_cstr[i]; - h_cstr_lower[i] = h_global_cstr_lower[g]; - h_cstr_upper[i] = h_global_cstr_upper[g]; + h_cstr_scaling_local[i] = h_global_cummulative_cstr_scaling[rank_data.local_to_global_cstr[i]]; + } + for (i_t i = 0; i < rank_data.owned_var_size; ++i) { + h_var_scaling_local[i] = h_global_cummulative_var_scaling[rank_data.local_to_global_var[i]]; } - // ---- 2. Build optimization_problem_t on this shard's device. ---- + // ---- 2. Build optimization_problem_t on this shard's device (UNSCALED). ---- opt_problem.emplace(&handle); opt_problem->set_csr_constraint_matrix( rank_data.h_A_values .data(), static_cast(rank_data.h_A_values .size()), @@ -86,7 +118,7 @@ pdlp_shard_t::pdlp_shard_t( opt_problem->set_objective_scaling_factor(objective_scaling_factor); opt_problem->set_problem_category(problem_category_t::LP); - // ---- 3. Build problem_t from opt_problem. ---- + // ---- 3. Build problem_t from opt_problem (still UNSCALED). ---- sub_problem.emplace(*opt_problem); // ---- 4. Override reverse_* with the real local A_T from rank_data. ---- @@ -109,7 +141,45 @@ pdlp_shard_t::pdlp_shard_t( handle.sync_stream(stream_view); // ---- 5. Build sub_pdlp (single-GPU mode; multi_gpu flags cleared by caller). ---- + // At this point sub_pdlp.op_problem_scaled_ is an unscaled copy + // of sub_problem and sub_pdlp.initial_scaling_strategy_ has + // unit cumulative factors (sub-settings disable Ruiz / PC iters). sub_pdlp = std::make_unique>(*sub_problem, settings, /*batch=*/false); + + // Inject master-scaled buffers inside sub_pdlp + auto& scaled = sub_pdlp->get_op_problem_scaled(); + raft::copy(scaled.coefficients.data(), + rank_data.h_A_values_scaled.data(), + rank_data.h_A_values_scaled.size(), stream_view); + raft::copy(scaled.reverse_coefficients.data(), + rank_data.h_A_t_values_scaled.data(), + rank_data.h_A_t_values_scaled.size(), stream_view); + raft::copy(scaled.objective_coefficients.data(), + h_obj_scaled.data(), h_obj_scaled.size(), stream_view); + raft::copy(scaled.constraint_lower_bounds.data(), + h_cstr_lower_scaled.data(), h_cstr_lower_scaled.size(), stream_view); + raft::copy(scaled.constraint_upper_bounds.data(), + h_cstr_upper_scaled.data(), h_cstr_upper_scaled.size(), stream_view); + + using f_t2 = typename type_2::type; + std::vector h_var_bounds_scaled_packed(rank_data.total_var_size); + for (i_t i = 0; i < rank_data.total_var_size; ++i) { + h_var_bounds_scaled_packed[i].x = h_var_lower_scaled[i]; + h_var_bounds_scaled_packed[i].y = h_var_upper_scaled[i]; + } + raft::copy(scaled.variable_bounds.data(), + h_var_bounds_scaled_packed.data(), + h_var_bounds_scaled_packed.size(), stream_view); + + combine_constraint_bounds(scaled, scaled.combined_bounds); + + // Inject master-scaled buffers inside sub_pdlp.initil_strategy + auto& scaling = sub_pdlp->get_initial_scaling_strategy(); + scaling.set_cummulative_scaling(h_cstr_scaling_local, h_var_scaling_local); + scaling.set_h_bound_rescaling (h_bound_rescaling); + scaling.set_h_objective_rescaling(h_objective_rescaling); + + handle.sync_stream(stream_view); } template struct pdlp_shard_t; diff --git a/cpp/src/pdlp/distributed_pdlp/shard.hpp b/cpp/src/pdlp/distributed_pdlp/shard.hpp index a33477edf1..3c10a90f90 100644 --- a/cpp/src/pdlp/distributed_pdlp/shard.hpp +++ b/cpp/src/pdlp/distributed_pdlp/shard.hpp @@ -45,18 +45,29 @@ // Out-of-line (in shard.cu) because pdlp_solver_t is incomplete here. ~pdlp_shard_t(); - pdlp_shard_t(int device_id, - rank_data_t&& rd, - ncclComm_t raw_comm, - std::vector const& h_global_obj, - std::vector const& h_global_var_lower, - std::vector const& h_global_var_upper, - std::vector const& h_global_cstr_lower, - std::vector const& h_global_cstr_upper, - bool maximize, - f_t objective_offset, - f_t objective_scaling_factor, - pdlp_solver_settings_t const& settings); + // sub worker for distributed pdlp. Owns its own view on scaled problem and unscaled problem + // Owns necessary multi-gpu data (rank_data, device_id, nccl_comm) + pdlp_shard_t(int device_id, + rank_data_t&& rd, + ncclComm_t raw_comm, + std::vector const& h_global_obj, + std::vector const& h_global_var_lower, + std::vector const& h_global_var_upper, + std::vector const& h_global_cstr_lower, + std::vector const& h_global_cstr_upper, + std::vector const& h_global_obj_scaled, + std::vector const& h_global_var_lower_scaled, + std::vector const& h_global_var_upper_scaled, + std::vector const& h_global_cstr_lower_scaled, + std::vector const& h_global_cstr_upper_scaled, + std::vector const& h_global_cummulative_cstr_scaling, + std::vector const& h_global_cummulative_var_scaling, + f_t h_bound_rescaling, + f_t h_objective_rescaling, + bool maximize, + f_t objective_offset, + f_t objective_scaling_factor, + pdlp_solver_settings_t const& settings); pdlp_shard_t(const pdlp_shard_t&) = delete; pdlp_shard_t& operator=(const pdlp_shard_t&) = delete; diff --git a/cpp/src/pdlp/initial_scaling_strategy/initial_scaling.cu b/cpp/src/pdlp/initial_scaling_strategy/initial_scaling.cu index a76b1773f9..a94064d0af 100644 --- a/cpp/src/pdlp/initial_scaling_strategy/initial_scaling.cu +++ b/cpp/src/pdlp/initial_scaling_strategy/initial_scaling.cu @@ -809,6 +809,42 @@ pdlp_initial_scaling_strategy_t::get_variable_scaling_vector() const return cummulative_variable_scaling_; } +template +void pdlp_initial_scaling_strategy_t::set_cummulative_scaling( + const std::vector& h_cummulative_constraint_matrix_scaling, + const std::vector& h_cummulative_variable_scaling) +{ + cuopt_expects(static_cast(h_cummulative_constraint_matrix_scaling.size()) == dual_size_h_, + error_type_t::ValidationError, + "set_cummulative_scaling: host constraint scaling vector size mismatch"); + cuopt_expects(static_cast(h_cummulative_variable_scaling.size()) == primal_size_h_, + error_type_t::ValidationError, + "set_cummulative_scaling: host variable scaling vector size mismatch"); + + raft::copy(cummulative_constraint_matrix_scaling_.data(), + h_cummulative_constraint_matrix_scaling.data(), + h_cummulative_constraint_matrix_scaling.size(), + stream_view_); + raft::copy(cummulative_variable_scaling_.data(), + h_cummulative_variable_scaling.data(), + h_cummulative_variable_scaling.size(), + stream_view_); +} + +template +void pdlp_initial_scaling_strategy_t::set_h_bound_rescaling(f_t value) +{ + h_bound_rescaling = value; + bound_rescaling_.set_value_async(value, stream_view_); +} + +template +void pdlp_initial_scaling_strategy_t::set_h_objective_rescaling(f_t value) +{ + h_objective_rescaling = value; + objective_rescaling_.set_value_async(value, stream_view_); +} + template typename pdlp_initial_scaling_strategy_t::view_t pdlp_initial_scaling_strategy_t::view() diff --git a/cpp/src/pdlp/initial_scaling_strategy/initial_scaling.cuh b/cpp/src/pdlp/initial_scaling_strategy/initial_scaling.cuh index 5a3dcfaca2..ed5f8b1851 100644 --- a/cpp/src/pdlp/initial_scaling_strategy/initial_scaling.cuh +++ b/cpp/src/pdlp/initial_scaling_strategy/initial_scaling.cuh @@ -76,6 +76,13 @@ class pdlp_initial_scaling_strategy_t { f_t get_h_bound_rescaling() const; f_t get_h_objective_rescaling() const; + // Inject scaling state computed by another pdlp_initial_scaling_strategy_t + // Needed by distributed PDLP + void set_cummulative_scaling(const std::vector& h_cummulative_constraint_matrix_scaling, + const std::vector& h_cummulative_variable_scaling); + void set_h_bound_rescaling(f_t value); + void set_h_objective_rescaling(f_t value); + void bound_objective_rescaling(); /** diff --git a/cpp/src/pdlp/pdlp.cu b/cpp/src/pdlp/pdlp.cu index a58ae4f210..612eb676ec 100644 --- a/cpp/src/pdlp/pdlp.cu +++ b/cpp/src/pdlp/pdlp.cu @@ -337,68 +337,119 @@ pdlp_solver_t::pdlp_solver_t(problem_t& op_problem, "Metis partitioning inside cuopt not implemented yet; " "provide a --parts file via settings.multi_gpu_partition_file"); } - // 3. Scale now before copying to children + + // always compute initial step size before scaling and primal_weight after scaling to do like cuPDLPx + assert(settings_.hyper_params.compute_initial_primal_weight_before_scaling && "compute_initial_primal_weight_before_scaling must be true in distributed mode"); + assert(!settings_.hyper_params.compute_initial_step_size_before_scaling && "compute_initial_step_size_before_scaling must be false in distributed mode"); + + compute_initial_primal_weight(); + + // scale globally before dispatching to shards initial_scaling_strategy_.scale_problem(); + + compute_initial_step_size(); - // 4. Copy the scaled global problem from device -> host. + const f_t initial_step_size_global = get_step_size_h(0); + const f_t initial_primal_weight_global = get_primal_weight_h(0); + + // 4. Copy both scaled and unscaled pb auto const stream = op_problem_scaled_.handle_ptr->get_stream(); i_t const n_cstr = op_problem_scaled_.n_constraints; i_t const n_vars = op_problem_scaled_.n_variables; i_t const nnz = op_problem_scaled_.nnz; - // CSRs (A and A_t). + + // Shared topology (taken from the scaled problem, but identical on both). std::vector h_A_row_offsets (n_cstr + 1); std::vector h_A_col_indices (nnz); - std::vector h_A_values (nnz); std::vector h_A_t_row_offsets(n_vars + 1); std::vector h_A_t_col_indices(nnz); - std::vector h_A_t_values (nnz); - raft::copy(h_A_row_offsets .data(), op_problem_scaled_.offsets .data(), n_cstr + 1, stream); - raft::copy(h_A_col_indices .data(), op_problem_scaled_.variables .data(), nnz, stream); - raft::copy(h_A_values .data(), op_problem_scaled_.coefficients .data(), nnz, stream); - raft::copy(h_A_t_row_offsets.data(), op_problem_scaled_.reverse_offsets .data(), n_vars + 1, stream); - raft::copy(h_A_t_col_indices.data(), op_problem_scaled_.reverse_constraints .data(), nnz, stream); - raft::copy(h_A_t_values .data(), op_problem_scaled_.reverse_coefficients.data(), nnz, stream); - // Objective coefficients. - std::vector h_obj(n_vars); - raft::copy(h_obj.data(), op_problem_scaled_.objective_coefficients.data(), n_vars, stream); - // Variable bounds: stored interleaved as f_t2 {lower, upper}. Unpack into two host vectors. + raft::copy(h_A_row_offsets .data(), op_problem_scaled_.offsets .data(), n_cstr + 1, stream); + raft::copy(h_A_col_indices .data(), op_problem_scaled_.variables .data(), nnz, stream); + raft::copy(h_A_t_row_offsets.data(), op_problem_scaled_.reverse_offsets .data(), n_vars + 1, stream); + raft::copy(h_A_t_col_indices.data(), op_problem_scaled_.reverse_constraints.data(), nnz, stream); + + // Paired value arrays for A and A_T. + std::vector h_A_values (nnz); + std::vector h_A_values_scaled (nnz); + std::vector h_A_t_values (nnz); + std::vector h_A_t_values_scaled(nnz); + raft::copy(h_A_values .data(), problem_ptr->coefficients .data(), nnz, stream); + raft::copy(h_A_t_values .data(), problem_ptr->reverse_coefficients .data(), nnz, stream); + raft::copy(h_A_values_scaled .data(), op_problem_scaled_.coefficients .data(), nnz, stream); + raft::copy(h_A_t_values_scaled.data(), op_problem_scaled_.reverse_coefficients.data(), nnz, stream); + using f_t2 = typename type_2::type; - std::vector h_var_bounds_packed(n_vars); - raft::copy(h_var_bounds_packed.data(), - op_problem_scaled_.variable_bounds.data(), n_vars, stream); - // Constraint bounds. - std::vector h_cstr_lower(n_cstr); - std::vector h_cstr_upper(n_cstr); - raft::copy(h_cstr_lower.data(), op_problem_scaled_.constraint_lower_bounds.data(), n_cstr, stream); - raft::copy(h_cstr_upper.data(), op_problem_scaled_.constraint_upper_bounds.data(), n_cstr, stream); + + std::vector h_obj (n_vars); + std::vector h_obj_scaled (n_vars); + std::vector h_var_bounds_packed (n_vars); + std::vector h_var_bounds_scaled_packed(n_vars); + std::vector h_cstr_lower (n_cstr); + std::vector h_cstr_upper (n_cstr); + std::vector h_cstr_lower_scaled(n_cstr); + std::vector h_cstr_upper_scaled(n_cstr); + + raft::copy(h_obj .data(), problem_ptr->objective_coefficients.data(), n_vars, stream); + raft::copy(h_obj_scaled .data(), op_problem_scaled_.objective_coefficients.data(), n_vars, stream); + raft::copy(h_var_bounds_packed .data(), problem_ptr->variable_bounds.data(), n_vars, stream); + raft::copy(h_var_bounds_scaled_packed.data(), op_problem_scaled_.variable_bounds.data(), n_vars, stream); + raft::copy(h_cstr_lower .data(), problem_ptr->constraint_lower_bounds.data(), n_cstr, stream); + raft::copy(h_cstr_upper .data(), problem_ptr->constraint_upper_bounds.data(), n_cstr, stream); + raft::copy(h_cstr_lower_scaled .data(), op_problem_scaled_.constraint_lower_bounds.data(), n_cstr, stream); + raft::copy(h_cstr_upper_scaled .data(), op_problem_scaled_.constraint_upper_bounds.data(), n_cstr, stream); + + // 5. Get full scaling factors on host + std::vector h_cummulative_cstr_scaling(n_cstr); + std::vector h_cummulative_var_scaling (n_vars); + raft::copy(h_cummulative_cstr_scaling.data(), + initial_scaling_strategy_.get_constraint_matrix_scaling_vector().data(), + n_cstr, stream); + raft::copy(h_cummulative_var_scaling.data(), + initial_scaling_strategy_.get_variable_scaling_vector().data(), + n_vars, stream); + const f_t h_bound_rescaling = initial_scaling_strategy_.get_h_bound_rescaling(); + const f_t h_objective_rescaling = initial_scaling_strategy_.get_h_objective_rescaling(); + op_problem_scaled_.handle_ptr->sync_stream(stream); - - std::vector h_var_lower(n_vars), h_var_upper(n_vars); + + // Unpack interleaved {lower, upper} into separate vectors for both + // versions, so the shard ctor's slicing loop is uniform. + std::vector h_var_lower (n_vars), h_var_upper (n_vars); + std::vector h_var_lower_scaled(n_vars), h_var_upper_scaled(n_vars); for (i_t i = 0; i < n_vars; ++i) { - h_var_lower[i] = h_var_bounds_packed[i].x; - h_var_upper[i] = h_var_bounds_packed[i].y; + h_var_lower[i] = h_var_bounds_packed[i].x; + h_var_upper[i] = h_var_bounds_packed[i].y; + h_var_lower_scaled[i] = h_var_bounds_scaled_packed[i].x; + h_var_upper_scaled[i] = h_var_bounds_scaled_packed[i].y; } - // 5. Build per-rank data and meta-data + + // 6. Build per-rank data and meta-data. std::vector> sub_pdlp_rank_data = partition_loader_t::create_rank_data_from_parts( parts, - h_A_row_offsets, h_A_col_indices, h_A_values, - h_A_t_row_offsets, h_A_t_col_indices, h_A_t_values, + h_A_row_offsets, h_A_col_indices, + h_A_values, h_A_values_scaled, + h_A_t_row_offsets, h_A_t_col_indices, + h_A_t_values, h_A_t_values_scaled, settings.num_gpus, n_cstr, n_vars, nnz); - // 6. Build the per-shard PDLP settings: - // - single-GPU mode (num_gpus=1, no partition file) so sub-solvers don't recurse; - // - disable scaling (master already scaled the data we're handing out). - pdlp_solver_settings_t sub_pdlp_settings = settings; - sub_pdlp_settings.num_gpus = 1; - sub_pdlp_settings.multi_gpu_partition_file = ""; - sub_pdlp_settings.hyper_params.default_l_inf_ruiz_iterations = 0; - sub_pdlp_settings.hyper_params.default_alpha_pock_chambolle_rescaling = 0.0; - - // 7. Construct the engine — this collectively bootstraps NCCL across all GPUs - // and constructs one shard per partition with the right slice of host data. + + // 7. Build the per-shard PDLP settings: + pdlp_solver_settings_t sub_pdlp_settings = settings; + sub_pdlp_settings.num_gpus = 1; + sub_pdlp_settings.multi_gpu_partition_file = ""; + sub_pdlp_settings.is_distributed_sub_pdlp = true; + sub_pdlp_settings.hyper_params.default_l_inf_ruiz_iterations = 0; + sub_pdlp_settings.hyper_params.default_alpha_pock_chambolle_rescaling = 0.0; + sub_pdlp_settings.set_initial_step_size (initial_step_size_global); + sub_pdlp_settings.set_initial_primal_weight(initial_primal_weight_global); + + // 8. Construct the engine, creates NCCL comms and shards multi_gpu_engine.emplace( std::move(sub_pdlp_rank_data), - h_obj, h_var_lower, h_var_upper, h_cstr_lower, h_cstr_upper, + h_obj, h_var_lower, h_var_upper, h_cstr_lower, h_cstr_upper, + h_obj_scaled, h_var_lower_scaled, h_var_upper_scaled, h_cstr_lower_scaled, h_cstr_upper_scaled, + h_cummulative_cstr_scaling, h_cummulative_var_scaling, + h_bound_rescaling, h_objective_rescaling, op_problem_scaled_.maximize, op_problem_scaled_.objective_offset, op_problem_scaled_.presolve_data.objective_scaling_factor, @@ -2349,9 +2400,13 @@ optimization_problem_solution_t pdlp_solver_t::run_solver(co !settings_.get_initial_primal_weight().has_value()) compute_initial_primal_weight(); - // In multi-GPU mode the master scaled op_problem_scaled_ in its ctor before - // distributing data to the shards, so skip the second scaling pass here. - if (!multi_gpu_engine.has_value()) { + // Skip the in-loop scaling pass in both distributed roles: + // - The master pdlp_solver_t scaled op_problem_scaled_ in its multi-GPU + // ctor before shipping data to the shards (multi_gpu_engine present). + // - Each per-shard pdlp_solver_t received already-scaled + // op_problem_scaled_ + injected scaling state from the master, so it + // must not re-apply scale_problem() (is_distributed_sub_pdlp set). + if (!multi_gpu_engine.has_value() && !settings_.is_distributed_sub_pdlp) { initial_scaling_strategy_.scale_problem(); } diff --git a/cpp/src/pdlp/pdlp.cuh b/cpp/src/pdlp/pdlp.cuh index ef992d2a9e..532f038fbf 100644 --- a/cpp/src/pdlp/pdlp.cuh +++ b/cpp/src/pdlp/pdlp.cuh @@ -33,7 +33,6 @@ #include #include -#include "distributed_pdlp/multi_gpu_engine.hpp" namespace cuopt::linear_programming::detail { /** @@ -108,6 +107,13 @@ class pdlp_solver_t { void compute_initial_step_size(); void compute_initial_primal_weight(); + // Needed by multi-GPU to mutate them + problem_t& get_op_problem_scaled() { return op_problem_scaled_; } + detail::pdlp_initial_scaling_strategy_t& get_initial_scaling_strategy() + { + return initial_scaling_strategy_; + } + private: void print_termination_criteria(const timer_t& timer, bool is_average = false); void print_final_termination_criteria( From b5ebfd2a757e1f35bcb70af97559b1d2082c3451 Mon Sep 17 00:00:00 2001 From: Bulle Mostovoi Date: Wed, 20 May 2026 15:41:59 +0200 Subject: [PATCH 10/67] added pre loop setup need to manage boxing + style too --- .../pdlp/distributed_pdlp/multi_gpu_engine.cu | 118 ++-- .../distributed_pdlp/multi_gpu_engine.hpp | 93 ++-- .../pdlp/distributed_pdlp/partition_loader.cu | 77 ++- cpp/src/pdlp/distributed_pdlp/rank_data.hpp | 101 ++-- cpp/src/pdlp/distributed_pdlp/shard.cu | 128 +++-- cpp/src/pdlp/distributed_pdlp/shard.hpp | 125 +++-- cpp/src/pdlp/pdlp.cu | 521 ++++++++++-------- cpp/src/pdlp/pdlp.cuh | 6 +- 8 files changed, 607 insertions(+), 562 deletions(-) diff --git a/cpp/src/pdlp/distributed_pdlp/multi_gpu_engine.cu b/cpp/src/pdlp/distributed_pdlp/multi_gpu_engine.cu index 9b404bbd53..fe95b1e5ff 100644 --- a/cpp/src/pdlp/distributed_pdlp/multi_gpu_engine.cu +++ b/cpp/src/pdlp/distributed_pdlp/multi_gpu_engine.cu @@ -3,45 +3,44 @@ * SPDX-License-Identifier: Apache-2.0 */ - #include +#include + +#include + +#include + +#include + +#include + +namespace cuopt::linear_programming::detail { - #include - - #include - - #include - - #include - - namespace cuopt::linear_programming::detail { - template multi_gpu_engine_t::multi_gpu_engine_t( - std::vector>&& rank_data, - std::vector const& h_global_obj, - std::vector const& h_global_var_lower, - std::vector const& h_global_var_upper, - std::vector const& h_global_cstr_lower, - std::vector const& h_global_cstr_upper, - std::vector const& h_global_obj_scaled, - std::vector const& h_global_var_lower_scaled, - std::vector const& h_global_var_upper_scaled, - std::vector const& h_global_cstr_lower_scaled, - std::vector const& h_global_cstr_upper_scaled, - std::vector const& h_global_cummulative_cstr_scaling, - std::vector const& h_global_cummulative_var_scaling, - f_t h_bound_rescaling, - f_t h_objective_rescaling, - bool maximize, - f_t objective_offset, - f_t objective_scaling_factor, - pdlp_solver_settings_t const& sub_solver_settings) + std::vector>&& rank_data, + std::vector const& h_global_obj, + std::vector const& h_global_var_lower, + std::vector const& h_global_var_upper, + std::vector const& h_global_cstr_lower, + std::vector const& h_global_cstr_upper, + std::vector const& h_global_obj_scaled, + std::vector const& h_global_var_lower_scaled, + std::vector const& h_global_var_upper_scaled, + std::vector const& h_global_cstr_lower_scaled, + std::vector const& h_global_cstr_upper_scaled, + std::vector const& h_global_cummulative_cstr_scaling, + std::vector const& h_global_cummulative_var_scaling, + f_t h_bound_rescaling, + f_t h_objective_rescaling, + bool maximize, + f_t objective_offset, + f_t objective_scaling_factor, + pdlp_solver_settings_t const& sub_solver_settings) : stream() { const int nb_parts = static_cast(rank_data.size()); - cuopt_expects(nb_parts > 0, - error_type_t::ValidationError, - "multi_gpu_engine_t: rank_data must be non-empty"); + cuopt_expects( + nb_parts > 0, error_type_t::ValidationError, "multi_gpu_engine_t: rank_data must be non-empty"); shards.reserve(nb_parts); std::vector devices(nb_parts); @@ -56,32 +55,31 @@ multi_gpu_engine_t::multi_gpu_engine_t( // 3. Construct one shard per rank, pinned to its device. for (int r = 0; r < nb_parts; ++r) { raft::device_setter guard(devices[r]); // shard ctor needs device set - shards.emplace_back(std::make_unique>( - devices[r], - std::move(rank_data[r]), - raw_comms[r], - h_global_obj, - h_global_var_lower, - h_global_var_upper, - h_global_cstr_lower, - h_global_cstr_upper, - h_global_obj_scaled, - h_global_var_lower_scaled, - h_global_var_upper_scaled, - h_global_cstr_lower_scaled, - h_global_cstr_upper_scaled, - h_global_cummulative_cstr_scaling, - h_global_cummulative_var_scaling, - h_bound_rescaling, - h_objective_rescaling, - maximize, - objective_offset, - objective_scaling_factor, - sub_solver_settings)); + shards.emplace_back(std::make_unique>(devices[r], + std::move(rank_data[r]), + raw_comms[r], + h_global_obj, + h_global_var_lower, + h_global_var_upper, + h_global_cstr_lower, + h_global_cstr_upper, + h_global_obj_scaled, + h_global_var_lower_scaled, + h_global_var_upper_scaled, + h_global_cstr_lower_scaled, + h_global_cstr_upper_scaled, + h_global_cummulative_cstr_scaling, + h_global_cummulative_var_scaling, + h_bound_rescaling, + h_objective_rescaling, + maximize, + objective_offset, + objective_scaling_factor, + sub_solver_settings)); } } - - template struct multi_gpu_engine_t; - // template struct multi_gpu_engine_t; - - } // namespace cuopt::linear_programming::detail \ No newline at end of file + +template struct multi_gpu_engine_t; +// template struct multi_gpu_engine_t; + +} // namespace cuopt::linear_programming::detail diff --git a/cpp/src/pdlp/distributed_pdlp/multi_gpu_engine.hpp b/cpp/src/pdlp/distributed_pdlp/multi_gpu_engine.hpp index d672e18197..e191a89d60 100644 --- a/cpp/src/pdlp/distributed_pdlp/multi_gpu_engine.hpp +++ b/cpp/src/pdlp/distributed_pdlp/multi_gpu_engine.hpp @@ -2,53 +2,52 @@ * SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. * SPDX-License-Identifier: Apache-2.0 */ - #pragma once - - #include - #include - - #include - - #include - - #include - #include - - namespace cuopt::linear_programming::detail { - +#pragma once + +#include +#include + +#include + +#include + +#include +#include + +namespace cuopt::linear_programming::detail { + template struct multi_gpu_engine_t { // Constructs shards from rank_data - multi_gpu_engine_t( - std::vector>&& rank_data, - std::vector const& h_global_obj, - std::vector const& h_global_var_lower, - std::vector const& h_global_var_upper, - std::vector const& h_global_cstr_lower, - std::vector const& h_global_cstr_upper, - std::vector const& h_global_obj_scaled, - std::vector const& h_global_var_lower_scaled, - std::vector const& h_global_var_upper_scaled, - std::vector const& h_global_cstr_lower_scaled, - std::vector const& h_global_cstr_upper_scaled, - std::vector const& h_global_cummulative_cstr_scaling, - std::vector const& h_global_cummulative_var_scaling, - f_t h_bound_rescaling, - f_t h_objective_rescaling, - bool maximize, - f_t objective_offset, - f_t objective_scaling_factor, - pdlp_solver_settings_t const& sub_solver_settings); - - multi_gpu_engine_t(const multi_gpu_engine_t&) = delete; - multi_gpu_engine_t& operator=(const multi_gpu_engine_t&) = delete; - - // Engine-level stream for fork/join orchestration (master side). - rmm::cuda_stream stream; - - // Shards stored by unique_ptr because pdlp_shard_t is immovable - // (owns device-affine resources: handle, NCCL comm, RMM buffers). - std::vector>> shards; - }; - - } // namespace cuopt::linear_programming::detail \ No newline at end of file + multi_gpu_engine_t(std::vector>&& rank_data, + std::vector const& h_global_obj, + std::vector const& h_global_var_lower, + std::vector const& h_global_var_upper, + std::vector const& h_global_cstr_lower, + std::vector const& h_global_cstr_upper, + std::vector const& h_global_obj_scaled, + std::vector const& h_global_var_lower_scaled, + std::vector const& h_global_var_upper_scaled, + std::vector const& h_global_cstr_lower_scaled, + std::vector const& h_global_cstr_upper_scaled, + std::vector const& h_global_cummulative_cstr_scaling, + std::vector const& h_global_cummulative_var_scaling, + f_t h_bound_rescaling, + f_t h_objective_rescaling, + bool maximize, + f_t objective_offset, + f_t objective_scaling_factor, + pdlp_solver_settings_t const& sub_solver_settings); + + multi_gpu_engine_t(const multi_gpu_engine_t&) = delete; + multi_gpu_engine_t& operator=(const multi_gpu_engine_t&) = delete; + + // Engine-level stream for fork/join orchestration (master side). + rmm::cuda_stream stream; + + // Shards stored by unique_ptr because pdlp_shard_t is immovable + // (owns device-affine resources: handle, NCCL comm, RMM buffers). + std::vector>> shards; +}; + +} // namespace cuopt::linear_programming::detail diff --git a/cpp/src/pdlp/distributed_pdlp/partition_loader.cu b/cpp/src/pdlp/distributed_pdlp/partition_loader.cu index 047fb536d5..6c96e0b63d 100644 --- a/cpp/src/pdlp/distributed_pdlp/partition_loader.cu +++ b/cpp/src/pdlp/distributed_pdlp/partition_loader.cu @@ -18,9 +18,8 @@ std::vector partition_loader_t::parse_distributed_pdlp_partition_ std::string const& file) { std::ifstream part_file(file); - cuopt_expects(part_file.is_open(), - error_type_t::ValidationError, - "Failed to open partition file: " + file); + cuopt_expects( + part_file.is_open(), error_type_t::ValidationError, "Failed to open partition file: " + file); // One integer per line; operator>> skips whitespace so blank lines and // trailing newlines are tolerated. @@ -39,8 +38,7 @@ std::vector partition_loader_t::parse_distributed_pdlp_partition_ } template -std::vector> -partition_loader_t::create_rank_data_from_parts( +std::vector> partition_loader_t::create_rank_data_from_parts( const std::vector& parts, const std::vector& A_row_offsets, const std::vector& A_col_indices, @@ -76,7 +74,7 @@ partition_loader_t::create_rank_data_from_parts( // 2. Compute local matrices and rank_data for (i_t rank = 0; rank < nb_parts; rank++) { - auto& rd = rank_data[rank]; + auto& rd = rank_data[rank]; rd.owned_var_size = rd.owned_var_indices.size(); rd.owned_cstr_size = rd.owned_cstr_indices.size(); // ---- A side ---- @@ -93,11 +91,11 @@ partition_loader_t::create_rank_data_from_parts( // local arrays share identical (offsets, col_indices) and differ only // in values. for (auto owned_cstr : rd.owned_cstr_indices) { - i_t cstr_len = A_row_offsets[owned_cstr + 1] - A_row_offsets[owned_cstr]; + i_t cstr_len = A_row_offsets[owned_cstr + 1] - A_row_offsets[owned_cstr]; i_t row_start = A_row_offsets[owned_cstr]; for (i_t v = 0; v < cstr_len; v++) { local_A_col_indices.push_back(A_col_indices[row_start + v]); - local_A_values .push_back(A_values [row_start + v]); + local_A_values.push_back(A_values[row_start + v]); local_A_values_scaled.push_back(A_values_scaled[row_start + v]); } local_A_nnz += cstr_len; @@ -106,29 +104,25 @@ partition_loader_t::create_rank_data_from_parts( std::set needed_vars; for (auto indice : local_A_col_indices) { - if (var_parts[indice] != rank) - needed_vars.insert(indice); + if (var_parts[indice] != rank) needed_vars.insert(indice); } for (i_t peer = 0; peer < nb_parts; peer++) { std::vector needed_var_from_peer; for (auto needed_var : needed_vars) { - if (var_parts[needed_var] == peer) - needed_var_from_peer.push_back(needed_var); + if (var_parts[needed_var] == peer) needed_var_from_peer.push_back(needed_var); } - i_t nb_recv_from_peer = needed_var_from_peer.size(); + i_t nb_recv_from_peer = needed_var_from_peer.size(); rd.var_recv_counts[peer] = nb_recv_from_peer; rd.var_recv_offsets[peer] = - peer == 0 - ? 0 - : rd.var_recv_offsets[peer - 1] + rd.var_recv_counts[peer - 1]; + peer == 0 ? 0 : rd.var_recv_offsets[peer - 1] + rd.var_recv_counts[peer - 1]; rank_data[peer].var_send_per_peer[rank] = std::move(needed_var_from_peer); } - rd.h_A_row_offsets = std::move(local_A_row_offsets); - rd.h_A_col_indices = std::move(local_A_col_indices); - rd.h_A_values = std::move(local_A_values); - rd.h_A_values_scaled = std::move(local_A_values_scaled); + rd.h_A_row_offsets = std::move(local_A_row_offsets); + rd.h_A_col_indices = std::move(local_A_col_indices); + rd.h_A_values = std::move(local_A_values); + rd.h_A_values_scaled = std::move(local_A_values_scaled); // ---- A_t side ---- std::vector local_A_t_row_offsets; @@ -139,11 +133,11 @@ partition_loader_t::create_rank_data_from_parts( local_A_t_row_offsets.push_back(local_A_t_nnz); for (auto owned_var : rd.owned_var_indices) { - i_t var_len = A_t_row_offsets[owned_var + 1] - A_t_row_offsets[owned_var]; + i_t var_len = A_t_row_offsets[owned_var + 1] - A_t_row_offsets[owned_var]; i_t row_start = A_t_row_offsets[owned_var]; for (i_t v = 0; v < var_len; v++) { - local_A_t_col_indices .push_back(A_t_col_indices [row_start + v]); - local_A_t_values .push_back(A_t_values [row_start + v]); + local_A_t_col_indices.push_back(A_t_col_indices[row_start + v]); + local_A_t_values.push_back(A_t_values[row_start + v]); local_A_t_values_scaled.push_back(A_t_values_scaled[row_start + v]); } local_A_t_nnz += var_len; @@ -152,31 +146,27 @@ partition_loader_t::create_rank_data_from_parts( std::set needed_cstrs; for (auto indice : local_A_t_col_indices) { - if (cstr_parts[indice] != rank) - needed_cstrs.insert(indice); + if (cstr_parts[indice] != rank) needed_cstrs.insert(indice); } for (i_t peer = 0; peer < nb_parts; peer++) { std::vector needed_cstr_from_peer; for (auto needed_cstr : needed_cstrs) { - if (cstr_parts[needed_cstr] == peer) - needed_cstr_from_peer.push_back(needed_cstr); + if (cstr_parts[needed_cstr] == peer) needed_cstr_from_peer.push_back(needed_cstr); } - i_t nb_recv_from_peer = needed_cstr_from_peer.size(); + i_t nb_recv_from_peer = needed_cstr_from_peer.size(); rd.cstr_recv_counts[peer] = nb_recv_from_peer; rd.cstr_recv_offsets[peer] = - peer == 0 - ? 0 - : rd.cstr_recv_offsets[peer - 1] + rd.cstr_recv_counts[peer - 1]; + peer == 0 ? 0 : rd.cstr_recv_offsets[peer - 1] + rd.cstr_recv_counts[peer - 1]; rank_data[peer].cstr_send_per_peer[rank] = std::move(needed_cstr_from_peer); } - rd.h_A_t_row_offsets = std::move(local_A_t_row_offsets); - rd.h_A_t_col_indices = std::move(local_A_t_col_indices); - rd.h_A_t_values = std::move(local_A_t_values); - rd.h_A_t_values_scaled = std::move(local_A_t_values_scaled); + rd.h_A_t_row_offsets = std::move(local_A_t_row_offsets); + rd.h_A_t_col_indices = std::move(local_A_t_col_indices); + rd.h_A_t_values = std::move(local_A_t_values); + rd.h_A_t_values_scaled = std::move(local_A_t_values_scaled); - rd.total_var_size = rd.owned_var_size + needed_vars.size(); + rd.total_var_size = rd.owned_var_size + needed_vars.size(); rd.total_cstr_size = rd.owned_cstr_size + needed_cstrs.size(); } @@ -195,7 +185,8 @@ partition_loader_t::create_rank_data_from_parts( if (peer == rank) continue; for (auto recv_cstr : rank_data[peer].cstr_send_per_peer[rank]) { rd.global_to_local_cstr[recv_cstr] = curr_id; - // rd.local_to_global_cstr.push_back(recv_cstr); // Not needed, we only do local_to_global on owned side + // rd.local_to_global_cstr.push_back(recv_cstr); // Not needed, we only do local_to_global + // on owned side curr_id++; } } @@ -221,14 +212,18 @@ partition_loader_t::create_rank_data_from_parts( auto& rd = rank_data[rank]; for (auto& send_vec : rd.var_send_per_peer) { - for (auto& v : send_vec) v = rd.global_to_local_var.at(v); + for (auto& v : send_vec) + v = rd.global_to_local_var.at(v); } for (auto& send_vec : rd.cstr_send_per_peer) { - for (auto& v : send_vec) v = rd.global_to_local_cstr.at(v); + for (auto& v : send_vec) + v = rd.global_to_local_cstr.at(v); } - for (auto& v : rd.h_A_col_indices) v = rd.global_to_local_var.at(v); - for (auto& v : rd.h_A_t_col_indices) v = rd.global_to_local_cstr.at(v); + for (auto& v : rd.h_A_col_indices) + v = rd.global_to_local_var.at(v); + for (auto& v : rd.h_A_t_col_indices) + v = rd.global_to_local_cstr.at(v); } return rank_data; diff --git a/cpp/src/pdlp/distributed_pdlp/rank_data.hpp b/cpp/src/pdlp/distributed_pdlp/rank_data.hpp index 29d76ae110..d52d277116 100644 --- a/cpp/src/pdlp/distributed_pdlp/rank_data.hpp +++ b/cpp/src/pdlp/distributed_pdlp/rank_data.hpp @@ -1,54 +1,61 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + */ + #pragma once -#include #include +#include namespace cuopt::linear_programming::detail { template struct rank_data_t { - rank_data_t(std::size_t nb_parts) - : var_send_per_peer(nb_parts), - cstr_send_per_peer(nb_parts), - var_recv_counts(nb_parts, 0), - var_recv_offsets(nb_parts, 0), - cstr_recv_counts(nb_parts, 0), - cstr_recv_offsets(nb_parts, 0) {} - - i_t owned_var_size{0}; - i_t total_var_size{0}; - i_t owned_cstr_size{0}; - i_t total_cstr_size{0}; - - // === Ownership === - std::vector owned_var_indices; - std::vector owned_cstr_indices; - - // === Send plan: per peer, indices to gather + send === - std::vector> var_send_per_peer; - std::vector> cstr_send_per_peer; - - // === Recv plan: per peer, contiguous slot in halo region === - std::vector var_recv_counts; - std::vector var_recv_offsets; - std::vector cstr_recv_counts; - std::vector cstr_recv_offsets; - - // === Mappings === - std::unordered_map global_to_local_var; - std::unordered_map global_to_local_cstr; - std::vector local_to_global_var; - std::vector local_to_global_cstr; - - // === Local host CSR matrices === - // A - std::vector h_A_row_offsets; - std::vector h_A_col_indices; - std::vector h_A_values; - std::vector h_A_values_scaled; - // A_t - std::vector h_A_t_row_offsets; - std::vector h_A_t_col_indices; - std::vector h_A_t_values; - std::vector h_A_t_values_scaled; - }; -} // namespace cuopt::linear_programming::detail \ No newline at end of file + rank_data_t(std::size_t nb_parts) + : var_send_per_peer(nb_parts), + cstr_send_per_peer(nb_parts), + var_recv_counts(nb_parts, 0), + var_recv_offsets(nb_parts, 0), + cstr_recv_counts(nb_parts, 0), + cstr_recv_offsets(nb_parts, 0) + { + } + + i_t owned_var_size{0}; + i_t total_var_size{0}; + i_t owned_cstr_size{0}; + i_t total_cstr_size{0}; + + // === Ownership === + std::vector owned_var_indices; + std::vector owned_cstr_indices; + + // === Send plan: per peer, indices to gather + send === + std::vector> var_send_per_peer; + std::vector> cstr_send_per_peer; + + // === Recv plan: per peer, contiguous slot in halo region === + std::vector var_recv_counts; + std::vector var_recv_offsets; + std::vector cstr_recv_counts; + std::vector cstr_recv_offsets; + + // === Mappings === + std::unordered_map global_to_local_var; + std::unordered_map global_to_local_cstr; + std::vector local_to_global_var; + std::vector local_to_global_cstr; + + // === Local host CSR matrices === + // A + std::vector h_A_row_offsets; + std::vector h_A_col_indices; + std::vector h_A_values; + std::vector h_A_values_scaled; + // A_t + std::vector h_A_t_row_offsets; + std::vector h_A_t_col_indices; + std::vector h_A_t_values; + std::vector h_A_t_values_scaled; +}; +} // namespace cuopt::linear_programming::detail diff --git a/cpp/src/pdlp/distributed_pdlp/shard.cu b/cpp/src/pdlp/distributed_pdlp/shard.cu index 41f74086ab..596a08a3dc 100644 --- a/cpp/src/pdlp/distributed_pdlp/shard.cu +++ b/cpp/src/pdlp/distributed_pdlp/shard.cu @@ -22,28 +22,27 @@ template pdlp_shard_t::~pdlp_shard_t() = default; template -pdlp_shard_t::pdlp_shard_t( - int device_id, - rank_data_t&& rd, - ncclComm_t raw_comm, - std::vector const& h_global_obj, - std::vector const& h_global_var_lower, - std::vector const& h_global_var_upper, - std::vector const& h_global_cstr_lower, - std::vector const& h_global_cstr_upper, - std::vector const& h_global_obj_scaled, - std::vector const& h_global_var_lower_scaled, - std::vector const& h_global_var_upper_scaled, - std::vector const& h_global_cstr_lower_scaled, - std::vector const& h_global_cstr_upper_scaled, - std::vector const& h_global_cummulative_cstr_scaling, - std::vector const& h_global_cummulative_var_scaling, - f_t h_bound_rescaling, - f_t h_objective_rescaling, - bool maximize, - f_t objective_offset, - f_t objective_scaling_factor, - pdlp_solver_settings_t const& settings) +pdlp_shard_t::pdlp_shard_t(int device_id, + rank_data_t&& rd, + ncclComm_t raw_comm, + std::vector const& h_global_obj, + std::vector const& h_global_var_lower, + std::vector const& h_global_var_upper, + std::vector const& h_global_cstr_lower, + std::vector const& h_global_cstr_upper, + std::vector const& h_global_obj_scaled, + std::vector const& h_global_var_lower_scaled, + std::vector const& h_global_var_upper_scaled, + std::vector const& h_global_cstr_lower_scaled, + std::vector const& h_global_cstr_upper_scaled, + std::vector const& h_global_cummulative_cstr_scaling, + std::vector const& h_global_cummulative_var_scaling, + f_t h_bound_rescaling, + f_t h_objective_rescaling, + bool maximize, + f_t objective_offset, + f_t objective_scaling_factor, + pdlp_solver_settings_t const& settings) : device_id(device_id), stream(), handle(stream.view()), @@ -53,22 +52,27 @@ pdlp_shard_t::pdlp_shard_t( sub_problem(std::nullopt), sub_pdlp(nullptr) { - assert(raft::device_setter::get_current_device() == device_id && "Right device must be set before building the shard"); + assert(raft::device_setter::get_current_device() == device_id && + "Right device must be set before building the shard"); // ---- 1. Gather per-shard host slices using rank_data's index maps. ---- // All vectors are sized to TOTAL (owned + halo). Owned slots get real // values; halo slots keep defaults because they should not be accessed - std::vector h_obj (rank_data.total_var_size, f_t{0}); - std::vector h_var_lower (rank_data.total_var_size, -std::numeric_limits::infinity()); - std::vector h_var_upper (rank_data.total_var_size, std::numeric_limits::infinity()); - std::vector h_cstr_lower (rank_data.total_cstr_size, -std::numeric_limits::infinity()); - std::vector h_cstr_upper (rank_data.total_cstr_size, std::numeric_limits::infinity()); - - std::vector h_obj_scaled (rank_data.total_var_size, f_t{0}); - std::vector h_var_lower_scaled (rank_data.total_var_size, -std::numeric_limits::infinity()); - std::vector h_var_upper_scaled (rank_data.total_var_size, std::numeric_limits::infinity()); - std::vector h_cstr_lower_scaled(rank_data.total_cstr_size, -std::numeric_limits::infinity()); - std::vector h_cstr_upper_scaled(rank_data.total_cstr_size, std::numeric_limits::infinity()); + std::vector h_obj(rank_data.total_var_size, f_t{0}); + std::vector h_var_lower(rank_data.total_var_size, -std::numeric_limits::infinity()); + std::vector h_var_upper(rank_data.total_var_size, std::numeric_limits::infinity()); + std::vector h_cstr_lower(rank_data.total_cstr_size, -std::numeric_limits::infinity()); + std::vector h_cstr_upper(rank_data.total_cstr_size, std::numeric_limits::infinity()); + + std::vector h_obj_scaled(rank_data.total_var_size, f_t{0}); + std::vector h_var_lower_scaled(rank_data.total_var_size, + -std::numeric_limits::infinity()); + std::vector h_var_upper_scaled(rank_data.total_var_size, + std::numeric_limits::infinity()); + std::vector h_cstr_lower_scaled(rank_data.total_cstr_size, + -std::numeric_limits::infinity()); + std::vector h_cstr_upper_scaled(rank_data.total_cstr_size, + std::numeric_limits::infinity()); for (i_t i = 0; i < rank_data.owned_var_size; ++i) { const auto g = rank_data.local_to_global_var[i]; @@ -89,7 +93,7 @@ pdlp_shard_t::pdlp_shard_t( // Get local scaling factors std::vector h_cstr_scaling_local(rank_data.total_cstr_size, f_t{1}); - std::vector h_var_scaling_local (rank_data.total_var_size, f_t{1}); + std::vector h_var_scaling_local(rank_data.total_var_size, f_t{1}); for (i_t i = 0; i < rank_data.owned_cstr_size; ++i) { h_cstr_scaling_local[i] = h_global_cummulative_cstr_scaling[rank_data.local_to_global_cstr[i]]; } @@ -99,15 +103,17 @@ pdlp_shard_t::pdlp_shard_t( // ---- 2. Build optimization_problem_t on this shard's device (UNSCALED). ---- opt_problem.emplace(&handle); - opt_problem->set_csr_constraint_matrix( - rank_data.h_A_values .data(), static_cast(rank_data.h_A_values .size()), - rank_data.h_A_col_indices.data(), static_cast(rank_data.h_A_col_indices.size()), - rank_data.h_A_row_offsets.data(), static_cast(rank_data.h_A_row_offsets.size())); + opt_problem->set_csr_constraint_matrix(rank_data.h_A_values.data(), + static_cast(rank_data.h_A_values.size()), + rank_data.h_A_col_indices.data(), + static_cast(rank_data.h_A_col_indices.size()), + rank_data.h_A_row_offsets.data(), + static_cast(rank_data.h_A_row_offsets.size())); // Primal axis: TOTAL (owned + halo). Halo slots have neutral defaults. - opt_problem->set_objective_coefficients(h_obj .data(), rank_data.total_var_size); - opt_problem->set_variable_lower_bounds (h_var_lower.data(), rank_data.total_var_size); - opt_problem->set_variable_upper_bounds (h_var_upper.data(), rank_data.total_var_size); + opt_problem->set_objective_coefficients(h_obj.data(), rank_data.total_var_size); + opt_problem->set_variable_lower_bounds(h_var_lower.data(), rank_data.total_var_size); + opt_problem->set_variable_upper_bounds(h_var_upper.data(), rank_data.total_var_size); // Dual axis: TOTAL (owned + halo). Halo slots have ±inf so trivially satisfied. opt_problem->set_constraint_lower_bounds(h_cstr_lower.data(), rank_data.total_cstr_size); @@ -126,18 +132,21 @@ pdlp_shard_t::pdlp_shard_t( // in multi-GPU: A_local is owned_cstr x total_var, and A_t_local is the // pre-sliced owned_var x total_cstr matrix we built during partitioning. auto stream_view = handle.get_stream(); - sub_problem->reverse_offsets .resize(rank_data.h_A_t_row_offsets.size(), stream_view); - sub_problem->reverse_constraints .resize(rank_data.h_A_t_col_indices.size(), stream_view); - sub_problem->reverse_coefficients.resize(rank_data.h_A_t_values .size(), stream_view); + sub_problem->reverse_offsets.resize(rank_data.h_A_t_row_offsets.size(), stream_view); + sub_problem->reverse_constraints.resize(rank_data.h_A_t_col_indices.size(), stream_view); + sub_problem->reverse_coefficients.resize(rank_data.h_A_t_values.size(), stream_view); raft::copy(sub_problem->reverse_offsets.data(), rank_data.h_A_t_row_offsets.data(), - rank_data.h_A_t_row_offsets.size(), stream_view); + rank_data.h_A_t_row_offsets.size(), + stream_view); raft::copy(sub_problem->reverse_constraints.data(), rank_data.h_A_t_col_indices.data(), - rank_data.h_A_t_col_indices.size(), stream_view); + rank_data.h_A_t_col_indices.size(), + stream_view); raft::copy(sub_problem->reverse_coefficients.data(), rank_data.h_A_t_values.data(), - rank_data.h_A_t_values.size(), stream_view); + rank_data.h_A_t_values.size(), + stream_view); handle.sync_stream(stream_view); // ---- 5. Build sub_pdlp (single-GPU mode; multi_gpu flags cleared by caller). ---- @@ -150,16 +159,22 @@ pdlp_shard_t::pdlp_shard_t( auto& scaled = sub_pdlp->get_op_problem_scaled(); raft::copy(scaled.coefficients.data(), rank_data.h_A_values_scaled.data(), - rank_data.h_A_values_scaled.size(), stream_view); + rank_data.h_A_values_scaled.size(), + stream_view); raft::copy(scaled.reverse_coefficients.data(), rank_data.h_A_t_values_scaled.data(), - rank_data.h_A_t_values_scaled.size(), stream_view); - raft::copy(scaled.objective_coefficients.data(), - h_obj_scaled.data(), h_obj_scaled.size(), stream_view); + rank_data.h_A_t_values_scaled.size(), + stream_view); + raft::copy( + scaled.objective_coefficients.data(), h_obj_scaled.data(), h_obj_scaled.size(), stream_view); raft::copy(scaled.constraint_lower_bounds.data(), - h_cstr_lower_scaled.data(), h_cstr_lower_scaled.size(), stream_view); + h_cstr_lower_scaled.data(), + h_cstr_lower_scaled.size(), + stream_view); raft::copy(scaled.constraint_upper_bounds.data(), - h_cstr_upper_scaled.data(), h_cstr_upper_scaled.size(), stream_view); + h_cstr_upper_scaled.data(), + h_cstr_upper_scaled.size(), + stream_view); using f_t2 = typename type_2::type; std::vector h_var_bounds_scaled_packed(rank_data.total_var_size); @@ -169,14 +184,15 @@ pdlp_shard_t::pdlp_shard_t( } raft::copy(scaled.variable_bounds.data(), h_var_bounds_scaled_packed.data(), - h_var_bounds_scaled_packed.size(), stream_view); + h_var_bounds_scaled_packed.size(), + stream_view); combine_constraint_bounds(scaled, scaled.combined_bounds); // Inject master-scaled buffers inside sub_pdlp.initil_strategy auto& scaling = sub_pdlp->get_initial_scaling_strategy(); scaling.set_cummulative_scaling(h_cstr_scaling_local, h_var_scaling_local); - scaling.set_h_bound_rescaling (h_bound_rescaling); + scaling.set_h_bound_rescaling(h_bound_rescaling); scaling.set_h_objective_rescaling(h_objective_rescaling); handle.sync_stream(stream_view); diff --git a/cpp/src/pdlp/distributed_pdlp/shard.hpp b/cpp/src/pdlp/distributed_pdlp/shard.hpp index 3c10a90f90..a5ff89c5c4 100644 --- a/cpp/src/pdlp/distributed_pdlp/shard.hpp +++ b/cpp/src/pdlp/distributed_pdlp/shard.hpp @@ -2,49 +2,49 @@ * SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. * SPDX-License-Identifier: Apache-2.0 */ - #pragma once +#pragma once + +#include + +#include +#include +#include + +#include +#include +#include + +#include + +#include +#include +#include + +namespace cuopt::linear_programming::detail { + +// Forward-declare to break the cyclic include with pdlp.cuh +// (pdlp.cuh -> multi_gpu_engine.hpp -> shard.hpp -> pdlp.cuh). +// Definitions of out-of-line members live in shard.cu, which includes pdlp.cuh. +template +class pdlp_solver_t; + +// RAII deleter for ncclComm_t; sets the right device before destroy. +struct nccl_comm_deleter_t { + int device_id{-1}; + void operator()(ncclComm* comm) const noexcept + { + if (comm == nullptr) return; + raft::device_setter guard(device_id); + ncclCommDestroy(comm); + } +}; +using nccl_comm_unique_ptr_t = std::unique_ptr; + +template +struct pdlp_shard_t { + // Out-of-line (in shard.cu) because pdlp_solver_t is incomplete here. + ~pdlp_shard_t(); - #include - - #include - #include - #include - - #include - #include - #include - - #include - - #include - #include - #include - - namespace cuopt::linear_programming::detail { - - // Forward-declare to break the cyclic include with pdlp.cuh - // (pdlp.cuh -> multi_gpu_engine.hpp -> shard.hpp -> pdlp.cuh). - // Definitions of out-of-line members live in shard.cu, which includes pdlp.cuh. - template - class pdlp_solver_t; - - // RAII deleter for ncclComm_t; sets the right device before destroy. - struct nccl_comm_deleter_t { - int device_id{-1}; - void operator()(ncclComm* comm) const noexcept - { - if (comm == nullptr) return; - raft::device_setter guard(device_id); - ncclCommDestroy(comm); - } - }; - using nccl_comm_unique_ptr_t = std::unique_ptr; - - template - struct pdlp_shard_t { - // Out-of-line (in shard.cu) because pdlp_solver_t is incomplete here. - ~pdlp_shard_t(); - // sub worker for distributed pdlp. Owns its own view on scaled problem and unscaled problem // Owns necessary multi-gpu data (rank_data, device_id, nccl_comm) pdlp_shard_t(int device_id, @@ -65,25 +65,24 @@ f_t h_bound_rescaling, f_t h_objective_rescaling, bool maximize, - f_t objective_offset, - f_t objective_scaling_factor, + f_t objective_offset, + f_t objective_scaling_factor, pdlp_solver_settings_t const& settings); - - pdlp_shard_t(const pdlp_shard_t&) = delete; - pdlp_shard_t& operator=(const pdlp_shard_t&) = delete; - // Move ops are implicitly deleted (user-declared dtor + deleted copy). - // Intentional: shard owns device-affine resources and must never move. - // Store as std::unique_ptr in any container. - - int device_id; - rmm::cuda_stream stream; - raft::handle_t handle; - nccl_comm_unique_ptr_t comm; - rank_data_t rank_data; - std::optional> opt_problem; - std::optional> sub_problem; - std::unique_ptr> sub_pdlp; - }; - - } // namespace cuopt::linear_programming::detail - \ No newline at end of file + + pdlp_shard_t(const pdlp_shard_t&) = delete; + pdlp_shard_t& operator=(const pdlp_shard_t&) = delete; + // Move ops are implicitly deleted (user-declared dtor + deleted copy). + // Intentional: shard owns device-affine resources and must never move. + // Store as std::unique_ptr in any container. + + int device_id; + rmm::cuda_stream stream; + raft::handle_t handle; + nccl_comm_unique_ptr_t comm; + rank_data_t rank_data; + std::optional> opt_problem; + std::optional> sub_problem; + std::unique_ptr> sub_pdlp; +}; + +} // namespace cuopt::linear_programming::detail diff --git a/cpp/src/pdlp/pdlp.cu b/cpp/src/pdlp/pdlp.cu index 612eb676ec..2a36c160fd 100644 --- a/cpp/src/pdlp/pdlp.cu +++ b/cpp/src/pdlp/pdlp.cu @@ -333,23 +333,28 @@ pdlp_solver_t::pdlp_solver_t(problem_t& op_problem, parts = partition_loader_t::parse_distributed_pdlp_partition_file( settings.multi_gpu_partition_file); } else { - cuopt_expects(false, error_type_t::NotImplemented, + cuopt_expects(false, + error_type_t::NotImplemented, "Metis partitioning inside cuopt not implemented yet; " "provide a --parts file via settings.multi_gpu_partition_file"); } - // always compute initial step size before scaling and primal_weight after scaling to do like cuPDLPx - assert(settings_.hyper_params.compute_initial_primal_weight_before_scaling && "compute_initial_primal_weight_before_scaling must be true in distributed mode"); - assert(!settings_.hyper_params.compute_initial_step_size_before_scaling && "compute_initial_step_size_before_scaling must be false in distributed mode"); - + // always compute initial step size before scaling and primal_weight after scaling to do like + // cuPDLPx + assert(settings_.hyper_params.compute_initial_primal_weight_before_scaling && + "compute_initial_primal_weight_before_scaling must be true in distributed mode"); + assert(!settings_.hyper_params.compute_initial_step_size_before_scaling && + "compute_initial_step_size_before_scaling must be false in distributed mode"); + compute_initial_primal_weight(); - + // scale globally before dispatching to shards initial_scaling_strategy_.scale_problem(); - + compute_initial_step_size(); + step_size_strategy_.get_primal_and_dual_stepsizes(primal_step_size_, dual_step_size_); - const f_t initial_step_size_global = get_step_size_h(0); + const f_t initial_step_size_global = get_step_size_h(0); const f_t initial_primal_weight_global = get_primal_weight_h(0); // 4. Copy both scaled and unscaled pb @@ -359,54 +364,61 @@ pdlp_solver_t::pdlp_solver_t(problem_t& op_problem, i_t const nnz = op_problem_scaled_.nnz; // Shared topology (taken from the scaled problem, but identical on both). - std::vector h_A_row_offsets (n_cstr + 1); - std::vector h_A_col_indices (nnz); + std::vector h_A_row_offsets(n_cstr + 1); + std::vector h_A_col_indices(nnz); std::vector h_A_t_row_offsets(n_vars + 1); std::vector h_A_t_col_indices(nnz); - raft::copy(h_A_row_offsets .data(), op_problem_scaled_.offsets .data(), n_cstr + 1, stream); - raft::copy(h_A_col_indices .data(), op_problem_scaled_.variables .data(), nnz, stream); - raft::copy(h_A_t_row_offsets.data(), op_problem_scaled_.reverse_offsets .data(), n_vars + 1, stream); - raft::copy(h_A_t_col_indices.data(), op_problem_scaled_.reverse_constraints.data(), nnz, stream); + raft::copy(h_A_row_offsets.data(), op_problem_scaled_.offsets.data(), n_cstr + 1, stream); + raft::copy(h_A_col_indices.data(), op_problem_scaled_.variables.data(), nnz, stream); + raft::copy( + h_A_t_row_offsets.data(), op_problem_scaled_.reverse_offsets.data(), n_vars + 1, stream); + raft::copy(h_A_t_col_indices.data(), op_problem_scaled_.reverse_constraints.data(), nnz, stream); // Paired value arrays for A and A_T. - std::vector h_A_values (nnz); - std::vector h_A_values_scaled (nnz); - std::vector h_A_t_values (nnz); + std::vector h_A_values(nnz); + std::vector h_A_values_scaled(nnz); + std::vector h_A_t_values(nnz); std::vector h_A_t_values_scaled(nnz); - raft::copy(h_A_values .data(), problem_ptr->coefficients .data(), nnz, stream); - raft::copy(h_A_t_values .data(), problem_ptr->reverse_coefficients .data(), nnz, stream); - raft::copy(h_A_values_scaled .data(), op_problem_scaled_.coefficients .data(), nnz, stream); - raft::copy(h_A_t_values_scaled.data(), op_problem_scaled_.reverse_coefficients.data(), nnz, stream); + raft::copy(h_A_values.data(), problem_ptr->coefficients.data(), nnz, stream); + raft::copy(h_A_t_values.data(), problem_ptr->reverse_coefficients.data(), nnz, stream); + raft::copy(h_A_values_scaled.data(), op_problem_scaled_.coefficients.data(), nnz, stream); + raft::copy( + h_A_t_values_scaled.data(), op_problem_scaled_.reverse_coefficients.data(), nnz, stream); using f_t2 = typename type_2::type; - std::vector h_obj (n_vars); - std::vector h_obj_scaled (n_vars); - std::vector h_var_bounds_packed (n_vars); + std::vector h_obj(n_vars); + std::vector h_obj_scaled(n_vars); + std::vector h_var_bounds_packed(n_vars); std::vector h_var_bounds_scaled_packed(n_vars); - std::vector h_cstr_lower (n_cstr); - std::vector h_cstr_upper (n_cstr); - std::vector h_cstr_lower_scaled(n_cstr); - std::vector h_cstr_upper_scaled(n_cstr); - - raft::copy(h_obj .data(), problem_ptr->objective_coefficients.data(), n_vars, stream); - raft::copy(h_obj_scaled .data(), op_problem_scaled_.objective_coefficients.data(), n_vars, stream); - raft::copy(h_var_bounds_packed .data(), problem_ptr->variable_bounds.data(), n_vars, stream); - raft::copy(h_var_bounds_scaled_packed.data(), op_problem_scaled_.variable_bounds.data(), n_vars, stream); - raft::copy(h_cstr_lower .data(), problem_ptr->constraint_lower_bounds.data(), n_cstr, stream); - raft::copy(h_cstr_upper .data(), problem_ptr->constraint_upper_bounds.data(), n_cstr, stream); - raft::copy(h_cstr_lower_scaled .data(), op_problem_scaled_.constraint_lower_bounds.data(), n_cstr, stream); - raft::copy(h_cstr_upper_scaled .data(), op_problem_scaled_.constraint_upper_bounds.data(), n_cstr, stream); + std::vector h_cstr_lower(n_cstr); + std::vector h_cstr_upper(n_cstr); + std::vector h_cstr_lower_scaled(n_cstr); + std::vector h_cstr_upper_scaled(n_cstr); + + raft::copy(h_obj.data(), problem_ptr->objective_coefficients.data(), n_vars, stream); + raft::copy(h_obj_scaled.data(), op_problem_scaled_.objective_coefficients.data(), n_vars, stream); + raft::copy(h_var_bounds_packed.data(), problem_ptr->variable_bounds.data(), n_vars, stream); + raft::copy( + h_var_bounds_scaled_packed.data(), op_problem_scaled_.variable_bounds.data(), n_vars, stream); + raft::copy(h_cstr_lower.data(), problem_ptr->constraint_lower_bounds.data(), n_cstr, stream); + raft::copy(h_cstr_upper.data(), problem_ptr->constraint_upper_bounds.data(), n_cstr, stream); + raft::copy( + h_cstr_lower_scaled.data(), op_problem_scaled_.constraint_lower_bounds.data(), n_cstr, stream); + raft::copy( + h_cstr_upper_scaled.data(), op_problem_scaled_.constraint_upper_bounds.data(), n_cstr, stream); // 5. Get full scaling factors on host std::vector h_cummulative_cstr_scaling(n_cstr); - std::vector h_cummulative_var_scaling (n_vars); + std::vector h_cummulative_var_scaling(n_vars); raft::copy(h_cummulative_cstr_scaling.data(), initial_scaling_strategy_.get_constraint_matrix_scaling_vector().data(), - n_cstr, stream); + n_cstr, + stream); raft::copy(h_cummulative_var_scaling.data(), initial_scaling_strategy_.get_variable_scaling_vector().data(), - n_vars, stream); + n_vars, + stream); const f_t h_bound_rescaling = initial_scaling_strategy_.get_h_bound_rescaling(); const f_t h_objective_rescaling = initial_scaling_strategy_.get_h_objective_rescaling(); @@ -414,7 +426,7 @@ pdlp_solver_t::pdlp_solver_t(problem_t& op_problem, // Unpack interleaved {lower, upper} into separate vectors for both // versions, so the shard ctor's slicing loop is uniform. - std::vector h_var_lower (n_vars), h_var_upper (n_vars); + std::vector h_var_lower(n_vars), h_var_upper(n_vars); std::vector h_var_lower_scaled(n_vars), h_var_upper_scaled(n_vars); for (i_t i = 0; i < n_vars; ++i) { h_var_lower[i] = h_var_bounds_packed[i].x; @@ -425,35 +437,58 @@ pdlp_solver_t::pdlp_solver_t(problem_t& op_problem, // 6. Build per-rank data and meta-data. std::vector> sub_pdlp_rank_data = - partition_loader_t::create_rank_data_from_parts( - parts, - h_A_row_offsets, h_A_col_indices, - h_A_values, h_A_values_scaled, - h_A_t_row_offsets, h_A_t_col_indices, - h_A_t_values, h_A_t_values_scaled, - settings.num_gpus, n_cstr, n_vars, nnz); + partition_loader_t::create_rank_data_from_parts(parts, + h_A_row_offsets, + h_A_col_indices, + h_A_values, + h_A_values_scaled, + h_A_t_row_offsets, + h_A_t_col_indices, + h_A_t_values, + h_A_t_values_scaled, + settings.num_gpus, + n_cstr, + n_vars, + nnz); // 7. Build the per-shard PDLP settings: - pdlp_solver_settings_t sub_pdlp_settings = settings; - sub_pdlp_settings.num_gpus = 1; - sub_pdlp_settings.multi_gpu_partition_file = ""; - sub_pdlp_settings.is_distributed_sub_pdlp = true; - sub_pdlp_settings.hyper_params.default_l_inf_ruiz_iterations = 0; - sub_pdlp_settings.hyper_params.default_alpha_pock_chambolle_rescaling = 0.0; - sub_pdlp_settings.set_initial_step_size (initial_step_size_global); - sub_pdlp_settings.set_initial_primal_weight(initial_primal_weight_global); + pdlp_solver_settings_t sub_pdlp_settings = settings; + sub_pdlp_settings.num_gpus = 1; + sub_pdlp_settings.multi_gpu_partition_file = ""; + sub_pdlp_settings.is_distributed_sub_pdlp = true; + sub_pdlp_settings.hyper_params.default_l_inf_ruiz_iterations = 0; + sub_pdlp_settings.hyper_params.default_alpha_pock_chambolle_rescaling = 0.0; // 8. Construct the engine, creates NCCL comms and shards - multi_gpu_engine.emplace( - std::move(sub_pdlp_rank_data), - h_obj, h_var_lower, h_var_upper, h_cstr_lower, h_cstr_upper, - h_obj_scaled, h_var_lower_scaled, h_var_upper_scaled, h_cstr_lower_scaled, h_cstr_upper_scaled, - h_cummulative_cstr_scaling, h_cummulative_var_scaling, - h_bound_rescaling, h_objective_rescaling, - op_problem_scaled_.maximize, - op_problem_scaled_.objective_offset, - op_problem_scaled_.presolve_data.objective_scaling_factor, - sub_pdlp_settings); + multi_gpu_engine.emplace(std::move(sub_pdlp_rank_data), + h_obj, + h_var_lower, + h_var_upper, + h_cstr_lower, + h_cstr_upper, + h_obj_scaled, + h_var_lower_scaled, + h_var_upper_scaled, + h_cstr_lower_scaled, + h_cstr_upper_scaled, + h_cummulative_cstr_scaling, + h_cummulative_var_scaling, + h_bound_rescaling, + h_objective_rescaling, + op_problem_scaled_.maximize, + op_problem_scaled_.objective_offset, + op_problem_scaled_.presolve_data.objective_scaling_factor, + sub_pdlp_settings); + + for (auto& shard : multi_gpu_engine.shards) { + raft::device_setter guard(shard->device_id); + auto& sub = *shard->sub_pdlp; + raft::copy(sub.step_size_.data(), step_size_.data(), 1, shard->stream); + raft::copy(sub.primal_weight_.data(), primal_weight_.data(), 1, shard->stream); + raft::copy(sub.best_primal_weight_.data(), best_primal_weight_.data(), 1, shard->stream); + raft::copy(sub.primal_step_size_.data(), primal_step_size_.data(), 1, shard->stream); + raft::copy(sub.dual_step_size_.data(), dual_step_size_.data(), 1, shard->stream); + } } template @@ -2392,219 +2427,215 @@ optimization_problem_solution_t pdlp_solver_t::run_solver(co std::cout << "Starting PDLP loop:" << std::endl; #endif - // TODO handle that properly - if (settings_.hyper_params.compute_initial_step_size_before_scaling && - !settings_.get_initial_step_size().has_value()) - compute_initial_step_size(); - if (settings_.hyper_params.compute_initial_primal_weight_before_scaling && - !settings_.get_initial_primal_weight().has_value()) - compute_initial_primal_weight(); - - // Skip the in-loop scaling pass in both distributed roles: - // - The master pdlp_solver_t scaled op_problem_scaled_ in its multi-GPU - // ctor before shipping data to the shards (multi_gpu_engine present). - // - Each per-shard pdlp_solver_t received already-scaled - // op_problem_scaled_ + injected scaling state from the master, so it - // must not re-apply scale_problem() (is_distributed_sub_pdlp set). - if (!multi_gpu_engine.has_value() && !settings_.is_distributed_sub_pdlp) { + // In distributed mode, skip all setup, it is done before + if (!settings_.hyper_params.use_distributed_pdlp) { + // TODO handle that properly + if (settings_.hyper_params.compute_initial_step_size_before_scaling && + !settings_.get_initial_step_size().has_value()) + compute_initial_step_size(); + if (settings_.hyper_params.compute_initial_primal_weight_before_scaling && + !settings_.get_initial_primal_weight().has_value()) + compute_initial_primal_weight(); + initial_scaling_strategy_.scale_problem(); - } - // Update FP32 matrix copies for mixed precision SpMV after scaling - pdhg_solver_.get_cusparse_view().update_mixed_precision_matrices(); + // Update FP32 matrix copies for mixed precision SpMV after scaling + pdhg_solver_.get_cusparse_view().update_mixed_precision_matrices(); - if (!settings_.hyper_params.compute_initial_step_size_before_scaling && - !settings_.get_initial_step_size().has_value()) - compute_initial_step_size(); - if (!settings_.hyper_params.compute_initial_primal_weight_before_scaling && - !settings_.get_initial_primal_weight().has_value()) - compute_initial_primal_weight(); + if (!settings_.hyper_params.compute_initial_step_size_before_scaling && + !settings_.get_initial_step_size().has_value()) + compute_initial_step_size(); + if (!settings_.hyper_params.compute_initial_primal_weight_before_scaling && + !settings_.get_initial_primal_weight().has_value()) + compute_initial_primal_weight(); #ifdef PDLP_DEBUG_MODE - std::cout << "Initial Scaling done" << std::endl; + std::cout << "Initial Scaling done" << std::endl; #endif - - // Needs to be performed here before the below line to make sure the initial primal_weight / step - // size are used as previous point when potentially updating them in this next call - if (settings_.get_initial_step_size().has_value() || initial_step_size_.has_value()) { - if (initial_step_size_.has_value()) - thrust::uninitialized_fill(handle_ptr_->get_thrust_policy(), - step_size_.begin(), - step_size_.end(), - initial_step_size_.value()); - else - thrust::uninitialized_fill(handle_ptr_->get_thrust_policy(), - step_size_.begin(), - step_size_.end(), - settings_.get_initial_step_size().value()); - } - if (settings_.get_initial_primal_weight().has_value() || initial_primal_weight_.has_value()) { - if (initial_primal_weight_.has_value()) { - thrust::uninitialized_fill(handle_ptr_->get_thrust_policy(), - primal_weight_.begin(), - primal_weight_.end(), - initial_primal_weight_.value()); - if (is_cupdlpx_restart(settings_.hyper_params)) + // Needs to be performed here before the below line to make sure the initial primal_weight / + // step size are used as previous point when potentially updating them in this next call + if (settings_.get_initial_step_size().has_value() || initial_step_size_.has_value()) { + if (initial_step_size_.has_value()) + thrust::uninitialized_fill(handle_ptr_->get_thrust_policy(), + step_size_.begin(), + step_size_.end(), + initial_step_size_.value()); + else + thrust::uninitialized_fill(handle_ptr_->get_thrust_policy(), + step_size_.begin(), + step_size_.end(), + settings_.get_initial_step_size().value()); + } + if (settings_.get_initial_primal_weight().has_value() || initial_primal_weight_.has_value()) { + if (initial_primal_weight_.has_value()) { thrust::uninitialized_fill(handle_ptr_->get_thrust_policy(), - best_primal_weight_.begin(), - best_primal_weight_.end(), + primal_weight_.begin(), + primal_weight_.end(), initial_primal_weight_.value()); - } else { - thrust::uninitialized_fill(handle_ptr_->get_thrust_policy(), - primal_weight_.begin(), - primal_weight_.end(), - settings_.get_initial_primal_weight().value()); - if (is_cupdlpx_restart(settings_.hyper_params)) + if (is_cupdlpx_restart(settings_.hyper_params)) + thrust::uninitialized_fill(handle_ptr_->get_thrust_policy(), + best_primal_weight_.begin(), + best_primal_weight_.end(), + initial_primal_weight_.value()); + } else { thrust::uninitialized_fill(handle_ptr_->get_thrust_policy(), - best_primal_weight_.begin(), - best_primal_weight_.end(), + primal_weight_.begin(), + primal_weight_.end(), settings_.get_initial_primal_weight().value()); + if (is_cupdlpx_restart(settings_.hyper_params)) + thrust::uninitialized_fill(handle_ptr_->get_thrust_policy(), + best_primal_weight_.begin(), + best_primal_weight_.end(), + settings_.get_initial_primal_weight().value()); + } + } + if (initial_k_.has_value()) { + pdhg_solver_.total_pdhg_iterations_ = initial_k_.value(); + pdhg_solver_.get_d_total_pdhg_iterations().set_value_async(initial_k_.value(), stream_view_); + } + if (settings_.get_initial_pdlp_iteration().has_value()) { + total_pdlp_iterations_ = settings_.get_initial_pdlp_iteration().value(); + // This is meaningless in batch mode since pdhg step is never used, set it just to avoid + // assertions + pdhg_solver_.get_d_total_pdhg_iterations().set_value_async(total_pdlp_iterations_, + stream_view_); + pdhg_solver_.total_pdhg_iterations_ = total_pdlp_iterations_; + // Reset the fixed point error since at this pdlp iteration it is expected to already be + // initialized to some value + std::fill(restart_strategy_.initial_fixed_point_error_.begin(), + restart_strategy_.initial_fixed_point_error_.end(), + f_t(0.0)); + std::fill(restart_strategy_.fixed_point_error_.begin(), + restart_strategy_.fixed_point_error_.end(), + f_t(0.0)); } - } - if (initial_k_.has_value()) { - pdhg_solver_.total_pdhg_iterations_ = initial_k_.value(); - pdhg_solver_.get_d_total_pdhg_iterations().set_value_async(initial_k_.value(), stream_view_); - } - if (settings_.get_initial_pdlp_iteration().has_value()) { - total_pdlp_iterations_ = settings_.get_initial_pdlp_iteration().value(); - // This is meaningless in batch mode since pdhg step is never used, set it just to avoid - // assertions - pdhg_solver_.get_d_total_pdhg_iterations().set_value_async(total_pdlp_iterations_, - stream_view_); - pdhg_solver_.total_pdhg_iterations_ = total_pdlp_iterations_; - // Reset the fixed point error since at this pdlp iteration it is expected to already be - // initialized to some value - std::fill(restart_strategy_.initial_fixed_point_error_.begin(), - restart_strategy_.initial_fixed_point_error_.end(), - f_t(0.0)); - std::fill(restart_strategy_.fixed_point_error_.begin(), - restart_strategy_.fixed_point_error_.end(), - f_t(0.0)); - } - // Only the primal_weight_ and step_size_ variables are initialized during the initial phase - // The associated primal/dual step_size (computed using the two firstly mentionned) are not - // initialized. This calls ensures the latter - // In the event of a given primal and dual solutions and if the option is toggled, calling the - // update primal_weight and step_size will also update the associated primal_step_size_, - // dual_step_size_. - // In summary: the below call is only mandatory at the beginning when - // computing/setting the initial primal weight and step size and if they are not recomputed later. - step_size_strategy_.get_primal_and_dual_stepsizes(primal_step_size_, dual_step_size_); + // Only the primal_weight_ and step_size_ variables are initialized during the initial phase + // The associated primal/dual step_size (computed using the two firstly mentionned) are not + // initialized. This calls ensures the latter + // In the event of a given primal and dual solutions and if the option is toggled, calling the + // update primal_weight and step_size will also update the associated primal_step_size_, + // dual_step_size_. + // In summary: the below call is only mandatory at the beginning when + // computing/setting the initial primal weight and step size and if they are not recomputed + // later. + step_size_strategy_.get_primal_and_dual_stepsizes(primal_step_size_, dual_step_size_); #ifdef CUPDLP_DEBUG_MODE - if (initial_primal_.size() != 0 || initial_dual_.size() != 0) { - std::cout << "Initial primal and dual solution before scaling" << std::endl; - if (initial_primal_.size() != 0) { print("initial_primal_", initial_primal_); } - if (initial_dual_.size() != 0) { print("initial_dual_", initial_dual_); } - } + if (initial_primal_.size() != 0 || initial_dual_.size() != 0) { + std::cout << "Initial primal and dual solution before scaling" << std::endl; + if (initial_primal_.size() != 0) { print("initial_primal_", initial_primal_); } + if (initial_dual_.size() != 0) { print("initial_dual_", initial_dual_); } + } #endif - // If there is an initial primal or dual we should update the restart info as if there was a step - // that has happend - if (initial_primal_.size() != 0 || initial_dual_.size() != 0) { - update_primal_dual_solutions( - (initial_primal_.size() != 0) ? std::make_optional(&initial_primal_) : std::nullopt, - (initial_dual_.size() != 0) ? std::make_optional(&initial_dual_) : std::nullopt); - } + // If there is an initial primal or dual we should update the restart info as if there was a + // step that has happend + if (initial_primal_.size() != 0 || initial_dual_.size() != 0) { + update_primal_dual_solutions( + (initial_primal_.size() != 0) ? std::make_optional(&initial_primal_) : std::nullopt, + (initial_dual_.size() != 0) ? std::make_optional(&initial_dual_) : std::nullopt); + } #ifdef CUPDLP_DEBUG_MODE - std::cout << "Solution before projection" << std::endl; - print("pdhg_solver_.get_primal_solution()", pdhg_solver_.get_primal_solution()); - print("pdhg_solver_.get_dual_solution()", pdhg_solver_.get_dual_solution()); - print("pdhg_solver_.get_potential_next_primal_solution()", - pdhg_solver_.get_potential_next_primal_solution()); - print("pdhg_solver_.get_potential_next_dual_solution()", - pdhg_solver_.get_potential_next_dual_solution()); - print("restart_strategy_.last_restart_duality_gap_.primal_solution_", - restart_strategy_.last_restart_duality_gap_.primal_solution_); - print("restart_strategy_.last_restart_duality_gap_.dual_solution_", - restart_strategy_.last_restart_duality_gap_.dual_solution_); + std::cout << "Solution before projection" << std::endl; + print("pdhg_solver_.get_primal_solution()", pdhg_solver_.get_primal_solution()); + print("pdhg_solver_.get_dual_solution()", pdhg_solver_.get_dual_solution()); + print("pdhg_solver_.get_potential_next_primal_solution()", + pdhg_solver_.get_potential_next_primal_solution()); + print("pdhg_solver_.get_potential_next_dual_solution()", + pdhg_solver_.get_potential_next_dual_solution()); + print("restart_strategy_.last_restart_duality_gap_.primal_solution_", + restart_strategy_.last_restart_duality_gap_.primal_solution_); + print("restart_strategy_.last_restart_duality_gap_.dual_solution_", + restart_strategy_.last_restart_duality_gap_.dual_solution_); #endif - // Project initial primal solution - if (settings_.hyper_params.project_initial_primal) { - using f_t2 = typename type_2::type; - cub::DeviceTransform::Transform( - cuda::std::make_tuple(pdhg_solver_.get_primal_solution().data(), - problem_wrap_container(op_problem_scaled_.variable_bounds)), - pdhg_solver_.get_primal_solution().data(), - pdhg_solver_.get_primal_solution().size(), - clamp(), - stream_view_.value()); - - pdhg_solver_.refine_initial_primal_projection(); - - if (!settings_.hyper_params.never_restart_to_average) { - cuopt_expects(!batch_mode_, - cuopt::error_type_t::ValidationError, - "Restart to average not supported in batch mode"); + // Project initial primal solution + if (settings_.hyper_params.project_initial_primal) { + using f_t2 = typename type_2::type; cub::DeviceTransform::Transform( - cuda::std::make_tuple(unscaled_primal_avg_solution_.data(), - op_problem_scaled_.variable_bounds.data()), - unscaled_primal_avg_solution_.data(), - primal_size_h_, + cuda::std::make_tuple(pdhg_solver_.get_primal_solution().data(), + problem_wrap_container(op_problem_scaled_.variable_bounds)), + pdhg_solver_.get_primal_solution().data(), + pdhg_solver_.get_primal_solution().size(), clamp(), stream_view_.value()); + + pdhg_solver_.refine_initial_primal_projection(); + + if (!settings_.hyper_params.never_restart_to_average) { + cuopt_expects(!batch_mode_, + cuopt::error_type_t::ValidationError, + "Restart to average not supported in batch mode"); + cub::DeviceTransform::Transform( + cuda::std::make_tuple(unscaled_primal_avg_solution_.data(), + op_problem_scaled_.variable_bounds.data()), + unscaled_primal_avg_solution_.data(), + primal_size_h_, + clamp(), + stream_view_.value()); + } } - } #ifdef CUPDLP_DEBUG_MODE - std::cout << "Solution after projection" << std::endl; - print("pdhg_solver_.get_primal_solution()", pdhg_solver_.get_primal_solution()); - print("pdhg_solver_.get_dual_solution()", pdhg_solver_.get_dual_solution()); - print("pdhg_solver_.get_potential_next_primal_solution()", - pdhg_solver_.get_potential_next_primal_solution()); - print("pdhg_solver_.get_potential_next_dual_solution()", - pdhg_solver_.get_potential_next_dual_solution()); - print("restart_strategy_.last_restart_duality_gap_.primal_solution_", - restart_strategy_.last_restart_duality_gap_.primal_solution_); - print("restart_strategy_.last_restart_duality_gap_.dual_solution_", - restart_strategy_.last_restart_duality_gap_.dual_solution_); + std::cout << "Solution after projection" << std::endl; + print("pdhg_solver_.get_primal_solution()", pdhg_solver_.get_primal_solution()); + print("pdhg_solver_.get_dual_solution()", pdhg_solver_.get_dual_solution()); + print("pdhg_solver_.get_potential_next_primal_solution()", + pdhg_solver_.get_potential_next_primal_solution()); + print("pdhg_solver_.get_potential_next_dual_solution()", + pdhg_solver_.get_potential_next_dual_solution()); + print("restart_strategy_.last_restart_duality_gap_.primal_solution_", + restart_strategy_.last_restart_duality_gap_.primal_solution_); + print("restart_strategy_.last_restart_duality_gap_.dual_solution_", + restart_strategy_.last_restart_duality_gap_.dual_solution_); #endif - // Need to to tranpose primal solution to row format as there might be initial values or clamping - // Value may not be all 0 - if (batch_mode_) { - rmm::device_uvector dummy(0, stream_view_); - transpose_primal_dual_to_row( - pdhg_solver_.get_primal_solution(), pdhg_solver_.get_dual_solution(), dummy); - if (settings_.hyper_params.use_reflected_primal_dual) { - transpose_primal_dual_to_row(pdhg_solver_.get_potential_next_primal_solution(), - pdhg_solver_.get_potential_next_dual_solution(), - dummy); - transpose_primal_dual_to_row(restart_strategy_.last_restart_duality_gap_.primal_solution_, - restart_strategy_.last_restart_duality_gap_.dual_solution_, - dummy); + // Need to to tranpose primal solution to row format as there might be initial values or + // clamping Value may not be all 0 + if (batch_mode_) { + rmm::device_uvector dummy(0, stream_view_); + transpose_primal_dual_to_row( + pdhg_solver_.get_primal_solution(), pdhg_solver_.get_dual_solution(), dummy); + if (settings_.hyper_params.use_reflected_primal_dual) { + transpose_primal_dual_to_row(pdhg_solver_.get_potential_next_primal_solution(), + pdhg_solver_.get_potential_next_dual_solution(), + dummy); + transpose_primal_dual_to_row(restart_strategy_.last_restart_duality_gap_.primal_solution_, + restart_strategy_.last_restart_duality_gap_.dual_solution_, + dummy); + } } - } - if (verbose) { - std::cout << "primal_size_h_ " << primal_size_h_ << " dual_size_h_ " << dual_size_h_ << " nnz " - << problem_ptr->nnz << std::endl; - std::cout << "Problem before scaling" << std::endl; - print_problem_info( - problem_ptr->coefficients, problem_ptr->objective_coefficients, problem_ptr->combined_bounds); - std::cout << "Problem after scaling" << std::endl; - print_problem_info(op_problem_scaled_.coefficients, - op_problem_scaled_.objective_coefficients, - op_problem_scaled_.combined_bounds); - raft::print_device_vector("Initial step_size", step_size_.data(), 1, std::cout); - raft::print_device_vector("Initial primal_weight", primal_weight_.data(), 1, std::cout); - raft::print_device_vector("Initial primal_step_size", primal_step_size_.data(), 1, std::cout); - raft::print_device_vector("Initial dual_step_size", dual_step_size_.data(), 1, std::cout); - } + if (verbose) { + std::cout << "primal_size_h_ " << primal_size_h_ << " dual_size_h_ " << dual_size_h_ + << " nnz " << problem_ptr->nnz << std::endl; + std::cout << "Problem before scaling" << std::endl; + print_problem_info(problem_ptr->coefficients, + problem_ptr->objective_coefficients, + problem_ptr->combined_bounds); + std::cout << "Problem after scaling" << std::endl; + print_problem_info(op_problem_scaled_.coefficients, + op_problem_scaled_.objective_coefficients, + op_problem_scaled_.combined_bounds); + raft::print_device_vector("Initial step_size", step_size_.data(), 1, std::cout); + raft::print_device_vector("Initial primal_weight", primal_weight_.data(), 1, std::cout); + raft::print_device_vector("Initial primal_step_size", primal_step_size_.data(), 1, std::cout); + raft::print_device_vector("Initial dual_step_size", dual_step_size_.data(), 1, std::cout); + } #ifdef CUPDLP_DEBUG_MODE - raft::print_device_vector("Initial step_size", step_size_.data(), step_size_.size(), std::cout); - raft::print_device_vector( - "Initial primal_weight", primal_weight_.data(), primal_weight_.size(), std::cout); + raft::print_device_vector("Initial step_size", step_size_.data(), step_size_.size(), std::cout); + raft::print_device_vector( + "Initial primal_weight", primal_weight_.data(), primal_weight_.size(), std::cout); #endif - bool warm_start_was_given = settings_.get_pdlp_warm_start_data().is_populated(); + bool warm_start_was_given = settings_.get_pdlp_warm_start_data().is_populated(); - if (!inside_mip_) { - CUOPT_LOG_INFO( - " Iter Primal Obj. Dual Obj. Gap Primal Res. Dual Res. Time"); + if (!inside_mip_) { + CUOPT_LOG_INFO( + " Iter Primal Obj. Dual Obj. Gap Primal Res. Dual Res. Time"); + } } while (true) { #ifdef CUPDLP_DEBUG_MODE diff --git a/cpp/src/pdlp/pdlp.cuh b/cpp/src/pdlp/pdlp.cuh index 532f038fbf..598d93ec33 100644 --- a/cpp/src/pdlp/pdlp.cuh +++ b/cpp/src/pdlp/pdlp.cuh @@ -62,11 +62,11 @@ class pdlp_solver_t { pdlp_solver_t(problem_t& op_problem, pdlp_solver_settings_t const& settings, bool is_batch_mode = false); - + // Distributed Solver Constructor pdlp_solver_t(problem_t& op_problem, - pdlp_solver_settings_t const& settings, - int num_gpus); + pdlp_solver_settings_t const& settings, + int num_gpus); optimization_problem_solution_t run_solver(const timer_t& timer); From 0965a60dd6300638174761e686936303aef97030 Mon Sep 17 00:00:00 2001 From: Bulle Mostovoi Date: Wed, 20 May 2026 16:41:07 +0200 Subject: [PATCH 11/67] added distributed transform --- .../distributed_pdlp/multi_gpu_engine.hpp | 47 +++++++++++++++++++ cpp/src/pdlp/pdlp.cu | 13 +++++ 2 files changed, 60 insertions(+) diff --git a/cpp/src/pdlp/distributed_pdlp/multi_gpu_engine.hpp b/cpp/src/pdlp/distributed_pdlp/multi_gpu_engine.hpp index e191a89d60..94f6b8584a 100644 --- a/cpp/src/pdlp/distributed_pdlp/multi_gpu_engine.hpp +++ b/cpp/src/pdlp/distributed_pdlp/multi_gpu_engine.hpp @@ -42,6 +42,53 @@ struct multi_gpu_engine_t { multi_gpu_engine_t(const multi_gpu_engine_t&) = delete; multi_gpu_engine_t& operator=(const multi_gpu_engine_t&) = delete; + + + template + void for_each_shard(Fn&& fn) + { + for (auto& s : shards) { + raft::device_setter guard(s->device_id); + fn(*s); + } + } + + template + void distributed_transform(std::tuple in_accessors, + OutAccess out, + SizeAccess sz, + Op op) + { + for_each_shard([&](auto& shard) { + auto& sub = *shard.sub_pdlp; + // turns the Tuple of lambdas into a tuple of rmm::device_uvector + auto cub_inputs = std::apply( + [&sub](auto&... acc) { return cuda::std::make_tuple(acc(sub)...); }, + in_accessors); + + cub::DeviceTransform::Transform(cub_inputs, + out(sub), + sz(sub), + op, + shard.stream.view()); + }); + } + // --- 2) convenience: single input accessor (delegates) --- + template + void distributed_transform(InAccess in, + OutAccess out, + SizeAccess sz, + Op op) + { + distributed_transform(std::make_tuple(in), out, sz, op); + } + // Engine-level stream for fork/join orchestration (master side). rmm::cuda_stream stream; diff --git a/cpp/src/pdlp/pdlp.cu b/cpp/src/pdlp/pdlp.cu index 2a36c160fd..12717ce45b 100644 --- a/cpp/src/pdlp/pdlp.cu +++ b/cpp/src/pdlp/pdlp.cu @@ -489,6 +489,19 @@ pdlp_solver_t::pdlp_solver_t(problem_t& op_problem, raft::copy(sub.primal_step_size_.data(), primal_step_size_.data(), 1, shard->stream); raft::copy(sub.dual_step_size_.data(), dual_step_size_.data(), 1, shard->stream); } + + // Project initial primal solution + if (settings_.hyper_params.project_initial_primal) { + using f_t2 = typename type_2::type; + + multi_gpu_engine->distributed_transform( + std::make_tuple( + [](auto& s) -> auto& { return s.pdhg_solver_.get_primal_solution().data();}, + [](auto& s) -> auto& { return s.get_op_problem_scaled().variable_bounds.data();}), + [](auto& s) -> auto& { return s.pdhg_solver_.get_primal_solution().data(); }, + [](auto& s) -> auto { return s.pdhg_solver_.get_primal_solution().size(); }, + clamp() + ) } template From d4d1cab460a8b06163a749d7496099c185baa2de Mon Sep 17 00:00:00 2001 From: Bulle Mostovoi Date: Wed, 20 May 2026 16:45:59 +0200 Subject: [PATCH 12/67] added semicolon and existing runtime error enum --- cpp/src/pdlp/pdlp.cu | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/cpp/src/pdlp/pdlp.cu b/cpp/src/pdlp/pdlp.cu index 12717ce45b..6ef586a8b8 100644 --- a/cpp/src/pdlp/pdlp.cu +++ b/cpp/src/pdlp/pdlp.cu @@ -334,9 +334,9 @@ pdlp_solver_t::pdlp_solver_t(problem_t& op_problem, settings.multi_gpu_partition_file); } else { cuopt_expects(false, - error_type_t::NotImplemented, - "Metis partitioning inside cuopt not implemented yet; " - "provide a --parts file via settings.multi_gpu_partition_file"); + error_type_t::RuntimeError, + "Metis partitioning inside cuopt not implemented yet; " + "provide a --parts file via settings.multi_gpu_partition_file"); } // always compute initial step size before scaling and primal_weight after scaling to do like @@ -501,7 +501,7 @@ pdlp_solver_t::pdlp_solver_t(problem_t& op_problem, [](auto& s) -> auto& { return s.pdhg_solver_.get_primal_solution().data(); }, [](auto& s) -> auto { return s.pdhg_solver_.get_primal_solution().size(); }, clamp() - ) + ); } template From 6659dd9768a9db5504c3ef480bacf148f66f8f33 Mon Sep 17 00:00:00 2001 From: Bulle Mostovoi Date: Wed, 20 May 2026 16:49:05 +0200 Subject: [PATCH 13/67] added } and fixed cuot_expects in partition loader --- cpp/src/pdlp/distributed_pdlp/partition_loader.cu | 11 +++++++---- cpp/src/pdlp/pdlp.cu | 1 + 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/cpp/src/pdlp/distributed_pdlp/partition_loader.cu b/cpp/src/pdlp/distributed_pdlp/partition_loader.cu index 6c96e0b63d..007df4ce1c 100644 --- a/cpp/src/pdlp/distributed_pdlp/partition_loader.cu +++ b/cpp/src/pdlp/distributed_pdlp/partition_loader.cu @@ -18,8 +18,10 @@ std::vector partition_loader_t::parse_distributed_pdlp_partition_ std::string const& file) { std::ifstream part_file(file); - cuopt_expects( - part_file.is_open(), error_type_t::ValidationError, "Failed to open partition file: " + file); + cuopt_expects(part_file.is_open(), + error_type_t::ValidationError, + "Failed to open partition file: %s", + file.c_str()); // One integer per line; operator>> skips whitespace so blank lines and // trailing newlines are tolerated. @@ -31,8 +33,9 @@ std::vector partition_loader_t::parse_distributed_pdlp_partition_ // We must have hit EOF cleanly; any other state means a malformed token. cuopt_expects(part_file.eof(), - error_type_t::ValidationError, - "Malformed partition file (expected one integer per line): " + file); + error_type_t::ValidationError, + "Malformed partition file (expected one integer per line): %s", + file.c_str()); return parts; } diff --git a/cpp/src/pdlp/pdlp.cu b/cpp/src/pdlp/pdlp.cu index 6ef586a8b8..ee91c874a4 100644 --- a/cpp/src/pdlp/pdlp.cu +++ b/cpp/src/pdlp/pdlp.cu @@ -502,6 +502,7 @@ pdlp_solver_t::pdlp_solver_t(problem_t& op_problem, [](auto& s) -> auto { return s.pdhg_solver_.get_primal_solution().size(); }, clamp() ); + } } template From b2ed271234f8ad3ec9483aff4fdb9f3aa5d6b21f Mon Sep 17 00:00:00 2001 From: Bulle Mostovoi Date: Wed, 20 May 2026 16:58:37 +0200 Subject: [PATCH 14/67] small bug fixes --- cpp/src/pdlp/pdlp.cu | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/cpp/src/pdlp/pdlp.cu b/cpp/src/pdlp/pdlp.cu index ee91c874a4..a2aa14a78a 100644 --- a/cpp/src/pdlp/pdlp.cu +++ b/cpp/src/pdlp/pdlp.cu @@ -480,7 +480,7 @@ pdlp_solver_t::pdlp_solver_t(problem_t& op_problem, op_problem_scaled_.presolve_data.objective_scaling_factor, sub_pdlp_settings); - for (auto& shard : multi_gpu_engine.shards) { + for (auto& shard : multi_gpu_engine->shards) { raft::device_setter guard(shard->device_id); auto& sub = *shard->sub_pdlp; raft::copy(sub.step_size_.data(), step_size_.data(), 1, shard->stream); @@ -2441,6 +2441,8 @@ optimization_problem_solution_t pdlp_solver_t::run_solver(co std::cout << "Starting PDLP loop:" << std::endl; #endif + bool warm_start_was_given = settings_.get_pdlp_warm_start_data().is_populated(); + // In distributed mode, skip all setup, it is done before if (!settings_.hyper_params.use_distributed_pdlp) { // TODO handle that properly @@ -2644,7 +2646,6 @@ optimization_problem_solution_t pdlp_solver_t::run_solver(co "Initial primal_weight", primal_weight_.data(), primal_weight_.size(), std::cout); #endif - bool warm_start_was_given = settings_.get_pdlp_warm_start_data().is_populated(); if (!inside_mip_) { CUOPT_LOG_INFO( From 50d16ce7d5abb8a06bf69f4be3e96b4bb19c3f0d Mon Sep 17 00:00:00 2001 From: Bulle Mostovoi Date: Wed, 20 May 2026 17:04:18 +0200 Subject: [PATCH 15/67] =?UTF-8?q?a=20version=20that=20compiles=20#heheha?= =?UTF-8?q?=20=F0=9F=98=8E=F0=9F=98=8E=F0=9F=98=8E=F0=9F=98=8E?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- cpp/src/pdlp/pdlp.cu | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/cpp/src/pdlp/pdlp.cu b/cpp/src/pdlp/pdlp.cu index a2aa14a78a..1cc987291f 100644 --- a/cpp/src/pdlp/pdlp.cu +++ b/cpp/src/pdlp/pdlp.cu @@ -496,10 +496,10 @@ pdlp_solver_t::pdlp_solver_t(problem_t& op_problem, multi_gpu_engine->distributed_transform( std::make_tuple( - [](auto& s) -> auto& { return s.pdhg_solver_.get_primal_solution().data();}, - [](auto& s) -> auto& { return s.get_op_problem_scaled().variable_bounds.data();}), - [](auto& s) -> auto& { return s.pdhg_solver_.get_primal_solution().data(); }, - [](auto& s) -> auto { return s.pdhg_solver_.get_primal_solution().size(); }, + [](auto& s) { return s.pdhg_solver_.get_primal_solution().data();}, + [](auto& s) { return s.get_op_problem_scaled().variable_bounds.data();}), + [](auto& s) { return s.pdhg_solver_.get_primal_solution().data(); }, + [](auto& s) { return s.pdhg_solver_.get_primal_solution().size(); }, clamp() ); } From 359d9f49693afb5a22ea767114dd8e3b20414c9a Mon Sep 17 00:00:00 2001 From: Bulle Mostovoi Date: Thu, 21 May 2026 10:51:26 +0200 Subject: [PATCH 16/67] removed use of engine:transaform --- cpp/src/pdlp/pdlp.cu | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/cpp/src/pdlp/pdlp.cu b/cpp/src/pdlp/pdlp.cu index 1cc987291f..d5422c9d5f 100644 --- a/cpp/src/pdlp/pdlp.cu +++ b/cpp/src/pdlp/pdlp.cu @@ -493,15 +493,16 @@ pdlp_solver_t::pdlp_solver_t(problem_t& op_problem, // Project initial primal solution if (settings_.hyper_params.project_initial_primal) { using f_t2 = typename type_2::type; - - multi_gpu_engine->distributed_transform( - std::make_tuple( - [](auto& s) { return s.pdhg_solver_.get_primal_solution().data();}, - [](auto& s) { return s.get_op_problem_scaled().variable_bounds.data();}), - [](auto& s) { return s.pdhg_solver_.get_primal_solution().data(); }, - [](auto& s) { return s.pdhg_solver_.get_primal_solution().size(); }, - clamp() - ); + for (auto& shard : multi_gpu_engine->shards) { + raft::device_setter guard(shard->device_id); + auto& sub = *shard->sub_pdlp; + cub::DeviceTransform::Transform( + std::make_tuple(sub.pdhg_solver_.get_primal_solution().data(), + sub.get_op_problem_scaled().variable_bounds.data()), + sub.pdhg_solver_.get_primal_solution().data(), + sub.pdhg_solver_.get_primal_solution().size(), + clamp(), shard->stream); + } } } From 910a49ab4346d08f8c70f4fa3ee9523becf3d9a5 Mon Sep 17 00:00:00 2001 From: Bulle Mostovoi Date: Fri, 22 May 2026 11:08:52 +0200 Subject: [PATCH 17/67] added multi-gpu SpMV #heheha --- .../pdlp/distributed_pdlp/multi_gpu_engine.cu | 4 + .../distributed_pdlp/multi_gpu_engine.hpp | 147 ++++++++++++++++++ cpp/src/pdlp/distributed_pdlp/shard.cu | 24 +++ cpp/src/pdlp/distributed_pdlp/shard.hpp | 11 ++ cpp/src/pdlp/pdhg.cu | 23 +++ cpp/src/pdlp/pdhg.hpp | 27 +++- cpp/src/pdlp/pdlp.cu | 4 + 7 files changed, 238 insertions(+), 2 deletions(-) diff --git a/cpp/src/pdlp/distributed_pdlp/multi_gpu_engine.cu b/cpp/src/pdlp/distributed_pdlp/multi_gpu_engine.cu index fe95b1e5ff..a0b3f5dcc3 100644 --- a/cpp/src/pdlp/distributed_pdlp/multi_gpu_engine.cu +++ b/cpp/src/pdlp/distributed_pdlp/multi_gpu_engine.cu @@ -4,6 +4,10 @@ */ #include +// compute_A_x() / compute_At_y() (defined inline in the engine header) call +// shard.sub_pdlp->pdhg_solver_.compute_* — pdlp_solver_t must be complete at +// the explicit instantiation point below. +#include #include diff --git a/cpp/src/pdlp/distributed_pdlp/multi_gpu_engine.hpp b/cpp/src/pdlp/distributed_pdlp/multi_gpu_engine.hpp index 94f6b8584a..9ea007947e 100644 --- a/cpp/src/pdlp/distributed_pdlp/multi_gpu_engine.hpp +++ b/cpp/src/pdlp/distributed_pdlp/multi_gpu_engine.hpp @@ -6,12 +6,23 @@ #include #include +#include #include +#include + #include +#include + +#include +#include +#include + +#include #include +#include #include namespace cuopt::linear_programming::detail { @@ -89,6 +100,142 @@ struct multi_gpu_engine_t { distributed_transform(std::make_tuple(in), out, sz, op); } + // -------- Halo exchange (variables / x) --------------------------------- + // Fills the halo slice [owned_var_size, total_var_size) of the per-shard + // reflected_primal vector (the buffer A @ x reads). Step 1: thrust::gather + // per-peer outgoing values into staging buffers. Step 2: a single NCCL + // group with matched ncclSend / ncclRecv across all (rank, peer) pairs. + void halo_exchange_var() + { + const int nb = static_cast(shards.size()); + + // Step 1: gather owned values that each peer needs into per-peer staging. + for (int r = 0; r < nb; ++r) { + auto& s = *shards[r]; + raft::device_setter guard(s.device_id); + auto& x = s.sub_pdlp->pdhg_solver_.get_reflected_primal(); + for (int peer = 0; peer < nb; ++peer) { + if (peer == r) continue; + if (s.var_send_indices_d[peer].size() == 0) continue; + thrust::gather(rmm::exec_policy_nosync(s.stream.view()), + s.var_send_indices_d[peer].begin(), + s.var_send_indices_d[peer].end(), + x.begin(), + s.var_send_buf_d[peer].begin()); + } + } + + // Step 2: matched send / recv across the whole topology in one NCCL group. + ncclGroupStart(); + for (int r = 0; r < nb; ++r) { + auto& s = *shards[r]; + raft::device_setter guard(s.device_id); + for (int peer = 0; peer < nb; ++peer) { + if (peer == r) continue; + ncclSend(s.var_send_buf_d[peer].data(), + s.var_send_buf_d[peer].size(), + ncclFloat64, + peer, + s.comm.get(), + s.stream.view().value()); + } + } + for (int r = 0; r < nb; ++r) { + auto& s = *shards[r]; + auto& rd = s.rank_data; + raft::device_setter guard(s.device_id); + auto& x = s.sub_pdlp->pdhg_solver_.get_reflected_primal(); + for (int peer = 0; peer < nb; ++peer) { + if (peer == r) continue; + f_t* recv_ptr = x.data() + rd.owned_var_size + rd.var_recv_offsets[peer]; + ncclRecv(recv_ptr, + static_cast(rd.var_recv_counts[peer]), + ncclFloat64, + peer, + s.comm.get(), + s.stream.view().value()); + } + } + ncclGroupEnd(); + } + + // -------- Halo exchange (constraints / y) ------------------------------- + // Same as halo_exchange_var but for the per-shard dual solution (the buffer + // A_T @ y reads) and constraint halos. + void halo_exchange_cstr() + { + const int nb = static_cast(shards.size()); + + for (int r = 0; r < nb; ++r) { + auto& s = *shards[r]; + raft::device_setter guard(s.device_id); + auto& y = s.sub_pdlp->pdhg_solver_.get_dual_solution(); + for (int peer = 0; peer < nb; ++peer) { + if (peer == r) continue; + if (s.cstr_send_indices_d[peer].size() == 0) continue; + thrust::gather(rmm::exec_policy_nosync(s.stream.view()), + s.cstr_send_indices_d[peer].begin(), + s.cstr_send_indices_d[peer].end(), + y.begin(), + s.cstr_send_buf_d[peer].begin()); + } + } + + ncclGroupStart(); + for (int r = 0; r < nb; ++r) { + auto& s = *shards[r]; + raft::device_setter guard(s.device_id); + for (int peer = 0; peer < nb; ++peer) { + if (peer == r) continue; + ncclSend(s.cstr_send_buf_d[peer].data(), + s.cstr_send_buf_d[peer].size(), + ncclFloat64, + peer, + s.comm.get(), + s.stream.view().value()); + } + } + for (int r = 0; r < nb; ++r) { + auto& s = *shards[r]; + auto& rd = s.rank_data; + raft::device_setter guard(s.device_id); + auto& y = s.sub_pdlp->pdhg_solver_.get_dual_solution(); + for (int peer = 0; peer < nb; ++peer) { + if (peer == r) continue; + f_t* recv_ptr = y.data() + rd.owned_cstr_size + rd.cstr_recv_offsets[peer]; + ncclRecv(recv_ptr, + static_cast(rd.cstr_recv_counts[peer]), + ncclFloat64, + peer, + s.comm.get(), + s.stream.view().value()); + } + } + ncclGroupEnd(); + } + + // -------- High-level: A @ x and A_T @ y --------------------------------- + // A @ x: halo-update the reflected_primal vector, then per-shard SpMV. + // Named distributed_* (rather than compute_*) to make call sites in pdhg.cu + // self-documenting and to avoid name collision with pdhg_solver_t's own + // compute_A_x / compute_At_y, which the engine dispatches into per shard. + void distributed_compute_A_x() + { + halo_exchange_var(); + for_each_shard([&](auto& shard) { + shard.sub_pdlp->pdhg_solver_.compute_A_x(); + }); + } + + // A_T @ y: halo-update the dual solution vector, then per-shard SpMV. + void distributed_compute_At_y() + { + halo_exchange_cstr(); + for_each_shard([&](auto& shard) { + shard.sub_pdlp->pdhg_solver_.compute_At_y(); + }); + } + // Engine-level stream for fork/join orchestration (master side). rmm::cuda_stream stream; diff --git a/cpp/src/pdlp/distributed_pdlp/shard.cu b/cpp/src/pdlp/distributed_pdlp/shard.cu index 596a08a3dc..bbc02559cf 100644 --- a/cpp/src/pdlp/distributed_pdlp/shard.cu +++ b/cpp/src/pdlp/distributed_pdlp/shard.cu @@ -195,6 +195,30 @@ pdlp_shard_t::pdlp_shard_t(int device_id, scaling.set_h_bound_rescaling(h_bound_rescaling); scaling.set_h_objective_rescaling(h_objective_rescaling); + // ---- 6. Build per-peer halo-exchange plans (ported from metis_tests). ---- + // For each peer p, we precompute: + // send_indices_d[p] : local indices to gather (uploaded from host send plan) + // send_buf_d[p] : f_t staging buffer sized to match + // Self-peer slot is present but empty (size 0). Used in engine halo exchange. + auto build_send_plan = [&](auto const& send_per_peer, + auto& indices_d, + auto& buf_d) { + const std::size_t n_peers = send_per_peer.size(); + indices_d.reserve(n_peers); + buf_d.reserve(n_peers); + for (auto const& send_to_peer : send_per_peer) { + rmm::device_uvector idx(send_to_peer.size(), stream_view); + rmm::device_uvector buf(send_to_peer.size(), stream_view); + if (!send_to_peer.empty()) { + raft::copy(idx.data(), send_to_peer.data(), send_to_peer.size(), stream_view); + } + indices_d.emplace_back(std::move(idx)); + buf_d.emplace_back(std::move(buf)); + } + }; + build_send_plan(rank_data.var_send_per_peer, var_send_indices_d, var_send_buf_d); + build_send_plan(rank_data.cstr_send_per_peer, cstr_send_indices_d, cstr_send_buf_d); + handle.sync_stream(stream_view); } diff --git a/cpp/src/pdlp/distributed_pdlp/shard.hpp b/cpp/src/pdlp/distributed_pdlp/shard.hpp index a5ff89c5c4..35babc12db 100644 --- a/cpp/src/pdlp/distributed_pdlp/shard.hpp +++ b/cpp/src/pdlp/distributed_pdlp/shard.hpp @@ -13,6 +13,7 @@ #include #include #include +#include #include @@ -83,6 +84,16 @@ struct pdlp_shard_t { std::optional> opt_problem; std::optional> sub_problem; std::unique_ptr> sub_pdlp; + + // Per-peer halo-exchange state. Inner index = peer rank. + // Slot for self (peer == this rank) is present but unused (size 0). + // var_send_indices_d[peer] : local indices into primal vector to gather and ncclSend + // var_send_buf_d [peer] : staging buffer for outgoing variable values + // cstr_send_indices_d/cstr_send_buf_d : same, for dual vector + std::vector> var_send_indices_d; + std::vector> var_send_buf_d; + std::vector> cstr_send_indices_d; + std::vector> cstr_send_buf_d; }; } // namespace cuopt::linear_programming::detail diff --git a/cpp/src/pdlp/pdhg.cu b/cpp/src/pdlp/pdhg.cu index cb16c9d662..9cf2087c8b 100644 --- a/cpp/src/pdlp/pdhg.cu +++ b/cpp/src/pdlp/pdhg.cu @@ -5,6 +5,11 @@ */ /* clang-format on */ #include +// pdlp.cuh defines pdlp_solver_t which the engine's compute_A_x/compute_At_y +// template bodies dereference via shard.sub_pdlp->pdhg_solver_. Must be a +// complete type at the point of template instantiation below. +#include +#include #include #include #include @@ -306,6 +311,15 @@ void pdhg_solver_t::compute_At_y() { // A_t @ y + // Multi-GPU dispatch: when the master pdhg has an engine, drive halo + // exchange + per-shard SpMV via the engine. Shards' pdhg_solver_ have no + // engine pointer set, so their compute_At_y falls through to the cusparse + // path below on each shard's local A_t. + if (mgpu_engine_ != nullptr) { + mgpu_engine_->distributed_compute_At_y(); + return; + } + if (!batch_mode_) { if constexpr (std::is_same_v) { if (cusparse_view_.mixed_precision_enabled_) { @@ -354,6 +368,15 @@ template void pdhg_solver_t::compute_A_x() { // A @ x + + // Multi-GPU dispatch: see compute_At_y. The engine halo-updates the + // reflected_primal vector (the buffer this SpMV reads) and then drives + // per-shard local cusparse SpMV. + if (mgpu_engine_ != nullptr) { + mgpu_engine_->distributed_compute_A_x(); + return; + } + if (!batch_mode_) { if constexpr (std::is_same_v) { if (cusparse_view_.mixed_precision_enabled_) { diff --git a/cpp/src/pdlp/pdhg.hpp b/cpp/src/pdlp/pdhg.hpp index 0a64e49efb..d258afb091 100644 --- a/cpp/src/pdlp/pdhg.hpp +++ b/cpp/src/pdlp/pdhg.hpp @@ -21,6 +21,12 @@ #include namespace cuopt::linear_programming::detail { + +// Forward-declared to avoid include cycle: multi_gpu_engine.hpp itself includes pdhg.hpp +// (engine calls per-shard pdhg compute_*). pdhg.cu does the full include. +template +struct multi_gpu_engine_t; + template class pdhg_solver_t { public: @@ -69,6 +75,21 @@ class pdhg_solver_t { void update_solution(cusparse_view_t& current_op_problem_evaluation_cusparse_view_); void refine_initial_primal_projection(); + // SpMV primitives. Public so the multi-GPU engine can drive them per-shard + // after halo-exchanging the relevant vector. Single-GPU PDLP still calls + // them internally via take_step / compute_next_*. + // + // If set_multi_gpu_engine() has been called, these dispatch to the engine + // (halo exchange + per-shard SpMV). Otherwise they run the single-GPU + // cusparse path on the local matrix. + void compute_At_y(); + void compute_A_x(); + + // Master PDLP wires up the engine pointer here after the engine is built. + // Shards' pdhg_solver_ leaves this null so each shard runs single-GPU SpMV + // on its local matrix. + void set_multi_gpu_engine(multi_gpu_engine_t* engine) { mgpu_engine_ = engine; } + i_t total_pdhg_iterations_; private: @@ -84,8 +105,6 @@ class pdhg_solver_t { void compute_primal_projection_with_gradient(rmm::device_uvector& primal_step_size); void compute_primal_projection(rmm::device_uvector& primal_step_size); - void compute_At_y(); - void compute_A_x(); bool batch_mode_{false}; raft::handle_t const* handle_ptr_{nullptr}; @@ -132,6 +151,10 @@ class pdhg_solver_t { rmm::device_uvector new_bounds_lower_; rmm::device_uvector new_bounds_upper_; cuda::fast_mod_div batch_size_divisor_; + + // Non-owning. Set on the master pdhg_solver_ in distributed mode; null + // (default) means single-GPU path. See compute_At_y / compute_A_x. + multi_gpu_engine_t* mgpu_engine_{nullptr}; }; } // namespace cuopt::linear_programming::detail diff --git a/cpp/src/pdlp/pdlp.cu b/cpp/src/pdlp/pdlp.cu index d5422c9d5f..348d41a512 100644 --- a/cpp/src/pdlp/pdlp.cu +++ b/cpp/src/pdlp/pdlp.cu @@ -490,6 +490,10 @@ pdlp_solver_t::pdlp_solver_t(problem_t& op_problem, raft::copy(sub.dual_step_size_.data(), dual_step_size_.data(), 1, shard->stream); } + // Wire the engine into the master pdhg_solver_. Shards' pdhg_solver_ keep + // mgpu_engine_ == nullptr so they run plain single-GPU SpMV on local A. + pdhg_solver_.set_multi_gpu_engine(&*multi_gpu_engine); + // Project initial primal solution if (settings_.hyper_params.project_initial_primal) { using f_t2 = typename type_2::type; From 76c0b3f50b96647d23534729a68c6b2f5702848d Mon Sep 17 00:00:00 2001 From: Bulle Mostovoi Date: Fri, 22 May 2026 11:48:51 +0200 Subject: [PATCH 18/67] transformed a transform. it compiles hehe --- cpp/src/pdlp/pdhg.cu | 40 +++++++++++++++++++++++++++++----------- cpp/src/pdlp/pdhg.hpp | 7 +++++++ cpp/src/pdlp/pdlp.cu | 7 ++++--- cpp/src/pdlp/pdlp.cuh | 6 ++++++ 4 files changed, 46 insertions(+), 14 deletions(-) diff --git a/cpp/src/pdlp/pdhg.cu b/cpp/src/pdlp/pdhg.cu index 9cf2087c8b..09d439cc0e 100644 --- a/cpp/src/pdlp/pdhg.cu +++ b/cpp/src/pdlp/pdhg.cu @@ -521,6 +521,26 @@ struct primal_reflected_major_projection { const f_t* scalar_; }; +// Pure cub-transform extract — body byte-identical to the non-batch inline +// path in compute_next_primal_dual_solution_reflected. The platform dispatch +// (single-GPU vs per-shard fan-out) lives at the call site, not here. +// Placed after primal_reflected_major_projection so the functor is visible. +template +void pdhg_solver_t::primal_reflected_major_projection_transform( + rmm::device_uvector& primal_step_size) +{ + cub::DeviceTransform::Transform( + cuda::std::make_tuple(current_saddle_point_state_.get_primal_solution().data(), + problem_ptr->objective_coefficients.data(), + current_saddle_point_state_.get_current_AtY().data(), + problem_ptr->variable_bounds.data()), + thrust::make_zip_iterator( + potential_next_primal_solution_.data(), dual_slack_.data(), reflected_primal_.data()), + primal_size_h_, + primal_reflected_major_projection(primal_step_size.data()), + stream_view_.value()); +} + template struct primal_reflected_major_projection_batch { using f_t2 = typename type_2::type; @@ -910,17 +930,15 @@ void pdhg_solver_t::compute_next_primal_dual_solution_reflected( graph_all.start_capture(should_major); compute_At_y(); - if (!batch_mode_) { - cub::DeviceTransform::Transform( - cuda::std::make_tuple(current_saddle_point_state_.get_primal_solution().data(), - problem_ptr->objective_coefficients.data(), - current_saddle_point_state_.get_current_AtY().data(), - problem_ptr->variable_bounds.data()), - thrust::make_zip_iterator( - potential_next_primal_solution_.data(), dual_slack_.data(), reflected_primal_.data()), - primal_size_h_, - primal_reflected_major_projection(primal_step_size.data()), - stream_view_.value()); + if (mgpu_engine_ != nullptr) { + for (auto& shard : mgpu_engine_->shards) { + raft::device_setter guard(shard->device_id); + auto& sub_pdlp = *shard->sub_pdlp; + sub_pdlp.pdhg_solver_.primal_reflected_major_projection_transform( + sub_pdlp.get_primal_step_size()); + } + } else if (!batch_mode_) { + primal_reflected_major_projection_transform(primal_step_size); } else { cub::DeviceFor::Bulk(potential_next_primal_solution_.size(), primal_reflected_major_projection_bulk_op{ diff --git a/cpp/src/pdlp/pdhg.hpp b/cpp/src/pdlp/pdhg.hpp index d258afb091..3a1795ce6f 100644 --- a/cpp/src/pdlp/pdhg.hpp +++ b/cpp/src/pdlp/pdhg.hpp @@ -85,6 +85,13 @@ class pdhg_solver_t { void compute_At_y(); void compute_A_x(); + // Pure cub-transform extractions. Each one is byte-identical to the inline + // cub call it replaces — no platform dispatch inside. Callers handle the + // single-GPU vs per-shard branching at the call site (see the + // "if (mgpu_engine_) for shard..." blocks in compute_next_*). + void primal_reflected_major_projection_transform( + rmm::device_uvector& primal_step_size); + // Master PDLP wires up the engine pointer here after the engine is built. // Shards' pdhg_solver_ leaves this null so each shard runs single-GPU SpMV // on its local matrix. diff --git a/cpp/src/pdlp/pdlp.cu b/cpp/src/pdlp/pdlp.cu index 348d41a512..168f997724 100644 --- a/cpp/src/pdlp/pdlp.cu +++ b/cpp/src/pdlp/pdlp.cu @@ -501,11 +501,12 @@ pdlp_solver_t::pdlp_solver_t(problem_t& op_problem, raft::device_setter guard(shard->device_id); auto& sub = *shard->sub_pdlp; cub::DeviceTransform::Transform( - std::make_tuple(sub.pdhg_solver_.get_primal_solution().data(), - sub.get_op_problem_scaled().variable_bounds.data()), + cuda::std::make_tuple(sub.pdhg_solver_.get_primal_solution().data(), + sub.get_op_problem_scaled().variable_bounds.data()), sub.pdhg_solver_.get_primal_solution().data(), sub.pdhg_solver_.get_primal_solution().size(), - clamp(), shard->stream); + clamp(), + shard->stream.view()); } } } diff --git a/cpp/src/pdlp/pdlp.cuh b/cpp/src/pdlp/pdlp.cuh index 598d93ec33..6b2bc35a24 100644 --- a/cpp/src/pdlp/pdlp.cuh +++ b/cpp/src/pdlp/pdlp.cuh @@ -114,6 +114,12 @@ class pdlp_solver_t { return initial_scaling_strategy_; } + // Per-shard primal/dual step sizes are private state on pdlp_solver_t but + // are needed inside the multi-GPU dispatch paths that fan out a master cub + // call across all shards' pdhg_solver_t::*_transform methods. + rmm::device_uvector& get_primal_step_size() { return primal_step_size_; } + rmm::device_uvector& get_dual_step_size() { return dual_step_size_; } + private: void print_termination_criteria(const timer_t& timer, bool is_average = false); void print_final_termination_criteria( From 5ec713842159df18170a2c6798f8c92344c789e6 Mon Sep 17 00:00:00 2001 From: Bulle Mostovoi Date: Fri, 22 May 2026 12:55:29 +0200 Subject: [PATCH 19/67] updated take step for distributed. compiles but doesnt run. will check on main --- cpp/CMakeLists.txt | 29 ++++++++++++ cpp/src/pdlp/pdhg.cu | 102 +++++++++++++++++++++++++++++------------- cpp/src/pdlp/pdhg.hpp | 3 ++ cpp/src/pdlp/pdlp.cu | 15 +++++++ 4 files changed, 119 insertions(+), 30 deletions(-) diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index e7b4693547..627e086343 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -288,6 +288,34 @@ create_logger_macros(CUOPT "cuopt::default_logger()" include/cuopt) find_package(CUDSS REQUIRED) +# ################################################################################################## +# - NCCL (multi-GPU distributed PDLP) ------------------------------------------------------------- +# NCCL is shipped via the conda env; no canonical CMake config target, so look it +# up by name in the standard lib paths (plus CONDA_PREFIX as a hint). +set(NCCL_HINT_PREFIXES "") +if (DEFINED ENV{CONDA_PREFIX} AND NOT "$ENV{CONDA_PREFIX}" STREQUAL "") + list(APPEND NCCL_HINT_PREFIXES "$ENV{CONDA_PREFIX}") +endif () +find_path(NCCL_INCLUDE_DIR + NAMES nccl.h + HINTS ${NCCL_HINT_PREFIXES} + PATH_SUFFIXES include +) +find_library(NCCL_LIBRARY + NAMES nccl + HINTS ${NCCL_HINT_PREFIXES} + PATH_SUFFIXES lib lib64 +) +if (NOT NCCL_INCLUDE_DIR OR NOT NCCL_LIBRARY) + message(FATAL_ERROR "NCCL not found. Looked in ${NCCL_HINT_PREFIXES}. Install nccl-dev / libnccl-dev in the active env.") +endif () +add_library(nccl_external UNKNOWN IMPORTED GLOBAL) +set_target_properties(nccl_external PROPERTIES + IMPORTED_LOCATION "${NCCL_LIBRARY}" + INTERFACE_INCLUDE_DIRECTORIES "${NCCL_INCLUDE_DIR}" +) +message(STATUS "Using NCCL: ${NCCL_LIBRARY}") + # ################################################################################################## # - gRPC and Protobuf setup ----------------------------------------------------------------------- @@ -549,6 +577,7 @@ target_link_libraries(cuopt ${CUDSS_LIB_FILE} PRIVATE ${CUOPT_PRIVATE_CUDA_LIBS} + nccl_external $<$:protobuf::libprotobuf> $<$:gRPC::grpc++> ) diff --git a/cpp/src/pdlp/pdhg.cu b/cpp/src/pdlp/pdhg.cu index 09d439cc0e..eb60a43603 100644 --- a/cpp/src/pdlp/pdhg.cu +++ b/cpp/src/pdlp/pdhg.cu @@ -569,6 +569,21 @@ struct primal_reflected_projection { const f_t* scalar_; }; +template +void pdhg_solver_t::primal_reflected_projection_transform( + rmm::device_uvector& primal_step_size) +{ + cub::DeviceTransform::Transform( + cuda::std::make_tuple(current_saddle_point_state_.get_primal_solution().data(), + problem_ptr->objective_coefficients.data(), + current_saddle_point_state_.get_current_AtY().data(), + problem_ptr->variable_bounds.data()), + reflected_primal_.data(), + primal_size_h_, + primal_reflected_projection(primal_step_size.data()), + stream_view_.value()); +} + template struct primal_reflected_projection_batch { using f_t2 = typename type_2::type; @@ -598,6 +613,21 @@ struct dual_reflected_major_projection { const f_t* scalar_; }; +template +void pdhg_solver_t::dual_reflected_major_projection_transform( + rmm::device_uvector& dual_step_size) +{ + cub::DeviceTransform::Transform( + cuda::std::make_tuple(current_saddle_point_state_.get_dual_solution().data(), + current_saddle_point_state_.get_dual_gradient().data(), + problem_ptr->constraint_lower_bounds.data(), + problem_ptr->constraint_upper_bounds.data()), + thrust::make_zip_iterator(potential_next_dual_solution_.data(), reflected_dual_.data()), + dual_size_h_, + dual_reflected_major_projection(dual_step_size.data()), + stream_view_.value()); +} + template struct dual_reflected_major_projection_batch { HDI thrust::tuple operator()( @@ -626,6 +656,21 @@ struct dual_reflected_projection { const f_t* scalar_; }; +template +void pdhg_solver_t::dual_reflected_projection_transform( + rmm::device_uvector& dual_step_size) +{ + cub::DeviceTransform::Transform( + cuda::std::make_tuple(current_saddle_point_state_.get_dual_solution().data(), + current_saddle_point_state_.get_dual_gradient().data(), + problem_ptr->constraint_lower_bounds.data(), + problem_ptr->constraint_upper_bounds.data()), + reflected_dual_.data(), + dual_size_h_, + dual_reflected_projection(dual_step_size.data()), + stream_view_.value()); +} + template struct dual_reflected_projection_batch { HDI f_t @@ -989,16 +1034,15 @@ void pdhg_solver_t::compute_next_primal_dual_solution_reflected( // Compute next dual compute_A_x(); - if (!batch_mode_) { - cub::DeviceTransform::Transform( - cuda::std::make_tuple(current_saddle_point_state_.get_dual_solution().data(), - current_saddle_point_state_.get_dual_gradient().data(), - problem_ptr->constraint_lower_bounds.data(), - problem_ptr->constraint_upper_bounds.data()), - thrust::make_zip_iterator(potential_next_dual_solution_.data(), reflected_dual_.data()), - dual_size_h_, - dual_reflected_major_projection(dual_step_size.data()), - stream_view_.value()); + if (mgpu_engine_ != nullptr) { + for (auto& shard : mgpu_engine_->shards) { + raft::device_setter guard(shard->device_id); + auto& sub_pdlp = *shard->sub_pdlp; + sub_pdlp.pdhg_solver_.dual_reflected_major_projection_transform( + sub_pdlp.get_dual_step_size()); + } + } else if (!batch_mode_) { + dual_reflected_major_projection_transform(dual_step_size); } else { cub::DeviceFor::Bulk(potential_next_dual_solution_.size(), dual_reflected_major_projection_bulk_op{ @@ -1036,16 +1080,15 @@ void pdhg_solver_t::compute_next_primal_dual_solution_reflected( current_saddle_point_state_.get_current_AtY()); #endif - if (!batch_mode_) { - cub::DeviceTransform::Transform( - cuda::std::make_tuple(current_saddle_point_state_.get_primal_solution().data(), - problem_ptr->objective_coefficients.data(), - current_saddle_point_state_.get_current_AtY().data(), - problem_ptr->variable_bounds.data()), - reflected_primal_.data(), - primal_size_h_, - primal_reflected_projection(primal_step_size.data()), - stream_view_.value()); + if (mgpu_engine_ != nullptr) { + for (auto& shard : mgpu_engine_->shards) { + raft::device_setter guard(shard->device_id); + auto& sub_pdlp = *shard->sub_pdlp; + sub_pdlp.pdhg_solver_.primal_reflected_projection_transform( + sub_pdlp.get_primal_step_size()); + } + } else if (!batch_mode_) { + primal_reflected_projection_transform(primal_step_size); } else { cub::DeviceFor::Bulk(reflected_primal_.size(), primal_reflected_projection_bulk_op{ @@ -1097,16 +1140,15 @@ void pdhg_solver_t::compute_next_primal_dual_solution_reflected( // Compute next dual compute_A_x(); - if (!batch_mode_) { - cub::DeviceTransform::Transform( - cuda::std::make_tuple(current_saddle_point_state_.get_dual_solution().data(), - current_saddle_point_state_.get_dual_gradient().data(), - problem_ptr->constraint_lower_bounds.data(), - problem_ptr->constraint_upper_bounds.data()), - reflected_dual_.data(), - dual_size_h_, - dual_reflected_projection(dual_step_size.data()), - stream_view_.value()); + if (mgpu_engine_ != nullptr) { + for (auto& shard : mgpu_engine_->shards) { + raft::device_setter guard(shard->device_id); + auto& sub_pdlp = *shard->sub_pdlp; + sub_pdlp.pdhg_solver_.dual_reflected_projection_transform( + sub_pdlp.get_dual_step_size()); + } + } else if (!batch_mode_) { + dual_reflected_projection_transform(dual_step_size); } else { cub::DeviceFor::Bulk(reflected_dual_.size(), dual_reflected_projection_bulk_op{ diff --git a/cpp/src/pdlp/pdhg.hpp b/cpp/src/pdlp/pdhg.hpp index 3a1795ce6f..628c3897e2 100644 --- a/cpp/src/pdlp/pdhg.hpp +++ b/cpp/src/pdlp/pdhg.hpp @@ -91,6 +91,9 @@ class pdhg_solver_t { // "if (mgpu_engine_) for shard..." blocks in compute_next_*). void primal_reflected_major_projection_transform( rmm::device_uvector& primal_step_size); + void dual_reflected_major_projection_transform(rmm::device_uvector& dual_step_size); + void primal_reflected_projection_transform(rmm::device_uvector& primal_step_size); + void dual_reflected_projection_transform(rmm::device_uvector& dual_step_size); // Master PDLP wires up the engine pointer here after the engine is built. // Shards' pdhg_solver_ leaves this null so each shard runs single-GPU SpMV diff --git a/cpp/src/pdlp/pdlp.cu b/cpp/src/pdlp/pdlp.cu index 168f997724..37de2d8537 100644 --- a/cpp/src/pdlp/pdlp.cu +++ b/cpp/src/pdlp/pdlp.cu @@ -42,6 +42,7 @@ #include #include +#include #include namespace cuopt::linear_programming::detail { @@ -327,6 +328,19 @@ pdlp_solver_t::pdlp_solver_t(problem_t& op_problem, cuopt_expects(num_gpus == settings.num_gpus && settings.num_gpus > 1, error_type_t::ValidationError, "This constructor should only be used for distributed PDLP (num_gpus > 1)"); + + // Distributed PDLP is currently double-only. The body is guarded with + // `if constexpr` so the float instantiation never references the + // multi_gpu_engine_t / partition_loader_t symbols + // (those are intentionally not instantiated in their .cu files), keeping + // the link clean. Trying to use distributed PDLP with f_t = float will + // throw at runtime instead. + if constexpr (!std::is_same_v) { + cuopt_expects(false, + error_type_t::ValidationError, + "Distributed PDLP (num_gpus > 1) currently requires double precision"); + return; + } else { // 2. Load partition std::vector parts; if (!settings.multi_gpu_partition_file.empty()) { @@ -509,6 +523,7 @@ pdlp_solver_t::pdlp_solver_t(problem_t& op_problem, shard->stream.view()); } } + } // end if constexpr (std::is_same_v) } template From de19f38f5e771e8470d0ae8e711676fb47912c4f Mon Sep 17 00:00:00 2001 From: Bulle Mostovoi Date: Fri, 22 May 2026 13:40:31 +0200 Subject: [PATCH 20/67] support spmvop on multi-gpu --- cpp/src/pdlp/distributed_pdlp/multi_gpu_engine.hpp | 4 ++-- cpp/src/pdlp/distributed_pdlp/shard.cu | 2 ++ 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/cpp/src/pdlp/distributed_pdlp/multi_gpu_engine.hpp b/cpp/src/pdlp/distributed_pdlp/multi_gpu_engine.hpp index 9ea007947e..e9f48b9666 100644 --- a/cpp/src/pdlp/distributed_pdlp/multi_gpu_engine.hpp +++ b/cpp/src/pdlp/distributed_pdlp/multi_gpu_engine.hpp @@ -223,7 +223,7 @@ struct multi_gpu_engine_t { { halo_exchange_var(); for_each_shard([&](auto& shard) { - shard.sub_pdlp->pdhg_solver_.compute_A_x(); + shard.sub_pdlp->pdhg_solver_.spmvop_A_x(); }); } @@ -232,7 +232,7 @@ struct multi_gpu_engine_t { { halo_exchange_cstr(); for_each_shard([&](auto& shard) { - shard.sub_pdlp->pdhg_solver_.compute_At_y(); + shard.sub_pdlp->pdhg_solver_.spmvop_At_y(); }); } diff --git a/cpp/src/pdlp/distributed_pdlp/shard.cu b/cpp/src/pdlp/distributed_pdlp/shard.cu index bbc02559cf..06c6f8c8de 100644 --- a/cpp/src/pdlp/distributed_pdlp/shard.cu +++ b/cpp/src/pdlp/distributed_pdlp/shard.cu @@ -195,6 +195,8 @@ pdlp_shard_t::pdlp_shard_t(int device_id, scaling.set_h_bound_rescaling(h_bound_rescaling); scaling.set_h_objective_rescaling(h_objective_rescaling); + sub_pdlp->pdhg_solver_.get_cusparse_view().create_spmv_op_plans( + /* is_reflected */ true); // ---- 6. Build per-peer halo-exchange plans (ported from metis_tests). ---- // For each peer p, we precompute: // send_indices_d[p] : local indices to gather (uploaded from host send plan) From 0030a6c5d7b3f9e22c3da791c25a09869679f3e0 Mon Sep 17 00:00:00 2001 From: Bulle Mostovoi Date: Fri, 22 May 2026 14:06:47 +0200 Subject: [PATCH 21/67] compile ready --- .../initial_scaling_strategy/initial_scaling.cu | 14 ++++++++++---- cpp/src/pdlp/pdhg.hpp | 6 +++--- cpp/src/pdlp/pdlp.cu | 4 +++- 3 files changed, 16 insertions(+), 8 deletions(-) diff --git a/cpp/src/pdlp/initial_scaling_strategy/initial_scaling.cu b/cpp/src/pdlp/initial_scaling_strategy/initial_scaling.cu index eb1bae2e95..fd6e02079e 100644 --- a/cpp/src/pdlp/initial_scaling_strategy/initial_scaling.cu +++ b/cpp/src/pdlp/initial_scaling_strategy/initial_scaling.cu @@ -938,15 +938,21 @@ void pdlp_initial_scaling_strategy_t::set_cummulative_scaling( template void pdlp_initial_scaling_strategy_t::set_h_bound_rescaling(f_t value) { - h_bound_rescaling = value; - bound_rescaling_.set_value_async(value, stream_view_); + std::fill(h_bound_rescaling_.begin(), h_bound_rescaling_.end(), value); + thrust::fill(handle_ptr_->get_thrust_policy(), + bound_rescaling_.begin(), + bound_rescaling_.end(), + value); } template void pdlp_initial_scaling_strategy_t::set_h_objective_rescaling(f_t value) { - h_objective_rescaling = value; - objective_rescaling_.set_value_async(value, stream_view_); + std::fill(h_objective_rescaling_.begin(), h_objective_rescaling_.end(), value); + thrust::fill(handle_ptr_->get_thrust_policy(), + objective_rescaling_.begin(), + objective_rescaling_.end(), + value); } template diff --git a/cpp/src/pdlp/pdhg.hpp b/cpp/src/pdlp/pdhg.hpp index 7b2e606864..8226d2cecc 100644 --- a/cpp/src/pdlp/pdhg.hpp +++ b/cpp/src/pdlp/pdhg.hpp @@ -91,7 +91,9 @@ class pdhg_solver_t { // cusparse path on the local matrix. void compute_At_y(); void compute_A_x(); - + void spmvop_At_y(); + void spmvop_A_x(); + // Pure cub-transform extractions. Each one is byte-identical to the inline // cub call it replaces — no platform dispatch inside. Callers handle the // single-GPU vs per-shard branching at the call site (see the @@ -124,8 +126,6 @@ class pdhg_solver_t { void compute_primal_projection_with_gradient(rmm::device_uvector& primal_step_size); void compute_primal_projection(rmm::device_uvector& primal_step_size); - void spmvop_At_y(); - void spmvop_A_x(); bool batch_mode_{false}; raft::handle_t const* handle_ptr_{nullptr}; diff --git a/cpp/src/pdlp/pdlp.cu b/cpp/src/pdlp/pdlp.cu index 8ceb712aff..ec7ed16c30 100644 --- a/cpp/src/pdlp/pdlp.cu +++ b/cpp/src/pdlp/pdlp.cu @@ -567,6 +567,7 @@ pdlp_solver_t::pdlp_solver_t(problem_t& op_problem, // Project initial primal solution if (settings_.hyper_params.project_initial_primal) { + // Use refine_initial_primal_projection ??? using f_t2 = typename type_2::type; for (auto& shard : multi_gpu_engine->shards) { raft::device_setter guard(shard->device_id); @@ -2672,7 +2673,8 @@ optimization_problem_solution_t pdlp_solver_t::run_solver(co clamp(), stream_view_.value()); - pdhg_solver_.refine_initial_primal_projection(); + pdhg_solver_.refine_initial_primal_projection( + initial_scaling_strategy_.get_bound_rescaling_vector()); if (!settings_.hyper_params.never_restart_to_average) { cuopt_expects(!batch_mode_, From 172ebc29da1eb892da5a2fe22d2df1f57d93f773 Mon Sep 17 00:00:00 2001 From: Bulle Mostovoi Date: Fri, 22 May 2026 14:14:10 +0200 Subject: [PATCH 22/67] can run now --- cpp/src/pdlp/distributed_pdlp/shard.cu | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/src/pdlp/distributed_pdlp/shard.cu b/cpp/src/pdlp/distributed_pdlp/shard.cu index 06c6f8c8de..c66b03755e 100644 --- a/cpp/src/pdlp/distributed_pdlp/shard.cu +++ b/cpp/src/pdlp/distributed_pdlp/shard.cu @@ -225,6 +225,6 @@ pdlp_shard_t::pdlp_shard_t(int device_id, } template struct pdlp_shard_t; -// template struct pdlp_shard_t; +template struct pdlp_shard_t; } // namespace cuopt::linear_programming::detail From 23d07981d0dd7b84d953437aafd921292f4db4d8 Mon Sep 17 00:00:00 2001 From: Bulle Mostovoi Date: Fri, 22 May 2026 14:57:51 +0200 Subject: [PATCH 23/67] passing all tests, good merge --- cpp/src/pdlp/pdlp.cu | 33 ++++++++++++++++++++++++++------- 1 file changed, 26 insertions(+), 7 deletions(-) diff --git a/cpp/src/pdlp/pdlp.cu b/cpp/src/pdlp/pdlp.cu index ec7ed16c30..b5fe5ad6ca 100644 --- a/cpp/src/pdlp/pdlp.cu +++ b/cpp/src/pdlp/pdlp.cu @@ -2665,13 +2665,31 @@ optimization_problem_solution_t pdlp_solver_t::run_solver(co // Project initial primal solution if (settings_.hyper_params.project_initial_primal) { using f_t2 = typename type_2::type; - cub::DeviceTransform::Transform( - cuda::std::make_tuple(pdhg_solver_.get_primal_solution().data(), - problem_wrap_container(op_problem_scaled_.variable_bounds)), - pdhg_solver_.get_primal_solution().data(), - pdhg_solver_.get_primal_solution().size(), - clamp(), - stream_view_.value()); + if (batch_mode_) { + // In batch mode variable_bounds are shared and only the bound rescaling is per climber. + // Apply it here too so the initial point is projected into the correct scaled space. + cub::DeviceTransform::Transform( + cuda::std::make_tuple( + pdhg_solver_.get_primal_solution().data(), + thrust::make_transform_iterator( + thrust::make_zip_iterator( + problem_wrap_container(op_problem_scaled_.variable_bounds), + batch_wrapped_container(initial_scaling_strategy_.get_bound_rescaling_vector(), + primal_size_h_)), + scale_bounds_by_scalar_op{})), + pdhg_solver_.get_primal_solution().data(), + pdhg_solver_.get_primal_solution().size(), + clamp(), + stream_view_.value()); + } else { + cub::DeviceTransform::Transform( + cuda::std::make_tuple(pdhg_solver_.get_primal_solution().data(), + problem_wrap_container(op_problem_scaled_.variable_bounds)), + pdhg_solver_.get_primal_solution().data(), + pdhg_solver_.get_primal_solution().size(), + clamp(), + stream_view_.value()); + } pdhg_solver_.refine_initial_primal_projection( initial_scaling_strategy_.get_bound_rescaling_vector()); @@ -2718,6 +2736,7 @@ optimization_problem_solution_t pdlp_solver_t::run_solver(co restart_strategy_.last_restart_duality_gap_.dual_solution_, dummy); } + transpose_problem_fields(/*to_row=*/true); } if (verbose) { From 30881ce2393292d2d4b7422f682857df074798c7 Mon Sep 17 00:00:00 2001 From: Bulle Mostovoi Date: Fri, 22 May 2026 16:24:24 +0200 Subject: [PATCH 24/67] fixed the errors hihi, finished distributed part for compte_fixed_error --- .../distributed_pdlp/multi_gpu_engine.hpp | 99 ++++++++++--- cpp/src/pdlp/pdhg.cu | 45 ++++++ cpp/src/pdlp/pdhg.hpp | 9 ++ cpp/src/pdlp/pdlp.cu | 133 ++++++++++++++---- .../adaptive_step_size_strategy.cu | 20 +++ .../adaptive_step_size_strategy.hpp | 7 + 6 files changed, 266 insertions(+), 47 deletions(-) diff --git a/cpp/src/pdlp/distributed_pdlp/multi_gpu_engine.hpp b/cpp/src/pdlp/distributed_pdlp/multi_gpu_engine.hpp index e9f48b9666..6d9cf9d3a3 100644 --- a/cpp/src/pdlp/distributed_pdlp/multi_gpu_engine.hpp +++ b/cpp/src/pdlp/distributed_pdlp/multi_gpu_engine.hpp @@ -102,10 +102,12 @@ struct multi_gpu_engine_t { // -------- Halo exchange (variables / x) --------------------------------- // Fills the halo slice [owned_var_size, total_var_size) of the per-shard - // reflected_primal vector (the buffer A @ x reads). Step 1: thrust::gather - // per-peer outgoing values into staging buffers. Step 2: a single NCCL - // group with matched ncclSend / ncclRecv across all (rank, peer) pairs. - void halo_exchange_var() + // input buffer returned by `buf_access(pdhg)` (the buffer A @ x will read). + // Step 1: thrust::gather per-peer outgoing values into staging buffers. + // Step 2: a single NCCL group with matched ncclSend / ncclRecv across all + // (rank, peer) pairs. + template + void halo_exchange_var(BufAccess&& buf_access) { const int nb = static_cast(shards.size()); @@ -113,7 +115,7 @@ struct multi_gpu_engine_t { for (int r = 0; r < nb; ++r) { auto& s = *shards[r]; raft::device_setter guard(s.device_id); - auto& x = s.sub_pdlp->pdhg_solver_.get_reflected_primal(); + auto& x = buf_access(s.sub_pdlp->pdhg_solver_); for (int peer = 0; peer < nb; ++peer) { if (peer == r) continue; if (s.var_send_indices_d[peer].size() == 0) continue; @@ -144,7 +146,7 @@ struct multi_gpu_engine_t { auto& s = *shards[r]; auto& rd = s.rank_data; raft::device_setter guard(s.device_id); - auto& x = s.sub_pdlp->pdhg_solver_.get_reflected_primal(); + auto& x = buf_access(s.sub_pdlp->pdhg_solver_); for (int peer = 0; peer < nb; ++peer) { if (peer == r) continue; f_t* recv_ptr = x.data() + rd.owned_var_size + rd.var_recv_offsets[peer]; @@ -160,16 +162,17 @@ struct multi_gpu_engine_t { } // -------- Halo exchange (constraints / y) ------------------------------- - // Same as halo_exchange_var but for the per-shard dual solution (the buffer - // A_T @ y reads) and constraint halos. - void halo_exchange_cstr() + // Same as halo_exchange_var but for a constraint-shaped buffer (the input + // A_T @ y will read) and constraint halos. + template + void halo_exchange_cstr(BufAccess&& buf_access) { const int nb = static_cast(shards.size()); for (int r = 0; r < nb; ++r) { auto& s = *shards[r]; raft::device_setter guard(s.device_id); - auto& y = s.sub_pdlp->pdhg_solver_.get_dual_solution(); + auto& y = buf_access(s.sub_pdlp->pdhg_solver_); for (int peer = 0; peer < nb; ++peer) { if (peer == r) continue; if (s.cstr_send_indices_d[peer].size() == 0) continue; @@ -199,7 +202,7 @@ struct multi_gpu_engine_t { auto& s = *shards[r]; auto& rd = s.rank_data; raft::device_setter guard(s.device_id); - auto& y = s.sub_pdlp->pdhg_solver_.get_dual_solution(); + auto& y = buf_access(s.sub_pdlp->pdhg_solver_); for (int peer = 0; peer < nb; ++peer) { if (peer == r) continue; f_t* recv_ptr = y.data() + rd.owned_cstr_size + rd.cstr_recv_offsets[peer]; @@ -214,28 +217,78 @@ struct multi_gpu_engine_t { ncclGroupEnd(); } - // -------- High-level: A @ x and A_T @ y --------------------------------- - // A @ x: halo-update the reflected_primal vector, then per-shard SpMV. - // Named distributed_* (rather than compute_*) to make call sites in pdhg.cu - // self-documenting and to avoid name collision with pdhg_solver_t's own - // compute_A_x / compute_At_y, which the engine dispatches into per shard. - void distributed_compute_A_x() + // -------- NCCL allreduce (sum, in place) -------------------------------- + // Per-shard in-place sum-allreduce. Each shard's stream issues an + // ncclAllReduce(buf, buf, count, ncclFloat64, ncclSum, ...) inside a single + // group. After this returns, every shard's buffer holds the global sum. + // + // PtrAccess: pdlp_solver_t& -> f_t* (e.g. into step_size_strategy_). + template + void allreduce_sum_inplace(PtrAccess&& ptr_access, size_t count = 1) + { + ncclGroupStart(); + for (auto& s : shards) { + raft::device_setter guard(s->device_id); + f_t* buf = ptr_access(*s->sub_pdlp); + ncclAllReduce(buf, + buf, + count, + ncclFloat64, + ncclSum, + s->comm.get(), + s->stream.view().value()); + } + ncclGroupEnd(); + } + + // -------- Generic distributed SpMVs ------------------------------------- + // distributed_spmv_A : halo-update the var-shaped input buffer returned by + // `in_buf(pdhg)`, then per-shard A @ in_buf -> out_desc. + // distributed_spmv_At: halo-update the cstr-shaped input buffer returned by + // `in_buf(pdhg)`, then per-shard A_T @ in_buf -> out_desc. + // + // Accessor signatures: + // in_buf (pdhg_solver_t&) -> rmm::device_uvector& + // out_desc(pdhg_solver_t&) -> cusparseDnVecDescr_t + template + void distributed_spmv_A(InBufAccess&& in_buf, OutDescAccess&& out_desc) { - halo_exchange_var(); + halo_exchange_var(in_buf); for_each_shard([&](auto& shard) { - shard.sub_pdlp->pdhg_solver_.spmvop_A_x(); + auto& sub_pdhg = shard.sub_pdlp->pdhg_solver_; + sub_pdhg.spmv_A_into(in_buf(sub_pdhg), out_desc(sub_pdhg)); }); } - // A_T @ y: halo-update the dual solution vector, then per-shard SpMV. - void distributed_compute_At_y() + template + void distributed_spmv_At(InBufAccess&& in_buf, OutDescAccess&& out_desc) { - halo_exchange_cstr(); + halo_exchange_cstr(in_buf); for_each_shard([&](auto& shard) { - shard.sub_pdlp->pdhg_solver_.spmvop_At_y(); + auto& sub_pdhg = shard.sub_pdlp->pdhg_solver_; + sub_pdhg.spmv_At_into(in_buf(sub_pdhg), out_desc(sub_pdhg)); }); } + // -------- High-level: A @ x and A_T @ y --------------------------------- + // Thin wrappers used from pdhg_solver_t::compute_A_x / compute_At_y when an + // engine is wired in. They use the canonical PDHG buffers/descriptors so the + // result lands where single-GPU PDHG would have put it (dual_gradient for A, + // current_AtY for A_T). + void distributed_compute_A_x() + { + distributed_spmv_A( + [](auto& pdhg) -> rmm::device_uvector& { return pdhg.get_reflected_primal(); }, + [](auto& pdhg) -> cusparseDnVecDescr_t { return pdhg.get_cusparse_view().dual_gradient; }); + } + + void distributed_compute_At_y() + { + distributed_spmv_At( + [](auto& pdhg) -> rmm::device_uvector& { return pdhg.get_dual_solution(); }, + [](auto& pdhg) -> cusparseDnVecDescr_t { return pdhg.get_cusparse_view().current_AtY; }); + } + // Engine-level stream for fork/join orchestration (master side). rmm::cuda_stream stream; diff --git a/cpp/src/pdlp/pdhg.cu b/cpp/src/pdlp/pdhg.cu index fb0fc9b611..56c61aedda 100644 --- a/cpp/src/pdlp/pdhg.cu +++ b/cpp/src/pdlp/pdhg.cu @@ -623,6 +623,51 @@ void pdhg_solver_t::compute_A_x() } } +template +void pdhg_solver_t::spmv_At_into(rmm::device_uvector& in_buf, + cusparseDnVecDescr_t out_desc) +{ + RAFT_CUSPARSE_TRY(cusparseDnVecSetValues(cusparse_view_.dual_solution, in_buf.data())); + RAFT_CUSPARSE_TRY( + raft::sparse::detail::cusparsespmv(handle_ptr_->get_cusparse_handle(), + CUSPARSE_OPERATION_NON_TRANSPOSE, + reusable_device_scalar_value_1_.data(), + cusparse_view_.A_T, + cusparse_view_.dual_solution, + reusable_device_scalar_value_0_.data(), + out_desc, + CUSPARSE_SPMV_CSR_ALG2, + (f_t*)cusparse_view_.buffer_transpose.data(), + stream_view_)); + // Restore the canonical binding so subsequent code on this shard that reads + // cv.dual_solution sees the dual_solution_ buffer it was constructed with. + RAFT_CUSPARSE_TRY(cusparseDnVecSetValues( + cusparse_view_.dual_solution, current_saddle_point_state_.get_dual_solution().data())); +} + +template +void pdhg_solver_t::spmv_A_into(rmm::device_uvector& in_buf, + cusparseDnVecDescr_t out_desc) +{ + RAFT_CUSPARSE_TRY( + cusparseDnVecSetValues(cusparse_view_.reflected_primal_solution, in_buf.data())); + RAFT_CUSPARSE_TRY( + raft::sparse::detail::cusparsespmv(handle_ptr_->get_cusparse_handle(), + CUSPARSE_OPERATION_NON_TRANSPOSE, + reusable_device_scalar_value_1_.data(), + cusparse_view_.A, + cusparse_view_.reflected_primal_solution, + reusable_device_scalar_value_0_.data(), + out_desc, + CUSPARSE_SPMV_CSR_ALG2, + (f_t*)cusparse_view_.buffer_non_transpose.data(), + stream_view_)); + // Restore the canonical binding so subsequent code on this shard that reads + // cv.reflected_primal_solution sees the reflected_primal_ buffer. + RAFT_CUSPARSE_TRY( + cusparseDnVecSetValues(cusparse_view_.reflected_primal_solution, reflected_primal_.data())); +} + template void pdhg_solver_t::compute_primal_projection_with_gradient( rmm::device_uvector& primal_step_size) diff --git a/cpp/src/pdlp/pdhg.hpp b/cpp/src/pdlp/pdhg.hpp index 8226d2cecc..8fbee24e71 100644 --- a/cpp/src/pdlp/pdhg.hpp +++ b/cpp/src/pdlp/pdhg.hpp @@ -93,6 +93,15 @@ class pdhg_solver_t { void compute_A_x(); void spmvop_At_y(); void spmvop_A_x(); + + // Parameterized SpMVs used by the multi-GPU engine. + // Both temporarily hijack a canonical input descriptor in cusparse_view_ + // (cv.dual_solution for At, cv.reflected_primal_solution for A) to point at + // `in_buf.data()`, run the local SpMV into `out_desc`, then restore the + // descriptor to its original buffer so other code on this shard is unaffected. + // No multi-GPU dispatch inside — the engine is the orchestrator. + void spmv_At_into(rmm::device_uvector& in_buf, cusparseDnVecDescr_t out_desc); + void spmv_A_into(rmm::device_uvector& in_buf, cusparseDnVecDescr_t out_desc); // Pure cub-transform extractions. Each one is byte-identical to the inline // cub call it replaces — no platform dispatch inside. Callers handle the diff --git a/cpp/src/pdlp/pdlp.cu b/cpp/src/pdlp/pdlp.cu index b5fe5ad6ca..7203c11a42 100644 --- a/cpp/src/pdlp/pdlp.cu +++ b/cpp/src/pdlp/pdlp.cu @@ -2221,34 +2221,118 @@ void pdlp_solver_t::compute_fixed_error(std::vector& has_restarte // Computing the deltas // TODO batch mdoe: this only works if everyone restarts - cub::DeviceTransform::Transform(cuda::std::make_tuple(pdhg_solver_.get_reflected_primal().data(), - pdhg_solver_.get_primal_solution().data()), - pdhg_solver_.get_saddle_point_state().get_delta_primal().data(), - pdhg_solver_.get_primal_solution().size(), - cuda::std::minus{}, - stream_view_.value()); - cub::DeviceTransform::Transform(cuda::std::make_tuple(pdhg_solver_.get_reflected_dual().data(), - pdhg_solver_.get_dual_solution().data()), - pdhg_solver_.get_saddle_point_state().get_delta_dual().data(), - pdhg_solver_.get_dual_solution().size(), - cuda::std::minus{}, - stream_view_.value()); + if (multi_gpu_engine) { + // Go faire une fonction compute_delta_primal, compute_delta primal ? + for (auto& shard : multi_gpu_engine->shards) { + raft::device_setter guard(shard->device_id); + auto& sub_pdhg = shard->sub_pdlp->pdhg_solver_; + cub::DeviceTransform::Transform( + cuda::std::make_tuple(sub_pdhg.get_reflected_primal().data(), + sub_pdhg.get_primal_solution().data()), + sub_pdhg.get_saddle_point_state().get_delta_primal().data(), + sub_pdhg.get_primal_solution().size(), + cuda::std::minus{}, + shard->stream.view()); + cub::DeviceTransform::Transform( + cuda::std::make_tuple(sub_pdhg.get_reflected_dual().data(), + sub_pdhg.get_dual_solution().data()), + sub_pdhg.get_saddle_point_state().get_delta_dual().data(), + sub_pdhg.get_dual_solution().size(), + cuda::std::minus{}, + shard->stream.view()); + } + } else { + cub::DeviceTransform::Transform( + cuda::std::make_tuple(pdhg_solver_.get_reflected_primal().data(), + pdhg_solver_.get_primal_solution().data()), + pdhg_solver_.get_saddle_point_state().get_delta_primal().data(), + pdhg_solver_.get_primal_solution().size(), + cuda::std::minus{}, + stream_view_.value()); + cub::DeviceTransform::Transform( + cuda::std::make_tuple(pdhg_solver_.get_reflected_dual().data(), + pdhg_solver_.get_dual_solution().data()), + pdhg_solver_.get_saddle_point_state().get_delta_dual().data(), + pdhg_solver_.get_dual_solution().size(), + cuda::std::minus{}, + stream_view_.value()); + } auto& cusparse_view = pdhg_solver_.get_cusparse_view(); - // Sync to make sure all previous cuSparse operations are finished before setting the - // potential_next_dual_solution - RAFT_CUDA_TRY(cudaStreamSynchronize(stream_view_)); - // Make potential_next_dual_solution point towards reflected dual solution to reuse the code - RAFT_CUSPARSE_TRY(cusparseDnVecSetValues(cusparse_view.potential_next_dual_solution, - (void*)pdhg_solver_.get_reflected_dual().data())); + if (multi_gpu_engine) { - if (batch_mode_) - RAFT_CUSPARSE_TRY(cusparseDnMatSetValues(cusparse_view.batch_potential_next_dual_solution, + // SpMV is the first operation in compute_interaction_and_movement so we can do halo before and call it naturally + // we then reduce the local dot products + multi_gpu_engine->halo_exchange_cstr( + [](auto& pdhg) -> rmm::device_uvector& { return pdhg.get_reflected_dual(); }); + + for (auto& shard : multi_gpu_engine->shards) { + raft::device_setter guard(shard->device_id); + auto& sub_pdlp = *shard->sub_pdlp; + auto& sub_cv = sub_pdlp.pdhg_solver_.get_cusparse_view(); + + RAFT_CUSPARSE_TRY( + cusparseDnVecSetValues(sub_cv.potential_next_dual_solution, + (void*)sub_pdlp.pdhg_solver_.get_reflected_dual().data())); + + sub_pdlp.step_size_strategy_.compute_interaction_and_movement( + sub_pdlp.pdhg_solver_.get_primal_tmp_resource(), + sub_cv, + sub_pdlp.pdhg_solver_.get_saddle_point_state()); + + RAFT_CUSPARSE_TRY(cusparseDnVecSetValues( + sub_cv.potential_next_dual_solution, + (void*)sub_pdlp.pdhg_solver_.get_potential_next_dual_solution().data())); + } + + multi_gpu_engine->allreduce_sum_inplace( + [](auto& sp) -> f_t* { return sp.step_size_strategy_.get_interaction().data(); }, 1); + multi_gpu_engine->allreduce_sum_inplace( + [](auto& sp) -> f_t* { + return sp.step_size_strategy_.get_norm_squared_delta_primal().data(); + }, + 1); + multi_gpu_engine->allreduce_sum_inplace( + [](auto& sp) -> f_t* { + return sp.step_size_strategy_.get_norm_squared_delta_dual().data(); + }, + 1); + + auto& s0 = *multi_gpu_engine->shards[0]; + { + raft::device_setter guard(s0.device_id); + RAFT_CUDA_TRY(cudaStreamSynchronize(s0.stream.view().value())); + } + auto& src_sp = s0.sub_pdlp->step_size_strategy_; + raft::copy(step_size_strategy_.get_interaction().data(), + src_sp.get_interaction().data(), + 1, + stream_view_); + raft::copy(step_size_strategy_.get_norm_squared_delta_primal().data(), + src_sp.get_norm_squared_delta_primal().data(), + 1, + stream_view_); + raft::copy(step_size_strategy_.get_norm_squared_delta_dual().data(), + src_sp.get_norm_squared_delta_dual().data(), + 1, + stream_view_); + } else { + // Sync to make sure all previous cuSparse operations are finished before setting the + // potential_next_dual_solution + RAFT_CUDA_TRY(cudaStreamSynchronize(stream_view_)); + + // Make potential_next_dual_solution point towards reflected dual solution to reuse the code + RAFT_CUSPARSE_TRY(cusparseDnVecSetValues(cusparse_view.potential_next_dual_solution, (void*)pdhg_solver_.get_reflected_dual().data())); - step_size_strategy_.compute_interaction_and_movement( - pdhg_solver_.get_primal_tmp_resource(), cusparse_view, pdhg_solver_.get_saddle_point_state()); + if (batch_mode_) + RAFT_CUSPARSE_TRY(cusparseDnMatSetValues(cusparse_view.batch_potential_next_dual_solution, + (void*)pdhg_solver_.get_reflected_dual().data())); + + step_size_strategy_.compute_interaction_and_movement( + pdhg_solver_.get_primal_tmp_resource(), cusparse_view, pdhg_solver_.get_saddle_point_state()); + } if (batch_mode_) { const auto [grid_size, block_size] = kernel_config_from_batch_size(climber_strategies_.size()); @@ -2279,11 +2363,12 @@ void pdlp_solver_t::compute_fixed_error(std::vector& has_restarte // potential_next_dual_solution RAFT_CUDA_TRY(cudaStreamSynchronize(stream_view_)); - // Put back + // Put back, already done in multi-gpu side + if (!multi_gpu_engine) { RAFT_CUSPARSE_TRY( cusparseDnVecSetValues(cusparse_view.potential_next_dual_solution, (void*)pdhg_solver_.get_potential_next_dual_solution().data())); - + } if (batch_mode_) { RAFT_CUSPARSE_TRY( cusparseDnMatSetValues(cusparse_view.batch_potential_next_dual_solution, diff --git a/cpp/src/pdlp/step_size_strategy/adaptive_step_size_strategy.cu b/cpp/src/pdlp/step_size_strategy/adaptive_step_size_strategy.cu index 1f137dc9ea..fb85be4280 100644 --- a/cpp/src/pdlp/step_size_strategy/adaptive_step_size_strategy.cu +++ b/cpp/src/pdlp/step_size_strategy/adaptive_step_size_strategy.cu @@ -309,6 +309,26 @@ adaptive_step_size_strategy_t::get_norm_squared_delta_dual() const return norm_squared_delta_dual_; } +template +rmm::device_uvector& adaptive_step_size_strategy_t::get_interaction() +{ + return interaction_; +} + +template +rmm::device_uvector& +adaptive_step_size_strategy_t::get_norm_squared_delta_primal() +{ + return norm_squared_delta_primal_; +} + +template +rmm::device_uvector& +adaptive_step_size_strategy_t::get_norm_squared_delta_dual() +{ + return norm_squared_delta_dual_; +} + template void adaptive_step_size_strategy_t::set_valid_step_size(i_t valid) { diff --git a/cpp/src/pdlp/step_size_strategy/adaptive_step_size_strategy.hpp b/cpp/src/pdlp/step_size_strategy/adaptive_step_size_strategy.hpp index 1e969150e7..896c6fa24e 100644 --- a/cpp/src/pdlp/step_size_strategy/adaptive_step_size_strategy.hpp +++ b/cpp/src/pdlp/step_size_strategy/adaptive_step_size_strategy.hpp @@ -81,6 +81,13 @@ class adaptive_step_size_strategy_t { const rmm::device_uvector& get_norm_squared_delta_primal() const; const rmm::device_uvector& get_norm_squared_delta_dual() const; + // Mutable overloads — used by the multi-GPU path to NCCL-allreduce the + // per-shard scalar contributions in place and to mirror them back to the + // master step_size_strategy_. + rmm::device_uvector& get_interaction(); + rmm::device_uvector& get_norm_squared_delta_primal(); + rmm::device_uvector& get_norm_squared_delta_dual(); + void compute_interaction_and_movement(rmm::device_uvector& tmp_primal, cusparse_view_t& cusparse_view, saddle_point_state_t& current_saddle_point_state); From c33faf2d4d0ce0b00390553bae0c9c6e70b0c03d Mon Sep 17 00:00:00 2001 From: Bulle Mostovoi Date: Fri, 22 May 2026 16:27:00 +0200 Subject: [PATCH 25/67] style --- .../distributed_pdlp/multi_gpu_engine.hpp | 60 +-- .../pdlp/distributed_pdlp/partition_loader.cu | 12 +- cpp/src/pdlp/distributed_pdlp/shard.cu | 6 +- .../initial_scaling.cu | 6 +- cpp/src/pdlp/pdhg.cu | 30 +- cpp/src/pdlp/pdhg.hpp | 5 +- cpp/src/pdlp/pdlp.cu | 424 +++++++++--------- .../adaptive_step_size_strategy.cu | 6 +- 8 files changed, 259 insertions(+), 290 deletions(-) diff --git a/cpp/src/pdlp/distributed_pdlp/multi_gpu_engine.hpp b/cpp/src/pdlp/distributed_pdlp/multi_gpu_engine.hpp index 6d9cf9d3a3..001f9b760e 100644 --- a/cpp/src/pdlp/distributed_pdlp/multi_gpu_engine.hpp +++ b/cpp/src/pdlp/distributed_pdlp/multi_gpu_engine.hpp @@ -15,9 +15,9 @@ #include #include +#include #include #include -#include #include @@ -53,51 +53,35 @@ struct multi_gpu_engine_t { multi_gpu_engine_t(const multi_gpu_engine_t&) = delete; multi_gpu_engine_t& operator=(const multi_gpu_engine_t&) = delete; - - template void for_each_shard(Fn&& fn) { for (auto& s : shards) { - raft::device_setter guard(s->device_id); - fn(*s); + raft::device_setter guard(s->device_id); + fn(*s); } } - template + template void distributed_transform(std::tuple in_accessors, - OutAccess out, - SizeAccess sz, - Op op) + OutAccess out, + SizeAccess sz, + Op op) { for_each_shard([&](auto& shard) { auto& sub = *shard.sub_pdlp; // turns the Tuple of lambdas into a tuple of rmm::device_uvector auto cub_inputs = std::apply( - [&sub](auto&... acc) { return cuda::std::make_tuple(acc(sub)...); }, - in_accessors); + [&sub](auto&... acc) { return cuda::std::make_tuple(acc(sub)...); }, in_accessors); - cub::DeviceTransform::Transform(cub_inputs, - out(sub), - sz(sub), - op, - shard.stream.view()); + cub::DeviceTransform::Transform(cub_inputs, out(sub), sz(sub), op, shard.stream.view()); }); } // --- 2) convenience: single input accessor (delegates) --- - template - void distributed_transform(InAccess in, - OutAccess out, - SizeAccess sz, - Op op) + template + void distributed_transform(InAccess in, OutAccess out, SizeAccess sz, Op op) { - distributed_transform(std::make_tuple(in), out, sz, op); + distributed_transform(std::make_tuple(in), out, sz, op); } // -------- Halo exchange (variables / x) --------------------------------- @@ -143,10 +127,10 @@ struct multi_gpu_engine_t { } } for (int r = 0; r < nb; ++r) { - auto& s = *shards[r]; - auto& rd = s.rank_data; + auto& s = *shards[r]; + auto& rd = s.rank_data; raft::device_setter guard(s.device_id); - auto& x = buf_access(s.sub_pdlp->pdhg_solver_); + auto& x = buf_access(s.sub_pdlp->pdhg_solver_); for (int peer = 0; peer < nb; ++peer) { if (peer == r) continue; f_t* recv_ptr = x.data() + rd.owned_var_size + rd.var_recv_offsets[peer]; @@ -199,10 +183,10 @@ struct multi_gpu_engine_t { } } for (int r = 0; r < nb; ++r) { - auto& s = *shards[r]; - auto& rd = s.rank_data; + auto& s = *shards[r]; + auto& rd = s.rank_data; raft::device_setter guard(s.device_id); - auto& y = buf_access(s.sub_pdlp->pdhg_solver_); + auto& y = buf_access(s.sub_pdlp->pdhg_solver_); for (int peer = 0; peer < nb; ++peer) { if (peer == r) continue; f_t* recv_ptr = y.data() + rd.owned_cstr_size + rd.cstr_recv_offsets[peer]; @@ -230,13 +214,7 @@ struct multi_gpu_engine_t { for (auto& s : shards) { raft::device_setter guard(s->device_id); f_t* buf = ptr_access(*s->sub_pdlp); - ncclAllReduce(buf, - buf, - count, - ncclFloat64, - ncclSum, - s->comm.get(), - s->stream.view().value()); + ncclAllReduce(buf, buf, count, ncclFloat64, ncclSum, s->comm.get(), s->stream.view().value()); } ncclGroupEnd(); } diff --git a/cpp/src/pdlp/distributed_pdlp/partition_loader.cu b/cpp/src/pdlp/distributed_pdlp/partition_loader.cu index 007df4ce1c..b9bc71ae9e 100644 --- a/cpp/src/pdlp/distributed_pdlp/partition_loader.cu +++ b/cpp/src/pdlp/distributed_pdlp/partition_loader.cu @@ -19,9 +19,9 @@ std::vector partition_loader_t::parse_distributed_pdlp_partition_ { std::ifstream part_file(file); cuopt_expects(part_file.is_open(), - error_type_t::ValidationError, - "Failed to open partition file: %s", - file.c_str()); + error_type_t::ValidationError, + "Failed to open partition file: %s", + file.c_str()); // One integer per line; operator>> skips whitespace so blank lines and // trailing newlines are tolerated. @@ -33,9 +33,9 @@ std::vector partition_loader_t::parse_distributed_pdlp_partition_ // We must have hit EOF cleanly; any other state means a malformed token. cuopt_expects(part_file.eof(), - error_type_t::ValidationError, - "Malformed partition file (expected one integer per line): %s", - file.c_str()); + error_type_t::ValidationError, + "Malformed partition file (expected one integer per line): %s", + file.c_str()); return parts; } diff --git a/cpp/src/pdlp/distributed_pdlp/shard.cu b/cpp/src/pdlp/distributed_pdlp/shard.cu index c66b03755e..33aac38103 100644 --- a/cpp/src/pdlp/distributed_pdlp/shard.cu +++ b/cpp/src/pdlp/distributed_pdlp/shard.cu @@ -202,9 +202,7 @@ pdlp_shard_t::pdlp_shard_t(int device_id, // send_indices_d[p] : local indices to gather (uploaded from host send plan) // send_buf_d[p] : f_t staging buffer sized to match // Self-peer slot is present but empty (size 0). Used in engine halo exchange. - auto build_send_plan = [&](auto const& send_per_peer, - auto& indices_d, - auto& buf_d) { + auto build_send_plan = [&](auto const& send_per_peer, auto& indices_d, auto& buf_d) { const std::size_t n_peers = send_per_peer.size(); indices_d.reserve(n_peers); buf_d.reserve(n_peers); @@ -218,7 +216,7 @@ pdlp_shard_t::pdlp_shard_t(int device_id, buf_d.emplace_back(std::move(buf)); } }; - build_send_plan(rank_data.var_send_per_peer, var_send_indices_d, var_send_buf_d); + build_send_plan(rank_data.var_send_per_peer, var_send_indices_d, var_send_buf_d); build_send_plan(rank_data.cstr_send_per_peer, cstr_send_indices_d, cstr_send_buf_d); handle.sync_stream(stream_view); diff --git a/cpp/src/pdlp/initial_scaling_strategy/initial_scaling.cu b/cpp/src/pdlp/initial_scaling_strategy/initial_scaling.cu index fd6e02079e..478753e9d9 100644 --- a/cpp/src/pdlp/initial_scaling_strategy/initial_scaling.cu +++ b/cpp/src/pdlp/initial_scaling_strategy/initial_scaling.cu @@ -939,10 +939,8 @@ template void pdlp_initial_scaling_strategy_t::set_h_bound_rescaling(f_t value) { std::fill(h_bound_rescaling_.begin(), h_bound_rescaling_.end(), value); - thrust::fill(handle_ptr_->get_thrust_policy(), - bound_rescaling_.begin(), - bound_rescaling_.end(), - value); + thrust::fill( + handle_ptr_->get_thrust_policy(), bound_rescaling_.begin(), bound_rescaling_.end(), value); } template diff --git a/cpp/src/pdlp/pdhg.cu b/cpp/src/pdlp/pdhg.cu index 56c61aedda..969f5d0d30 100644 --- a/cpp/src/pdlp/pdhg.cu +++ b/cpp/src/pdlp/pdhg.cu @@ -8,8 +8,8 @@ // pdlp.cuh defines pdlp_solver_t which the engine's compute_A_x/compute_At_y // template bodies dereference via shard.sub_pdlp->pdhg_solver_. Must be a // complete type at the point of template instantiation below. -#include #include +#include #include #include #include @@ -628,21 +628,20 @@ void pdhg_solver_t::spmv_At_into(rmm::device_uvector& in_buf, cusparseDnVecDescr_t out_desc) { RAFT_CUSPARSE_TRY(cusparseDnVecSetValues(cusparse_view_.dual_solution, in_buf.data())); - RAFT_CUSPARSE_TRY( - raft::sparse::detail::cusparsespmv(handle_ptr_->get_cusparse_handle(), - CUSPARSE_OPERATION_NON_TRANSPOSE, - reusable_device_scalar_value_1_.data(), - cusparse_view_.A_T, - cusparse_view_.dual_solution, - reusable_device_scalar_value_0_.data(), - out_desc, - CUSPARSE_SPMV_CSR_ALG2, - (f_t*)cusparse_view_.buffer_transpose.data(), - stream_view_)); + RAFT_CUSPARSE_TRY(raft::sparse::detail::cusparsespmv(handle_ptr_->get_cusparse_handle(), + CUSPARSE_OPERATION_NON_TRANSPOSE, + reusable_device_scalar_value_1_.data(), + cusparse_view_.A_T, + cusparse_view_.dual_solution, + reusable_device_scalar_value_0_.data(), + out_desc, + CUSPARSE_SPMV_CSR_ALG2, + (f_t*)cusparse_view_.buffer_transpose.data(), + stream_view_)); // Restore the canonical binding so subsequent code on this shard that reads // cv.dual_solution sees the dual_solution_ buffer it was constructed with. - RAFT_CUSPARSE_TRY(cusparseDnVecSetValues( - cusparse_view_.dual_solution, current_saddle_point_state_.get_dual_solution().data())); + RAFT_CUSPARSE_TRY(cusparseDnVecSetValues(cusparse_view_.dual_solution, + current_saddle_point_state_.get_dual_solution().data())); } template @@ -1434,8 +1433,7 @@ void pdhg_solver_t::compute_next_primal_dual_solution_reflected( for (auto& shard : mgpu_engine_->shards) { raft::device_setter guard(shard->device_id); auto& sub_pdlp = *shard->sub_pdlp; - sub_pdlp.pdhg_solver_.dual_reflected_projection_transform( - sub_pdlp.get_dual_step_size()); + sub_pdlp.pdhg_solver_.dual_reflected_projection_transform(sub_pdlp.get_dual_step_size()); } } else if (!batch_mode_) { dual_reflected_projection_transform(dual_step_size); diff --git a/cpp/src/pdlp/pdhg.hpp b/cpp/src/pdlp/pdhg.hpp index 8fbee24e71..e38ea9389c 100644 --- a/cpp/src/pdlp/pdhg.hpp +++ b/cpp/src/pdlp/pdhg.hpp @@ -102,13 +102,12 @@ class pdhg_solver_t { // No multi-GPU dispatch inside — the engine is the orchestrator. void spmv_At_into(rmm::device_uvector& in_buf, cusparseDnVecDescr_t out_desc); void spmv_A_into(rmm::device_uvector& in_buf, cusparseDnVecDescr_t out_desc); - + // Pure cub-transform extractions. Each one is byte-identical to the inline // cub call it replaces — no platform dispatch inside. Callers handle the // single-GPU vs per-shard branching at the call site (see the // "if (mgpu_engine_) for shard..." blocks in compute_next_*). - void primal_reflected_major_projection_transform( - rmm::device_uvector& primal_step_size); + void primal_reflected_major_projection_transform(rmm::device_uvector& primal_step_size); void dual_reflected_major_projection_transform(rmm::device_uvector& dual_step_size); void primal_reflected_projection_transform(rmm::device_uvector& primal_step_size); void dual_reflected_projection_transform(rmm::device_uvector& dual_step_size); diff --git a/cpp/src/pdlp/pdlp.cu b/cpp/src/pdlp/pdlp.cu index 7203c11a42..302f62e56a 100644 --- a/cpp/src/pdlp/pdlp.cu +++ b/cpp/src/pdlp/pdlp.cu @@ -44,8 +44,8 @@ #include #include #include -#include #include +#include #include namespace cuopt::linear_programming::detail { @@ -398,189 +398,195 @@ pdlp_solver_t::pdlp_solver_t(problem_t& op_problem, "Distributed PDLP (num_gpus > 1) currently requires double precision"); return; } else { - // 2. Load partition - std::vector parts; - if (!settings.multi_gpu_partition_file.empty()) { - parts = partition_loader_t::parse_distributed_pdlp_partition_file( - settings.multi_gpu_partition_file); - } else { - cuopt_expects(false, - error_type_t::RuntimeError, - "Metis partitioning inside cuopt not implemented yet; " - "provide a --parts file via settings.multi_gpu_partition_file"); - } - - // always compute initial step size before scaling and primal_weight after scaling to do like - // cuPDLPx - assert(settings_.hyper_params.compute_initial_primal_weight_before_scaling && - "compute_initial_primal_weight_before_scaling must be true in distributed mode"); - assert(!settings_.hyper_params.compute_initial_step_size_before_scaling && - "compute_initial_step_size_before_scaling must be false in distributed mode"); - - compute_initial_primal_weight(); - - // scale globally before dispatching to shards - initial_scaling_strategy_.scale_problem(); - - compute_initial_step_size(); - step_size_strategy_.get_primal_and_dual_stepsizes(primal_step_size_, dual_step_size_); - - const f_t initial_step_size_global = get_step_size_h(0); - const f_t initial_primal_weight_global = get_primal_weight_h(0); - - // 4. Copy both scaled and unscaled pb - auto const stream = op_problem_scaled_.handle_ptr->get_stream(); - i_t const n_cstr = op_problem_scaled_.n_constraints; - i_t const n_vars = op_problem_scaled_.n_variables; - i_t const nnz = op_problem_scaled_.nnz; - - // Shared topology (taken from the scaled problem, but identical on both). - std::vector h_A_row_offsets(n_cstr + 1); - std::vector h_A_col_indices(nnz); - std::vector h_A_t_row_offsets(n_vars + 1); - std::vector h_A_t_col_indices(nnz); - raft::copy(h_A_row_offsets.data(), op_problem_scaled_.offsets.data(), n_cstr + 1, stream); - raft::copy(h_A_col_indices.data(), op_problem_scaled_.variables.data(), nnz, stream); - raft::copy( - h_A_t_row_offsets.data(), op_problem_scaled_.reverse_offsets.data(), n_vars + 1, stream); - raft::copy(h_A_t_col_indices.data(), op_problem_scaled_.reverse_constraints.data(), nnz, stream); - - // Paired value arrays for A and A_T. - std::vector h_A_values(nnz); - std::vector h_A_values_scaled(nnz); - std::vector h_A_t_values(nnz); - std::vector h_A_t_values_scaled(nnz); - raft::copy(h_A_values.data(), problem_ptr->coefficients.data(), nnz, stream); - raft::copy(h_A_t_values.data(), problem_ptr->reverse_coefficients.data(), nnz, stream); - raft::copy(h_A_values_scaled.data(), op_problem_scaled_.coefficients.data(), nnz, stream); - raft::copy( - h_A_t_values_scaled.data(), op_problem_scaled_.reverse_coefficients.data(), nnz, stream); + // 2. Load partition + std::vector parts; + if (!settings.multi_gpu_partition_file.empty()) { + parts = partition_loader_t::parse_distributed_pdlp_partition_file( + settings.multi_gpu_partition_file); + } else { + cuopt_expects(false, + error_type_t::RuntimeError, + "Metis partitioning inside cuopt not implemented yet; " + "provide a --parts file via settings.multi_gpu_partition_file"); + } - using f_t2 = typename type_2::type; + // always compute initial step size before scaling and primal_weight after scaling to do like + // cuPDLPx + assert(settings_.hyper_params.compute_initial_primal_weight_before_scaling && + "compute_initial_primal_weight_before_scaling must be true in distributed mode"); + assert(!settings_.hyper_params.compute_initial_step_size_before_scaling && + "compute_initial_step_size_before_scaling must be false in distributed mode"); + + compute_initial_primal_weight(); + + // scale globally before dispatching to shards + initial_scaling_strategy_.scale_problem(); + + compute_initial_step_size(); + step_size_strategy_.get_primal_and_dual_stepsizes(primal_step_size_, dual_step_size_); + + const f_t initial_step_size_global = get_step_size_h(0); + const f_t initial_primal_weight_global = get_primal_weight_h(0); + + // 4. Copy both scaled and unscaled pb + auto const stream = op_problem_scaled_.handle_ptr->get_stream(); + i_t const n_cstr = op_problem_scaled_.n_constraints; + i_t const n_vars = op_problem_scaled_.n_variables; + i_t const nnz = op_problem_scaled_.nnz; + + // Shared topology (taken from the scaled problem, but identical on both). + std::vector h_A_row_offsets(n_cstr + 1); + std::vector h_A_col_indices(nnz); + std::vector h_A_t_row_offsets(n_vars + 1); + std::vector h_A_t_col_indices(nnz); + raft::copy(h_A_row_offsets.data(), op_problem_scaled_.offsets.data(), n_cstr + 1, stream); + raft::copy(h_A_col_indices.data(), op_problem_scaled_.variables.data(), nnz, stream); + raft::copy( + h_A_t_row_offsets.data(), op_problem_scaled_.reverse_offsets.data(), n_vars + 1, stream); + raft::copy( + h_A_t_col_indices.data(), op_problem_scaled_.reverse_constraints.data(), nnz, stream); + + // Paired value arrays for A and A_T. + std::vector h_A_values(nnz); + std::vector h_A_values_scaled(nnz); + std::vector h_A_t_values(nnz); + std::vector h_A_t_values_scaled(nnz); + raft::copy(h_A_values.data(), problem_ptr->coefficients.data(), nnz, stream); + raft::copy(h_A_t_values.data(), problem_ptr->reverse_coefficients.data(), nnz, stream); + raft::copy(h_A_values_scaled.data(), op_problem_scaled_.coefficients.data(), nnz, stream); + raft::copy( + h_A_t_values_scaled.data(), op_problem_scaled_.reverse_coefficients.data(), nnz, stream); - std::vector h_obj(n_vars); - std::vector h_obj_scaled(n_vars); - std::vector h_var_bounds_packed(n_vars); - std::vector h_var_bounds_scaled_packed(n_vars); - std::vector h_cstr_lower(n_cstr); - std::vector h_cstr_upper(n_cstr); - std::vector h_cstr_lower_scaled(n_cstr); - std::vector h_cstr_upper_scaled(n_cstr); - - raft::copy(h_obj.data(), problem_ptr->objective_coefficients.data(), n_vars, stream); - raft::copy(h_obj_scaled.data(), op_problem_scaled_.objective_coefficients.data(), n_vars, stream); - raft::copy(h_var_bounds_packed.data(), problem_ptr->variable_bounds.data(), n_vars, stream); - raft::copy( - h_var_bounds_scaled_packed.data(), op_problem_scaled_.variable_bounds.data(), n_vars, stream); - raft::copy(h_cstr_lower.data(), problem_ptr->constraint_lower_bounds.data(), n_cstr, stream); - raft::copy(h_cstr_upper.data(), problem_ptr->constraint_upper_bounds.data(), n_cstr, stream); - raft::copy( - h_cstr_lower_scaled.data(), op_problem_scaled_.constraint_lower_bounds.data(), n_cstr, stream); - raft::copy( - h_cstr_upper_scaled.data(), op_problem_scaled_.constraint_upper_bounds.data(), n_cstr, stream); - - // 5. Get full scaling factors on host - std::vector h_cummulative_cstr_scaling(n_cstr); - std::vector h_cummulative_var_scaling(n_vars); - raft::copy(h_cummulative_cstr_scaling.data(), - initial_scaling_strategy_.get_constraint_matrix_scaling_vector().data(), - n_cstr, - stream); - raft::copy(h_cummulative_var_scaling.data(), - initial_scaling_strategy_.get_variable_scaling_vector().data(), - n_vars, - stream); - const f_t h_bound_rescaling = initial_scaling_strategy_.get_h_bound_rescaling(); - const f_t h_objective_rescaling = initial_scaling_strategy_.get_h_objective_rescaling(); - - op_problem_scaled_.handle_ptr->sync_stream(stream); - - // Unpack interleaved {lower, upper} into separate vectors for both - // versions, so the shard ctor's slicing loop is uniform. - std::vector h_var_lower(n_vars), h_var_upper(n_vars); - std::vector h_var_lower_scaled(n_vars), h_var_upper_scaled(n_vars); - for (i_t i = 0; i < n_vars; ++i) { - h_var_lower[i] = h_var_bounds_packed[i].x; - h_var_upper[i] = h_var_bounds_packed[i].y; - h_var_lower_scaled[i] = h_var_bounds_scaled_packed[i].x; - h_var_upper_scaled[i] = h_var_bounds_scaled_packed[i].y; - } - - // 6. Build per-rank data and meta-data. - std::vector> sub_pdlp_rank_data = - partition_loader_t::create_rank_data_from_parts(parts, - h_A_row_offsets, - h_A_col_indices, - h_A_values, - h_A_values_scaled, - h_A_t_row_offsets, - h_A_t_col_indices, - h_A_t_values, - h_A_t_values_scaled, - settings.num_gpus, - n_cstr, - n_vars, - nnz); - - // 7. Build the per-shard PDLP settings: - pdlp_solver_settings_t sub_pdlp_settings = settings; - sub_pdlp_settings.num_gpus = 1; - sub_pdlp_settings.multi_gpu_partition_file = ""; - sub_pdlp_settings.is_distributed_sub_pdlp = true; - sub_pdlp_settings.hyper_params.default_l_inf_ruiz_iterations = 0; - sub_pdlp_settings.hyper_params.default_alpha_pock_chambolle_rescaling = 0.0; - - // 8. Construct the engine, creates NCCL comms and shards - multi_gpu_engine.emplace(std::move(sub_pdlp_rank_data), - h_obj, - h_var_lower, - h_var_upper, - h_cstr_lower, - h_cstr_upper, - h_obj_scaled, - h_var_lower_scaled, - h_var_upper_scaled, - h_cstr_lower_scaled, - h_cstr_upper_scaled, - h_cummulative_cstr_scaling, - h_cummulative_var_scaling, - h_bound_rescaling, - h_objective_rescaling, - op_problem_scaled_.maximize, - op_problem_scaled_.objective_offset, - op_problem_scaled_.presolve_data.objective_scaling_factor, - sub_pdlp_settings); - - for (auto& shard : multi_gpu_engine->shards) { - raft::device_setter guard(shard->device_id); - auto& sub = *shard->sub_pdlp; - raft::copy(sub.step_size_.data(), step_size_.data(), 1, shard->stream); - raft::copy(sub.primal_weight_.data(), primal_weight_.data(), 1, shard->stream); - raft::copy(sub.best_primal_weight_.data(), best_primal_weight_.data(), 1, shard->stream); - raft::copy(sub.primal_step_size_.data(), primal_step_size_.data(), 1, shard->stream); - raft::copy(sub.dual_step_size_.data(), dual_step_size_.data(), 1, shard->stream); - } - - // Wire the engine into the master pdhg_solver_. Shards' pdhg_solver_ keep - // mgpu_engine_ == nullptr so they run plain single-GPU SpMV on local A. - pdhg_solver_.set_multi_gpu_engine(&*multi_gpu_engine); - - // Project initial primal solution - if (settings_.hyper_params.project_initial_primal) { - // Use refine_initial_primal_projection ??? using f_t2 = typename type_2::type; + + std::vector h_obj(n_vars); + std::vector h_obj_scaled(n_vars); + std::vector h_var_bounds_packed(n_vars); + std::vector h_var_bounds_scaled_packed(n_vars); + std::vector h_cstr_lower(n_cstr); + std::vector h_cstr_upper(n_cstr); + std::vector h_cstr_lower_scaled(n_cstr); + std::vector h_cstr_upper_scaled(n_cstr); + + raft::copy(h_obj.data(), problem_ptr->objective_coefficients.data(), n_vars, stream); + raft::copy( + h_obj_scaled.data(), op_problem_scaled_.objective_coefficients.data(), n_vars, stream); + raft::copy(h_var_bounds_packed.data(), problem_ptr->variable_bounds.data(), n_vars, stream); + raft::copy( + h_var_bounds_scaled_packed.data(), op_problem_scaled_.variable_bounds.data(), n_vars, stream); + raft::copy(h_cstr_lower.data(), problem_ptr->constraint_lower_bounds.data(), n_cstr, stream); + raft::copy(h_cstr_upper.data(), problem_ptr->constraint_upper_bounds.data(), n_cstr, stream); + raft::copy(h_cstr_lower_scaled.data(), + op_problem_scaled_.constraint_lower_bounds.data(), + n_cstr, + stream); + raft::copy(h_cstr_upper_scaled.data(), + op_problem_scaled_.constraint_upper_bounds.data(), + n_cstr, + stream); + + // 5. Get full scaling factors on host + std::vector h_cummulative_cstr_scaling(n_cstr); + std::vector h_cummulative_var_scaling(n_vars); + raft::copy(h_cummulative_cstr_scaling.data(), + initial_scaling_strategy_.get_constraint_matrix_scaling_vector().data(), + n_cstr, + stream); + raft::copy(h_cummulative_var_scaling.data(), + initial_scaling_strategy_.get_variable_scaling_vector().data(), + n_vars, + stream); + const f_t h_bound_rescaling = initial_scaling_strategy_.get_h_bound_rescaling(); + const f_t h_objective_rescaling = initial_scaling_strategy_.get_h_objective_rescaling(); + + op_problem_scaled_.handle_ptr->sync_stream(stream); + + // Unpack interleaved {lower, upper} into separate vectors for both + // versions, so the shard ctor's slicing loop is uniform. + std::vector h_var_lower(n_vars), h_var_upper(n_vars); + std::vector h_var_lower_scaled(n_vars), h_var_upper_scaled(n_vars); + for (i_t i = 0; i < n_vars; ++i) { + h_var_lower[i] = h_var_bounds_packed[i].x; + h_var_upper[i] = h_var_bounds_packed[i].y; + h_var_lower_scaled[i] = h_var_bounds_scaled_packed[i].x; + h_var_upper_scaled[i] = h_var_bounds_scaled_packed[i].y; + } + + // 6. Build per-rank data and meta-data. + std::vector> sub_pdlp_rank_data = + partition_loader_t::create_rank_data_from_parts(parts, + h_A_row_offsets, + h_A_col_indices, + h_A_values, + h_A_values_scaled, + h_A_t_row_offsets, + h_A_t_col_indices, + h_A_t_values, + h_A_t_values_scaled, + settings.num_gpus, + n_cstr, + n_vars, + nnz); + + // 7. Build the per-shard PDLP settings: + pdlp_solver_settings_t sub_pdlp_settings = settings; + sub_pdlp_settings.num_gpus = 1; + sub_pdlp_settings.multi_gpu_partition_file = ""; + sub_pdlp_settings.is_distributed_sub_pdlp = true; + sub_pdlp_settings.hyper_params.default_l_inf_ruiz_iterations = 0; + sub_pdlp_settings.hyper_params.default_alpha_pock_chambolle_rescaling = 0.0; + + // 8. Construct the engine, creates NCCL comms and shards + multi_gpu_engine.emplace(std::move(sub_pdlp_rank_data), + h_obj, + h_var_lower, + h_var_upper, + h_cstr_lower, + h_cstr_upper, + h_obj_scaled, + h_var_lower_scaled, + h_var_upper_scaled, + h_cstr_lower_scaled, + h_cstr_upper_scaled, + h_cummulative_cstr_scaling, + h_cummulative_var_scaling, + h_bound_rescaling, + h_objective_rescaling, + op_problem_scaled_.maximize, + op_problem_scaled_.objective_offset, + op_problem_scaled_.presolve_data.objective_scaling_factor, + sub_pdlp_settings); + for (auto& shard : multi_gpu_engine->shards) { raft::device_setter guard(shard->device_id); auto& sub = *shard->sub_pdlp; - cub::DeviceTransform::Transform( - cuda::std::make_tuple(sub.pdhg_solver_.get_primal_solution().data(), - sub.get_op_problem_scaled().variable_bounds.data()), - sub.pdhg_solver_.get_primal_solution().data(), - sub.pdhg_solver_.get_primal_solution().size(), - clamp(), - shard->stream.view()); + raft::copy(sub.step_size_.data(), step_size_.data(), 1, shard->stream); + raft::copy(sub.primal_weight_.data(), primal_weight_.data(), 1, shard->stream); + raft::copy(sub.best_primal_weight_.data(), best_primal_weight_.data(), 1, shard->stream); + raft::copy(sub.primal_step_size_.data(), primal_step_size_.data(), 1, shard->stream); + raft::copy(sub.dual_step_size_.data(), dual_step_size_.data(), 1, shard->stream); + } + + // Wire the engine into the master pdhg_solver_. Shards' pdhg_solver_ keep + // mgpu_engine_ == nullptr so they run plain single-GPU SpMV on local A. + pdhg_solver_.set_multi_gpu_engine(&*multi_gpu_engine); + + // Project initial primal solution + if (settings_.hyper_params.project_initial_primal) { + // Use refine_initial_primal_projection ??? + using f_t2 = typename type_2::type; + for (auto& shard : multi_gpu_engine->shards) { + raft::device_setter guard(shard->device_id); + auto& sub = *shard->sub_pdlp; + cub::DeviceTransform::Transform( + cuda::std::make_tuple(sub.pdhg_solver_.get_primal_solution().data(), + sub.get_op_problem_scaled().variable_bounds.data()), + sub.pdhg_solver_.get_primal_solution().data(), + sub.pdhg_solver_.get_primal_solution().size(), + clamp(), + shard->stream.view()); + } } - } } // end if constexpr (std::is_same_v) } @@ -2222,24 +2228,22 @@ void pdlp_solver_t::compute_fixed_error(std::vector& has_restarte // Computing the deltas // TODO batch mdoe: this only works if everyone restarts if (multi_gpu_engine) { - // Go faire une fonction compute_delta_primal, compute_delta primal ? + // Go faire une fonction compute_delta_primal, compute_delta primal ? for (auto& shard : multi_gpu_engine->shards) { raft::device_setter guard(shard->device_id); auto& sub_pdhg = shard->sub_pdlp->pdhg_solver_; - cub::DeviceTransform::Transform( - cuda::std::make_tuple(sub_pdhg.get_reflected_primal().data(), - sub_pdhg.get_primal_solution().data()), - sub_pdhg.get_saddle_point_state().get_delta_primal().data(), - sub_pdhg.get_primal_solution().size(), - cuda::std::minus{}, - shard->stream.view()); - cub::DeviceTransform::Transform( - cuda::std::make_tuple(sub_pdhg.get_reflected_dual().data(), - sub_pdhg.get_dual_solution().data()), - sub_pdhg.get_saddle_point_state().get_delta_dual().data(), - sub_pdhg.get_dual_solution().size(), - cuda::std::minus{}, - shard->stream.view()); + cub::DeviceTransform::Transform(cuda::std::make_tuple(sub_pdhg.get_reflected_primal().data(), + sub_pdhg.get_primal_solution().data()), + sub_pdhg.get_saddle_point_state().get_delta_primal().data(), + sub_pdhg.get_primal_solution().size(), + cuda::std::minus{}, + shard->stream.view()); + cub::DeviceTransform::Transform(cuda::std::make_tuple(sub_pdhg.get_reflected_dual().data(), + sub_pdhg.get_dual_solution().data()), + sub_pdhg.get_saddle_point_state().get_delta_dual().data(), + sub_pdhg.get_dual_solution().size(), + cuda::std::minus{}, + shard->stream.view()); } } else { cub::DeviceTransform::Transform( @@ -2249,21 +2253,19 @@ void pdlp_solver_t::compute_fixed_error(std::vector& has_restarte pdhg_solver_.get_primal_solution().size(), cuda::std::minus{}, stream_view_.value()); - cub::DeviceTransform::Transform( - cuda::std::make_tuple(pdhg_solver_.get_reflected_dual().data(), - pdhg_solver_.get_dual_solution().data()), - pdhg_solver_.get_saddle_point_state().get_delta_dual().data(), - pdhg_solver_.get_dual_solution().size(), - cuda::std::minus{}, - stream_view_.value()); + cub::DeviceTransform::Transform(cuda::std::make_tuple(pdhg_solver_.get_reflected_dual().data(), + pdhg_solver_.get_dual_solution().data()), + pdhg_solver_.get_saddle_point_state().get_delta_dual().data(), + pdhg_solver_.get_dual_solution().size(), + cuda::std::minus{}, + stream_view_.value()); } auto& cusparse_view = pdhg_solver_.get_cusparse_view(); if (multi_gpu_engine) { - - // SpMV is the first operation in compute_interaction_and_movement so we can do halo before and call it naturally - // we then reduce the local dot products + // SpMV is the first operation in compute_interaction_and_movement so we can do halo before and + // call it naturally we then reduce the local dot products multi_gpu_engine->halo_exchange_cstr( [](auto& pdhg) -> rmm::device_uvector& { return pdhg.get_reflected_dual(); }); @@ -2294,9 +2296,7 @@ void pdlp_solver_t::compute_fixed_error(std::vector& has_restarte }, 1); multi_gpu_engine->allreduce_sum_inplace( - [](auto& sp) -> f_t* { - return sp.step_size_strategy_.get_norm_squared_delta_dual().data(); - }, + [](auto& sp) -> f_t* { return sp.step_size_strategy_.get_norm_squared_delta_dual().data(); }, 1); auto& s0 = *multi_gpu_engine->shards[0]; @@ -2365,10 +2365,10 @@ void pdlp_solver_t::compute_fixed_error(std::vector& has_restarte // Put back, already done in multi-gpu side if (!multi_gpu_engine) { - RAFT_CUSPARSE_TRY( - cusparseDnVecSetValues(cusparse_view.potential_next_dual_solution, - (void*)pdhg_solver_.get_potential_next_dual_solution().data())); - } + RAFT_CUSPARSE_TRY( + cusparseDnVecSetValues(cusparse_view.potential_next_dual_solution, + (void*)pdhg_solver_.get_potential_next_dual_solution().data())); + } if (batch_mode_) { RAFT_CUSPARSE_TRY( cusparseDnMatSetValues(cusparse_view.batch_potential_next_dual_solution, @@ -2630,8 +2630,9 @@ optimization_problem_solution_t pdlp_solver_t::run_solver(co // Update FP32 matrix copies for mixed precision SpMV after scaling pdhg_solver_.get_cusparse_view().update_mixed_precision_matrices(); - // Redirect cuSPARSE descriptors to use the original problem's structural data (offsets, indices), - // then free the duplicated structural vectors from the scaled copy to save device memory. + // Redirect cuSPARSE descriptors to use the original problem's structural data (offsets, + // indices), then free the duplicated structural vectors from the scaled copy to save device + // memory. pdhg_solver_.get_cusparse_view().redirect_cusparse_csr_structure_pointers(*problem_ptr); op_problem_scaled_.variables.resize(0, stream_view_); op_problem_scaled_.offsets.resize(0, stream_view_); @@ -2846,7 +2847,6 @@ optimization_problem_solution_t pdlp_solver_t::run_solver(co "Initial primal_weight", primal_weight_.data(), primal_weight_.size(), std::cout); #endif - if (!inside_mip_) { CUOPT_LOG_INFO( " Iter Primal Obj. Dual Obj. Gap Primal Res. Dual Res. Time"); diff --git a/cpp/src/pdlp/step_size_strategy/adaptive_step_size_strategy.cu b/cpp/src/pdlp/step_size_strategy/adaptive_step_size_strategy.cu index fb85be4280..2cb843ae86 100644 --- a/cpp/src/pdlp/step_size_strategy/adaptive_step_size_strategy.cu +++ b/cpp/src/pdlp/step_size_strategy/adaptive_step_size_strategy.cu @@ -316,15 +316,13 @@ rmm::device_uvector& adaptive_step_size_strategy_t::get_interacti } template -rmm::device_uvector& -adaptive_step_size_strategy_t::get_norm_squared_delta_primal() +rmm::device_uvector& adaptive_step_size_strategy_t::get_norm_squared_delta_primal() { return norm_squared_delta_primal_; } template -rmm::device_uvector& -adaptive_step_size_strategy_t::get_norm_squared_delta_dual() +rmm::device_uvector& adaptive_step_size_strategy_t::get_norm_squared_delta_dual() { return norm_squared_delta_dual_; } From 98e0ce68d67f3b9701c7b196d490754401c18a31 Mon Sep 17 00:00:00 2001 From: Bulle Mostovoi Date: Tue, 26 May 2026 11:06:24 +0200 Subject: [PATCH 26/67] now manage halpern update in multi-gpu pdlp --- cpp/src/pdlp/pdlp.cu | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/cpp/src/pdlp/pdlp.cu b/cpp/src/pdlp/pdlp.cu index 302f62e56a..b69ceccae5 100644 --- a/cpp/src/pdlp/pdlp.cu +++ b/cpp/src/pdlp/pdlp.cu @@ -3085,13 +3085,22 @@ optimization_problem_solution_t pdlp_solver_t::run_solver(co transpose_problem_fields(/*to_row=*/true); } } - halpern_update(); + if (multi_gpu_engine_) { + multi_gpu_engine_->for_each_shard([&](auto& shard) { shard.sub_pdlp->halpern_update(); }); + } else { + halpern_update(); + } } ++total_pdlp_iterations_; ++internal_solver_iterations_; - if (settings_.hyper_params.never_restart_to_average) - restart_strategy_.increment_iteration_since_last_restart(); + if (settings_.hyper_params.never_restart_to_average) { + if (multi_gpu_engine_) { + multi_gpu_engine_->for_each_shard([&](auto& shard) { shard.sub_pdlp->restart_strategy_.increment_iteration_since_last_restart(); }); + } else { + restart_strategy_.increment_iteration_since_last_restart(); + } + } } return optimization_problem_solution_t{pdlp_termination_status_t::NumericalError, stream_view_}; From 84128bf809348932fb6b540ae93d893feb7c4756 Mon Sep 17 00:00:00 2001 From: Bulle Mostovoi Date: Tue, 26 May 2026 11:46:46 +0200 Subject: [PATCH 27/67] small fix to calls of multi_gpu_engine_ and scale/unscale solutions. compiles and runs --- cpp/src/pdlp/pdlp.cu | 47 ++++++++++++++++++++++++++++++++++++-------- 1 file changed, 39 insertions(+), 8 deletions(-) diff --git a/cpp/src/pdlp/pdlp.cu b/cpp/src/pdlp/pdlp.cu index b69ceccae5..36ba854439 100644 --- a/cpp/src/pdlp/pdlp.cu +++ b/cpp/src/pdlp/pdlp.cu @@ -2896,6 +2896,9 @@ optimization_problem_solution_t pdlp_solver_t::run_solver(co // 1. At the very beginning of the solver, when no steps have been taken yet // 2. After a single step, since average of one step is the same step if (internal_solver_iterations_ <= 1) { + if (multi_gpu_engine) { + assert(false && "Not implemented"); + } raft::copy(unscaled_primal_avg_solution_.data(), pdhg_solver_.get_primal_solution().data(), primal_size_h_, @@ -2946,8 +2949,22 @@ optimization_problem_solution_t pdlp_solver_t::run_solver(co unscaled_dual_avg_solution_); } if (settings_.hyper_params.use_adaptive_step_size_strategy) { - initial_scaling_strategy_.unscale_solutions(pdhg_solver_.get_primal_solution(), - pdhg_solver_.get_dual_solution()); + if (multi_gpu_engine) { + // Master's pdhg_solver_.{primal,dual}_solution_ is stale in mGPU mode + // (live state lives on shards). Unscale in place on each shard with + // the shard's own initial_scaling_strategy_, which already holds the + // global cumulative scaling factors for its owned slice (set up in + // shard.cu via set_cummulative_scaling). Halo slots have unit scaling + // so unscaling is a no-op there (their values are junk anyway). + multi_gpu_engine->for_each_shard([&](auto& shard) { + auto& sub = *shard.sub_pdlp; + sub.get_initial_scaling_strategy().unscale_solutions( + sub.pdhg_solver_.get_primal_solution(), sub.pdhg_solver_.get_dual_solution()); + }); + } else { + initial_scaling_strategy_.unscale_solutions(pdhg_solver_.get_primal_solution(), + pdhg_solver_.get_dual_solution()); + } } else { initial_scaling_strategy_.unscale_solutions( pdhg_solver_.get_potential_next_primal_solution(), @@ -2981,8 +2998,20 @@ optimization_problem_solution_t pdlp_solver_t::run_solver(co unscaled_dual_avg_solution_); } if (settings_.hyper_params.use_adaptive_step_size_strategy) { - initial_scaling_strategy_.scale_solutions(pdhg_solver_.get_primal_solution(), - pdhg_solver_.get_dual_solution()); + if (multi_gpu_engine) { + // Symmetric to the unscale dispatch above. Live state lives on + // shards; each shard's initial_scaling_strategy_ holds the global + // cumulative scaling factors for its owned slice (halo slots have + // unit scaling, so they're no-ops). Scale in place per shard. + multi_gpu_engine->for_each_shard([&](auto& shard) { + auto& sub = *shard.sub_pdlp; + sub.get_initial_scaling_strategy().scale_solutions( + sub.pdhg_solver_.get_primal_solution(), sub.pdhg_solver_.get_dual_solution()); + }); + } else { + initial_scaling_strategy_.scale_solutions(pdhg_solver_.get_primal_solution(), + pdhg_solver_.get_dual_solution()); + } } else { initial_scaling_strategy_.scale_solutions( pdhg_solver_.get_potential_next_primal_solution(), @@ -3085,8 +3114,8 @@ optimization_problem_solution_t pdlp_solver_t::run_solver(co transpose_problem_fields(/*to_row=*/true); } } - if (multi_gpu_engine_) { - multi_gpu_engine_->for_each_shard([&](auto& shard) { shard.sub_pdlp->halpern_update(); }); + if (multi_gpu_engine) { + multi_gpu_engine->for_each_shard([&](auto& shard) { shard.sub_pdlp->halpern_update(); }); } else { halpern_update(); } @@ -3095,8 +3124,10 @@ optimization_problem_solution_t pdlp_solver_t::run_solver(co ++total_pdlp_iterations_; ++internal_solver_iterations_; if (settings_.hyper_params.never_restart_to_average) { - if (multi_gpu_engine_) { - multi_gpu_engine_->for_each_shard([&](auto& shard) { shard.sub_pdlp->restart_strategy_.increment_iteration_since_last_restart(); }); + if (multi_gpu_engine) { + multi_gpu_engine->for_each_shard([&](auto& shard) { + shard.sub_pdlp->restart_strategy_.increment_iteration_since_last_restart(); + }); } else { restart_strategy_.increment_iteration_since_last_restart(); } From abe4dd23e41ee7cb7cdba4ca3ca7979874b39856 Mon Sep 17 00:00:00 2001 From: Bulle Mostovoi Date: Tue, 26 May 2026 11:54:45 +0200 Subject: [PATCH 28/67] comments --- cpp/src/pdlp/pdlp.cu | 12 ++---------- 1 file changed, 2 insertions(+), 10 deletions(-) diff --git a/cpp/src/pdlp/pdlp.cu b/cpp/src/pdlp/pdlp.cu index 36ba854439..e2aeb3f08c 100644 --- a/cpp/src/pdlp/pdlp.cu +++ b/cpp/src/pdlp/pdlp.cu @@ -2950,12 +2950,7 @@ optimization_problem_solution_t pdlp_solver_t::run_solver(co } if (settings_.hyper_params.use_adaptive_step_size_strategy) { if (multi_gpu_engine) { - // Master's pdhg_solver_.{primal,dual}_solution_ is stale in mGPU mode - // (live state lives on shards). Unscale in place on each shard with - // the shard's own initial_scaling_strategy_, which already holds the - // global cumulative scaling factors for its owned slice (set up in - // shard.cu via set_cummulative_scaling). Halo slots have unit scaling - // so unscaling is a no-op there (their values are junk anyway). + // The only branch in cuPDLPx multi_gpu_engine->for_each_shard([&](auto& shard) { auto& sub = *shard.sub_pdlp; sub.get_initial_scaling_strategy().unscale_solutions( @@ -2999,10 +2994,7 @@ optimization_problem_solution_t pdlp_solver_t::run_solver(co } if (settings_.hyper_params.use_adaptive_step_size_strategy) { if (multi_gpu_engine) { - // Symmetric to the unscale dispatch above. Live state lives on - // shards; each shard's initial_scaling_strategy_ holds the global - // cumulative scaling factors for its owned slice (halo slots have - // unit scaling, so they're no-ops). Scale in place per shard. + // The only branch in cuPDLPx multi_gpu_engine->for_each_shard([&](auto& shard) { auto& sub = *shard.sub_pdlp; sub.get_initial_scaling_strategy().scale_solutions( From 5c41497080dd3950c378d485a5ada75c1658f31f Mon Sep 17 00:00:00 2001 From: Bulle Mostovoi Date: Tue, 26 May 2026 12:06:51 +0200 Subject: [PATCH 29/67] added is multi gpu to pdhg --- cpp/src/pdlp/distributed_pdlp/shard.cu | 2 ++ cpp/src/pdlp/pdhg.hpp | 16 ++++++++++++++-- 2 files changed, 16 insertions(+), 2 deletions(-) diff --git a/cpp/src/pdlp/distributed_pdlp/shard.cu b/cpp/src/pdlp/distributed_pdlp/shard.cu index 33aac38103..405e6fa05c 100644 --- a/cpp/src/pdlp/distributed_pdlp/shard.cu +++ b/cpp/src/pdlp/distributed_pdlp/shard.cu @@ -155,6 +155,8 @@ pdlp_shard_t::pdlp_shard_t(int device_id, // unit cumulative factors (sub-settings disable Ruiz / PC iters). sub_pdlp = std::make_unique>(*sub_problem, settings, /*batch=*/false); + sub_pdlp->pdhg_solver_.set_is_multi_gpu(true); + // Inject master-scaled buffers inside sub_pdlp auto& scaled = sub_pdlp->get_op_problem_scaled(); raft::copy(scaled.coefficients.data(), diff --git a/cpp/src/pdlp/pdhg.hpp b/cpp/src/pdlp/pdhg.hpp index e38ea9389c..2e230eaf86 100644 --- a/cpp/src/pdlp/pdhg.hpp +++ b/cpp/src/pdlp/pdhg.hpp @@ -114,8 +114,19 @@ class pdhg_solver_t { // Master PDLP wires up the engine pointer here after the engine is built. // Shards' pdhg_solver_ leaves this null so each shard runs single-GPU SpMV - // on its local matrix. - void set_multi_gpu_engine(multi_gpu_engine_t* engine) { mgpu_engine_ = engine; } + // on its local matrix. Also flips is_multi_gpu_ — convenience flag that any + // pdhg participating in a distributed run (master OR shard) carries true. + void set_multi_gpu_engine(multi_gpu_engine_t* engine) + { + mgpu_engine_ = engine; + is_multi_gpu_ = (engine != nullptr); + } + + // Mark a shard's pdhg_solver_ as part of a distributed run without giving it + // an engine (shards don't orchestrate; they only run local SpMV on owned + // rows). Called from shard.cu right after sub_pdlp is constructed. + void set_is_multi_gpu(bool v) { is_multi_gpu_ = v; } + bool is_multi_gpu() const { return is_multi_gpu_; } i_t total_pdhg_iterations_; @@ -136,6 +147,7 @@ class pdhg_solver_t { void compute_primal_projection(rmm::device_uvector& primal_step_size); bool batch_mode_{false}; + bool is_multi_gpu_{false}; raft::handle_t const* handle_ptr_{nullptr}; rmm::cuda_stream_view stream_view_; From 37b1fdafab439c4b0be39b7d467b31d0f23110b5 Mon Sep 17 00:00:00 2001 From: Bulle Mostovoi Date: Tue, 26 May 2026 12:24:45 +0200 Subject: [PATCH 30/67] added pdhg get mgpu engine --- cpp/src/pdlp/pdhg.hpp | 1 + 1 file changed, 1 insertion(+) diff --git a/cpp/src/pdlp/pdhg.hpp b/cpp/src/pdlp/pdhg.hpp index 2e230eaf86..e4d16360a7 100644 --- a/cpp/src/pdlp/pdhg.hpp +++ b/cpp/src/pdlp/pdhg.hpp @@ -127,6 +127,7 @@ class pdhg_solver_t { // rows). Called from shard.cu right after sub_pdlp is constructed. void set_is_multi_gpu(bool v) { is_multi_gpu_ = v; } bool is_multi_gpu() const { return is_multi_gpu_; } + multi_gpu_engine_t* get_mgpu_engine() const { return mgpu_engine_; } i_t total_pdhg_iterations_; From 57c70615337bd12fe803938d5f1bc44c4d9fa7f1 Mon Sep 17 00:00:00 2001 From: Bulle Mostovoi Date: Tue, 26 May 2026 12:25:25 +0200 Subject: [PATCH 31/67] added non const convergence information getter --- cpp/src/pdlp/termination_strategy/termination_strategy.cu | 7 +++++++ cpp/src/pdlp/termination_strategy/termination_strategy.hpp | 1 + 2 files changed, 8 insertions(+) diff --git a/cpp/src/pdlp/termination_strategy/termination_strategy.cu b/cpp/src/pdlp/termination_strategy/termination_strategy.cu index d1a88799d6..0320b420a8 100644 --- a/cpp/src/pdlp/termination_strategy/termination_strategy.cu +++ b/cpp/src/pdlp/termination_strategy/termination_strategy.cu @@ -195,6 +195,13 @@ pdlp_termination_strategy_t::get_convergence_information() const return convergence_information_; } +template +convergence_information_t& +pdlp_termination_strategy_t::get_convergence_information() +{ + return convergence_information_; +} + template const infeasibility_information_t& pdlp_termination_strategy_t::get_infeasibility_information() const diff --git a/cpp/src/pdlp/termination_strategy/termination_strategy.hpp b/cpp/src/pdlp/termination_strategy/termination_strategy.hpp index 5cd43d7be7..63b2e81ff4 100644 --- a/cpp/src/pdlp/termination_strategy/termination_strategy.hpp +++ b/cpp/src/pdlp/termination_strategy/termination_strategy.hpp @@ -187,6 +187,7 @@ class pdlp_termination_strategy_t { i_t get_optimal_solution_id() const; const convergence_information_t& get_convergence_information() const; + convergence_information_t& get_convergence_information(); const infeasibility_information_t& get_infeasibility_information() const; // Deep copy is used when save best primal so far is toggled From 9f78d0534c232055b7da4e425379e9f86a436e08 Mon Sep 17 00:00:00 2001 From: Bulle Mostovoi Date: Tue, 26 May 2026 14:29:36 +0200 Subject: [PATCH 32/67] compute_convergence_information is now on multi-gpu --- .../distributed_pdlp/multi_gpu_engine.hpp | 58 ++++ cpp/src/pdlp/pdlp.cu | 8 +- .../convergence_information.cu | 296 ++++++++++++++++-- .../convergence_information.hpp | 31 ++ 4 files changed, 360 insertions(+), 33 deletions(-) diff --git a/cpp/src/pdlp/distributed_pdlp/multi_gpu_engine.hpp b/cpp/src/pdlp/distributed_pdlp/multi_gpu_engine.hpp index 001f9b760e..438a878834 100644 --- a/cpp/src/pdlp/distributed_pdlp/multi_gpu_engine.hpp +++ b/cpp/src/pdlp/distributed_pdlp/multi_gpu_engine.hpp @@ -7,10 +7,12 @@ #include #include #include +#include #include #include +#include #include #include @@ -27,6 +29,16 @@ namespace cuopt::linear_programming::detail { +// Element-wise sqrt functor. Defined at namespace scope (not as a local +// extended HD lambda) because nvcc disallows extended __host__ __device__ +// lambdas appearing inside templates whose template arguments are +// themselves local lambda types (which happens when distributed_l2_norm is +// invoked with closure accessors). +template +struct sqrt_inplace_op_t { + __host__ __device__ f_t operator()(f_t x) const { return raft::sqrt(x); } +}; + template struct multi_gpu_engine_t { // Constructs shards from rank_data @@ -219,6 +231,52 @@ struct multi_gpu_engine_t { ncclGroupEnd(); } + // -------- Distributed L2 norm ------------------------------------------ + // Computes sqrt(Σ_k Σ_{i ∈ owned_k} buf_k[i]²) and writes the scalar into + // the buffer returned by `out_access` on EVERY shard. + // + // Algorithm: + // 1) per shard: out = cublasdot(buf[0:n_owned], buf[0:n_owned]) (partial Σ²) + // 2) NCCL allreduce SUM on out (count = 1) (global Σ²) + // 3) per shard: out = sqrt(out) + // + // The caller is responsible for clipping correctness via `size_access` + // (which picks `rank_data.owned_var_size` or `rank_data.owned_cstr_size` + // depending on the shape of the input buffer), and for mirroring the + // result back to master if downstream code needs it there. + // + // BufAccess : pdlp_solver_t& -> rmm::device_uvector& + // OutAccess : pdlp_solver_t& -> f_t* (single scalar in shard memory) + // SizeAccess : pdlp_shard_t& -> i_t (owned slice length) + template + void distributed_l2_norm(BufAccess&& buf_access, + OutAccess&& out_access, + SizeAccess&& size_access) + { + for_each_shard([&](auto& shard) { + auto& sub = *shard.sub_pdlp; + auto& buf = buf_access(sub); + const i_t n = size_access(shard); + f_t* out = out_access(sub); + RAFT_CUBLAS_TRY(raft::linalg::detail::cublasdot(shard.handle.get_cublas_handle(), + static_cast(n), + buf.data(), + 1, + buf.data(), + 1, + out, + shard.stream.view().value())); + }); + + allreduce_sum_inplace(out_access, /*count=*/1); + + for_each_shard([&](auto& shard) { + f_t* out = out_access(*shard.sub_pdlp); + cub::DeviceTransform::Transform( + out, out, 1, sqrt_inplace_op_t{}, shard.stream.view().value()); + }); + } + // -------- Generic distributed SpMVs ------------------------------------- // distributed_spmv_A : halo-update the var-shaped input buffer returned by // `in_buf(pdhg)`, then per-shard A @ in_buf -> out_desc. diff --git a/cpp/src/pdlp/pdlp.cu b/cpp/src/pdlp/pdlp.cu index e2aeb3f08c..9522ae4065 100644 --- a/cpp/src/pdlp/pdlp.cu +++ b/cpp/src/pdlp/pdlp.cu @@ -2289,15 +2289,13 @@ void pdlp_solver_t::compute_fixed_error(std::vector& has_restarte } multi_gpu_engine->allreduce_sum_inplace( - [](auto& sp) -> f_t* { return sp.step_size_strategy_.get_interaction().data(); }, 1); + [](auto& sp) -> f_t* { return sp.step_size_strategy_.get_interaction().data(); }); multi_gpu_engine->allreduce_sum_inplace( [](auto& sp) -> f_t* { return sp.step_size_strategy_.get_norm_squared_delta_primal().data(); - }, - 1); + }); multi_gpu_engine->allreduce_sum_inplace( - [](auto& sp) -> f_t* { return sp.step_size_strategy_.get_norm_squared_delta_dual().data(); }, - 1); + [](auto& sp) -> f_t* { return sp.step_size_strategy_.get_norm_squared_delta_dual().data(); }); auto& s0 = *multi_gpu_engine->shards[0]; { diff --git a/cpp/src/pdlp/termination_strategy/convergence_information.cu b/cpp/src/pdlp/termination_strategy/convergence_information.cu index a6d6d14d96..28b33582ab 100644 --- a/cpp/src/pdlp/termination_strategy/convergence_information.cu +++ b/cpp/src/pdlp/termination_strategy/convergence_information.cu @@ -5,12 +5,16 @@ */ /* clang-format on */ +#include +#include #include #include #include #include #include +#include + #include #include @@ -416,17 +420,89 @@ void convergence_information_t::compute_convergence_information( print("dual_slack", dual_slack); #endif + if (current_pdhg_solver.is_multi_gpu()) + { + auto* engine = current_pdhg_solver.get_mgpu_engine(); + cuopt_assert(engine != nullptr, + "mGPU branch reached but current_pdhg_solver has no engine (shard pdhg?)"); + cuopt_expects(!settings.per_constraint_residual, + error_type_t::ValidationError, + "per_constraint_residual is not yet supported in multi-GPU mode"); + + // Prepares halo values in primal_solution + engine->halo_exchange_var( + [](pdhg_solver_t& pdhg) -> rmm::device_uvector& { + return pdhg.get_primal_solution(); + }); + + // Compute the primal residual and objective on each shard + for (auto& shard : engine->shards) { + raft::device_setter guard(shard->device_id); + auto& sub_pdlp = *shard->sub_pdlp; + auto& sub_conv = sub_pdlp.get_current_termination_strategy().get_convergence_information(); + sub_conv.compute_primal_residual(sub_conv.op_problem_cusparse_view_, + sub_pdlp.pdhg_solver_.get_dual_tmp_resource(), + sub_pdlp.pdhg_solver_.get_dual_solution()); + sub_conv.compute_primal_objective_owned_partial(sub_pdlp.pdhg_solver_.get_primal_solution(), + shard->rank_data.owned_var_size); + } + + // Reduce all primal objectives across shards + cuopt_assert(!batch_mode_, "multi-GPU PDLP is not supported in batch mode"); + engine->allreduce_sum_inplace( + [](pdlp_solver_t& sp) -> f_t* { + return sp.get_current_termination_strategy() + .get_convergence_information() + .get_primal_objective() + .data(); + }); + + // Get the reduced primal objective from the shard[0] (arbitrary) + { + auto& s0 = *engine->shards[0]; + raft::device_setter guard(s0.device_id); + auto& s0_conv = + s0.sub_pdlp->get_current_termination_strategy().get_convergence_information(); + raft::copy(primal_objective_.data(), s0_conv.get_primal_objective().data(), 1, stream_view_); + } + apply_primal_objective_scaling_and_offset(); + } + else { compute_primal_residual( op_problem_cusparse_view_, current_pdhg_solver.get_dual_tmp_resource(), dual_iterate); - compute_primal_objective(primal_iterate); + compute_primal_objective(primal_iterate);} #ifdef CUPDLP_DEBUG_MODE print("Primal Residual", primal_residual_); #endif - if (!batch_mode_) + // L2 Norm + if (current_pdhg_solver.is_multi_gpu()) { + auto* engine = current_pdhg_solver.get_mgpu_engine(); + engine->distributed_l2_norm( + [](pdlp_solver_t& sp) -> rmm::device_uvector& { + return sp.get_current_termination_strategy() + .get_convergence_information() + .primal_residual_; + }, + [](pdlp_solver_t& sp) -> f_t* { + return sp.get_current_termination_strategy() + .get_convergence_information() + .l2_primal_residual_.data(); + }, + [](pdlp_shard_t& shard) -> i_t { return shard.rank_data.owned_cstr_size; }); + + auto& s0 = *engine->shards[0]; + raft::device_setter guard(s0.device_id); + raft::copy(l2_primal_residual_.data(), + s0.sub_pdlp->get_current_termination_strategy() + .get_convergence_information() + .l2_primal_residual_.data(), + 1, + stream_view_); + } else if (!batch_mode_) { my_l2_norm(primal_residual_, l2_primal_residual_, handle_ptr_); - else { + } else { segmented_sum_handler_.segmented_sum_helper( thrust::make_transform_iterator(primal_residual_.data(), power_two_func_t{}), l2_primal_residual_.data(), @@ -444,6 +520,7 @@ void convergence_information_t::compute_convergence_information( print("Absolute Primal Residual", l2_primal_residual_); #endif // If per_constraint_residual is false we still need to perform the l2 since it's used in kkt + // Not suported in mGPU if (settings.per_constraint_residual) { // Compute the linf of (residual_i - rel * b_i) if (settings.save_best_primal_so_far) { @@ -466,19 +543,98 @@ void convergence_information_t::compute_convergence_information( std::numeric_limits::lowest()); } - compute_dual_residual(op_problem_cusparse_view_, - current_pdhg_solver.get_primal_tmp_resource(), - primal_iterate, - dual_slack); - compute_dual_objective(dual_iterate, primal_iterate, dual_slack); + if (current_pdhg_solver.is_multi_gpu()) { + auto* engine = current_pdhg_solver.get_mgpu_engine(); + + // 1) Halo-exchange the dual solution on every shard so the upcoming + // A_T_shard @ dual SpMV inside compute_dual_residual reads correct + // values in the cstr halo region. + engine->halo_exchange_cstr( + [](pdhg_solver_t& pdhg) -> rmm::device_uvector& { + return pdhg.get_dual_solution(); + }); + + // 2-3) Per-shard: + // - compute_dual_residual: shard.dual_residual_ has owned-var entries + // correct, halo var entries garbage (their A_T row isn't on this + // shard). + // - compute_dual_objective_owned_partial: writes a *partial* + // dot(slack[0:nv], x[0:nv]) + Σ primal_slack[0:nc] into + // shard.dual_objective_, with NO scaling/offset. Relies on + // primal_slack_ already populated by the per-shard + // compute_primal_residual above. + for (auto& shard : engine->shards) { + raft::device_setter guard(shard->device_id); + auto& sub_pdlp = *shard->sub_pdlp; + auto& sub_conv = sub_pdlp.get_current_termination_strategy().get_convergence_information(); + sub_conv.compute_dual_residual(sub_conv.op_problem_cusparse_view_, + sub_pdlp.pdhg_solver_.get_primal_tmp_resource(), + sub_pdlp.pdhg_solver_.get_primal_solution(), + sub_pdlp.pdhg_solver_.get_dual_slack()); + sub_conv.compute_dual_objective_owned_partial(sub_pdlp.pdhg_solver_.get_primal_solution(), + sub_pdlp.pdhg_solver_.get_dual_slack(), + shard->rank_data.owned_var_size, + shard->rank_data.owned_cstr_size); + } + + // 4) Allreduce dual_objective_ across shards (sum, in place). Same + // offset/scaling-after-allreduce reasoning as primal: applying offset + // per-shard would over-count it Nshards times. + engine->allreduce_sum_inplace( + [](pdlp_solver_t& sp) -> f_t* { + return sp.get_current_termination_strategy() + .get_convergence_information() + .get_dual_objective() + .data(); + }); + + { + auto& s0 = *engine->shards[0]; + raft::device_setter guard(s0.device_id); + auto& s0_conv = + s0.sub_pdlp->get_current_termination_strategy().get_convergence_information(); + raft::copy(dual_objective_.data(), s0_conv.get_dual_objective().data(), 1, stream_view_); + } + apply_dual_objective_scaling_and_offset(); + } else { + compute_dual_residual(op_problem_cusparse_view_, + current_pdhg_solver.get_primal_tmp_resource(), + primal_iterate, + dual_slack); + compute_dual_objective(dual_iterate, primal_iterate, dual_slack); + } #ifdef CUPDLP_DEBUG_MODE print("Dual Residual", dual_residual_); #endif - if (!batch_mode_) + if (current_pdhg_solver.is_multi_gpu()) { + // Multi-GPU dual residual L2 norm: same pattern as the primal L2 above, + // but the dual residual is var-shaped so we clip to owned_var_size. + auto* engine = current_pdhg_solver.get_mgpu_engine(); + engine->distributed_l2_norm( + [](pdlp_solver_t& sp) -> rmm::device_uvector& { + return sp.get_current_termination_strategy() + .get_convergence_information() + .dual_residual_; + }, + [](pdlp_solver_t& sp) -> f_t* { + return sp.get_current_termination_strategy() + .get_convergence_information() + .l2_dual_residual_.data(); + }, + [](pdlp_shard_t& shard) -> i_t { return shard.rank_data.owned_var_size; }); + auto& s0 = *engine->shards[0]; + raft::device_setter guard(s0.device_id); + raft::copy(l2_dual_residual_.data(), + s0.sub_pdlp->get_current_termination_strategy() + .get_convergence_information() + .l2_dual_residual_.data(), + 1, + stream_view_); + } else if (!batch_mode_) { my_l2_norm(dual_residual_, l2_dual_residual_, handle_ptr_); - else { + } else { segmented_sum_handler_.segmented_sum_helper( thrust::make_transform_iterator(dual_residual_.data(), power_two_func_t{}), l2_dual_residual_.data(), @@ -509,6 +665,7 @@ void convergence_information_t::compute_convergence_information( std::numeric_limits::lowest()); } + // In mGPU, full primal_objective and dual_objective already mirrored to master so no special behaviour const auto [grid_size, block_size] = kernel_config_from_batch_size(climber_strategies_.size()); compute_remaining_stats_kernel <<>>(this->view(), climber_strategies_.size()); @@ -615,6 +772,24 @@ __global__ void apply_objective_scaling_and_offset(raft::device_span object objective[idx] = objective_scaling_factor * (objective[idx] + objective_offsets[idx]); } +template +void convergence_information_t::compute_primal_objective_owned_partial( + rmm::device_uvector& primal_solution, i_t n_owned) +{ + raft::common::nvtx::range fun_scope("compute_primal_objective_owned_partial"); + cuopt_assert(!batch_mode_, "owned-partial primal objective is only used in non-batch mGPU mode"); + cuopt_assert(n_owned <= primal_size_h_, + "n_owned must be <= primal_size_h_ (owned slice is a prefix)"); + RAFT_CUBLAS_TRY(raft::linalg::detail::cublasdot(handle_ptr_->get_cublas_handle(), + static_cast(n_owned), + primal_solution.data(), + primal_stride, + problem_ptr->objective_coefficients.data(), + primal_stride, + primal_objective_.data(), + stream_view_)); +} + template void convergence_information_t::compute_primal_objective( rmm::device_uvector& primal_solution) @@ -643,21 +818,25 @@ void convergence_information_t::compute_primal_objective( // Apply per-climber objective scaling and offset. objective_offsets_ is always populated // (defaults to the scalar problem offset replicated, or user-specified per-climber offsets). - { - const auto [grid_size, block_size] = kernel_config_from_batch_size(climber_strategies_.size()); - apply_objective_scaling_and_offset<<>>( - make_span(primal_objective_), - problem_ptr->presolve_data.objective_scaling_factor, - make_span(objective_offsets_), - climber_strategies_.size()); - RAFT_CUDA_TRY(cudaPeekAtLastError()); - } + apply_primal_objective_scaling_and_offset(); #ifdef CUPDLP_DEBUG_MODE print("Primal objective", primal_objective_); #endif } +template +void convergence_information_t::apply_primal_objective_scaling_and_offset() +{ + const auto [grid_size, block_size] = kernel_config_from_batch_size(climber_strategies_.size()); + apply_objective_scaling_and_offset<<>>( + make_span(primal_objective_), + problem_ptr->presolve_data.objective_scaling_factor, + make_span(objective_offsets_), + climber_strategies_.size()); + RAFT_CUDA_TRY(cudaPeekAtLastError()); +} + template void convergence_information_t::compute_dual_residual( cusparse_view_t& cusparse_view, @@ -740,6 +919,51 @@ void convergence_information_t::compute_dual_residual( } } +template +void convergence_information_t::compute_dual_objective_owned_partial( + rmm::device_uvector& primal_solution, + rmm::device_uvector& dual_slack, + i_t n_owned_var, + i_t n_owned_cstr) +{ + raft::common::nvtx::range fun_scope("compute_dual_objective_owned_partial"); + cuopt_assert(!batch_mode_, "owned-partial dual objective is only used in non-batch mGPU mode"); + cuopt_assert(hyper_params_.use_reflected_primal_dual, + "owned-partial dual objective requires use_reflected_primal_dual"); + cuopt_assert(n_owned_var <= primal_size_h_, + "n_owned_var must be <= primal_size_h_ (owned slice is a prefix)"); + cuopt_assert(n_owned_cstr <= dual_size_h_, + "n_owned_cstr must be <= dual_size_h_ (owned slice is a prefix)"); + + // dual_dot_ = dot(dual_slack[0:n_owned_var], primal_solution[0:n_owned_var]) + RAFT_CUBLAS_TRY(raft::linalg::detail::cublasdot(handle_ptr_->get_cublas_handle(), + static_cast(n_owned_var), + dual_slack.data(), + primal_stride, + primal_solution.data(), + primal_stride, + dual_dot_.data(), + stream_view_)); + + // sum_primal_slack_ = Σ primal_slack_[0:n_owned_cstr] + // primal_slack_ is assumed populated for owned cstrs by a prior + // compute_primal_residual call on this same shard. + cub::DeviceReduce::Sum(rmm_tmp_buffer_.data(), + size_of_buffer_, + primal_slack_.data(), + sum_primal_slack_.data(), + static_cast(n_owned_cstr), + stream_view_); + + // dual_objective_ = dual_dot_ + sum_primal_slack_ (still a partial sum). + cub::DeviceTransform::Transform( + cuda::std::make_tuple(dual_dot_.data(), sum_primal_slack_.data()), + dual_objective_.data(), + 1, + cuda::std::plus<>{}, + stream_view_); +} + template void convergence_information_t::compute_dual_objective( rmm::device_uvector& dual_solution, @@ -821,21 +1045,25 @@ void convergence_information_t::compute_dual_objective( } // Apply per-climber objective scaling and offset. - { - const auto [grid_size, block_size] = kernel_config_from_batch_size(climber_strategies_.size()); - apply_objective_scaling_and_offset<<>>( - make_span(dual_objective_), - problem_ptr->presolve_data.objective_scaling_factor, - make_span(objective_offsets_), - climber_strategies_.size()); - RAFT_CUDA_TRY(cudaPeekAtLastError()); - } + apply_dual_objective_scaling_and_offset(); #ifdef CUPDLP_DEBUG_MODE print("Dual objective", dual_objective_); #endif } +template +void convergence_information_t::apply_dual_objective_scaling_and_offset() +{ + const auto [grid_size, block_size] = kernel_config_from_batch_size(climber_strategies_.size()); + apply_objective_scaling_and_offset<<>>( + make_span(dual_objective_), + problem_ptr->presolve_data.objective_scaling_factor, + make_span(objective_offsets_), + climber_strategies_.size()); + RAFT_CUDA_TRY(cudaPeekAtLastError()); +} + template void convergence_information_t::compute_reduced_cost_from_primal_gradient( const rmm::device_uvector& primal_gradient, const rmm::device_uvector& primal_solution) @@ -916,12 +1144,24 @@ const rmm::device_uvector& convergence_information_t::get_primal_ return primal_objective_; } +template +rmm::device_uvector& convergence_information_t::get_primal_objective() +{ + return primal_objective_; +} + template const rmm::device_uvector& convergence_information_t::get_dual_objective() const { return dual_objective_; } +template +rmm::device_uvector& convergence_information_t::get_dual_objective() +{ + return dual_objective_; +} + template const rmm::device_uvector& convergence_information_t::get_l2_dual_residual() const { diff --git a/cpp/src/pdlp/termination_strategy/convergence_information.hpp b/cpp/src/pdlp/termination_strategy/convergence_information.hpp index 2389a60fae..6325622a2b 100644 --- a/cpp/src/pdlp/termination_strategy/convergence_information.hpp +++ b/cpp/src/pdlp/termination_strategy/convergence_information.hpp @@ -52,7 +52,10 @@ class convergence_information_t { // Needed for kkt restart & debug prints const rmm::device_uvector& get_primal_objective() const; + // Non-const overload used by the multi-GPU branch to mirror / allreduce. + rmm::device_uvector& get_primal_objective(); const rmm::device_uvector& get_dual_objective() const; + rmm::device_uvector& get_dual_objective(); const rmm::device_uvector& get_l2_primal_residual() const; const rmm::device_uvector& get_l2_dual_residual() const; const rmm::device_uvector& get_relative_linf_primal_residual() const; @@ -123,12 +126,40 @@ class convergence_information_t { rmm::device_uvector& tmp_dual, [[maybe_unused]] const rmm::device_uvector& dual_iterate); + // Multi-GPU shard helper: writes a partial dot(c[0:n_owned], x[0:n_owned]) + // into primal_objective_ (no scaling, no offset). Master is responsible for + // allreduce SUM across shards and then applying scaling + offset once on the + // reduced value. n_owned must be <= primal_size_h_; pass owned_var_size on + // each shard. + void compute_primal_objective_owned_partial(rmm::device_uvector& primal_solution, + i_t n_owned); + + // Multi-GPU shard helper: writes a partial dual objective into + // dual_objective_ (no scaling, no offset). Computes + // dual_dot_ = dot(dual_slack[0:n_owned_var], primal_solution[0:n_owned_var]) + // sum_primal_slack_ = Σ primal_slack_[0:n_owned_cstr] + // dual_objective_ = dual_dot_ + sum_primal_slack_ + // primal_slack_ is assumed already populated by a prior per-shard + // compute_primal_residual call. Use only in the use_reflected_primal_dual + // path (the multi-GPU mode). + void compute_dual_objective_owned_partial(rmm::device_uvector& primal_solution, + rmm::device_uvector& dual_slack, + i_t n_owned_var, + i_t n_owned_cstr); + void swap_context(const thrust::universal_host_pinned_vector>& swap_pairs); void resize_context(i_t new_size); private: void compute_primal_objective(rmm::device_uvector& primal_solution); + // Applies per-climber objective scaling + offset to primal_objective_. + // Single-GPU path: called from compute_primal_objective right after the dot. + // Multi-GPU path: called on master once after allreduce of partial sums. + void apply_primal_objective_scaling_and_offset(); + // Same as above but for dual_objective_. + void apply_dual_objective_scaling_and_offset(); + void compute_dual_residual(cusparse_view_t& cusparse_view, rmm::device_uvector& tmp_primal, rmm::device_uvector& primal_solution, From c484485d9debf9d5a1d7246dcf34f7f919d5f344 Mon Sep 17 00:00:00 2001 From: Bulle Mostovoi Date: Tue, 26 May 2026 16:14:44 +0200 Subject: [PATCH 33/67] fill_return_problem_solutionis now ready !! --- .../distributed_pdlp/multi_gpu_engine.hpp | 86 +++++++++++++++++++ cpp/src/pdlp/pdlp.cu | 9 ++ 2 files changed, 95 insertions(+) diff --git a/cpp/src/pdlp/distributed_pdlp/multi_gpu_engine.hpp b/cpp/src/pdlp/distributed_pdlp/multi_gpu_engine.hpp index 438a878834..e04f2e26eb 100644 --- a/cpp/src/pdlp/distributed_pdlp/multi_gpu_engine.hpp +++ b/cpp/src/pdlp/distributed_pdlp/multi_gpu_engine.hpp @@ -17,7 +17,9 @@ #include #include +#include #include +#include #include #include @@ -325,6 +327,90 @@ struct multi_gpu_engine_t { [](auto& pdhg) -> cusparseDnVecDescr_t { return pdhg.get_cusparse_view().current_AtY; }); } + // -------- Solution gather (shards -> master) ---------------------------- + // Assembles the global potential_next primal/dual solutions on the master + // pdhg_solver_ from the owned slices distributed across shards. Each shard's + // first owned_var_size (resp. owned_cstr_size) entries of its + // potential_next_primal_solution_ (resp. _dual_) are the live, up-to-date + // owned values; the master pdhg_solver_'s buffers are not updated during + // iterations and would otherwise return stale data. + // + // Used right before fill_return_problem_solution() at the return sites in + // pdlp_solver_t::check_termination() and pdlp_solver_t::check_limits(): the + // user-visible solution must contain gathered global values. + // + // Mirrors the metis_tests engine::get_x_output / get_y_output pattern: + // per shard: alloc small host tmp, copy owned slice device->host, sync, + // host-scatter via rank_data.local_to_global_{var,cstr} into a contiguous + // host buffer. Then one host->device copy into the master pdhg buffer. + void gather_potential_next_solutions_to_master(pdhg_solver_t& master_pdhg) + { + const std::size_t total_vars = + master_pdhg.get_potential_next_primal_solution().size(); + const std::size_t total_cstrs = + master_pdhg.get_potential_next_dual_solution().size(); + + std::vector h_primal(total_vars); + std::vector h_dual(total_cstrs); + + for (auto& s_uptr : shards) { + auto& s = *s_uptr; + raft::device_setter guard(s.device_id); + const i_t nv = s.rank_data.owned_var_size; + const i_t nc = s.rank_data.owned_cstr_size; + + std::vector tmp_primal(nv); + std::vector tmp_dual(nc); + + if (nv > 0) { + RAFT_CUDA_TRY( + cudaMemcpyAsync(tmp_primal.data(), + s.sub_pdlp->pdhg_solver_.get_potential_next_primal_solution().data(), + static_cast(nv) * sizeof(f_t), + cudaMemcpyDeviceToHost, + s.stream.view().value())); + } + if (nc > 0) { + RAFT_CUDA_TRY( + cudaMemcpyAsync(tmp_dual.data(), + s.sub_pdlp->pdhg_solver_.get_potential_next_dual_solution().data(), + static_cast(nc) * sizeof(f_t), + cudaMemcpyDeviceToHost, + s.stream.view().value())); + } + RAFT_CUDA_TRY(cudaStreamSynchronize(s.stream.view().value())); + + if (nv > 0) { + thrust::scatter(thrust::host, + tmp_primal.begin(), + tmp_primal.end(), + s.rank_data.local_to_global_var.begin(), + h_primal.begin()); + } + if (nc > 0) { + thrust::scatter(thrust::host, + tmp_dual.begin(), + tmp_dual.end(), + s.rank_data.local_to_global_cstr.begin(), + h_dual.begin()); + } + } + + // Host -> master device. engine.stream lives on the master device + // (created at engine construction when master device was current). + RAFT_CUDA_TRY(cudaMemcpyAsync(master_pdhg.get_potential_next_primal_solution().data(), + h_primal.data(), + total_vars * sizeof(f_t), + cudaMemcpyHostToDevice, + stream.view().value())); + RAFT_CUDA_TRY(cudaMemcpyAsync(master_pdhg.get_potential_next_dual_solution().data(), + h_dual.data(), + total_cstrs * sizeof(f_t), + cudaMemcpyHostToDevice, + stream.view().value())); + RAFT_CUDA_TRY(cudaStreamSynchronize(stream.view().value())); + } + // Engine-level stream for fork/join orchestration (master side). rmm::cuda_stream stream; diff --git a/cpp/src/pdlp/pdlp.cu b/cpp/src/pdlp/pdlp.cu index 9522ae4065..e9cf194d98 100644 --- a/cpp/src/pdlp/pdlp.cu +++ b/cpp/src/pdlp/pdlp.cu @@ -661,6 +661,9 @@ std::optional> pdlp_solver_t RAFT_CUDA_TRY(cudaDeviceSynchronize()); std::cout << "Time Limit reached, returning current solution" << std::endl; #endif + if (auto* engine = pdhg_solver_.get_mgpu_engine()) { + engine->gather_potential_next_solutions_to_master(pdhg_solver_); + } return current_termination_strategy_.fill_return_problem_solution( internal_solver_iterations_, pdhg_solver_, @@ -694,6 +697,9 @@ std::optional> pdlp_solver_t return finalize_batch_return_with_limit_reached(pdlp_termination_status_t::IterationLimit); } + if (auto* engine = pdhg_solver_.get_mgpu_engine()) { + engine->gather_potential_next_solutions_to_master(pdhg_solver_); + } return current_termination_strategy_.fill_return_problem_solution( internal_solver_iterations_, pdhg_solver_, @@ -1371,6 +1377,9 @@ std::optional> pdlp_solver_t #endif print_final_termination_criteria( timer, current_termination_strategy_.get_convergence_information(), termination_current); + if (auto* engine = pdhg_solver_.get_mgpu_engine()) { + engine->gather_potential_next_solutions_to_master(pdhg_solver_); + } return current_termination_strategy_.fill_return_problem_solution( internal_solver_iterations_, pdhg_solver_, From fc46080d24d7566729a1a505837cfc41e023997d Mon Sep 17 00:00:00 2001 From: Bulle Mostovoi Date: Tue, 26 May 2026 16:39:26 +0200 Subject: [PATCH 34/67] added reduced cost in gathering of solution, builds and runs --- .../distributed_pdlp/multi_gpu_engine.hpp | 47 +++++++++++++++---- cpp/src/pdlp/pdlp.cu | 12 +++-- 2 files changed, 47 insertions(+), 12 deletions(-) diff --git a/cpp/src/pdlp/distributed_pdlp/multi_gpu_engine.hpp b/cpp/src/pdlp/distributed_pdlp/multi_gpu_engine.hpp index e04f2e26eb..d156e889af 100644 --- a/cpp/src/pdlp/distributed_pdlp/multi_gpu_engine.hpp +++ b/cpp/src/pdlp/distributed_pdlp/multi_gpu_engine.hpp @@ -328,22 +328,30 @@ struct multi_gpu_engine_t { } // -------- Solution gather (shards -> master) ---------------------------- - // Assembles the global potential_next primal/dual solutions on the master - // pdhg_solver_ from the owned slices distributed across shards. Each shard's - // first owned_var_size (resp. owned_cstr_size) entries of its - // potential_next_primal_solution_ (resp. _dual_) are the live, up-to-date - // owned values; the master pdhg_solver_'s buffers are not updated during - // iterations and would otherwise return stale data. + // Assembles the global potential_next primal/dual solutions and the + // reduced_cost on the master from the owned slices distributed across + // shards. Each shard's first owned_var_size (resp. owned_cstr_size) entries + // of its potential_next_primal_solution_ / reduced_cost_ (resp. + // potential_next_dual_solution_) are the live, up-to-date owned values; the + // master buffers are not updated during iterations and would otherwise + // return stale data. // // Used right before fill_return_problem_solution() at the return sites in // pdlp_solver_t::check_termination() and pdlp_solver_t::check_limits(): the - // user-visible solution must contain gathered global values. + // user-visible solution must contain gathered global values for primal, + // dual, and reduced_cost. // // Mirrors the metis_tests engine::get_x_output / get_y_output pattern: // per shard: alloc small host tmp, copy owned slice device->host, sync, // host-scatter via rank_data.local_to_global_{var,cstr} into a contiguous - // host buffer. Then one host->device copy into the master pdhg buffer. - void gather_potential_next_solutions_to_master(pdhg_solver_t& master_pdhg) + // host buffer. Then one host->device copy into the master buffer per field. + // + // master_pdhg : provides destinations for primal / dual. + // master_reduced_cost : destination for the reduced_cost (var-shaped, lives + // in the master pdlp_solver_t's termination strategy + // convergence_information_). + void gather_potential_next_solutions_to_master( + pdhg_solver_t& master_pdhg, rmm::device_uvector& master_reduced_cost) { const std::size_t total_vars = master_pdhg.get_potential_next_primal_solution().size(); @@ -352,6 +360,7 @@ struct multi_gpu_engine_t { std::vector h_primal(total_vars); std::vector h_dual(total_cstrs); + std::vector h_reduced_cost(total_vars); for (auto& s_uptr : shards) { auto& s = *s_uptr; @@ -361,6 +370,11 @@ struct multi_gpu_engine_t { std::vector tmp_primal(nv); std::vector tmp_dual(nc); + std::vector tmp_reduced_cost(nv); + + auto& sub_reduced_cost = s.sub_pdlp->get_current_termination_strategy() + .get_convergence_information() + .get_reduced_cost(); if (nv > 0) { RAFT_CUDA_TRY( @@ -369,6 +383,11 @@ struct multi_gpu_engine_t { static_cast(nv) * sizeof(f_t), cudaMemcpyDeviceToHost, s.stream.view().value())); + RAFT_CUDA_TRY(cudaMemcpyAsync(tmp_reduced_cost.data(), + sub_reduced_cost.data(), + static_cast(nv) * sizeof(f_t), + cudaMemcpyDeviceToHost, + s.stream.view().value())); } if (nc > 0) { RAFT_CUDA_TRY( @@ -386,6 +405,11 @@ struct multi_gpu_engine_t { tmp_primal.end(), s.rank_data.local_to_global_var.begin(), h_primal.begin()); + thrust::scatter(thrust::host, + tmp_reduced_cost.begin(), + tmp_reduced_cost.end(), + s.rank_data.local_to_global_var.begin(), + h_reduced_cost.begin()); } if (nc > 0) { thrust::scatter(thrust::host, @@ -408,6 +432,11 @@ struct multi_gpu_engine_t { total_cstrs * sizeof(f_t), cudaMemcpyHostToDevice, stream.view().value())); + RAFT_CUDA_TRY(cudaMemcpyAsync(master_reduced_cost.data(), + h_reduced_cost.data(), + total_vars * sizeof(f_t), + cudaMemcpyHostToDevice, + stream.view().value())); RAFT_CUDA_TRY(cudaStreamSynchronize(stream.view().value())); } diff --git a/cpp/src/pdlp/pdlp.cu b/cpp/src/pdlp/pdlp.cu index e9cf194d98..7bd6d34473 100644 --- a/cpp/src/pdlp/pdlp.cu +++ b/cpp/src/pdlp/pdlp.cu @@ -662,7 +662,9 @@ std::optional> pdlp_solver_t std::cout << "Time Limit reached, returning current solution" << std::endl; #endif if (auto* engine = pdhg_solver_.get_mgpu_engine()) { - engine->gather_potential_next_solutions_to_master(pdhg_solver_); + engine->gather_potential_next_solutions_to_master( + pdhg_solver_, + current_termination_strategy_.get_convergence_information().get_reduced_cost()); } return current_termination_strategy_.fill_return_problem_solution( internal_solver_iterations_, @@ -698,7 +700,9 @@ std::optional> pdlp_solver_t } if (auto* engine = pdhg_solver_.get_mgpu_engine()) { - engine->gather_potential_next_solutions_to_master(pdhg_solver_); + engine->gather_potential_next_solutions_to_master( + pdhg_solver_, + current_termination_strategy_.get_convergence_information().get_reduced_cost()); } return current_termination_strategy_.fill_return_problem_solution( internal_solver_iterations_, @@ -1378,7 +1382,9 @@ std::optional> pdlp_solver_t print_final_termination_criteria( timer, current_termination_strategy_.get_convergence_information(), termination_current); if (auto* engine = pdhg_solver_.get_mgpu_engine()) { - engine->gather_potential_next_solutions_to_master(pdhg_solver_); + engine->gather_potential_next_solutions_to_master( + pdhg_solver_, + current_termination_strategy_.get_convergence_information().get_reduced_cost()); } return current_termination_strategy_.fill_return_problem_solution( internal_solver_iterations_, From 6538382755ed20571b649e0366afbebd95493053 Mon Sep 17 00:00:00 2001 From: Bulle Mostovoi Date: Wed, 27 May 2026 13:41:27 +0200 Subject: [PATCH 35/67] updated mgpu scale/unscale logic --- cpp/src/pdlp/pdlp.cu | 40 ++++++++++++++++++++++------------------ 1 file changed, 22 insertions(+), 18 deletions(-) diff --git a/cpp/src/pdlp/pdlp.cu b/cpp/src/pdlp/pdlp.cu index 7bd6d34473..c31c528d8d 100644 --- a/cpp/src/pdlp/pdlp.cu +++ b/cpp/src/pdlp/pdlp.cu @@ -2962,22 +2962,24 @@ optimization_problem_solution_t pdlp_solver_t::run_solver(co unscaled_dual_avg_solution_); } if (settings_.hyper_params.use_adaptive_step_size_strategy) { + initial_scaling_strategy_.unscale_solutions(pdhg_solver_.get_primal_solution(), + pdhg_solver_.get_dual_solution()); + } else { if (multi_gpu_engine) { - // The only branch in cuPDLPx + // The only branch in cuPDLPx (Stable3) multi_gpu_engine->for_each_shard([&](auto& shard) { auto& sub = *shard.sub_pdlp; sub.get_initial_scaling_strategy().unscale_solutions( - sub.pdhg_solver_.get_primal_solution(), sub.pdhg_solver_.get_dual_solution()); + sub.pdhg_solver_.get_potential_next_primal_solution(), + sub.pdhg_solver_.get_potential_next_dual_solution(), + sub.pdhg_solver_.get_dual_slack()); }); } else { - initial_scaling_strategy_.unscale_solutions(pdhg_solver_.get_primal_solution(), - pdhg_solver_.get_dual_solution()); + initial_scaling_strategy_.unscale_solutions( + pdhg_solver_.get_potential_next_primal_solution(), + pdhg_solver_.get_potential_next_dual_solution(), + pdhg_solver_.get_dual_slack()); } - } else { - initial_scaling_strategy_.unscale_solutions( - pdhg_solver_.get_potential_next_primal_solution(), - pdhg_solver_.get_potential_next_dual_solution(), - pdhg_solver_.get_dual_slack()); } #ifdef CUPDLP_DEBUG_MODE @@ -3006,22 +3008,24 @@ optimization_problem_solution_t pdlp_solver_t::run_solver(co unscaled_dual_avg_solution_); } if (settings_.hyper_params.use_adaptive_step_size_strategy) { + initial_scaling_strategy_.scale_solutions(pdhg_solver_.get_primal_solution(), + pdhg_solver_.get_dual_solution()); + } else { if (multi_gpu_engine) { - // The only branch in cuPDLPx + // The only branch in cuPDLPx (Stable3) multi_gpu_engine->for_each_shard([&](auto& shard) { auto& sub = *shard.sub_pdlp; sub.get_initial_scaling_strategy().scale_solutions( - sub.pdhg_solver_.get_primal_solution(), sub.pdhg_solver_.get_dual_solution()); + sub.pdhg_solver_.get_potential_next_primal_solution(), + sub.pdhg_solver_.get_potential_next_dual_solution(), + sub.pdhg_solver_.get_dual_slack()); }); } else { - initial_scaling_strategy_.scale_solutions(pdhg_solver_.get_primal_solution(), - pdhg_solver_.get_dual_solution()); + initial_scaling_strategy_.scale_solutions( + pdhg_solver_.get_potential_next_primal_solution(), + pdhg_solver_.get_potential_next_dual_solution(), + pdhg_solver_.get_dual_slack()); } - } else { - initial_scaling_strategy_.scale_solutions( - pdhg_solver_.get_potential_next_primal_solution(), - pdhg_solver_.get_potential_next_dual_solution(), - pdhg_solver_.get_dual_slack()); } } From a88285a9f8d4ed8ec568307d0fac1866ed404831 Mon Sep 17 00:00:00 2001 From: Bulle Mostovoi Date: Wed, 27 May 2026 14:09:56 +0200 Subject: [PATCH 36/67] wired mgpu restart --- cpp/src/pdlp/pdlp.cuh | 1 + .../restart_strategy/pdlp_restart_strategy.cu | 147 ++++++++++++++---- 2 files changed, 118 insertions(+), 30 deletions(-) diff --git a/cpp/src/pdlp/pdlp.cuh b/cpp/src/pdlp/pdlp.cuh index 63aef7b43a..17fb05080f 100644 --- a/cpp/src/pdlp/pdlp.cuh +++ b/cpp/src/pdlp/pdlp.cuh @@ -110,6 +110,7 @@ class pdlp_solver_t { { return initial_scaling_strategy_; } + detail::pdlp_restart_strategy_t& get_restart_strategy() { return restart_strategy_; } // Per-shard primal/dual step sizes are private state on pdlp_solver_t but // are needed inside the multi-GPU dispatch paths that fan out a master cub diff --git a/cpp/src/pdlp/restart_strategy/pdlp_restart_strategy.cu b/cpp/src/pdlp/restart_strategy/pdlp_restart_strategy.cu index 17c7abcac5..00c5b16c8b 100644 --- a/cpp/src/pdlp/restart_strategy/pdlp_restart_strategy.cu +++ b/cpp/src/pdlp/restart_strategy/pdlp_restart_strategy.cu @@ -8,6 +8,8 @@ #include #include +#include +#include #include #include #include @@ -892,20 +894,64 @@ void pdlp_restart_strategy_t::cupdlpx_restart( "If any, all should be true"); // Computing the deltas - distance_squared_moved_from_last_restart_period( - pdhg_solver.get_potential_next_primal_solution(), - last_restart_duality_gap_.primal_solution_, - pdhg_solver.get_primal_tmp_resource(), - primal_size_h_, - 1, - last_restart_duality_gap_.primal_distance_traveled_); - distance_squared_moved_from_last_restart_period( - pdhg_solver.get_potential_next_dual_solution(), - last_restart_duality_gap_.dual_solution_, - pdhg_solver.get_dual_tmp_resource(), - dual_size_h_, - 1, - last_restart_duality_gap_.dual_distance_traveled_); + if (auto* engine = pdhg_solver.get_mgpu_engine()) { + engine->for_each_shard([&](auto& shard) { + auto& sub = *shard.sub_pdlp; + auto& sub_rest = sub.get_restart_strategy(); + sub_rest.distance_squared_moved_from_last_restart_period( + sub.pdhg_solver_.get_potential_next_primal_solution(), + sub_rest.last_restart_duality_gap_.primal_solution_, + sub.pdhg_solver_.get_primal_tmp_resource(), + shard.rank_data.owned_var_size, + 1, + sub_rest.last_restart_duality_gap_.primal_distance_traveled_); + sub_rest.distance_squared_moved_from_last_restart_period( + sub.pdhg_solver_.get_potential_next_dual_solution(), + sub_rest.last_restart_duality_gap_.dual_solution_, + sub.pdhg_solver_.get_dual_tmp_resource(), + shard.rank_data.owned_cstr_size, + 1, + sub_rest.last_restart_duality_gap_.dual_distance_traveled_); + }); + + engine->allreduce_sum_inplace([](pdlp_solver_t& sp) -> f_t* { + return sp.get_restart_strategy().last_restart_duality_gap_.primal_distance_traveled_.data(); + }); + engine->allreduce_sum_inplace([](pdlp_solver_t& sp) -> f_t* { + return sp.get_restart_strategy().last_restart_duality_gap_.dual_distance_traveled_.data(); + }); + + auto& s0 = *engine->shards[0]; + { + raft::device_setter guard(s0.device_id); + RAFT_CUDA_TRY(cudaStreamSynchronize(s0.stream.view().value())); + } + raft::copy(last_restart_duality_gap_.primal_distance_traveled_.data(), + s0.sub_pdlp->get_restart_strategy() + .last_restart_duality_gap_.primal_distance_traveled_.data(), + 1, + stream_view_); + raft::copy(last_restart_duality_gap_.dual_distance_traveled_.data(), + s0.sub_pdlp->get_restart_strategy() + .last_restart_duality_gap_.dual_distance_traveled_.data(), + 1, + stream_view_); + } else { + distance_squared_moved_from_last_restart_period( + pdhg_solver.get_potential_next_primal_solution(), + last_restart_duality_gap_.primal_solution_, + pdhg_solver.get_primal_tmp_resource(), + primal_size_h_, + 1, + last_restart_duality_gap_.primal_distance_traveled_); + distance_squared_moved_from_last_restart_period( + pdhg_solver.get_potential_next_dual_solution(), + last_restart_duality_gap_.dual_solution_, + pdhg_solver.get_dual_tmp_resource(), + dual_size_h_, + 1, + last_restart_duality_gap_.dual_distance_traveled_); + } auto view = make_cupdlpx_restart_view(last_restart_duality_gap_.primal_distance_traveled_, last_restart_duality_gap_.dual_distance_traveled_, @@ -958,24 +1004,58 @@ void pdlp_restart_strategy_t::cupdlpx_restart( best_primal_weight.set_element_async(0, best_primal_weight_value, stream_view_); } + // Broadcast the primal and dual step sizes to all shards + if (auto* engine = pdhg_solver.get_mgpu_engine()) { + RAFT_CUDA_TRY(cudaStreamSynchronize(stream_view_)); + engine->for_each_shard([&](auto& shard) { + auto& sub = *shard.sub_pdlp; + raft::copy(sub.get_primal_step_size().data(), + primal_step_size.data(), 1, shard.stream.view()); + raft::copy(sub.get_dual_step_size().data(), + dual_step_size.data(), 1, shard.stream.view()); + }); + } // TODO later batch mode: remove if you have per climber restart - raft::copy(last_restart_duality_gap_.primal_solution_.data(), - pdhg_solver.get_potential_next_primal_solution().data(), - last_restart_duality_gap_.primal_solution_.size(), - stream_view_); - raft::copy(pdhg_solver.get_primal_solution().data(), - pdhg_solver.get_potential_next_primal_solution().data(), - last_restart_duality_gap_.primal_solution_.size(), - stream_view_); - raft::copy(last_restart_duality_gap_.dual_solution_.data(), - pdhg_solver.get_potential_next_dual_solution().data(), - last_restart_duality_gap_.dual_solution_.size(), - stream_view_); - raft::copy(pdhg_solver.get_dual_solution().data(), - pdhg_solver.get_potential_next_dual_solution().data(), - last_restart_duality_gap_.dual_solution_.size(), - stream_view_); + if (auto* engine = pdhg_solver.get_mgpu_engine()) { + engine->for_each_shard([&](auto& shard) { + auto& sub = *shard.sub_pdlp; + auto& sub_rest = sub.get_restart_strategy(); + raft::copy(sub_rest.last_restart_duality_gap_.primal_solution_.data(), + sub.pdhg_solver_.get_potential_next_primal_solution().data(), + sub_rest.last_restart_duality_gap_.primal_solution_.size(), + shard.stream.view()); + raft::copy(sub.pdhg_solver_.get_primal_solution().data(), + sub.pdhg_solver_.get_potential_next_primal_solution().data(), + sub.pdhg_solver_.get_primal_solution().size(), + shard.stream.view()); + raft::copy(sub_rest.last_restart_duality_gap_.dual_solution_.data(), + sub.pdhg_solver_.get_potential_next_dual_solution().data(), + sub_rest.last_restart_duality_gap_.dual_solution_.size(), + shard.stream.view()); + raft::copy(sub.pdhg_solver_.get_dual_solution().data(), + sub.pdhg_solver_.get_potential_next_dual_solution().data(), + sub.pdhg_solver_.get_dual_solution().size(), + shard.stream.view()); + }); + } else { + raft::copy(last_restart_duality_gap_.primal_solution_.data(), + pdhg_solver.get_potential_next_primal_solution().data(), + last_restart_duality_gap_.primal_solution_.size(), + stream_view_); + raft::copy(pdhg_solver.get_primal_solution().data(), + pdhg_solver.get_potential_next_primal_solution().data(), + last_restart_duality_gap_.primal_solution_.size(), + stream_view_); + raft::copy(last_restart_duality_gap_.dual_solution_.data(), + pdhg_solver.get_potential_next_dual_solution().data(), + last_restart_duality_gap_.dual_solution_.size(), + stream_view_); + raft::copy(pdhg_solver.get_dual_solution().data(), + pdhg_solver.get_potential_next_dual_solution().data(), + last_restart_duality_gap_.dual_solution_.size(), + stream_view_); + } #ifdef CUPDLP_DEBUG_MODE print("New last_restart_duality_gap_.primal_solution_", @@ -990,6 +1070,13 @@ void pdlp_restart_strategy_t::cupdlpx_restart( weighted_average_solution_.iterations_since_last_restart_ = 0; last_trial_fixed_point_error_[i] = std::numeric_limits::infinity(); } + + if (auto* engine = pdhg_solver.get_mgpu_engine()) { + engine->for_each_shard([&](auto& shard) { + shard.sub_pdlp->get_restart_strategy().weighted_average_solution_.iterations_since_last_restart_ = + 0; + }); + } } template From b34c5f6b286add3911c85fe8f747f2b8f6ccc9c2 Mon Sep 17 00:00:00 2001 From: Bulle Mostovoi Date: Wed, 27 May 2026 14:38:56 +0200 Subject: [PATCH 37/67] dummy version locally seems to work ????? --- .../cuopt/linear_programming/constants.h | 1 + cpp/src/math_optimization/solver_settings.cu | 1 + cpp/src/pdlp/pdlp.cu | 24 +++++++++++++------ cpp/src/pdlp/solve.cu | 4 +++- 4 files changed, 22 insertions(+), 8 deletions(-) diff --git a/cpp/include/cuopt/linear_programming/constants.h b/cpp/include/cuopt/linear_programming/constants.h index 39685251b6..26ef3653e0 100644 --- a/cpp/include/cuopt/linear_programming/constants.h +++ b/cpp/include/cuopt/linear_programming/constants.h @@ -84,6 +84,7 @@ #define CUOPT_NUM_CPU_THREADS "num_cpu_threads" #define CUOPT_NUM_GPUS "num_gpus" #define CUOPT_MULTI_GPU_PARTITION_FILE "multi_gpu_partition_file" +#define CUOPT_USE_DISTRIBUTED_PDLP "use_distributed_pdlp" #define CUOPT_USER_PROBLEM_FILE "user_problem_file" #define CUOPT_PRESOLVE_FILE "presolve_file" #define CUOPT_RANDOM_SEED "random_seed" diff --git a/cpp/src/math_optimization/solver_settings.cu b/cpp/src/math_optimization/solver_settings.cu index 6d7e7504e4..991b0d62c1 100644 --- a/cpp/src/math_optimization/solver_settings.cu +++ b/cpp/src/math_optimization/solver_settings.cu @@ -177,6 +177,7 @@ solver_settings_t::solver_settings_t() : pdlp_settings(), mip_settings {CUOPT_DUAL_POSTSOLVE, &pdlp_settings.dual_postsolve, true}, {CUOPT_BARRIER_ITERATIVE_REFINEMENT, &pdlp_settings.barrier_iterative_refinement, true}, {CUOPT_MIP_PROBING, &mip_settings.probing, true}, + {CUOPT_USE_DISTRIBUTED_PDLP, &pdlp_settings.hyper_params.use_distributed_pdlp, false}, }; // String parameters string_parameters = { diff --git a/cpp/src/pdlp/pdlp.cu b/cpp/src/pdlp/pdlp.cu index c31c528d8d..1e76fa4251 100644 --- a/cpp/src/pdlp/pdlp.cu +++ b/cpp/src/pdlp/pdlp.cu @@ -382,16 +382,14 @@ pdlp_solver_t::pdlp_solver_t(problem_t& op_problem, // (problem_ptr, op_problem_scaled_, pdhg_solver_, strategies, etc.). : pdlp_solver_t(op_problem, settings, false) { - cuopt_expects(num_gpus == settings.num_gpus && settings.num_gpus > 1, + if (num_gpus == 1) { + std::cout << "CAREFUL: num_gpus == 1, running dummy version" << std::endl; + } + cuopt_expects(num_gpus == settings.num_gpus /*&& settings.num_gpus > 1*/, error_type_t::ValidationError, "This constructor should only be used for distributed PDLP (num_gpus > 1)"); - // Distributed PDLP is currently double-only. The body is guarded with - // `if constexpr` so the float instantiation never references the - // multi_gpu_engine_t / partition_loader_t symbols - // (those are intentionally not instantiated in their .cu files), keeping - // the link clean. Trying to use distributed PDLP with f_t = float will - // throw at runtime instead. + // Distributed PDLP is currently double-only if constexpr (!std::is_same_v) { cuopt_expects(false, error_type_t::ValidationError, @@ -403,6 +401,18 @@ pdlp_solver_t::pdlp_solver_t(problem_t& op_problem, if (!settings.multi_gpu_partition_file.empty()) { parts = partition_loader_t::parse_distributed_pdlp_partition_file( settings.multi_gpu_partition_file); + } else if (num_gpus == 1) { + // Single-part dummy run: useful for exercising the mGPU code paths on a + // single physical GPU without a real Metis partition file. The downstream + // create_rank_data_from_parts expects a flat vector of length + // (n_constraints + n_variables) where each entry is the owning part-id + // (cstrs first, then vars). With nb_parts == 1, every entry is 0. + std::cout << "CAREFUL: num_gpus == 1, running dummy version (single part covering " + << op_problem_scaled_.n_constraints << " cstrs + " + << op_problem_scaled_.n_variables << " vars)" << std::endl; + parts = std::vector( + static_cast(op_problem_scaled_.n_constraints + op_problem_scaled_.n_variables), + 0); } else { cuopt_expects(false, error_type_t::RuntimeError, diff --git a/cpp/src/pdlp/solve.cu b/cpp/src/pdlp/solve.cu index 479340810c..e401ab35b6 100644 --- a/cpp/src/pdlp/solve.cu +++ b/cpp/src/pdlp/solve.cu @@ -771,9 +771,11 @@ static optimization_problem_solution_t run_pdlp_solver( } #endif if (settings.hyper_params.use_distributed_pdlp) { + /* cuopt_expects(settings.num_gpus > 1, error_type_t::ValidationError, - "use_distributed_pdlp requires settings.num_gpus > 1"); + "use_distributed_pdlp requires settings.num_gpus > 1"); */ + if (settings.num_gpus == 1) {std::cout << "CAREFUL: use_distributed_pdlp requires settings.num_gpus > 1" << std::endl;} cuopt_expects(!is_batch_mode, error_type_t::ValidationError, "Distributed PDLP does not support batch mode"); From b784a441395c092f13a94b276b8caad03a7cac7e Mon Sep 17 00:00:00 2001 From: Bulle Mostovoi Date: Wed, 27 May 2026 06:12:59 -0700 Subject: [PATCH 38/67] added dummy partitionner --- cpp/src/pdlp/CMakeLists.txt | 1 + .../distributed_pdlp/metis_partitioner.hpp | 24 +++++ cpp/src/pdlp/distributed_pdlp/partitioner.cu | 87 +++++++++++++++++++ cpp/src/pdlp/distributed_pdlp/partitioner.hpp | 63 ++++++++++++++ cpp/src/pdlp/pdlp.cu | 39 +++++---- 5 files changed, 197 insertions(+), 17 deletions(-) create mode 100644 cpp/src/pdlp/distributed_pdlp/metis_partitioner.hpp create mode 100644 cpp/src/pdlp/distributed_pdlp/partitioner.cu create mode 100644 cpp/src/pdlp/distributed_pdlp/partitioner.hpp diff --git a/cpp/src/pdlp/CMakeLists.txt b/cpp/src/pdlp/CMakeLists.txt index 2bc2771c91..a6ef14e3ff 100644 --- a/cpp/src/pdlp/CMakeLists.txt +++ b/cpp/src/pdlp/CMakeLists.txt @@ -32,6 +32,7 @@ set(LP_CORE_FILES ${CMAKE_CURRENT_SOURCE_DIR}/distributed_pdlp/shard.cu ${CMAKE_CURRENT_SOURCE_DIR}/distributed_pdlp/multi_gpu_engine.cu ${CMAKE_CURRENT_SOURCE_DIR}/distributed_pdlp/partition_loader.cu + ${CMAKE_CURRENT_SOURCE_DIR}/distributed_pdlp/partitioner.cu ) # C and Python adapter files diff --git a/cpp/src/pdlp/distributed_pdlp/metis_partitioner.hpp b/cpp/src/pdlp/distributed_pdlp/metis_partitioner.hpp new file mode 100644 index 0000000000..c4e37f57a9 --- /dev/null +++ b/cpp/src/pdlp/distributed_pdlp/metis_partitioner.hpp @@ -0,0 +1,24 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + */ + +#pragma once + +#include + +namespace cuopt::linear_programming::detail { + +// METIS k-way partitioner on the constraint/variable bipartite graph induced by A. +// Requires partitioner_input_t::A and A_t (or A row_offsets/col_indices only — the +// implementation builds the bipartite adjacency the same way as metis_tests: +// cstr nodes [0, nb_cstr), var nodes [nb_cstr, nb_cstr+nb_vars), edges from A and A_t). +// +// Wire into make_partitioner() once METIS is an optional cuOpt dependency. +template +class metis_partitioner_t : public partitioner_i { + public: + std::vector partition(partitioner_input_t const& input) const override; +}; + +} // namespace cuopt::linear_programming::detail diff --git a/cpp/src/pdlp/distributed_pdlp/partitioner.cu b/cpp/src/pdlp/distributed_pdlp/partitioner.cu new file mode 100644 index 0000000000..bdbfcacf06 --- /dev/null +++ b/cpp/src/pdlp/distributed_pdlp/partitioner.cu @@ -0,0 +1,87 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + */ + +#include + +#include + +#include +#include + +namespace cuopt::linear_programming::detail { + +template +std::vector dummy_partitioner_t::partition( + partitioner_input_t const& input) const +{ + cuopt_expects(input.nb_parts > 0, + error_type_t::ValidationError, + "dummy_partitioner: nb_parts must be positive"); + cuopt_expects(input.nb_cstr >= 0 && input.nb_vars >= 0, + error_type_t::ValidationError, + "dummy_partitioner: invalid problem dimensions"); + + const std::size_t nvtx = + static_cast(input.nb_cstr) + static_cast(input.nb_vars); + std::vector parts(nvtx); + for (std::size_t i = 0; i < nvtx; ++i) { + parts[i] = static_cast(i % static_cast(input.nb_parts)); + } + validate_partition(parts, + static_cast(input.nb_cstr), + static_cast(input.nb_vars), + static_cast(input.nb_parts), + "dummy_partitioner"); + return parts; +} + +void validate_partition(std::vector const& parts, + int nb_cstr, + int nb_vars, + int nb_parts, + char const* context) +{ + const std::size_t expected = + static_cast(nb_cstr) + static_cast(nb_vars); + cuopt_expects(parts.size() == expected, + error_type_t::ValidationError, + "%s: expected %zu part entries (cstrs + vars), got %zu", + context, + expected, + parts.size()); + cuopt_expects(nb_parts > 0, + error_type_t::ValidationError, + "%s: nb_parts must be positive", + context); + if (parts.empty()) { return; } + const auto [min_it, max_it] = std::minmax_element(parts.begin(), parts.end()); + cuopt_expects(*min_it >= 0, + error_type_t::ValidationError, + "%s: partition ids must be non-negative (min=%d)", + context, + static_cast(*min_it)); + cuopt_expects(*max_it < nb_parts, + error_type_t::ValidationError, + "%s: partition ids must be in [0, %d) (max=%d)", + context, + static_cast(nb_parts), + static_cast(*max_it)); +} + +template +std::unique_ptr> make_partitioner(partitioner_kind_t kind) +{ + switch (kind) { + case partitioner_kind_t::Dummy: + return std::make_unique>(); + } + cuopt_expects(false, error_type_t::RuntimeError, "make_partitioner: unsupported partitioner kind"); + return nullptr; +} + +template class dummy_partitioner_t; +template std::unique_ptr> make_partitioner(partitioner_kind_t); + +} // namespace cuopt::linear_programming::detail diff --git a/cpp/src/pdlp/distributed_pdlp/partitioner.hpp b/cpp/src/pdlp/distributed_pdlp/partitioner.hpp new file mode 100644 index 0000000000..ee5798fd0b --- /dev/null +++ b/cpp/src/pdlp/distributed_pdlp/partitioner.hpp @@ -0,0 +1,63 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + */ + +#pragma once + +#include +#include +#include + +namespace cuopt::linear_programming::detail { + +// Non-owning view of a host CSR matrix (A or A_t). +template +struct csr_host_view_t { + std::vector const* row_offsets{nullptr}; + std::vector const* col_indices{nullptr}; + std::vector const* values{nullptr}; // optional; unused by topology-only partitioners + i_t num_rows{0}; + i_t num_cols{0}; +}; + +// Inputs shared by all distributed-PDLP partitioners. +// Returns a flat vector of length (nb_cstr + nb_vars): constraint part-ids first, +// then variable part-ids, each in [0, nb_parts). +template +struct partitioner_input_t { + i_t nb_cstr{0}; + i_t nb_vars{0}; + i_t nb_parts{0}; + // Constraint matrix A (rows = constraints, cols = variables). + csr_host_view_t A{}; + // Transpose A_t (rows = variables, cols = constraints). Optional for partitioners + // that build a bipartite graph (e.g. METIS); dummy partitioner ignores both matrices. + csr_host_view_t A_t{}; +}; + +enum class partitioner_kind_t { Dummy /*, Metis */ }; + +template +class partitioner_i { + public: + virtual ~partitioner_i() = default; + virtual std::vector partition(partitioner_input_t const& input) const = 0; +}; + +template +class dummy_partitioner_t : public partitioner_i { + public: + std::vector partition(partitioner_input_t const& input) const override; +}; + +void validate_partition(std::vector const& parts, + int nb_cstr, + int nb_vars, + int nb_parts, + char const* context = "partition"); + +template +std::unique_ptr> make_partitioner(partitioner_kind_t kind); + +} // namespace cuopt::linear_programming::detail diff --git a/cpp/src/pdlp/pdlp.cu b/cpp/src/pdlp/pdlp.cu index 1e76fa4251..203547367b 100644 --- a/cpp/src/pdlp/pdlp.cu +++ b/cpp/src/pdlp/pdlp.cu @@ -12,6 +12,7 @@ #include #include +#include #include #include #include @@ -396,28 +397,32 @@ pdlp_solver_t::pdlp_solver_t(problem_t& op_problem, "Distributed PDLP (num_gpus > 1) currently requires double precision"); return; } else { - // 2. Load partition + // 2. Load or compute partition std::vector parts; if (!settings.multi_gpu_partition_file.empty()) { parts = partition_loader_t::parse_distributed_pdlp_partition_file( settings.multi_gpu_partition_file); - } else if (num_gpus == 1) { - // Single-part dummy run: useful for exercising the mGPU code paths on a - // single physical GPU without a real Metis partition file. The downstream - // create_rank_data_from_parts expects a flat vector of length - // (n_constraints + n_variables) where each entry is the owning part-id - // (cstrs first, then vars). With nb_parts == 1, every entry is 0. - std::cout << "CAREFUL: num_gpus == 1, running dummy version (single part covering " - << op_problem_scaled_.n_constraints << " cstrs + " - << op_problem_scaled_.n_variables << " vars)" << std::endl; - parts = std::vector( - static_cast(op_problem_scaled_.n_constraints + op_problem_scaled_.n_variables), - 0); + validate_partition(parts, + op_problem_scaled_.n_constraints, + op_problem_scaled_.n_variables, + num_gpus, + "partition file"); } else { - cuopt_expects(false, - error_type_t::RuntimeError, - "Metis partitioning inside cuopt not implemented yet; " - "provide a --parts file via settings.multi_gpu_partition_file"); + if (num_gpus == 1) { + // Single-part dummy run: useful for exercising the mGPU code paths on a + // single physical GPU without a real partition file. + std::cout << "CAREFUL: num_gpus == 1, running dummy version (single part covering " + << op_problem_scaled_.n_constraints << " cstrs + " + << op_problem_scaled_.n_variables << " vars)" << std::endl; + } + partitioner_input_t partition_input; + partition_input.nb_cstr = op_problem_scaled_.n_constraints; + partition_input.nb_vars = op_problem_scaled_.n_variables; + partition_input.nb_parts = num_gpus; + // Dummy partitioner ignores A / A_t for now; future METIS partitioners will + // fill these CSR views before calling partition(). + auto partitioner = make_partitioner(partitioner_kind_t::Dummy); + parts = partitioner->partition(partition_input); } // always compute initial step size before scaling and primal_weight after scaling to do like From ca7d7a91b33b72c60c885a7314619097d82e19ad Mon Sep 17 00:00:00 2001 From: Bulle Mostovoi Date: Wed, 27 May 2026 15:57:37 +0200 Subject: [PATCH 39/67] added stream forking for cuda graph --- .../pdlp/distributed_pdlp/multi_gpu_engine.cu | 13 +++++ .../distributed_pdlp/multi_gpu_engine.hpp | 55 +++++++++++++++++++ cpp/src/pdlp/pdhg.cu | 17 ++++++ 3 files changed, 85 insertions(+) diff --git a/cpp/src/pdlp/distributed_pdlp/multi_gpu_engine.cu b/cpp/src/pdlp/distributed_pdlp/multi_gpu_engine.cu index a0b3f5dcc3..796153fd79 100644 --- a/cpp/src/pdlp/distributed_pdlp/multi_gpu_engine.cu +++ b/cpp/src/pdlp/distributed_pdlp/multi_gpu_engine.cu @@ -81,6 +81,19 @@ multi_gpu_engine_t::multi_gpu_engine_t( objective_scaling_factor, sub_solver_settings)); } + + // 4. Allocate fork/join events for cross-stream graph capture splicing. + // fork_event_ on the master device (whatever device is current when the + // engine is constructed -- pdlp_solver_t's mGPU ctor runs on master). + // join_events_[r] on shard r's device. event_handler_t uses the default + // cudaEventCreate (no flags), matching the rest of the codebase. + // Cleanup is automatic via event_handler_t's RAII destructor. + fork_event_ = std::make_unique(); + join_events_.reserve(nb_parts); + for (int r = 0; r < nb_parts; ++r) { + raft::device_setter guard(devices[r]); + join_events_.emplace_back(std::make_unique()); + } } template struct multi_gpu_engine_t; diff --git a/cpp/src/pdlp/distributed_pdlp/multi_gpu_engine.hpp b/cpp/src/pdlp/distributed_pdlp/multi_gpu_engine.hpp index d156e889af..ade0da1c66 100644 --- a/cpp/src/pdlp/distributed_pdlp/multi_gpu_engine.hpp +++ b/cpp/src/pdlp/distributed_pdlp/multi_gpu_engine.hpp @@ -8,6 +8,7 @@ #include #include #include +#include #include @@ -446,6 +447,60 @@ struct multi_gpu_engine_t { // Shards stored by unique_ptr because pdlp_shard_t is immovable // (owns device-affine resources: handle, NCCL comm, RMM buffers). std::vector>> shards; + + // ===== Fork/join events for CUDA graph capture spanning shard streams ===== + // + // CUDA graph capture starts on the master pdhg stream (in pdhg_solver_t). + // The per-iteration work then dispatches kernels and NCCL collectives onto + // each shard's own stream. For these cross-stream operations to be + // recorded into the same captured graph (instead of escaping the capture + // and either invalidating it or being silently dropped), every shard + // stream must be "spliced" into the active capture via fork/join events. + // + // master_stream ──record(fork_event_)──┐ + // ├─> shard_0.stream (waits) ──┐ + // ├─> shard_1.stream (waits) ──┤ + // └─> shard_{n-1}.stream ──┘ + // (record join_events_[r]) + // master waits on each + // + // Pattern mirrors metis_tests/src/bench.cu. Events are reused across + // iterations (created once at engine construction) and cleaned up + // automatically by event_handler_t's RAII destructor. + // + // unique_ptr because event_handler_t is non-copyable and we need + // per-device construction (each join event must be created with its + // shard's device current). + std::unique_ptr fork_event_; + std::vector> join_events_; + + // fork_to_shards: record fork_event_ on `master_stream`, then make every + // shard stream wait on it. Inside a graph capture, this splices every + // shard stream into the same captured graph. + void fork_to_shards(rmm::cuda_stream_view master_stream) + { + fork_event_->record(master_stream); + for (auto& s : shards) { + raft::device_setter guard(s->device_id); + fork_event_->stream_wait(s->stream.view()); + } + } + + // join_from_shards: each shard records its join event on its own stream, + // then `master_stream` waits on every join event. Closes the captured + // sub-graph back into the master stream so cudaStreamEndCapture can + // produce a single graph spanning all streams. + void join_from_shards(rmm::cuda_stream_view master_stream) + { + const int nb = static_cast(shards.size()); + for (int r = 0; r < nb; ++r) { + raft::device_setter guard(shards[r]->device_id); + join_events_[r]->record(shards[r]->stream.view()); + } + for (auto& e : join_events_) { + e->stream_wait(master_stream); + } + } }; } // namespace cuopt::linear_programming::detail diff --git a/cpp/src/pdlp/pdhg.cu b/cpp/src/pdlp/pdhg.cu index 969f5d0d30..df183dc7e6 100644 --- a/cpp/src/pdlp/pdhg.cu +++ b/cpp/src/pdlp/pdhg.cu @@ -1249,6 +1249,14 @@ void pdhg_solver_t::compute_next_primal_dual_solution_reflected( if (should_major) { graph_all.run(should_major, [&]() { + // Multi-GPU: splice shard streams into the capture so their kernels and + // NCCL collectives are recorded into the same graph. Without this, work + // issued on shard.stream from inside this lambda would either invalidate + // the capture or run outside the graph, leaving the captured graph + // empty (or broken) -- which produces the cycling/stall behavior we + // observed on larger problems. Mirrors metis_tests bench.cu fork/join. + if (mgpu_engine_ != nullptr) { mgpu_engine_->fork_to_shards(stream_view_); } + compute_At_y(); if (mgpu_engine_ != nullptr) { for (auto& shard : mgpu_engine_->shards) { @@ -1346,10 +1354,17 @@ void pdhg_solver_t::compute_next_primal_dual_solution_reflected( print("potential_next_dual_solution_", potential_next_dual_solution_); print("reflected_dual_", reflected_dual_); #endif + + // Multi-GPU: close the fork by joining every shard stream back into + // the master stream so cudaStreamEndCapture sees a single graph + // spanning all streams. + if (mgpu_engine_ != nullptr) { mgpu_engine_->join_from_shards(stream_view_); } }); } else { graph_all.run(should_major, [&]() { + if (mgpu_engine_ != nullptr) { mgpu_engine_->fork_to_shards(stream_view_); } + // Compute next primal compute_At_y(); @@ -1454,6 +1469,8 @@ void pdhg_solver_t::compute_next_primal_dual_solution_reflected( #ifdef CUPDLP_DEBUG_MODE print("reflected_dual_", reflected_dual_); #endif + + if (mgpu_engine_ != nullptr) { mgpu_engine_->join_from_shards(stream_view_); } }); } } From 0310d50a57dbb6b7f5752a8630f19cf663658795 Mon Sep 17 00:00:00 2001 From: Bulle Mostovoi Date: Wed, 27 May 2026 18:49:12 +0200 Subject: [PATCH 40/67] updated convergence information to use potential_next rather than current in compute_primal/dual_residual, as the dual_iterate parameter --- .../convergence_information.cu | 53 ++++++++++++------- 1 file changed, 33 insertions(+), 20 deletions(-) diff --git a/cpp/src/pdlp/termination_strategy/convergence_information.cu b/cpp/src/pdlp/termination_strategy/convergence_information.cu index 28b33582ab..608590ffa0 100644 --- a/cpp/src/pdlp/termination_strategy/convergence_information.cu +++ b/cpp/src/pdlp/termination_strategy/convergence_information.cu @@ -429,22 +429,24 @@ void convergence_information_t::compute_convergence_information( error_type_t::ValidationError, "per_constraint_residual is not yet supported in multi-GPU mode"); - // Prepares halo values in primal_solution + // Prepares halo values in potential_next_primal_solution + engine->halo_exchange_var( [](pdhg_solver_t& pdhg) -> rmm::device_uvector& { - return pdhg.get_primal_solution(); + return pdhg.get_potential_next_primal_solution(); }); - // Compute the primal residual and objective on each shard for (auto& shard : engine->shards) { raft::device_setter guard(shard->device_id); auto& sub_pdlp = *shard->sub_pdlp; auto& sub_conv = sub_pdlp.get_current_termination_strategy().get_convergence_information(); - sub_conv.compute_primal_residual(sub_conv.op_problem_cusparse_view_, - sub_pdlp.pdhg_solver_.get_dual_tmp_resource(), - sub_pdlp.pdhg_solver_.get_dual_solution()); - sub_conv.compute_primal_objective_owned_partial(sub_pdlp.pdhg_solver_.get_primal_solution(), - shard->rank_data.owned_var_size); + sub_conv.compute_primal_residual( + sub_conv.op_problem_cusparse_view_, + sub_pdlp.pdhg_solver_.get_dual_tmp_resource(), + sub_pdlp.pdhg_solver_.get_potential_next_dual_solution()); + sub_conv.compute_primal_objective_owned_partial( + sub_pdlp.pdhg_solver_.get_potential_next_primal_solution(), + shard->rank_data.owned_var_size); } // Reduce all primal objectives across shards @@ -546,12 +548,15 @@ void convergence_information_t::compute_convergence_information( if (current_pdhg_solver.is_multi_gpu()) { auto* engine = current_pdhg_solver.get_mgpu_engine(); - // 1) Halo-exchange the dual solution on every shard so the upcoming - // A_T_shard @ dual SpMV inside compute_dual_residual reads correct - // values in the cstr halo region. + // 1) Halo-exchange potential_next_dual_solution on every shard so the + // A_T_shard @ y SpMV inside compute_dual_residual reads correct values + // in the cstr halo region. The SpMV is driven through the eval view's + // cv.dual_solution descriptor, which (cuPDLPx, see + // cusparse_view.cu:931-937) is bound to _potential_next_dual -- not to + // current.dual_solution. So we must halo-exchange the same buffer. engine->halo_exchange_cstr( [](pdhg_solver_t& pdhg) -> rmm::device_uvector& { - return pdhg.get_dual_solution(); + return pdhg.get_potential_next_dual_solution(); }); // 2-3) Per-shard: @@ -563,18 +568,26 @@ void convergence_information_t::compute_convergence_information( // shard.dual_objective_, with NO scaling/offset. Relies on // primal_slack_ already populated by the per-shard // compute_primal_residual above. + // + // Same primal_iterate fix as the primal block above: use the shard's + // (fresh, unscaled) potential_next_primal_solution, matching single-GPU + // cuPDLPx (pdlp.cu:1190-1203). The previous code's get_primal_solution() + // would mix scaled x with unscaled dual_slack in the dual_objective + // cublasdot. for (auto& shard : engine->shards) { raft::device_setter guard(shard->device_id); auto& sub_pdlp = *shard->sub_pdlp; auto& sub_conv = sub_pdlp.get_current_termination_strategy().get_convergence_information(); - sub_conv.compute_dual_residual(sub_conv.op_problem_cusparse_view_, - sub_pdlp.pdhg_solver_.get_primal_tmp_resource(), - sub_pdlp.pdhg_solver_.get_primal_solution(), - sub_pdlp.pdhg_solver_.get_dual_slack()); - sub_conv.compute_dual_objective_owned_partial(sub_pdlp.pdhg_solver_.get_primal_solution(), - sub_pdlp.pdhg_solver_.get_dual_slack(), - shard->rank_data.owned_var_size, - shard->rank_data.owned_cstr_size); + sub_conv.compute_dual_residual( + sub_conv.op_problem_cusparse_view_, + sub_pdlp.pdhg_solver_.get_primal_tmp_resource(), + sub_pdlp.pdhg_solver_.get_potential_next_primal_solution(), + sub_pdlp.pdhg_solver_.get_dual_slack()); + sub_conv.compute_dual_objective_owned_partial( + sub_pdlp.pdhg_solver_.get_potential_next_primal_solution(), + sub_pdlp.pdhg_solver_.get_dual_slack(), + shard->rank_data.owned_var_size, + shard->rank_data.owned_cstr_size); } // 4) Allreduce dual_objective_ across shards (sum, in place). Same From f811bc8459f71f6690efce337daacf6db4892141 Mon Sep 17 00:00:00 2001 From: Bulle Mostovoi Date: Wed, 27 May 2026 19:49:54 +0200 Subject: [PATCH 41/67] disabled graph, can sole afiro hehe --- cpp/src/pdlp/distributed_pdlp/shard.cu | 8 +++++++- cpp/src/pdlp/pdlp.cu | 8 +++++++- 2 files changed, 14 insertions(+), 2 deletions(-) diff --git a/cpp/src/pdlp/distributed_pdlp/shard.cu b/cpp/src/pdlp/distributed_pdlp/shard.cu index 405e6fa05c..45f9f7a880 100644 --- a/cpp/src/pdlp/distributed_pdlp/shard.cu +++ b/cpp/src/pdlp/distributed_pdlp/shard.cu @@ -153,7 +153,13 @@ pdlp_shard_t::pdlp_shard_t(int device_id, // At this point sub_pdlp.op_problem_scaled_ is an unscaled copy // of sub_problem and sub_pdlp.initial_scaling_strategy_ has // unit cumulative factors (sub-settings disable Ruiz / PC iters). - sub_pdlp = std::make_unique>(*sub_problem, settings, /*batch=*/false); + // NOTE: pass is_legacy_batch_mode=true to disable CUDA-graph capture inside + // sub_pdlp while debugging fake-mGPU divergence. The flag is a pure + // graph-capture toggle (ping_pong_graph_t / manual_cuda_graph_t) and does + // not change any algorithm semantics. Restore to false once the path is + // confirmed correct. + sub_pdlp = std::make_unique>( + *sub_problem, settings, /*is_legacy_batch_mode=*/true); sub_pdlp->pdhg_solver_.set_is_multi_gpu(true); diff --git a/cpp/src/pdlp/pdlp.cu b/cpp/src/pdlp/pdlp.cu index 203547367b..ecc2e35c20 100644 --- a/cpp/src/pdlp/pdlp.cu +++ b/cpp/src/pdlp/pdlp.cu @@ -381,7 +381,13 @@ pdlp_solver_t::pdlp_solver_t(problem_t& op_problem, int num_gpus) // 1. Delegate to single-GPU ctor to bring up all the per-master state // (problem_ptr, op_problem_scaled_, pdhg_solver_, strategies, etc.). - : pdlp_solver_t(op_problem, settings, false) + // + // NOTE: pass is_legacy_batch_mode=true to disable CUDA-graph capture on the + // master while we are debugging fake-mGPU divergence. The flag is a pure + // graph-capture toggle (see ping_pong_graph_t / manual_cuda_graph_t); it does + // not change any algorithm semantics. Restore to false once the path is + // confirmed correct. + : pdlp_solver_t(op_problem, settings, /*is_legacy_batch_mode=*/true) { if (num_gpus == 1) { std::cout << "CAREFUL: num_gpus == 1, running dummy version" << std::endl; From 4d7e2fced7f3600ca45e6a171972483af876576a Mon Sep 17 00:00:00 2001 From: Bulle Mostovoi Date: Thu, 28 May 2026 12:49:58 +0200 Subject: [PATCH 42/67] added join_from_shards in convergence_info, now afiro is erfect 510 but a28 is 2100 vs 1500 hmmmm --- .../convergence_information.cu | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/cpp/src/pdlp/termination_strategy/convergence_information.cu b/cpp/src/pdlp/termination_strategy/convergence_information.cu index 608590ffa0..7877a64c88 100644 --- a/cpp/src/pdlp/termination_strategy/convergence_information.cu +++ b/cpp/src/pdlp/termination_strategy/convergence_information.cu @@ -459,7 +459,10 @@ void convergence_information_t::compute_convergence_information( .data(); }); - // Get the reduced primal objective from the shard[0] (arbitrary) + // Get the reduced primal objective from the shard[0] (arbitrary) + // Race fix: master stream must wait for shard streams to finish the + // allreduce above before copying scalar data out of shard 0's buffer. + engine->join_from_shards(stream_view_); { auto& s0 = *engine->shards[0]; raft::device_setter guard(s0.device_id); @@ -494,6 +497,9 @@ void convergence_information_t::compute_convergence_information( }, [](pdlp_shard_t& shard) -> i_t { return shard.rank_data.owned_cstr_size; }); + // Race fix: master stream must wait for shard streams to finish the + // distributed L2 norm before copying scalar data out of shard 0. + engine->join_from_shards(stream_view_); auto& s0 = *engine->shards[0]; raft::device_setter guard(s0.device_id); raft::copy(l2_primal_residual_.data(), @@ -601,6 +607,9 @@ void convergence_information_t::compute_convergence_information( .data(); }); + // Race fix: master stream must wait for shard streams to finish the + // allreduce above before copying scalar data out of shard 0's buffer. + engine->join_from_shards(stream_view_); { auto& s0 = *engine->shards[0]; raft::device_setter guard(s0.device_id); @@ -637,6 +646,9 @@ void convergence_information_t::compute_convergence_information( .l2_dual_residual_.data(); }, [](pdlp_shard_t& shard) -> i_t { return shard.rank_data.owned_var_size; }); + // Race fix: master stream must wait for shard streams to finish the + // distributed L2 norm before copying scalar data out of shard 0. + engine->join_from_shards(stream_view_); auto& s0 = *engine->shards[0]; raft::device_setter guard(s0.device_id); raft::copy(l2_dual_residual_.data(), From 7ad460664e5c09328fa3ef3c32dcc2b7598cbc77 Mon Sep 17 00:00:00 2001 From: Bulle Mostovoi Date: Thu, 28 May 2026 13:52:22 +0200 Subject: [PATCH 43/67] use spmvop in mgpu and fixed small bug of increment_iteration_since_last_restart. now we have exact same iter for A28 --- .../distributed_pdlp/multi_gpu_engine.hpp | 26 ++++++++++++------- cpp/src/pdlp/pdlp.cu | 3 +-- 2 files changed, 18 insertions(+), 11 deletions(-) diff --git a/cpp/src/pdlp/distributed_pdlp/multi_gpu_engine.hpp b/cpp/src/pdlp/distributed_pdlp/multi_gpu_engine.hpp index ade0da1c66..637c342975 100644 --- a/cpp/src/pdlp/distributed_pdlp/multi_gpu_engine.hpp +++ b/cpp/src/pdlp/distributed_pdlp/multi_gpu_engine.hpp @@ -311,21 +311,29 @@ struct multi_gpu_engine_t { // -------- High-level: A @ x and A_T @ y --------------------------------- // Thin wrappers used from pdhg_solver_t::compute_A_x / compute_At_y when an - // engine is wired in. They use the canonical PDHG buffers/descriptors so the - // result lands where single-GPU PDHG would have put it (dual_gradient for A, - // current_AtY for A_T). + // engine is wired in. They drive the per-shard plan-based SpMV via the + // canonical cusparse_view bindings (no rebinding) so the descriptor binding + // is never disturbed by mGPU machinery. + // + // The halo-exchange MUST target the exact buffer the canonical descriptor + // is bound to in the PDHG cusparse_view (see cusparse_view.cu lines 516-519 + // and 595-599): + // - cv.reflected_primal_solution -> reflected_primal_ (var-shaped) + // - cv.dual_solution -> current.dual_solution_ (cstr-shaped) + // For 1 shard the halo-exchange is a no-op, but the buffer choice is what + // makes multi-shard correctness work, so we keep it accurate either way. void distributed_compute_A_x() { - distributed_spmv_A( - [](auto& pdhg) -> rmm::device_uvector& { return pdhg.get_reflected_primal(); }, - [](auto& pdhg) -> cusparseDnVecDescr_t { return pdhg.get_cusparse_view().dual_gradient; }); + halo_exchange_var( + [](auto& pdhg) -> rmm::device_uvector& { return pdhg.get_reflected_primal(); }); + for_each_shard([](auto& shard) { shard.sub_pdlp->pdhg_solver_.spmvop_A_x(); }); } void distributed_compute_At_y() { - distributed_spmv_At( - [](auto& pdhg) -> rmm::device_uvector& { return pdhg.get_dual_solution(); }, - [](auto& pdhg) -> cusparseDnVecDescr_t { return pdhg.get_cusparse_view().current_AtY; }); + halo_exchange_cstr( + [](auto& pdhg) -> rmm::device_uvector& { return pdhg.get_dual_solution(); }); + for_each_shard([](auto& shard) { shard.sub_pdlp->pdhg_solver_.spmvop_At_y(); }); } // -------- Solution gather (shards -> master) ---------------------------- diff --git a/cpp/src/pdlp/pdlp.cu b/cpp/src/pdlp/pdlp.cu index ecc2e35c20..91263828c1 100644 --- a/cpp/src/pdlp/pdlp.cu +++ b/cpp/src/pdlp/pdlp.cu @@ -3154,12 +3154,11 @@ optimization_problem_solution_t pdlp_solver_t::run_solver(co ++total_pdlp_iterations_; ++internal_solver_iterations_; if (settings_.hyper_params.never_restart_to_average) { + restart_strategy_.increment_iteration_since_last_restart(); if (multi_gpu_engine) { multi_gpu_engine->for_each_shard([&](auto& shard) { shard.sub_pdlp->restart_strategy_.increment_iteration_since_last_restart(); }); - } else { - restart_strategy_.increment_iteration_since_last_restart(); } } } From 03d1259e668b2103c0547e7b3a0826eb9c18f311 Mon Sep 17 00:00:00 2001 From: Bulle Mostovoi Date: Thu, 28 May 2026 13:57:10 +0200 Subject: [PATCH 44/67] re-enabled graph. not working --- cpp/src/pdlp/distributed_pdlp/shard.cu | 15 +++++++++------ cpp/src/pdlp/pdlp.cu | 12 ++++++------ 2 files changed, 15 insertions(+), 12 deletions(-) diff --git a/cpp/src/pdlp/distributed_pdlp/shard.cu b/cpp/src/pdlp/distributed_pdlp/shard.cu index 45f9f7a880..93dc1403fc 100644 --- a/cpp/src/pdlp/distributed_pdlp/shard.cu +++ b/cpp/src/pdlp/distributed_pdlp/shard.cu @@ -153,13 +153,16 @@ pdlp_shard_t::pdlp_shard_t(int device_id, // At this point sub_pdlp.op_problem_scaled_ is an unscaled copy // of sub_problem and sub_pdlp.initial_scaling_strategy_ has // unit cumulative factors (sub-settings disable Ruiz / PC iters). - // NOTE: pass is_legacy_batch_mode=true to disable CUDA-graph capture inside - // sub_pdlp while debugging fake-mGPU divergence. The flag is a pure - // graph-capture toggle (ping_pong_graph_t / manual_cuda_graph_t) and does - // not change any algorithm semantics. Restore to false once the path is - // confirmed correct. + // Graph capture is enabled. The per-shard kernels invoked by the master's + // captured graph (compute_next_primal_dual_solution_reflected → for_each_shard + // → primal/dual_reflected_*_projection_transform on sub_pdlp's pdhg) are + // recorded into the same graph via the fork_to_shards / join_from_shards + // splicing on the master stream. Shards never own their own graph; their + // pdhg ping_pong_graph_t is only constructed because pdlp_solver_t requires + // it, but no graph.run() on a shard's pdhg is ever invoked from the mGPU + // path (compute_next_primal_dual_solution_reflected runs on master). sub_pdlp = std::make_unique>( - *sub_problem, settings, /*is_legacy_batch_mode=*/true); + *sub_problem, settings, /*is_legacy_batch_mode=*/false); sub_pdlp->pdhg_solver_.set_is_multi_gpu(true); diff --git a/cpp/src/pdlp/pdlp.cu b/cpp/src/pdlp/pdlp.cu index 91263828c1..fd78a0ac9d 100644 --- a/cpp/src/pdlp/pdlp.cu +++ b/cpp/src/pdlp/pdlp.cu @@ -382,12 +382,12 @@ pdlp_solver_t::pdlp_solver_t(problem_t& op_problem, // 1. Delegate to single-GPU ctor to bring up all the per-master state // (problem_ptr, op_problem_scaled_, pdhg_solver_, strategies, etc.). // - // NOTE: pass is_legacy_batch_mode=true to disable CUDA-graph capture on the - // master while we are debugging fake-mGPU divergence. The flag is a pure - // graph-capture toggle (see ping_pong_graph_t / manual_cuda_graph_t); it does - // not change any algorithm semantics. Restore to false once the path is - // confirmed correct. - : pdlp_solver_t(op_problem, settings, /*is_legacy_batch_mode=*/true) + // Graph capture is enabled here. The master's captured graph splices the + // shard streams via fork_to_shards/join_from_shards inside + // compute_next_primal_dual_solution_reflected (see pdhg.cu) so every + // per-shard kernel and NCCL collective is recorded into the same parent + // graph. + : pdlp_solver_t(op_problem, settings, /*is_legacy_batch_mode=*/false) { if (num_gpus == 1) { std::cout << "CAREFUL: num_gpus == 1, running dummy version" << std::endl; From cdc912b50d0d2549c6f9a43576efcc7b4a2edb3c Mon Sep 17 00:00:00 2001 From: Bulle Mostovoi Date: Thu, 28 May 2026 14:40:47 +0200 Subject: [PATCH 45/67] Cleaner sync semantics, ez ez ez, single mGPU gives exact same results as base PDLP on afiro and a28, with graphs !!!! EZ --- .../pdlp/distributed_pdlp/multi_gpu_engine.cu | 18 ++--- .../distributed_pdlp/multi_gpu_engine.hpp | 81 ++++++++++--------- cpp/src/pdlp/distributed_pdlp/shard.cu | 11 --- cpp/src/pdlp/pdhg.cu | 21 ++++- cpp/src/pdlp/pdlp.cu | 6 -- .../convergence_information.cu | 17 ++-- 6 files changed, 75 insertions(+), 79 deletions(-) diff --git a/cpp/src/pdlp/distributed_pdlp/multi_gpu_engine.cu b/cpp/src/pdlp/distributed_pdlp/multi_gpu_engine.cu index 796153fd79..98f33b6c88 100644 --- a/cpp/src/pdlp/distributed_pdlp/multi_gpu_engine.cu +++ b/cpp/src/pdlp/distributed_pdlp/multi_gpu_engine.cu @@ -82,17 +82,17 @@ multi_gpu_engine_t::multi_gpu_engine_t( sub_solver_settings)); } - // 4. Allocate fork/join events for cross-stream graph capture splicing. - // fork_event_ on the master device (whatever device is current when the - // engine is constructed -- pdlp_solver_t's mGPU ctor runs on master). - // join_events_[r] on shard r's device. event_handler_t uses the default - // cudaEventCreate (no flags), matching the rest of the codebase. - // Cleanup is automatic via event_handler_t's RAII destructor. - fork_event_ = std::make_unique(); - join_events_.reserve(nb_parts); + // Two different events + // capture_*_event_ are used inside graph capture + // ext_*_event_ are used when sync is needed outside of graph + graph_master_ready_event_ = std::make_unique(); + sync_master_ready_event_ = std::make_unique(); + graph_shard_ready_events_.reserve(nb_parts); + sync_shard_ready_events_.reserve(nb_parts); for (int r = 0; r < nb_parts; ++r) { raft::device_setter guard(devices[r]); - join_events_.emplace_back(std::make_unique()); + graph_shard_ready_events_.emplace_back(std::make_unique()); + sync_shard_ready_events_.emplace_back(std::make_unique()); } } diff --git a/cpp/src/pdlp/distributed_pdlp/multi_gpu_engine.hpp b/cpp/src/pdlp/distributed_pdlp/multi_gpu_engine.hpp index 637c342975..674c4c0ef2 100644 --- a/cpp/src/pdlp/distributed_pdlp/multi_gpu_engine.hpp +++ b/cpp/src/pdlp/distributed_pdlp/multi_gpu_engine.hpp @@ -456,56 +456,59 @@ struct multi_gpu_engine_t { // (owns device-affine resources: handle, NCCL comm, RMM buffers). std::vector>> shards; - // ===== Fork/join events for CUDA graph capture spanning shard streams ===== - // - // CUDA graph capture starts on the master pdhg stream (in pdhg_solver_t). - // The per-iteration work then dispatches kernels and NCCL collectives onto - // each shard's own stream. For these cross-stream operations to be - // recorded into the same captured graph (instead of escaping the capture - // and either invalidating it or being silently dropped), every shard - // stream must be "spliced" into the active capture via fork/join events. - // - // master_stream ──record(fork_event_)──┐ - // ├─> shard_0.stream (waits) ──┐ - // ├─> shard_1.stream (waits) ──┤ - // └─> shard_{n-1}.stream ──┘ - // (record join_events_[r]) - // master waits on each - // - // Pattern mirrors metis_tests/src/bench.cu. Events are reused across - // iterations (created once at engine construction) and cleaned up - // automatically by event_handler_t's RAII destructor. - // - // unique_ptr because event_handler_t is non-copyable and we need - // per-device construction (each join event must be created with its - // shard's device current). - std::unique_ptr fork_event_; - std::vector> join_events_; - - // fork_to_shards: record fork_event_ on `master_stream`, then make every - // shard stream wait on it. Inside a graph capture, this splices every - // shard stream into the same captured graph. - void fork_to_shards(rmm::cuda_stream_view master_stream) + // ===== Cross-stream synchronization events ===== + // two different events + // capture_*_event_ are used inside graph capture + // ext_*_event_ are used when sync is needed outside of graph + std::unique_ptr graph_master_ready_event_; + std::vector> graph_shard_ready_events_; + std::unique_ptr sync_master_ready_event_; + std::vector> sync_shard_ready_events_; + + // Forks master stream to shards, so that the captured graph can see the work on the shards + void graph_capture_fork_to_shards(rmm::cuda_stream_view master_stream) + { + graph_master_ready_event_->record(master_stream); + for (auto& s : shards) { + raft::device_setter guard(s->device_id); + graph_master_ready_event_->stream_wait(s->stream.view()); + } + } + + // Joins shards back to master stream for correct graph capture + void graph_capture_join_from_shards(rmm::cuda_stream_view master_stream) + { + const int nb = static_cast(shards.size()); + for (int r = 0; r < nb; ++r) { + raft::device_setter guard(shards[r]->device_id); + graph_shard_ready_events_[r]->record(shards[r]->stream.view()); + } + for (auto& e : graph_shard_ready_events_) { + e->stream_wait(master_stream); + } + } + + // Functionnaly same as graph_capture_fork_to_shards but on a different event to avoid race conditions + // Can be used as a way to sync shards with master stream + void sync_await_master(rmm::cuda_stream_view master_stream) { - fork_event_->record(master_stream); + sync_master_ready_event_->record(master_stream); for (auto& s : shards) { raft::device_setter guard(s->device_id); - fork_event_->stream_wait(s->stream.view()); + sync_master_ready_event_->stream_wait(s->stream.view()); } } - // join_from_shards: each shard records its join event on its own stream, - // then `master_stream` waits on every join event. Closes the captured - // sub-graph back into the master stream so cudaStreamEndCapture can - // produce a single graph spanning all streams. - void join_from_shards(rmm::cuda_stream_view master_stream) + // Same as sync_await_master + // Can be used as a way to sync master stream with shards + void sync_await_shards(rmm::cuda_stream_view master_stream) { const int nb = static_cast(shards.size()); for (int r = 0; r < nb; ++r) { raft::device_setter guard(shards[r]->device_id); - join_events_[r]->record(shards[r]->stream.view()); + sync_shard_ready_events_[r]->record(shards[r]->stream.view()); } - for (auto& e : join_events_) { + for (auto& e : sync_shard_ready_events_) { e->stream_wait(master_stream); } } diff --git a/cpp/src/pdlp/distributed_pdlp/shard.cu b/cpp/src/pdlp/distributed_pdlp/shard.cu index 93dc1403fc..3a49287362 100644 --- a/cpp/src/pdlp/distributed_pdlp/shard.cu +++ b/cpp/src/pdlp/distributed_pdlp/shard.cu @@ -150,17 +150,6 @@ pdlp_shard_t::pdlp_shard_t(int device_id, handle.sync_stream(stream_view); // ---- 5. Build sub_pdlp (single-GPU mode; multi_gpu flags cleared by caller). ---- - // At this point sub_pdlp.op_problem_scaled_ is an unscaled copy - // of sub_problem and sub_pdlp.initial_scaling_strategy_ has - // unit cumulative factors (sub-settings disable Ruiz / PC iters). - // Graph capture is enabled. The per-shard kernels invoked by the master's - // captured graph (compute_next_primal_dual_solution_reflected → for_each_shard - // → primal/dual_reflected_*_projection_transform on sub_pdlp's pdhg) are - // recorded into the same graph via the fork_to_shards / join_from_shards - // splicing on the master stream. Shards never own their own graph; their - // pdhg ping_pong_graph_t is only constructed because pdlp_solver_t requires - // it, but no graph.run() on a shard's pdhg is ever invoked from the mGPU - // path (compute_next_primal_dual_solution_reflected runs on master). sub_pdlp = std::make_unique>( *sub_problem, settings, /*is_legacy_batch_mode=*/false); diff --git a/cpp/src/pdlp/pdhg.cu b/cpp/src/pdlp/pdhg.cu index df183dc7e6..ec983fd01b 100644 --- a/cpp/src/pdlp/pdhg.cu +++ b/cpp/src/pdlp/pdhg.cu @@ -1245,6 +1245,8 @@ void pdhg_solver_t::compute_next_primal_dual_solution_reflected( using f_t2 = typename type_2::type; + if (mgpu_engine_ != nullptr) { mgpu_engine_->sync_await_shards(stream_view_); } + // Compute next primal solution reflected. if (should_major) { @@ -1255,7 +1257,9 @@ void pdhg_solver_t::compute_next_primal_dual_solution_reflected( // the capture or run outside the graph, leaving the captured graph // empty (or broken) -- which produces the cycling/stall behavior we // observed on larger problems. Mirrors metis_tests bench.cu fork/join. - if (mgpu_engine_ != nullptr) { mgpu_engine_->fork_to_shards(stream_view_); } + if (mgpu_engine_ != nullptr) { + mgpu_engine_->graph_capture_fork_to_shards(stream_view_); + } compute_At_y(); if (mgpu_engine_ != nullptr) { @@ -1358,12 +1362,16 @@ void pdhg_solver_t::compute_next_primal_dual_solution_reflected( // Multi-GPU: close the fork by joining every shard stream back into // the master stream so cudaStreamEndCapture sees a single graph // spanning all streams. - if (mgpu_engine_ != nullptr) { mgpu_engine_->join_from_shards(stream_view_); } + if (mgpu_engine_ != nullptr) { + mgpu_engine_->graph_capture_join_from_shards(stream_view_); + } }); } else { graph_all.run(should_major, [&]() { - if (mgpu_engine_ != nullptr) { mgpu_engine_->fork_to_shards(stream_view_); } + if (mgpu_engine_ != nullptr) { + mgpu_engine_->graph_capture_fork_to_shards(stream_view_); + } // Compute next primal compute_At_y(); @@ -1470,9 +1478,14 @@ void pdhg_solver_t::compute_next_primal_dual_solution_reflected( print("reflected_dual_", reflected_dual_); #endif - if (mgpu_engine_ != nullptr) { mgpu_engine_->join_from_shards(stream_view_); } + if (mgpu_engine_ != nullptr) { + mgpu_engine_->graph_capture_join_from_shards(stream_view_); + } }); } + + // sync to master stream after the graph is captured + if (mgpu_engine_ != nullptr) { mgpu_engine_->sync_await_master(stream_view_); } } template diff --git a/cpp/src/pdlp/pdlp.cu b/cpp/src/pdlp/pdlp.cu index fd78a0ac9d..8cf37cd8a1 100644 --- a/cpp/src/pdlp/pdlp.cu +++ b/cpp/src/pdlp/pdlp.cu @@ -381,12 +381,6 @@ pdlp_solver_t::pdlp_solver_t(problem_t& op_problem, int num_gpus) // 1. Delegate to single-GPU ctor to bring up all the per-master state // (problem_ptr, op_problem_scaled_, pdhg_solver_, strategies, etc.). - // - // Graph capture is enabled here. The master's captured graph splices the - // shard streams via fork_to_shards/join_from_shards inside - // compute_next_primal_dual_solution_reflected (see pdhg.cu) so every - // per-shard kernel and NCCL collective is recorded into the same parent - // graph. : pdlp_solver_t(op_problem, settings, /*is_legacy_batch_mode=*/false) { if (num_gpus == 1) { diff --git a/cpp/src/pdlp/termination_strategy/convergence_information.cu b/cpp/src/pdlp/termination_strategy/convergence_information.cu index 7877a64c88..da2340146a 100644 --- a/cpp/src/pdlp/termination_strategy/convergence_information.cu +++ b/cpp/src/pdlp/termination_strategy/convergence_information.cu @@ -460,9 +460,8 @@ void convergence_information_t::compute_convergence_information( }); // Get the reduced primal objective from the shard[0] (arbitrary) - // Race fix: master stream must wait for shard streams to finish the - // allreduce above before copying scalar data out of shard 0's buffer. - engine->join_from_shards(stream_view_); + // Sync shards with master stream to avoid race conditions + engine->sync_await_shards(stream_view_); { auto& s0 = *engine->shards[0]; raft::device_setter guard(s0.device_id); @@ -497,9 +496,8 @@ void convergence_information_t::compute_convergence_information( }, [](pdlp_shard_t& shard) -> i_t { return shard.rank_data.owned_cstr_size; }); - // Race fix: master stream must wait for shard streams to finish the // distributed L2 norm before copying scalar data out of shard 0. - engine->join_from_shards(stream_view_); + engine->sync_await_shards(stream_view_); auto& s0 = *engine->shards[0]; raft::device_setter guard(s0.device_id); raft::copy(l2_primal_residual_.data(), @@ -607,9 +605,8 @@ void convergence_information_t::compute_convergence_information( .data(); }); - // Race fix: master stream must wait for shard streams to finish the - // allreduce above before copying scalar data out of shard 0's buffer. - engine->join_from_shards(stream_view_); + // Sync shards with master stream to avoid race conditions + engine->sync_await_shards(stream_view_); { auto& s0 = *engine->shards[0]; raft::device_setter guard(s0.device_id); @@ -646,9 +643,9 @@ void convergence_information_t::compute_convergence_information( .l2_dual_residual_.data(); }, [](pdlp_shard_t& shard) -> i_t { return shard.rank_data.owned_var_size; }); - // Race fix: master stream must wait for shard streams to finish the + // distributed L2 norm before copying scalar data out of shard 0. - engine->join_from_shards(stream_view_); + engine->sync_await_shards(stream_view_); auto& s0 = *engine->shards[0]; raft::device_setter guard(s0.device_id); raft::copy(l2_dual_residual_.data(), From 04d22cf161e55aef6f37add6ff3d988dcd2c34de Mon Sep 17 00:00:00 2001 From: Bulle Mostovoi Date: Fri, 29 May 2026 05:48:01 -0700 Subject: [PATCH 46/67] pad local matrices for easier integration and allow mismatch of nnz between A and A_t for shards --- cpp/src/pdlp/cusparse_view.cu | 17 +++++++++++------ .../pdlp/distributed_pdlp/partition_loader.cu | 10 ++++++++++ 2 files changed, 21 insertions(+), 6 deletions(-) diff --git a/cpp/src/pdlp/cusparse_view.cu b/cpp/src/pdlp/cusparse_view.cu index 396fd27499..1e3638cdbd 100644 --- a/cpp/src/pdlp/cusparse_view.cu +++ b/cpp/src/pdlp/cusparse_view.cu @@ -498,14 +498,17 @@ cusparse_view_t::cusparse_view_t( // setup cusparse view A.create(op_problem_scaled.n_constraints, op_problem_scaled.n_variables, - op_problem_scaled.nnz, + static_cast(A_.size()), const_cast(op_problem_scaled.offsets.data()), const_cast(op_problem_scaled.variables.data()), const_cast(op_problem_scaled.coefficients.data())); + // A_T can have a different nnz than A in multi-GPU shards + // A is just what is needed to compute A_x for owned constraints + // A_T is just what is needed to compute A_T_y for owned variables A_T.create(op_problem_scaled.n_variables, op_problem_scaled.n_constraints, - op_problem_scaled.nnz, + static_cast(A_T_.size()), const_cast(A_T_offsets_.data()), const_cast(A_T_indices_.data()), const_cast(A_T_.data())); @@ -914,14 +917,14 @@ cusparse_view_t::cusparse_view_t( // setup cusparse view A.create(op_problem.n_constraints, op_problem.n_variables, - op_problem.nnz, + static_cast(A_.size()), const_cast(op_problem.offsets.data()), const_cast(op_problem.variables.data()), const_cast(op_problem.coefficients.data())); A_T.create(op_problem.n_variables, op_problem.n_constraints, - op_problem.nnz, + static_cast(A_T_.size()), const_cast(A_T_offsets_.data()), const_cast(A_T_indices_.data()), const_cast(A_T_.data())); @@ -1129,16 +1132,18 @@ cusparse_view_t::cusparse_view_t( // Copying them from the existing cuSparse view is a bad practice and creates segfault post // CUDA 12.4 Using the saved pointer of the existing cusparse view to make sure we capture the // correct pointer + // See comment in the PDHG cusparse_view_t ctor: bind the descriptor nnz to + // the actual value-buffer length so A and A_T stay symmetric and shard-safe. A.create(op_problem.n_constraints, op_problem.n_variables, - op_problem.nnz, + static_cast(A_.size()), const_cast(A_offsets_.data()), const_cast(A_indices_.data()), const_cast(A_.data())); A_T.create(op_problem.n_variables, op_problem.n_constraints, - op_problem.nnz, + static_cast(existing_cusparse_view.A_T_.size()), const_cast(existing_cusparse_view.A_T_offsets_.data()), const_cast(existing_cusparse_view.A_T_indices_.data()), const_cast(existing_cusparse_view.A_T_.data())); diff --git a/cpp/src/pdlp/distributed_pdlp/partition_loader.cu b/cpp/src/pdlp/distributed_pdlp/partition_loader.cu index b9bc71ae9e..5014607736 100644 --- a/cpp/src/pdlp/distributed_pdlp/partition_loader.cu +++ b/cpp/src/pdlp/distributed_pdlp/partition_loader.cu @@ -171,6 +171,16 @@ std::vector> partition_loader_t::create_rank_dat rd.total_var_size = rd.owned_var_size + needed_vars.size(); rd.total_cstr_size = rd.owned_cstr_size + needed_cstrs.size(); + + // Pad row-offset arrays so cuSPARSE sees the local matrices as + // (total_cstr x total_var) for A and (total_var x total_cstr) for A_T + const i_t a_last_nnz = + rd.h_A_row_offsets.empty() ? i_t{0} : rd.h_A_row_offsets.back(); + rd.h_A_row_offsets.resize(rd.total_cstr_size + 1, a_last_nnz); + + const i_t at_last_nnz = + rd.h_A_t_row_offsets.empty() ? i_t{0} : rd.h_A_t_row_offsets.back(); + rd.h_A_t_row_offsets.resize(rd.total_var_size + 1, at_last_nnz); } // 3. Generate local indices for contiguous [[self], [peer1], ..., [peer_k]] From b41df4583df78ec095efc351df8c10159d78ccdb Mon Sep 17 00:00:00 2001 From: Bulle Mostovoi Date: Fri, 29 May 2026 06:24:42 -0700 Subject: [PATCH 47/67] copy scalars to host rather than direct d2d. better --- cpp/src/pdlp/pdlp.cu | 21 ++++++++++++---- cpp/src/pdlp/pdlp.cuh | 5 ++++ .../restart_strategy/pdlp_restart_strategy.cu | 24 +++++++++++++++---- 3 files changed, 40 insertions(+), 10 deletions(-) diff --git a/cpp/src/pdlp/pdlp.cu b/cpp/src/pdlp/pdlp.cu index 8cf37cd8a1..241b9a5aeb 100644 --- a/cpp/src/pdlp/pdlp.cu +++ b/cpp/src/pdlp/pdlp.cu @@ -572,14 +572,25 @@ pdlp_solver_t::pdlp_solver_t(problem_t& op_problem, op_problem_scaled_.presolve_data.objective_scaling_factor, sub_pdlp_settings); + // Copy to host and then to shards. + // More robust than cudaDeviceEnablePeerAccess and cost-free-ish. + f_t h_step_size{}, h_primal_weight{}, h_best_primal_weight{}; + f_t h_primal_step_size{}, h_dual_step_size{}; + raft::copy(&h_step_size, step_size_.data(), 1, stream_view_); + raft::copy(&h_primal_weight, primal_weight_.data(), 1, stream_view_); + raft::copy(&h_best_primal_weight, best_primal_weight_.data(), 1, stream_view_); + raft::copy(&h_primal_step_size, primal_step_size_.data(), 1, stream_view_); + raft::copy(&h_dual_step_size, dual_step_size_.data(), 1, stream_view_); + handle_ptr_->sync_stream(stream_view_); + for (auto& shard : multi_gpu_engine->shards) { raft::device_setter guard(shard->device_id); auto& sub = *shard->sub_pdlp; - raft::copy(sub.step_size_.data(), step_size_.data(), 1, shard->stream); - raft::copy(sub.primal_weight_.data(), primal_weight_.data(), 1, shard->stream); - raft::copy(sub.best_primal_weight_.data(), best_primal_weight_.data(), 1, shard->stream); - raft::copy(sub.primal_step_size_.data(), primal_step_size_.data(), 1, shard->stream); - raft::copy(sub.dual_step_size_.data(), dual_step_size_.data(), 1, shard->stream); + raft::copy(sub.step_size_.data(), &h_step_size, 1, shard->stream); + raft::copy(sub.primal_weight_.data(), &h_primal_weight, 1, shard->stream); + raft::copy(sub.best_primal_weight_.data(), &h_best_primal_weight, 1, shard->stream); + raft::copy(sub.primal_step_size_.data(), &h_primal_step_size, 1, shard->stream); + raft::copy(sub.dual_step_size_.data(), &h_dual_step_size, 1, shard->stream); } // Wire the engine into the master pdhg_solver_. Shards' pdhg_solver_ keep diff --git a/cpp/src/pdlp/pdlp.cuh b/cpp/src/pdlp/pdlp.cuh index 17fb05080f..15ddfdaad3 100644 --- a/cpp/src/pdlp/pdlp.cuh +++ b/cpp/src/pdlp/pdlp.cuh @@ -117,6 +117,11 @@ class pdlp_solver_t { // call across all shards' pdhg_solver_t::*_transform methods. rmm::device_uvector& get_primal_step_size() { return primal_step_size_; } rmm::device_uvector& get_dual_step_size() { return dual_step_size_; } + // Multi-GPU restart broadcast needs to mirror master's primal_weight / + // best_primal_weight onto every shard after each cuPDLPx restart so that + // downstream shard-side restart machinery stays in sync with master. + rmm::device_uvector& get_primal_weight() { return primal_weight_; } + rmm::device_uvector& get_best_primal_weight() { return best_primal_weight_; } private: void print_termination_criteria(const timer_t& timer, bool is_average = false); diff --git a/cpp/src/pdlp/restart_strategy/pdlp_restart_strategy.cu b/cpp/src/pdlp/restart_strategy/pdlp_restart_strategy.cu index 00c5b16c8b..b7d49fc32f 100644 --- a/cpp/src/pdlp/restart_strategy/pdlp_restart_strategy.cu +++ b/cpp/src/pdlp/restart_strategy/pdlp_restart_strategy.cu @@ -1004,15 +1004,29 @@ void pdlp_restart_strategy_t::cupdlpx_restart( best_primal_weight.set_element_async(0, best_primal_weight_value, stream_view_); } - // Broadcast the primal and dual step sizes to all shards + // mGPU: Broadcast all primal-weight / step-size scalars updated by the cuPDLPx + // restart on the master to every shard so the restart-state on + // each shard stays in sync with master. if (auto* engine = pdhg_solver.get_mgpu_engine()) { RAFT_CUDA_TRY(cudaStreamSynchronize(stream_view_)); + + f_t h_primal_step_size{}, h_dual_step_size{}; + f_t h_primal_weight{}, h_best_primal_weight{}; + + raft::copy(&h_primal_step_size, primal_step_size.data(), 1, stream_view_); + raft::copy(&h_dual_step_size, dual_step_size.data(), 1, stream_view_); + raft::copy(&h_primal_weight, primal_weight.data(), 1, stream_view_); + raft::copy(&h_best_primal_weight, best_primal_weight.data(), 1, stream_view_); + RAFT_CUDA_TRY(cudaStreamSynchronize(stream_view_)); + engine->for_each_shard([&](auto& shard) { auto& sub = *shard.sub_pdlp; - raft::copy(sub.get_primal_step_size().data(), - primal_step_size.data(), 1, shard.stream.view()); - raft::copy(sub.get_dual_step_size().data(), - dual_step_size.data(), 1, shard.stream.view()); + raft::copy( + sub.get_primal_step_size().data(), &h_primal_step_size, 1, shard.stream.view()); + raft::copy(sub.get_dual_step_size().data(), &h_dual_step_size, 1, shard.stream.view()); + raft::copy(sub.get_primal_weight().data(), &h_primal_weight, 1, shard.stream.view()); + raft::copy( + sub.get_best_primal_weight().data(), &h_best_primal_weight, 1, shard.stream.view()); }); } // TODO later batch mode: remove if you have per climber restart From a1ffe1d791d203a903956769e89cde5452309d91 Mon Sep 17 00:00:00 2001 From: Bulle Mostovoi Date: Fri, 29 May 2026 06:29:39 -0700 Subject: [PATCH 48/67] force re-inject offset and variables to undo the sort, cheap and ugly but works --- cpp/src/pdlp/distributed_pdlp/shard.cu | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/cpp/src/pdlp/distributed_pdlp/shard.cu b/cpp/src/pdlp/distributed_pdlp/shard.cu index 3a49287362..356e10a03c 100644 --- a/cpp/src/pdlp/distributed_pdlp/shard.cu +++ b/cpp/src/pdlp/distributed_pdlp/shard.cu @@ -155,12 +155,26 @@ pdlp_shard_t::pdlp_shard_t(int device_id, sub_pdlp->pdhg_solver_.set_is_multi_gpu(true); - // Inject master-scaled buffers inside sub_pdlp + // Re-inject master-scaled buffers inside sub_pdlp. + // Need to also re-inject the offsets and variables arrays to revert + // the csrsort done by problem_t's constructor. auto& scaled = sub_pdlp->get_op_problem_scaled(); + raft::copy(scaled.offsets.data(), + rank_data.h_A_row_offsets.data(), + rank_data.h_A_row_offsets.size(), + stream_view); + raft::copy(scaled.variables.data(), + rank_data.h_A_col_indices.data(), + rank_data.h_A_col_indices.size(), + stream_view); raft::copy(scaled.coefficients.data(), rank_data.h_A_values_scaled.data(), rank_data.h_A_values_scaled.size(), stream_view); + // A_T side: all three arrays were already overridden together from + // rank_data on sub_problem (see step 4 above) and deep-copied into the + // scaled problem, so reverse_offsets / reverse_constraints already match + // h_A_t_values_scaled's order. Only the values need a SCALED swap-in. raft::copy(scaled.reverse_coefficients.data(), rank_data.h_A_t_values_scaled.data(), rank_data.h_A_t_values_scaled.size(), From c9394d9d00147ab740765ef02ba1ec6a77de2514 Mon Sep 17 00:00:00 2001 From: Bulle Mostovoi Date: Fri, 29 May 2026 06:48:38 -0700 Subject: [PATCH 49/67] few style changes, better args and prints --- cpp/cuopt_cli.cpp | 18 +++++++++-- .../cuopt/linear_programming/constants.h | 1 + .../pdlp/solver_settings.hpp | 3 ++ cpp/src/math_optimization/solver_settings.cu | 3 +- cpp/src/pdlp/pdlp.cu | 30 ++++++++++++------- cpp/src/pdlp/pdlp.cuh | 2 +- cpp/src/pdlp/solve.cu | 25 ++++++++++++---- 7 files changed, 60 insertions(+), 22 deletions(-) diff --git a/cpp/cuopt_cli.cpp b/cpp/cuopt_cli.cpp index 39aab47170..7c0a9111d9 100644 --- a/cpp/cuopt_cli.cpp +++ b/cpp/cuopt_cli.cpp @@ -426,10 +426,22 @@ int main(int argc, char* argv[]) std::vector memory_resources; if (memory_backend == cuopt::linear_programming::memory_backend_t::GPU) { - const int num_gpus = settings.get_parameter(CUOPT_NUM_GPUS); + // Distributed PDLP scales one shard per GPU and uses its own knob; everything else + // (concurrent, batch, MIP) uses num_gpus which is capped at 2. + // For distributed PDLP, -1 means "auto-detect": resolve to the visible device + // count so the RMM memory pools match what solve.cu will eventually dispatch. + const bool use_distributed_pdlp = settings.get_parameter(CUOPT_USE_DISTRIBUTED_PDLP); + int requested_gpus = + use_distributed_pdlp ? settings.get_parameter(CUOPT_DISTRIBUTED_PDLP_NUM_GPUS) + : settings.get_parameter(CUOPT_NUM_GPUS); + if (use_distributed_pdlp && requested_gpus == -1) { + requested_gpus = raft::device_setter::get_device_count(); + } + const int provisioned_gpus = + std::min(raft::device_setter::get_device_count(), requested_gpus); - memory_resources.reserve(std::min(raft::device_setter::get_device_count(), num_gpus)); - for (int i = 0; i < std::min(raft::device_setter::get_device_count(), num_gpus); ++i) { + memory_resources.reserve(provisioned_gpus); + for (int i = 0; i < provisioned_gpus; ++i) { RAFT_CUDA_TRY(cudaSetDevice(i)); memory_resources.emplace_back(); rmm::mr::set_per_device_resource(rmm::cuda_device_id{i}, memory_resources.back()); diff --git a/cpp/include/cuopt/linear_programming/constants.h b/cpp/include/cuopt/linear_programming/constants.h index 26ef3653e0..3346ab3565 100644 --- a/cpp/include/cuopt/linear_programming/constants.h +++ b/cpp/include/cuopt/linear_programming/constants.h @@ -83,6 +83,7 @@ #define CUOPT_SOLUTION_FILE "solution_file" #define CUOPT_NUM_CPU_THREADS "num_cpu_threads" #define CUOPT_NUM_GPUS "num_gpus" +#define CUOPT_DISTRIBUTED_PDLP_NUM_GPUS "distributed_pdlp_num_gpus" #define CUOPT_MULTI_GPU_PARTITION_FILE "multi_gpu_partition_file" #define CUOPT_USE_DISTRIBUTED_PDLP "use_distributed_pdlp" #define CUOPT_USER_PROBLEM_FILE "user_problem_file" diff --git a/cpp/include/cuopt/linear_programming/pdlp/solver_settings.hpp b/cpp/include/cuopt/linear_programming/pdlp/solver_settings.hpp index e8beef007d..efdbd5733c 100644 --- a/cpp/include/cuopt/linear_programming/pdlp/solver_settings.hpp +++ b/cpp/include/cuopt/linear_programming/pdlp/solver_settings.hpp @@ -307,6 +307,9 @@ class pdlp_solver_settings_t { presolver_t presolver{presolver_t::Default}; bool dual_postsolve{true}; int num_gpus{1}; + // Number of GPUs to use specifically for distributed PDLP (use_distributed_pdlp=true). + // -1 means auto-detect + int distributed_pdlp_num_gpus{-1}; std::string multi_gpu_partition_file{""}; // Set to true inside the shards bool is_distributed_sub_pdlp{false}; diff --git a/cpp/src/math_optimization/solver_settings.cu b/cpp/src/math_optimization/solver_settings.cu index 991b0d62c1..207e53f20d 100644 --- a/cpp/src/math_optimization/solver_settings.cu +++ b/cpp/src/math_optimization/solver_settings.cu @@ -138,8 +138,9 @@ solver_settings_t::solver_settings_t() : pdlp_settings(), mip_settings {CUOPT_MIP_IMPLIED_BOUND_CUTS, &mip_settings.implied_bound_cuts, -1, 1, -1}, {CUOPT_MIP_STRONG_CHVATAL_GOMORY_CUTS, &mip_settings.strong_chvatal_gomory_cuts, -1, 1, -1}, {CUOPT_MIP_REDUCED_COST_STRENGTHENING, &mip_settings.reduced_cost_strengthening, -1, std::numeric_limits::max(), -1}, - {CUOPT_NUM_GPUS, &pdlp_settings.num_gpus, 1, 576, 1}, + {CUOPT_NUM_GPUS, &pdlp_settings.num_gpus, 1, 2, 1}, {CUOPT_NUM_GPUS, &mip_settings.num_gpus, 1, 2, 1}, + {CUOPT_DISTRIBUTED_PDLP_NUM_GPUS, &pdlp_settings.distributed_pdlp_num_gpus, -1, 576, -1}, {CUOPT_MIP_BATCH_PDLP_STRONG_BRANCHING, &mip_settings.mip_batch_pdlp_strong_branching, 0, 2, 0}, {CUOPT_MIP_BATCH_PDLP_RELIABILITY_BRANCHING, &mip_settings.mip_batch_pdlp_reliability_branching, 0, 2, 0}, {CUOPT_MIP_STRONG_BRANCHING_SIMPLEX_ITERATION_LIMIT, &mip_settings.strong_branching_simplex_iteration_limit, -1,std::numeric_limits::max(), -1}, diff --git a/cpp/src/pdlp/pdlp.cu b/cpp/src/pdlp/pdlp.cu index 241b9a5aeb..a061a2d468 100644 --- a/cpp/src/pdlp/pdlp.cu +++ b/cpp/src/pdlp/pdlp.cu @@ -378,23 +378,29 @@ pdlp_solver_t::pdlp_solver_t(problem_t& op_problem, template pdlp_solver_t::pdlp_solver_t(problem_t& op_problem, pdlp_solver_settings_t const& settings, - int num_gpus) + int distributed_pdlp_num_gpus) // 1. Delegate to single-GPU ctor to bring up all the per-master state // (problem_ptr, op_problem_scaled_, pdhg_solver_, strategies, etc.). : pdlp_solver_t(op_problem, settings, /*is_legacy_batch_mode=*/false) { - if (num_gpus == 1) { - std::cout << "CAREFUL: num_gpus == 1, running dummy version" << std::endl; + CUOPT_LOG_INFO("Solving with distributed PDLP on %d GPU", + distributed_pdlp_num_gpus); + if (distributed_pdlp_num_gpus == 1) { + std::cout << "CAREFUL !!: distributed_pdlp_num_gpus == 1, running single-shard dummy path, " + "if you want to set the number of GPUs to use for distributed PDLP, set the " + "parameter --distributed-pdlp-num-gpus" + << std::endl; } - cuopt_expects(num_gpus == settings.num_gpus /*&& settings.num_gpus > 1*/, + cuopt_expects(distributed_pdlp_num_gpus == settings.distributed_pdlp_num_gpus, error_type_t::ValidationError, - "This constructor should only be used for distributed PDLP (num_gpus > 1)"); + "This constructor's distributed_pdlp_num_gpus argument must match " + "settings.distributed_pdlp_num_gpus"); // Distributed PDLP is currently double-only if constexpr (!std::is_same_v) { cuopt_expects(false, error_type_t::ValidationError, - "Distributed PDLP (num_gpus > 1) currently requires double precision"); + "Distributed PDLP currently requires double precision"); return; } else { // 2. Load or compute partition @@ -405,20 +411,21 @@ pdlp_solver_t::pdlp_solver_t(problem_t& op_problem, validate_partition(parts, op_problem_scaled_.n_constraints, op_problem_scaled_.n_variables, - num_gpus, + distributed_pdlp_num_gpus, "partition file"); } else { - if (num_gpus == 1) { + if (distributed_pdlp_num_gpus == 1) { // Single-part dummy run: useful for exercising the mGPU code paths on a // single physical GPU without a real partition file. - std::cout << "CAREFUL: num_gpus == 1, running dummy version (single part covering " + std::cout << "CAREFUL: distributed_pdlp_num_gpus == 1, running dummy version (single " + "part covering " << op_problem_scaled_.n_constraints << " cstrs + " << op_problem_scaled_.n_variables << " vars)" << std::endl; } partitioner_input_t partition_input; partition_input.nb_cstr = op_problem_scaled_.n_constraints; partition_input.nb_vars = op_problem_scaled_.n_variables; - partition_input.nb_parts = num_gpus; + partition_input.nb_parts = distributed_pdlp_num_gpus; // Dummy partitioner ignores A / A_t for now; future METIS partitioners will // fill these CSR views before calling partition(). auto partitioner = make_partitioner(partitioner_kind_t::Dummy); @@ -538,7 +545,7 @@ pdlp_solver_t::pdlp_solver_t(problem_t& op_problem, h_A_t_col_indices, h_A_t_values, h_A_t_values_scaled, - settings.num_gpus, + settings.distributed_pdlp_num_gpus, n_cstr, n_vars, nnz); @@ -546,6 +553,7 @@ pdlp_solver_t::pdlp_solver_t(problem_t& op_problem, // 7. Build the per-shard PDLP settings: pdlp_solver_settings_t sub_pdlp_settings = settings; sub_pdlp_settings.num_gpus = 1; + sub_pdlp_settings.distributed_pdlp_num_gpus = 1; sub_pdlp_settings.multi_gpu_partition_file = ""; sub_pdlp_settings.is_distributed_sub_pdlp = true; sub_pdlp_settings.hyper_params.default_l_inf_ruiz_iterations = 0; diff --git a/cpp/src/pdlp/pdlp.cuh b/cpp/src/pdlp/pdlp.cuh index 15ddfdaad3..14651eab3f 100644 --- a/cpp/src/pdlp/pdlp.cuh +++ b/cpp/src/pdlp/pdlp.cuh @@ -66,7 +66,7 @@ class pdlp_solver_t { // Distributed Solver Constructor pdlp_solver_t(problem_t& op_problem, pdlp_solver_settings_t const& settings, - int num_gpus); + int distributed_pdlp_num_gpus); optimization_problem_solution_t run_solver(const timer_t& timer); diff --git a/cpp/src/pdlp/solve.cu b/cpp/src/pdlp/solve.cu index e401ab35b6..338083f03a 100644 --- a/cpp/src/pdlp/solve.cu +++ b/cpp/src/pdlp/solve.cu @@ -771,16 +771,29 @@ static optimization_problem_solution_t run_pdlp_solver( } #endif if (settings.hyper_params.use_distributed_pdlp) { - /* - cuopt_expects(settings.num_gpus > 1, + // Resolve the -1 "auto-detect" sentinel to the actual visible-device count on + // the master process + pdlp_solver_settings_t settings_resolved = settings; + if (settings_resolved.distributed_pdlp_num_gpus == -1) { + settings_resolved.distributed_pdlp_num_gpus = raft::device_setter::get_device_count(); + CUOPT_LOG_INFO("distributed_pdlp_num_gpus == -1: auto-detected %d visible CUDA device", + settings_resolved.distributed_pdlp_num_gpus); + } + cuopt_expects(settings_resolved.distributed_pdlp_num_gpus >= 1, error_type_t::ValidationError, - "use_distributed_pdlp requires settings.num_gpus > 1"); */ - if (settings.num_gpus == 1) {std::cout << "CAREFUL: use_distributed_pdlp requires settings.num_gpus > 1" << std::endl;} + "distributed_pdlp_num_gpus must be >= 1 or -1 (auto-detect)"); + if (settings_resolved.distributed_pdlp_num_gpus == 1) { + std::cout + << "CAREFUL: use_distributed_pdlp with distributed_pdlp_num_gpus == 1 runs the " + "single-shard dummy path" + << std::endl; + } cuopt_expects(!is_batch_mode, error_type_t::ValidationError, "Distributed PDLP does not support batch mode"); - // Multi-GPU ctor; dispatched by 3rd-arg TYPE (int num_gpus, not bool batch). - detail::pdlp_solver_t solver(problem, settings, settings.num_gpus); + // Multi-GPU ctor; dispatched by 3rd-arg TYPE (int, not bool batch). + detail::pdlp_solver_t solver( + problem, settings_resolved, settings_resolved.distributed_pdlp_num_gpus); return solver.run_solver(timer); } detail::pdlp_solver_t solver(problem, settings, is_batch_mode); From 4faa7df79320fc5588796e6828642bce523ea726 Mon Sep 17 00:00:00 2001 From: Bulle Mostovoi Date: Fri, 29 May 2026 07:20:12 -0700 Subject: [PATCH 50/67] added disable_graph flag, afiro gets solved on non-graph just as if it was single --- .../cuopt/linear_programming/constants.h | 1 + .../pdlp/pdlp_hyper_params.cuh | 3 +++ cpp/src/math_optimization/solver_settings.cu | 1 + cpp/src/pdlp/solve.cu | 3 +++ cpp/src/pdlp/utilities/ping_pong_graph.cuh | 17 ++++++++++++++++- 5 files changed, 24 insertions(+), 1 deletion(-) diff --git a/cpp/include/cuopt/linear_programming/constants.h b/cpp/include/cuopt/linear_programming/constants.h index 3346ab3565..e695bb21d3 100644 --- a/cpp/include/cuopt/linear_programming/constants.h +++ b/cpp/include/cuopt/linear_programming/constants.h @@ -86,6 +86,7 @@ #define CUOPT_DISTRIBUTED_PDLP_NUM_GPUS "distributed_pdlp_num_gpus" #define CUOPT_MULTI_GPU_PARTITION_FILE "multi_gpu_partition_file" #define CUOPT_USE_DISTRIBUTED_PDLP "use_distributed_pdlp" +#define CUOPT_PDLP_DISABLE_GRAPH "pdlp_disable_graph" #define CUOPT_USER_PROBLEM_FILE "user_problem_file" #define CUOPT_PRESOLVE_FILE "presolve_file" #define CUOPT_RANDOM_SEED "random_seed" diff --git a/cpp/include/cuopt/linear_programming/pdlp/pdlp_hyper_params.cuh b/cpp/include/cuopt/linear_programming/pdlp/pdlp_hyper_params.cuh index 962f06ee4a..c68dc86d6a 100644 --- a/cpp/include/cuopt/linear_programming/pdlp/pdlp_hyper_params.cuh +++ b/cpp/include/cuopt/linear_programming/pdlp/pdlp_hyper_params.cuh @@ -48,6 +48,9 @@ struct pdlp_hyper_params_t { bool use_reflected_primal_dual = true; bool use_fixed_point_error = true; bool use_distributed_pdlp = false; + // Debug/diagnostic knob: when true, PDLP bypasses CUDA-graph capture in + // ping_pong_graph_t and executes each iteration eagerly + bool pdlp_disable_graph = false; double reflection_coefficient = 1.0; double restart_k_p = 0.99; double restart_k_i = 0.01; diff --git a/cpp/src/math_optimization/solver_settings.cu b/cpp/src/math_optimization/solver_settings.cu index 207e53f20d..629c8a8428 100644 --- a/cpp/src/math_optimization/solver_settings.cu +++ b/cpp/src/math_optimization/solver_settings.cu @@ -179,6 +179,7 @@ solver_settings_t::solver_settings_t() : pdlp_settings(), mip_settings {CUOPT_BARRIER_ITERATIVE_REFINEMENT, &pdlp_settings.barrier_iterative_refinement, true}, {CUOPT_MIP_PROBING, &mip_settings.probing, true}, {CUOPT_USE_DISTRIBUTED_PDLP, &pdlp_settings.hyper_params.use_distributed_pdlp, false}, + {CUOPT_PDLP_DISABLE_GRAPH, &pdlp_settings.hyper_params.pdlp_disable_graph, false}, }; // String parameters string_parameters = { diff --git a/cpp/src/pdlp/solve.cu b/cpp/src/pdlp/solve.cu index 338083f03a..70c488e3f3 100644 --- a/cpp/src/pdlp/solve.cu +++ b/cpp/src/pdlp/solve.cu @@ -756,6 +756,9 @@ static optimization_problem_solution_t run_pdlp_solver( const timer_t& timer, bool is_batch_mode) { + detail::pdlp_graph_disabled_flag().store(settings.hyper_params.pdlp_disable_graph, + std::memory_order_relaxed); + if (problem.n_constraints == 0) { CUOPT_LOG_CONDITIONAL_INFO( !settings.inside_mip, diff --git a/cpp/src/pdlp/utilities/ping_pong_graph.cuh b/cpp/src/pdlp/utilities/ping_pong_graph.cuh index dbc8fe5828..6b527f81b2 100644 --- a/cpp/src/pdlp/utilities/ping_pong_graph.cuh +++ b/cpp/src/pdlp/utilities/ping_pong_graph.cuh @@ -12,10 +12,25 @@ #include +#include #include namespace cuopt::linear_programming::detail { +// Debug/diagnostic toggle: when set, ping_pong_graph_t::run() bypasses CUDA +// graph capture and executes its work eagerly on every iteration. Useful for +// for debugging +inline std::atomic& pdlp_graph_disabled_flag() +{ + static std::atomic s_flag{false}; + return s_flag; +} + +inline bool pdlp_graph_disabled() +{ + return pdlp_graph_disabled_flag().load(std::memory_order_relaxed); +} + // Two-slot CUDA-graph cache for PDLP. PDLP swaps pointers (rather than // copying vectors) at the end of adaptive pdhg step, so the captured graph // topology alternates between two layouts depending on iteration parity. @@ -49,7 +64,7 @@ class ping_pong_graph_t { #ifdef CUPDLP_DEBUG_MODE work(); #else - if (is_legacy_batch_mode_) { + if (is_legacy_batch_mode_ || pdlp_graph_disabled()) { work(); return; } From 61acddb5cd0c0df8f09086d87264759e66ac94dd Mon Sep 17 00:00:00 2001 From: Bulle Mostovoi Date: Sun, 31 May 2026 10:02:48 -0700 Subject: [PATCH 51/67] makes reductions in compute interraction adn movement use owned_size rather than total size hehehehe --- cpp/src/pdlp/pdlp.cu | 5 ++++- .../adaptive_step_size_strategy.cu | 18 ++++++++++++++---- .../adaptive_step_size_strategy.hpp | 6 +++++- 3 files changed, 23 insertions(+), 6 deletions(-) diff --git a/cpp/src/pdlp/pdlp.cu b/cpp/src/pdlp/pdlp.cu index a061a2d468..3b77a1cf47 100644 --- a/cpp/src/pdlp/pdlp.cu +++ b/cpp/src/pdlp/pdlp.cu @@ -2327,10 +2327,13 @@ void pdlp_solver_t::compute_fixed_error(std::vector& has_restarte cusparseDnVecSetValues(sub_cv.potential_next_dual_solution, (void*)sub_pdlp.pdhg_solver_.get_reflected_dual().data())); + // Ensure norm is on owned size sub_pdlp.step_size_strategy_.compute_interaction_and_movement( sub_pdlp.pdhg_solver_.get_primal_tmp_resource(), sub_cv, - sub_pdlp.pdhg_solver_.get_saddle_point_state()); + sub_pdlp.pdhg_solver_.get_saddle_point_state(), + shard->rank_data.owned_var_size, + shard->rank_data.owned_cstr_size); RAFT_CUSPARSE_TRY(cusparseDnVecSetValues( sub_cv.potential_next_dual_solution, diff --git a/cpp/src/pdlp/step_size_strategy/adaptive_step_size_strategy.cu b/cpp/src/pdlp/step_size_strategy/adaptive_step_size_strategy.cu index 2cb843ae86..530a426117 100644 --- a/cpp/src/pdlp/step_size_strategy/adaptive_step_size_strategy.cu +++ b/cpp/src/pdlp/step_size_strategy/adaptive_step_size_strategy.cu @@ -364,8 +364,18 @@ template void adaptive_step_size_strategy_t::compute_interaction_and_movement( rmm::device_uvector& tmp_primal, cusparse_view_t& cusparse_view, - saddle_point_state_t& current_saddle_point_state) + saddle_point_state_t& current_saddle_point_state, + i_t owned_primal_size, + i_t owned_cstr_size) { + // mGPU needs to know owned size to restrict the reductions to the owned prefix + const i_t reduce_primal_size = (owned_primal_size >= 0) + ? owned_primal_size + : current_saddle_point_state.get_primal_size(); + const i_t reduce_dual_size = (owned_cstr_size >= 0) + ? owned_cstr_size + : current_saddle_point_state.get_dual_size(); + // QP would need this: // if iszero(problem.objective_matrix) // primal_objective_interaction = 0.0 @@ -444,7 +454,7 @@ void adaptive_step_size_strategy_t::compute_interaction_and_movement( // compute interaction (x'-x) . (A(y'-y)) RAFT_CUBLAS_TRY( raft::linalg::detail::cublasdot(handle_ptr_->get_cublas_handle(), - current_saddle_point_state.get_primal_size(), + reduce_primal_size, tmp_primal.data(), primal_stride, current_saddle_point_state.get_delta_primal().data(), @@ -462,7 +472,7 @@ void adaptive_step_size_strategy_t::compute_interaction_and_movement( // norm(delta_dual) ^ 2; RAFT_CUBLAS_TRY( raft::linalg::detail::cublasdot(handle_ptr_->get_cublas_handle(), - current_saddle_point_state.get_primal_size(), + reduce_primal_size, current_saddle_point_state.get_delta_primal().data(), primal_stride, current_saddle_point_state.get_delta_primal().data(), @@ -472,7 +482,7 @@ void adaptive_step_size_strategy_t::compute_interaction_and_movement( RAFT_CUBLAS_TRY( raft::linalg::detail::cublasdot(handle_ptr_->get_cublas_handle(), - current_saddle_point_state.get_dual_size(), + reduce_dual_size, current_saddle_point_state.get_delta_dual().data(), dual_stride, current_saddle_point_state.get_delta_dual().data(), diff --git a/cpp/src/pdlp/step_size_strategy/adaptive_step_size_strategy.hpp b/cpp/src/pdlp/step_size_strategy/adaptive_step_size_strategy.hpp index 896c6fa24e..238735e8ff 100644 --- a/cpp/src/pdlp/step_size_strategy/adaptive_step_size_strategy.hpp +++ b/cpp/src/pdlp/step_size_strategy/adaptive_step_size_strategy.hpp @@ -88,9 +88,13 @@ class adaptive_step_size_strategy_t { rmm::device_uvector& get_norm_squared_delta_primal(); rmm::device_uvector& get_norm_squared_delta_dual(); + // owned_primal_size / owned_cstr_size are mGPU overrides. + // mGPU needs to know owned size to restrict the reductions to the owned prefix void compute_interaction_and_movement(rmm::device_uvector& tmp_primal, cusparse_view_t& cusparse_view, - saddle_point_state_t& current_saddle_point_state); + saddle_point_state_t& current_saddle_point_state, + i_t owned_primal_size = -1, + i_t owned_cstr_size = -1); void swap_context(const thrust::universal_host_pinned_vector>& swap_pairs); void resize_context(i_t new_size); From b8b59bfce89a26652d809dc2b9966d20febc28ef Mon Sep 17 00:00:00 2001 From: Bulle Mostovoi Date: Sun, 31 May 2026 11:49:27 -0700 Subject: [PATCH 52/67] added emtis partitionner, still need it in the env. it is FAST. but we lose a lot of time on actal partitionning and data movements. Everything seems to be working --- cpp/CMakeLists.txt | 37 +++++ cpp/src/pdlp/CMakeLists.txt | 1 + .../distributed_pdlp/metis_partitioner.cu | 142 ++++++++++++++++++ cpp/src/pdlp/distributed_pdlp/partitioner.cu | 3 + cpp/src/pdlp/distributed_pdlp/partitioner.hpp | 2 +- cpp/src/pdlp/utilities/mgpu_trace.cuh | 52 +++++++ 6 files changed, 236 insertions(+), 1 deletion(-) create mode 100644 cpp/src/pdlp/distributed_pdlp/metis_partitioner.cu create mode 100644 cpp/src/pdlp/utilities/mgpu_trace.cuh diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index da7d4a4d35..d27072bcf9 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -342,6 +342,42 @@ set_target_properties(nccl_external PROPERTIES ) message(STATUS "Using NCCL: ${NCCL_LIBRARY}") +# ################################################################################################## +# - METIS (graph partitioning for distributed PDLP) ----------------------------------------------- +# Found by searching CONDA_PREFIX first, then CUOPT_METIS_ROOT (cmake var or env) +# if the user wants to pull METIS from a different conda env / system path. +set(METIS_HINT_PREFIXES "") +if (DEFINED ENV{CONDA_PREFIX} AND NOT "$ENV{CONDA_PREFIX}" STREQUAL "") + list(APPEND METIS_HINT_PREFIXES "$ENV{CONDA_PREFIX}") +endif () +if (DEFINED CUOPT_METIS_ROOT AND NOT "${CUOPT_METIS_ROOT}" STREQUAL "") + list(APPEND METIS_HINT_PREFIXES "${CUOPT_METIS_ROOT}") +endif () +if (DEFINED ENV{CUOPT_METIS_ROOT} AND NOT "$ENV{CUOPT_METIS_ROOT}" STREQUAL "") + list(APPEND METIS_HINT_PREFIXES "$ENV{CUOPT_METIS_ROOT}") +endif () +find_path(METIS_INCLUDE_DIR + NAMES metis.h + HINTS ${METIS_HINT_PREFIXES} + PATH_SUFFIXES include +) +find_library(METIS_LIBRARY + NAMES metis libmetis + HINTS ${METIS_HINT_PREFIXES} + PATH_SUFFIXES lib lib64 +) +if (NOT METIS_INCLUDE_DIR OR NOT METIS_LIBRARY) + message(FATAL_ERROR "METIS not found. Looked in: ${METIS_HINT_PREFIXES}. " + "Install it via 'conda install -c conda-forge metis' in the active env, " + "or set CUOPT_METIS_ROOT to a prefix containing include/metis.h and lib/libmetis.{so,a}.") +endif () +add_library(metis_external UNKNOWN IMPORTED GLOBAL) +set_target_properties(metis_external PROPERTIES + IMPORTED_LOCATION "${METIS_LIBRARY}" + INTERFACE_INCLUDE_DIRECTORIES "${METIS_INCLUDE_DIR}" +) +message(STATUS "Using METIS: ${METIS_LIBRARY}") + # ################################################################################################## # - gRPC and Protobuf setup ----------------------------------------------------------------------- @@ -605,6 +641,7 @@ target_link_libraries(cuopt PRIVATE ${CUOPT_PRIVATE_CUDA_LIBS} nccl_external + metis_external $<$:protobuf::libprotobuf> $<$:gRPC::grpc++> ) diff --git a/cpp/src/pdlp/CMakeLists.txt b/cpp/src/pdlp/CMakeLists.txt index a6ef14e3ff..863cf20962 100644 --- a/cpp/src/pdlp/CMakeLists.txt +++ b/cpp/src/pdlp/CMakeLists.txt @@ -33,6 +33,7 @@ set(LP_CORE_FILES ${CMAKE_CURRENT_SOURCE_DIR}/distributed_pdlp/multi_gpu_engine.cu ${CMAKE_CURRENT_SOURCE_DIR}/distributed_pdlp/partition_loader.cu ${CMAKE_CURRENT_SOURCE_DIR}/distributed_pdlp/partitioner.cu + ${CMAKE_CURRENT_SOURCE_DIR}/distributed_pdlp/metis_partitioner.cu ) # C and Python adapter files diff --git a/cpp/src/pdlp/distributed_pdlp/metis_partitioner.cu b/cpp/src/pdlp/distributed_pdlp/metis_partitioner.cu new file mode 100644 index 0000000000..6ed80b0047 --- /dev/null +++ b/cpp/src/pdlp/distributed_pdlp/metis_partitioner.cu @@ -0,0 +1,142 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + */ + +#include +#include + +#include + +#include + +#include + +#include +#include +#include + +namespace cuopt::linear_programming::detail { + +// Builds the bipartite constraint/variable graph induced by A and runs +// METIS_PartGraphKway to assign each of the (nb_cstr + nb_vars) nodes to a +// part in [0, nb_parts). Layout matches metis_tests: +// * nodes [0, nb_cstr) : constraint nodes +// * nodes [nb_cstr, nb_cstr+nb_vars): variable nodes +// * undirected edges from each A nonzero (one half via A, one via A_t) +// The output is consumed by partition_loader_t::create_rank_data_from_parts. +template +std::vector metis_partitioner_t::partition( + partitioner_input_t const& input) const +{ + cuopt_expects(input.nb_parts > 0, + error_type_t::ValidationError, + "metis_partitioner: nb_parts must be positive"); + cuopt_expects(input.nb_cstr >= 0 && input.nb_vars >= 0, + error_type_t::ValidationError, + "metis_partitioner: invalid problem dimensions"); + + cuopt_expects(input.A.row_offsets != nullptr && input.A.col_indices != nullptr, + error_type_t::ValidationError, + "metis_partitioner: A.row_offsets and A.col_indices are required"); + cuopt_expects(input.A_t.row_offsets != nullptr && input.A_t.col_indices != nullptr, + error_type_t::ValidationError, + "metis_partitioner: A_t.row_offsets and A_t.col_indices are required"); + + auto const& A_offsets = *input.A.row_offsets; + auto const& A_cols = *input.A.col_indices; + auto const& A_t_offsets = *input.A_t.row_offsets; + auto const& A_t_cols = *input.A_t.col_indices; + + cuopt_expects(static_cast(A_offsets.size()) == input.nb_cstr + 1, + error_type_t::ValidationError, + "metis_partitioner: A.row_offsets size mismatch (expected nb_cstr+1)"); + cuopt_expects(static_cast(A_t_offsets.size()) == input.nb_vars + 1, + error_type_t::ValidationError, + "metis_partitioner: A_t.row_offsets size mismatch (expected nb_vars+1)"); + cuopt_expects(A_cols.size() == A_t_cols.size(), + error_type_t::ValidationError, + "metis_partitioner: A and A_t nnz mismatch"); + + const i_t nb_cstr = input.nb_cstr; + const i_t nb_vars = input.nb_vars; + const i_t nnz = static_cast(A_cols.size()); + const i_t nvtx = nb_cstr + nb_vars; + + // Bipartite CSR. Same construction as metis_tests/src/main.cpp: + // xadj has length nvtx + 1 + // adjncy has length 2 * nnz (each A nonzero contributes one half-edge + // from cstr side via A and one half-edge from var side via A_t) + std::vector xadj(nvtx + 1); + std::vector adjncy(2 * static_cast(nnz)); + + // cstr-side row offsets: A_offsets[0..nb_cstr] (no shift). + for (i_t i = 0; i <= nb_cstr; ++i) { xadj[i] = static_cast(A_offsets[i]); } + // var-side row offsets: A_t_offsets[0..nb_vars], shifted by +nnz so that + // they index into the second half of adjncy. + for (i_t i = 0; i <= nb_vars; ++i) { + xadj[nb_cstr + i] = static_cast(A_t_offsets[i]) + static_cast(nnz); + } + + // cstr-side neighbours: A_cols[i] shifted by +nb_cstr to index into the + // variable node block. + for (i_t k = 0; k < nnz; ++k) { + adjncy[k] = static_cast(A_cols[k]) + static_cast(nb_cstr); + } + // var-side neighbours: A_t_cols[i] already in [0, nb_cstr). + for (i_t k = 0; k < nnz; ++k) { + adjncy[nnz + k] = static_cast(A_t_cols[k]); + } + + idx_t metis_options[METIS_NOPTIONS]; + METIS_SetDefaultOptions(metis_options); + metis_options[METIS_OPTION_OBJTYPE] = METIS_OBJTYPE_CUT; + + idx_t metis_nvtx = static_cast(nvtx); + idx_t ncon = 1; + idx_t nparts = static_cast(input.nb_parts); + idx_t objval = 0; + std::vector metis_parts(nvtx); + + auto t0 = std::chrono::high_resolution_clock::now(); + const int status = METIS_PartGraphKway(&metis_nvtx, + &ncon, + xadj.data(), + adjncy.data(), + /*vwgt=*/nullptr, + /*vsize=*/nullptr, + /*adjwgt=*/nullptr, + &nparts, + /*tpwgts=*/nullptr, + /*ubvec=*/nullptr, + metis_options, + &objval, + metis_parts.data()); + auto t1 = std::chrono::high_resolution_clock::now(); + const double dt = std::chrono::duration(t1 - t0).count(); + cuopt_expects(status == METIS_OK, + error_type_t::RuntimeError, + "METIS_PartGraphKway failed (status=%d)", + status); + CUOPT_LOG_INFO( + "METIS partitioned bipartite graph: nvtx=%d nnz=%d nb_parts=%d edge_cut=%lld in %.3fs", + static_cast(nvtx), + static_cast(nnz), + static_cast(input.nb_parts), + static_cast(objval), + dt); + + std::vector parts(static_cast(nvtx)); + for (i_t i = 0; i < nvtx; ++i) { parts[i] = static_cast(metis_parts[i]); } + + validate_partition(parts, + static_cast(nb_cstr), + static_cast(nb_vars), + static_cast(input.nb_parts), + "metis_partitioner"); + return parts; +} + +template class metis_partitioner_t; + +} // namespace cuopt::linear_programming::detail diff --git a/cpp/src/pdlp/distributed_pdlp/partitioner.cu b/cpp/src/pdlp/distributed_pdlp/partitioner.cu index bdbfcacf06..4b809986ce 100644 --- a/cpp/src/pdlp/distributed_pdlp/partitioner.cu +++ b/cpp/src/pdlp/distributed_pdlp/partitioner.cu @@ -3,6 +3,7 @@ * SPDX-License-Identifier: Apache-2.0 */ +#include #include #include @@ -76,6 +77,8 @@ std::unique_ptr> make_partitioner(partitioner_kind_t kin switch (kind) { case partitioner_kind_t::Dummy: return std::make_unique>(); + case partitioner_kind_t::Metis: + return std::make_unique>(); } cuopt_expects(false, error_type_t::RuntimeError, "make_partitioner: unsupported partitioner kind"); return nullptr; diff --git a/cpp/src/pdlp/distributed_pdlp/partitioner.hpp b/cpp/src/pdlp/distributed_pdlp/partitioner.hpp index ee5798fd0b..82650ad805 100644 --- a/cpp/src/pdlp/distributed_pdlp/partitioner.hpp +++ b/cpp/src/pdlp/distributed_pdlp/partitioner.hpp @@ -36,7 +36,7 @@ struct partitioner_input_t { csr_host_view_t A_t{}; }; -enum class partitioner_kind_t { Dummy /*, Metis */ }; +enum class partitioner_kind_t { Dummy, Metis }; template class partitioner_i { diff --git a/cpp/src/pdlp/utilities/mgpu_trace.cuh b/cpp/src/pdlp/utilities/mgpu_trace.cuh new file mode 100644 index 0000000000..06a848b18e --- /dev/null +++ b/cpp/src/pdlp/utilities/mgpu_trace.cuh @@ -0,0 +1,52 @@ +/* clang-format off */ +/* + * SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + */ +/* clang-format on */ +#pragma once + +// Lightweight env-gated tracing for multi-GPU PDLP diagnosis. +// +// Enable by setting CUOPT_MGPU_TRACE=1 in the environment. +// All prints go to stderr (line-buffered + explicit flush) so they survive +// a CUDA hang and interleave with cuOpt's normal output. +// +// Usage: +// MGPU_TRACE("entering compute_At_y"); +// MGPU_TRACE_FMT("shard %d nnz=%lld", r, (long long)nnz); +// +// The guard reads the env var once on first use (thread-safe via static +// initialization) and the cost when disabled is a single load + branch. + +#include +#include + +namespace cuopt::linear_programming::detail { + +inline bool mgpu_trace_enabled() +{ + static const bool enabled = []() { + const char* v = std::getenv("CUOPT_MGPU_TRACE"); + return v != nullptr && v[0] != '\0' && v[0] != '0'; + }(); + return enabled; +} + +} // namespace cuopt::linear_programming::detail + +#define MGPU_TRACE(msg) \ + do { \ + if (::cuopt::linear_programming::detail::mgpu_trace_enabled()) { \ + std::fprintf(stderr, "[mgpu %s:%d] %s\n", __func__, __LINE__, (msg)); \ + std::fflush(stderr); \ + } \ + } while (0) + +#define MGPU_TRACE_FMT(fmt, ...) \ + do { \ + if (::cuopt::linear_programming::detail::mgpu_trace_enabled()) { \ + std::fprintf(stderr, "[mgpu %s:%d] " fmt "\n", __func__, __LINE__, __VA_ARGS__); \ + std::fflush(stderr); \ + } \ + } while (0) From 7d74e740ca3369ff10e9402573c1bed73dcae13a Mon Sep 17 00:00:00 2001 From: Bulle Mostovoi Date: Sun, 31 May 2026 11:53:44 -0700 Subject: [PATCH 53/67] forgot to push a file, maybe doesnt compile lol --- cpp/src/pdlp/pdlp.cu | 47 +++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 44 insertions(+), 3 deletions(-) diff --git a/cpp/src/pdlp/pdlp.cu b/cpp/src/pdlp/pdlp.cu index 3b77a1cf47..d80adf248d 100644 --- a/cpp/src/pdlp/pdlp.cu +++ b/cpp/src/pdlp/pdlp.cu @@ -426,9 +426,50 @@ pdlp_solver_t::pdlp_solver_t(problem_t& op_problem, partition_input.nb_cstr = op_problem_scaled_.n_constraints; partition_input.nb_vars = op_problem_scaled_.n_variables; partition_input.nb_parts = distributed_pdlp_num_gpus; - // Dummy partitioner ignores A / A_t for now; future METIS partitioners will - // fill these CSR views before calling partition(). - auto partitioner = make_partitioner(partitioner_kind_t::Dummy); + + // Topology buffers: only needed for METIS (Dummy ignores them). + // Read CSR offsets and col indices from the (unscaled) problem; the + // partitioner only needs topology, not values, and scaled/unscaled share + // the same nonzero pattern. + std::vector h_part_A_row_offsets; + std::vector h_part_A_col_indices; + std::vector h_part_A_t_row_offsets; + std::vector h_part_A_t_col_indices; + + const partitioner_kind_t kind = partitioner_kind_t::Metis; + if (kind == partitioner_kind_t::Metis) { + const auto stream = op_problem_scaled_.handle_ptr->get_stream(); + const i_t n_cstr = op_problem_scaled_.n_constraints; + const i_t n_vars = op_problem_scaled_.n_variables; + const i_t nnz = op_problem_scaled_.nnz; + h_part_A_row_offsets.resize(n_cstr + 1); + h_part_A_col_indices.resize(nnz); + h_part_A_t_row_offsets.resize(n_vars + 1); + h_part_A_t_col_indices.resize(nnz); + raft::copy( + h_part_A_row_offsets.data(), op_problem_scaled_.offsets.data(), n_cstr + 1, stream); + raft::copy( + h_part_A_col_indices.data(), op_problem_scaled_.variables.data(), nnz, stream); + raft::copy(h_part_A_t_row_offsets.data(), + op_problem_scaled_.reverse_offsets.data(), + n_vars + 1, + stream); + raft::copy(h_part_A_t_col_indices.data(), + op_problem_scaled_.reverse_constraints.data(), + nnz, + stream); + op_problem_scaled_.handle_ptr->sync_stream(stream); + + partition_input.A.row_offsets = &h_part_A_row_offsets; + partition_input.A.col_indices = &h_part_A_col_indices; + partition_input.A.num_rows = n_cstr; + partition_input.A.num_cols = n_vars; + partition_input.A_t.row_offsets = &h_part_A_t_row_offsets; + partition_input.A_t.col_indices = &h_part_A_t_col_indices; + partition_input.A_t.num_rows = n_vars; + partition_input.A_t.num_cols = n_cstr; + } + auto partitioner = make_partitioner(kind); parts = partitioner->partition(partition_input); } From 859a299b0e3b296957a6de64418b2807796b87db Mon Sep 17 00:00:00 2001 From: Bulle Mostovoi Date: Mon, 1 Jun 2026 10:43:00 +0200 Subject: [PATCH 54/67] fixed dummy partitionner on single gpu --- cpp/src/pdlp/distributed_pdlp/metis_partitioner.cu | 8 ++++++++ cpp/src/pdlp/pdlp.cu | 8 +++++++- 2 files changed, 15 insertions(+), 1 deletion(-) diff --git a/cpp/src/pdlp/distributed_pdlp/metis_partitioner.cu b/cpp/src/pdlp/distributed_pdlp/metis_partitioner.cu index 6ed80b0047..73e2736251 100644 --- a/cpp/src/pdlp/distributed_pdlp/metis_partitioner.cu +++ b/cpp/src/pdlp/distributed_pdlp/metis_partitioner.cu @@ -32,6 +32,14 @@ std::vector metis_partitioner_t::partition( cuopt_expects(input.nb_parts > 0, error_type_t::ValidationError, "metis_partitioner: nb_parts must be positive"); + // METIS_PartGraphKway internally does integer arithmetic of the form + // `nedges / nparts` and traps with SIGFPE when nparts == 1. The single-part + // case is also trivial (everything in part 0) so callers should route it to + // the Dummy partitioner instead (see pdlp_solver_t mGPU ctor). + cuopt_expects(input.nb_parts >= 2, + error_type_t::ValidationError, + "metis_partitioner: nb_parts must be >= 2 (METIS_PartGraphKway requirement); " + "use the Dummy partitioner for the single-shard case"); cuopt_expects(input.nb_cstr >= 0 && input.nb_vars >= 0, error_type_t::ValidationError, "metis_partitioner: invalid problem dimensions"); diff --git a/cpp/src/pdlp/pdlp.cu b/cpp/src/pdlp/pdlp.cu index d80adf248d..a747706639 100644 --- a/cpp/src/pdlp/pdlp.cu +++ b/cpp/src/pdlp/pdlp.cu @@ -436,7 +436,13 @@ pdlp_solver_t::pdlp_solver_t(problem_t& op_problem, std::vector h_part_A_t_row_offsets; std::vector h_part_A_t_col_indices; - const partitioner_kind_t kind = partitioner_kind_t::Metis; + // METIS_PartGraphKway requires nparts >= 2; calling it with nparts == 1 + // traps inside METIS (SIGFPE on integer division by zero). The + // num_gpus == 1 path is the single-shard dummy run anyway -- there's + // nothing for METIS to do, so route directly to Dummy which just places + // every vertex into part 0. + const partitioner_kind_t kind = + (distributed_pdlp_num_gpus == 1) ? partitioner_kind_t::Dummy : partitioner_kind_t::Metis; if (kind == partitioner_kind_t::Metis) { const auto stream = op_problem_scaled_.handle_ptr->get_stream(); const i_t n_cstr = op_problem_scaled_.n_constraints; From 7daa7400e1f1b7a421a8ac9f9fbbba3d42489c16 Mon Sep 17 00:00:00 2001 From: Bulle Mostovoi Date: Mon, 1 Jun 2026 11:34:16 +0200 Subject: [PATCH 55/67] added some plumbing, will not load full problem on gpu --- cpp/src/pdlp/solve.cu | 47 ++++++++++++++++++++++++++++++++++++++++++ cpp/src/pdlp/solve.cuh | 40 +++++++++++++++++++++++++++++++++++ 2 files changed, 87 insertions(+) diff --git a/cpp/src/pdlp/solve.cu b/cpp/src/pdlp/solve.cu index 70c488e3f3..8081c42ffb 100644 --- a/cpp/src/pdlp/solve.cu +++ b/cpp/src/pdlp/solve.cu @@ -2143,10 +2143,49 @@ optimization_problem_solution_t solve_lp( bool problem_checking, bool use_pdlp_solver_mode) { + // In distributed PDLP we can't allocate the full problem on the master device + if (settings.hyper_params.use_distributed_pdlp) { + return solve_lp_distributed_from_mps( + handle_ptr, mps_data_model, settings, problem_checking, use_pdlp_solver_mode); + } auto op_problem = mps_data_model_to_optimization_problem(handle_ptr, mps_data_model); return solve_lp(op_problem, settings, problem_checking, use_pdlp_solver_mode); } +template +optimization_problem_solution_t solve_lp_distributed_from_mps( + raft::handle_t const* handle_ptr, + const cuopt::linear_programming::io::mps_data_model_t& mps_data_model, + pdlp_solver_settings_t const& settings, + bool problem_checking, + bool use_pdlp_solver_mode) +{ + cuopt_expects(handle_ptr != nullptr, + error_type_t::ValidationError, + "solve_lp_distributed_from_mps: handle_ptr must not be null"); + cuopt_expects(settings.hyper_params.use_distributed_pdlp, + error_type_t::ValidationError, + "solve_lp_distributed_from_mps: settings.hyper_params.use_distributed_pdlp " + "must be true"); + + pdlp_solver_settings_t settings_resolved = settings; + if (settings_resolved.distributed_pdlp_num_gpus == -1) { + settings_resolved.distributed_pdlp_num_gpus = raft::device_setter::get_device_count(); + CUOPT_LOG_INFO( + "solve_lp_distributed_from_mps: distributed_pdlp_num_gpus == -1, auto-detected " + "%d visible CUDA device(s)", + settings_resolved.distributed_pdlp_num_gpus); + } + if (settings_resolved.distributed_pdlp_num_gpus <= 1) + { + std::cout << "CAREFUL: use_distributed_pdlp with distributed_pdlp_num_gpus == 1 runs the " + "single-shard dummy path" + << std::endl; + } + auto op_problem = mps_data_model_to_optimization_problem(handle_ptr, mps_data_model); + return solve_lp(op_problem, settings_resolved, problem_checking, use_pdlp_solver_mode); +} + // ============================================================================ // CPU problem overloads (convert to GPU, solve, convert solution back) // ============================================================================ @@ -2287,6 +2326,14 @@ std::unique_ptr> solve_lp( template optimization_problem_t mps_data_model_to_optimization_problem( \ raft::handle_t const* handle_ptr, \ const cuopt::linear_programming::io::mps_data_model_t& data_model); \ + \ + template optimization_problem_solution_t solve_lp_distributed_from_mps( \ + raft::handle_t const* handle_ptr, \ + const cuopt::linear_programming::io::mps_data_model_t& mps_data_model, \ + pdlp_solver_settings_t const& settings, \ + bool problem_checking, \ + bool use_pdlp_solver_mode); \ + \ template void set_pdlp_solver_mode(pdlp_solver_settings_t& settings); #if MIP_INSTANTIATE_FLOAT diff --git a/cpp/src/pdlp/solve.cuh b/cpp/src/pdlp/solve.cuh index 90e5e4fe95..abb657943f 100644 --- a/cpp/src/pdlp/solve.cuh +++ b/cpp/src/pdlp/solve.cuh @@ -32,6 +32,46 @@ cuopt::linear_programming::optimization_problem_solution_t solve_lp_wi const timer_t& timer, bool is_batch_mode = false); +/** + * @brief Distributed-PDLP entry point that consumes the host MPS data model + * directly, without ever materializing the full problem on a single + * (master) GPU. + * + * This is the entry point intended for problems whose `nnz` exceeds the memory + * of a single device. Today (Step 1 of the mGPU memory refactor) it is a thin + * routing shim: it resolves `distributed_pdlp_num_gpus == -1` against the + * visible-device count and delegates to the legacy + * `mps_data_model_to_optimization_problem(...)` + device-side `solve_lp(...)` + * pipeline, which still allocates the full problem on master. The shim exists + * so the public-facing call site is already in place; subsequent commits will + * replace the body with: + * 1. host-side METIS partitioning straight off the MPS CSR + * 2. per-shard host CSR slicing + * 3. construction of an mGPU-native pdlp_solver_t whose master only holds + * scalar metadata + gather buffers (no full A / A^T / scaled copies). + * + * Until then, behaviour and memory footprint are identical to the legacy path. + * + * @param handle_ptr Master raft handle (its stream owns the gather buffers + * and any master-side aggregator allocations). + * @param mps_data_model Host-resident MPS data (CPU vectors only). + * @param settings User-supplied PDLP solver settings; the + * `distributed_pdlp_num_gpus == -1` sentinel is resolved + * here against the visible-device count. + * @param problem_checking Forwarded to the eventual solver. + * @param use_pdlp_solver_mode Forwarded to the eventual solver. + * + * @pre `settings.hyper_params.use_distributed_pdlp == true`. + */ +template +cuopt::linear_programming::optimization_problem_solution_t +solve_lp_distributed_from_mps( + raft::handle_t const* handle_ptr, + const cuopt::linear_programming::io::mps_data_model_t& mps_data_model, + pdlp_solver_settings_t const& settings, + bool problem_checking, + bool use_pdlp_solver_mode); + /** * @brief Entry point for batch PDLP. Solves multiple LPs sharing the same constraint * matrix structure in a single batched GPU run. From 8a39e8c9e1b62cff57b09e80c013ae6ee53e30d4 Mon Sep 17 00:00:00 2001 From: Bulle Mostovoi Date: Mon, 1 Jun 2026 13:53:49 +0200 Subject: [PATCH 56/67] added guard to ensure presolver is not supported in mGPU --- cpp/src/pdlp/solve.cu | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/cpp/src/pdlp/solve.cu b/cpp/src/pdlp/solve.cu index 8081c42ffb..b32bad87f8 100644 --- a/cpp/src/pdlp/solve.cu +++ b/cpp/src/pdlp/solve.cu @@ -2167,6 +2167,10 @@ optimization_problem_solution_t solve_lp_distributed_from_mps( error_type_t::ValidationError, "solve_lp_distributed_from_mps: settings.hyper_params.use_distributed_pdlp " "must be true"); + cuopt_expects(settings.presolver == cuopt::linear_programming::presolver_t::None, + error_type_t::ValidationError, + "solve_lp_distributed_from_mps: presolve is not yet supported with " + "use_distributed_pdlp; please set settings.presolver = presolver_t::None"); pdlp_solver_settings_t settings_resolved = settings; if (settings_resolved.distributed_pdlp_num_gpus == -1) { From 5a3b9ce521ac23d10a2356bcb2bb5413c66e98e0 Mon Sep 17 00:00:00 2001 From: Bulle Mostovoi Date: Tue, 2 Jun 2026 10:41:06 +0200 Subject: [PATCH 57/67] plumbed pdlp_distributed_solver with mps_data_model and now data doesnt transit on master device ! --- cpp/src/pdlp/pdlp.cu | 327 +++++++++++++++++------------------------- cpp/src/pdlp/pdlp.cuh | 7 +- cpp/src/pdlp/solve.cu | 104 ++++++++++---- 3 files changed, 211 insertions(+), 227 deletions(-) diff --git a/cpp/src/pdlp/pdlp.cu b/cpp/src/pdlp/pdlp.cu index a747706639..21291b853d 100644 --- a/cpp/src/pdlp/pdlp.cu +++ b/cpp/src/pdlp/pdlp.cu @@ -17,6 +17,7 @@ #include #include +#include #include #include "cuopt/linear_programming/pdlp/solver_solution.hpp" #include "distributed_pdlp/multi_gpu_engine.hpp" @@ -375,15 +376,28 @@ pdlp_solver_t::pdlp_solver_t(problem_t& op_problem, } } +// ============================================================================ +// Distributed multi-GPU ctor. +// needs placeholder_problem to be a shape-0 problem +// reads the problem from mps_data_model directly +// builds internal attributes from the placeholder_problem +// builds the engine from the mps_data_model template -pdlp_solver_t::pdlp_solver_t(problem_t& op_problem, - pdlp_solver_settings_t const& settings, - int distributed_pdlp_num_gpus) - // 1. Delegate to single-GPU ctor to bring up all the per-master state - // (problem_ptr, op_problem_scaled_, pdhg_solver_, strategies, etc.). - : pdlp_solver_t(op_problem, settings, /*is_legacy_batch_mode=*/false) +pdlp_solver_t::pdlp_solver_t( + problem_t& placeholder_problem, + cuopt::linear_programming::io::mps_data_model_t const& mps, + pdlp_solver_settings_t const& settings) + // Makes all inner feilds of master 0 size + : pdlp_solver_t(placeholder_problem, settings, /*is_legacy_batch_mode=*/false) { - CUOPT_LOG_INFO("Solving with distributed PDLP on %d GPU", + cuopt_expects(placeholder_problem.n_variables == 0 && + placeholder_problem.n_constraints == 0 && + placeholder_problem.nnz == 0, + error_type_t::ValidationError, + "Distributed mGPU pdlp_solver_t ctor requires a shape-0 " + "placeholder problem (n_variables == n_constraints == nnz == 0)"); + const int distributed_pdlp_num_gpus = settings.distributed_pdlp_num_gpus; + CUOPT_LOG_INFO("Solving with distributed PDLP on %d GPU (mps direct path)", distributed_pdlp_num_gpus); if (distributed_pdlp_num_gpus == 1) { std::cout << "CAREFUL !!: distributed_pdlp_num_gpus == 1, running single-shard dummy path, " @@ -391,87 +405,125 @@ pdlp_solver_t::pdlp_solver_t(problem_t& op_problem, "parameter --distributed-pdlp-num-gpus" << std::endl; } - cuopt_expects(distributed_pdlp_num_gpus == settings.distributed_pdlp_num_gpus, - error_type_t::ValidationError, - "This constructor's distributed_pdlp_num_gpus argument must match " - "settings.distributed_pdlp_num_gpus"); - // Distributed PDLP is currently double-only if constexpr (!std::is_same_v) { cuopt_expects(false, error_type_t::ValidationError, "Distributed PDLP currently requires double precision"); return; - } else { - // 2. Load or compute partition + } + // ----- 1. Read problem shape and bulk data directly from mps (host) ----- + const i_t n_vars = static_cast(mps.get_objective_coefficients().size()); + const i_t n_cstr = static_cast(mps.get_constraint_lower_bounds().size()); + const i_t nnz = static_cast(mps.get_constraint_matrix_values().size()); + cuopt_expects(n_vars > 0, + error_type_t::ValidationError, + "Distributed PDLP from mps requires a non-empty objective"); + cuopt_expects(n_cstr > 0, + error_type_t::ValidationError, + "Distributed PDLP from mps requires at least one constraint"); + cuopt_expects(static_cast(mps.get_constraint_matrix_offsets().size()) == n_cstr + 1, + error_type_t::ValidationError, + "mps constraint_matrix_offsets size must equal n_constraints + 1"); + cuopt_expects( + static_cast(mps.get_constraint_matrix_indices().size()) == nnz, + error_type_t::ValidationError, + "mps constraint_matrix_indices size must equal nnz (constraint_matrix_values size)"); + cuopt_expects(static_cast(mps.get_constraint_upper_bounds().size()) == n_cstr, + error_type_t::ValidationError, + "mps constraint_upper_bounds size must equal n_constraints"); + cuopt_expects(static_cast(mps.get_variable_lower_bounds().size()) == n_vars, + error_type_t::ValidationError, + "mps variable_lower_bounds size must equal n_variables"); + cuopt_expects(static_cast(mps.get_variable_upper_bounds().size()) == n_vars, + error_type_t::ValidationError, + "mps variable_upper_bounds size must equal n_variables"); + + const bool maximize = mps.get_sense(); + f_t objective_offset = mps.get_objective_offset(); + f_t objective_scaling_factor = mps.get_objective_scaling_factor(); + + // Objective: copy (mutable so we can negate for maximize, matching + // problem_helpers.cuh::convert_to_maximization_problem). + std::vector h_obj = mps.get_objective_coefficients(); + if (maximize) { + for (auto& v : h_obj) v = -v; + objective_offset = -objective_offset; + objective_scaling_factor = -objective_scaling_factor; + } + + // Bounds (copy from mps; engine ctor takes by const ref to std::vector). + std::vector h_var_lower = mps.get_variable_lower_bounds(); + std::vector h_var_upper = mps.get_variable_upper_bounds(); + std::vector h_cstr_lower = mps.get_constraint_lower_bounds(); + std::vector h_cstr_upper = mps.get_constraint_upper_bounds(); + + // A (CSR) — mutable copies for the engine + partitioner consumers below. + std::vector h_A_row_offsets = mps.get_constraint_matrix_offsets(); + std::vector h_A_col_indices = mps.get_constraint_matrix_indices(); + std::vector h_A_values = mps.get_constraint_matrix_values(); + + // ----- 2. Transpose A -> A^T on the host (one-shot CSR transpose) ----- + // CSC(A) and CSR(A^T) share the same memory layout, so the CSC produced + // by dual_simplex::csr_matrix_t::to_compressed_col IS the CSR of A^T. + // O(nnz + n_vars) counting sort, same as problem_t::compute_transpose. + namespace ds = cuopt::linear_programming::dual_simplex; + ds::csr_matrix_t A_csr(n_cstr, n_vars, nnz); + A_csr.row_start = h_A_row_offsets; + A_csr.j = h_A_col_indices; + A_csr.x = h_A_values; + ds::csc_matrix_t AT_as_csc(n_vars, n_cstr, nnz); + A_csr.to_compressed_col(AT_as_csc); + std::vector h_A_t_row_offsets = std::move(AT_as_csc.col_start); + std::vector h_A_t_col_indices = std::move(AT_as_csc.i); + std::vector h_A_t_values = std::move(AT_as_csc.x); + + // ----- 3. Identity scaling for V1 ----- + // Real multi-GPU scaling is a TODO; ship the unscaled problem to shards as + // both "unscaled" and "scaled" so the engine and per-shard pdlp_solver_t + // can run end-to-end. Scaling factor vectors are 1.0 everywhere so the + // shard-side unscale at the end is a no-op. + std::vector h_A_values_scaled = h_A_values; + std::vector h_A_t_values_scaled = h_A_t_values; + std::vector h_obj_scaled = h_obj; + std::vector h_var_lower_scaled = h_var_lower; + std::vector h_var_upper_scaled = h_var_upper; + std::vector h_cstr_lower_scaled = h_cstr_lower; + std::vector h_cstr_upper_scaled = h_cstr_upper; + std::vector h_cummulative_cstr_scaling(n_cstr, f_t(1.0)); + std::vector h_cummulative_var_scaling(n_vars, f_t(1.0)); + const f_t h_bound_rescaling = f_t(1.0); + const f_t h_objective_rescaling = f_t(1.0); + + // ----- 4. Partition ----- std::vector parts; if (!settings.multi_gpu_partition_file.empty()) { parts = partition_loader_t::parse_distributed_pdlp_partition_file( settings.multi_gpu_partition_file); - validate_partition(parts, - op_problem_scaled_.n_constraints, - op_problem_scaled_.n_variables, - distributed_pdlp_num_gpus, - "partition file"); + validate_partition(parts, n_cstr, n_vars, distributed_pdlp_num_gpus, "partition file"); } else { if (distributed_pdlp_num_gpus == 1) { - // Single-part dummy run: useful for exercising the mGPU code paths on a - // single physical GPU without a real partition file. std::cout << "CAREFUL: distributed_pdlp_num_gpus == 1, running dummy version (single " "part covering " - << op_problem_scaled_.n_constraints << " cstrs + " - << op_problem_scaled_.n_variables << " vars)" << std::endl; + << n_cstr << " cstrs + " << n_vars << " vars)" << std::endl; } partitioner_input_t partition_input; - partition_input.nb_cstr = op_problem_scaled_.n_constraints; - partition_input.nb_vars = op_problem_scaled_.n_variables; + partition_input.nb_cstr = n_cstr; + partition_input.nb_vars = n_vars; partition_input.nb_parts = distributed_pdlp_num_gpus; - // Topology buffers: only needed for METIS (Dummy ignores them). - // Read CSR offsets and col indices from the (unscaled) problem; the - // partitioner only needs topology, not values, and scaled/unscaled share - // the same nonzero pattern. - std::vector h_part_A_row_offsets; - std::vector h_part_A_col_indices; - std::vector h_part_A_t_row_offsets; - std::vector h_part_A_t_col_indices; - - // METIS_PartGraphKway requires nparts >= 2; calling it with nparts == 1 - // traps inside METIS (SIGFPE on integer division by zero). The - // num_gpus == 1 path is the single-shard dummy run anyway -- there's - // nothing for METIS to do, so route directly to Dummy which just places - // every vertex into part 0. + // METIS_PartGraphKway requires nparts >= 2; route num_gpus == 1 to Dummy. const partitioner_kind_t kind = (distributed_pdlp_num_gpus == 1) ? partitioner_kind_t::Dummy : partitioner_kind_t::Metis; if (kind == partitioner_kind_t::Metis) { - const auto stream = op_problem_scaled_.handle_ptr->get_stream(); - const i_t n_cstr = op_problem_scaled_.n_constraints; - const i_t n_vars = op_problem_scaled_.n_variables; - const i_t nnz = op_problem_scaled_.nnz; - h_part_A_row_offsets.resize(n_cstr + 1); - h_part_A_col_indices.resize(nnz); - h_part_A_t_row_offsets.resize(n_vars + 1); - h_part_A_t_col_indices.resize(nnz); - raft::copy( - h_part_A_row_offsets.data(), op_problem_scaled_.offsets.data(), n_cstr + 1, stream); - raft::copy( - h_part_A_col_indices.data(), op_problem_scaled_.variables.data(), nnz, stream); - raft::copy(h_part_A_t_row_offsets.data(), - op_problem_scaled_.reverse_offsets.data(), - n_vars + 1, - stream); - raft::copy(h_part_A_t_col_indices.data(), - op_problem_scaled_.reverse_constraints.data(), - nnz, - stream); - op_problem_scaled_.handle_ptr->sync_stream(stream); - - partition_input.A.row_offsets = &h_part_A_row_offsets; - partition_input.A.col_indices = &h_part_A_col_indices; + // partitioner_input_t holds non-const std::vector* pointers; we + // already have the data in our local mutable buffers above. + partition_input.A.row_offsets = &h_A_row_offsets; + partition_input.A.col_indices = &h_A_col_indices; partition_input.A.num_rows = n_cstr; partition_input.A.num_cols = n_vars; - partition_input.A_t.row_offsets = &h_part_A_t_row_offsets; - partition_input.A_t.col_indices = &h_part_A_t_col_indices; + partition_input.A_t.row_offsets = &h_A_t_row_offsets; + partition_input.A_t.col_indices = &h_A_t_col_indices; partition_input.A_t.num_rows = n_vars; partition_input.A_t.num_cols = n_cstr; } @@ -479,109 +531,7 @@ pdlp_solver_t::pdlp_solver_t(problem_t& op_problem, parts = partitioner->partition(partition_input); } - // always compute initial step size before scaling and primal_weight after scaling to do like - // cuPDLPx - assert(settings_.hyper_params.compute_initial_primal_weight_before_scaling && - "compute_initial_primal_weight_before_scaling must be true in distributed mode"); - assert(!settings_.hyper_params.compute_initial_step_size_before_scaling && - "compute_initial_step_size_before_scaling must be false in distributed mode"); - - compute_initial_primal_weight(); - - // scale globally before dispatching to shards - initial_scaling_strategy_.scale_problem(); - - compute_initial_step_size(); - step_size_strategy_.get_primal_and_dual_stepsizes(primal_step_size_, dual_step_size_); - - const f_t initial_step_size_global = get_step_size_h(0); - const f_t initial_primal_weight_global = get_primal_weight_h(0); - - // 4. Copy both scaled and unscaled pb - auto const stream = op_problem_scaled_.handle_ptr->get_stream(); - i_t const n_cstr = op_problem_scaled_.n_constraints; - i_t const n_vars = op_problem_scaled_.n_variables; - i_t const nnz = op_problem_scaled_.nnz; - - // Shared topology (taken from the scaled problem, but identical on both). - std::vector h_A_row_offsets(n_cstr + 1); - std::vector h_A_col_indices(nnz); - std::vector h_A_t_row_offsets(n_vars + 1); - std::vector h_A_t_col_indices(nnz); - raft::copy(h_A_row_offsets.data(), op_problem_scaled_.offsets.data(), n_cstr + 1, stream); - raft::copy(h_A_col_indices.data(), op_problem_scaled_.variables.data(), nnz, stream); - raft::copy( - h_A_t_row_offsets.data(), op_problem_scaled_.reverse_offsets.data(), n_vars + 1, stream); - raft::copy( - h_A_t_col_indices.data(), op_problem_scaled_.reverse_constraints.data(), nnz, stream); - - // Paired value arrays for A and A_T. - std::vector h_A_values(nnz); - std::vector h_A_values_scaled(nnz); - std::vector h_A_t_values(nnz); - std::vector h_A_t_values_scaled(nnz); - raft::copy(h_A_values.data(), problem_ptr->coefficients.data(), nnz, stream); - raft::copy(h_A_t_values.data(), problem_ptr->reverse_coefficients.data(), nnz, stream); - raft::copy(h_A_values_scaled.data(), op_problem_scaled_.coefficients.data(), nnz, stream); - raft::copy( - h_A_t_values_scaled.data(), op_problem_scaled_.reverse_coefficients.data(), nnz, stream); - - using f_t2 = typename type_2::type; - - std::vector h_obj(n_vars); - std::vector h_obj_scaled(n_vars); - std::vector h_var_bounds_packed(n_vars); - std::vector h_var_bounds_scaled_packed(n_vars); - std::vector h_cstr_lower(n_cstr); - std::vector h_cstr_upper(n_cstr); - std::vector h_cstr_lower_scaled(n_cstr); - std::vector h_cstr_upper_scaled(n_cstr); - - raft::copy(h_obj.data(), problem_ptr->objective_coefficients.data(), n_vars, stream); - raft::copy( - h_obj_scaled.data(), op_problem_scaled_.objective_coefficients.data(), n_vars, stream); - raft::copy(h_var_bounds_packed.data(), problem_ptr->variable_bounds.data(), n_vars, stream); - raft::copy( - h_var_bounds_scaled_packed.data(), op_problem_scaled_.variable_bounds.data(), n_vars, stream); - raft::copy(h_cstr_lower.data(), problem_ptr->constraint_lower_bounds.data(), n_cstr, stream); - raft::copy(h_cstr_upper.data(), problem_ptr->constraint_upper_bounds.data(), n_cstr, stream); - raft::copy(h_cstr_lower_scaled.data(), - op_problem_scaled_.constraint_lower_bounds.data(), - n_cstr, - stream); - raft::copy(h_cstr_upper_scaled.data(), - op_problem_scaled_.constraint_upper_bounds.data(), - n_cstr, - stream); - - // 5. Get full scaling factors on host - std::vector h_cummulative_cstr_scaling(n_cstr); - std::vector h_cummulative_var_scaling(n_vars); - raft::copy(h_cummulative_cstr_scaling.data(), - initial_scaling_strategy_.get_constraint_matrix_scaling_vector().data(), - n_cstr, - stream); - raft::copy(h_cummulative_var_scaling.data(), - initial_scaling_strategy_.get_variable_scaling_vector().data(), - n_vars, - stream); - const f_t h_bound_rescaling = initial_scaling_strategy_.get_h_bound_rescaling(); - const f_t h_objective_rescaling = initial_scaling_strategy_.get_h_objective_rescaling(); - - op_problem_scaled_.handle_ptr->sync_stream(stream); - - // Unpack interleaved {lower, upper} into separate vectors for both - // versions, so the shard ctor's slicing loop is uniform. - std::vector h_var_lower(n_vars), h_var_upper(n_vars); - std::vector h_var_lower_scaled(n_vars), h_var_upper_scaled(n_vars); - for (i_t i = 0; i < n_vars; ++i) { - h_var_lower[i] = h_var_bounds_packed[i].x; - h_var_upper[i] = h_var_bounds_packed[i].y; - h_var_lower_scaled[i] = h_var_bounds_scaled_packed[i].x; - h_var_upper_scaled[i] = h_var_bounds_scaled_packed[i].y; - } - - // 6. Build per-rank data and meta-data. + // ----- 5. Build per-rank data ----- std::vector> sub_pdlp_rank_data = partition_loader_t::create_rank_data_from_parts(parts, h_A_row_offsets, @@ -597,7 +547,7 @@ pdlp_solver_t::pdlp_solver_t(problem_t& op_problem, n_vars, nnz); - // 7. Build the per-shard PDLP settings: + // ----- 6. Per-shard settings ----- pdlp_solver_settings_t sub_pdlp_settings = settings; sub_pdlp_settings.num_gpus = 1; sub_pdlp_settings.distributed_pdlp_num_gpus = 1; @@ -606,7 +556,7 @@ pdlp_solver_t::pdlp_solver_t(problem_t& op_problem, sub_pdlp_settings.hyper_params.default_l_inf_ruiz_iterations = 0; sub_pdlp_settings.hyper_params.default_alpha_pock_chambolle_rescaling = 0.0; - // 8. Construct the engine, creates NCCL comms and shards + // ----- 7. Construct the engine: NCCL comms + per-shard pdlp_solver_t ----- multi_gpu_engine.emplace(std::move(sub_pdlp_rank_data), h_obj, h_var_lower, @@ -622,13 +572,12 @@ pdlp_solver_t::pdlp_solver_t(problem_t& op_problem, h_cummulative_var_scaling, h_bound_rescaling, h_objective_rescaling, - op_problem_scaled_.maximize, - op_problem_scaled_.objective_offset, - op_problem_scaled_.presolve_data.objective_scaling_factor, + maximize, + objective_offset, + objective_scaling_factor, sub_pdlp_settings); - // Copy to host and then to shards. - // More robust than cudaDeviceEnablePeerAccess and cost-free-ish. + // ----- 8. Seed shard step-size / primal-weight scalars from the master ----- f_t h_step_size{}, h_primal_weight{}, h_best_primal_weight{}; f_t h_primal_step_size{}, h_dual_step_size{}; raft::copy(&h_step_size, step_size_.data(), 1, stream_view_); @@ -648,27 +597,17 @@ pdlp_solver_t::pdlp_solver_t(problem_t& op_problem, raft::copy(sub.dual_step_size_.data(), &h_dual_step_size, 1, shard->stream); } - // Wire the engine into the master pdhg_solver_. Shards' pdhg_solver_ keep - // mgpu_engine_ == nullptr so they run plain single-GPU SpMV on local A. + // Wire the engine into master's pdhg_solver_; shards keep mgpu_engine_ == nullptr. pdhg_solver_.set_multi_gpu_engine(&*multi_gpu_engine); - // Project initial primal solution - if (settings_.hyper_params.project_initial_primal) { - // Use refine_initial_primal_projection ??? - using f_t2 = typename type_2::type; - for (auto& shard : multi_gpu_engine->shards) { - raft::device_setter guard(shard->device_id); - auto& sub = *shard->sub_pdlp; - cub::DeviceTransform::Transform( - cuda::std::make_tuple(sub.pdhg_solver_.get_primal_solution().data(), - sub.get_op_problem_scaled().variable_bounds.data()), - sub.pdhg_solver_.get_primal_solution().data(), - sub.pdhg_solver_.get_primal_solution().size(), - clamp(), - shard->stream.view()); - } - } - } // end if constexpr (std::is_same_v) + // ----- 9. Resize master gather destinations to the full problem size ----- + pdhg_solver_.get_potential_next_primal_solution().resize(n_vars, stream_view_); + pdhg_solver_.get_potential_next_dual_solution().resize(n_cstr, stream_view_); + current_termination_strategy_.get_convergence_information().get_reduced_cost().resize( + n_vars, stream_view_); + primal_size_h_ = n_vars; + dual_size_h_ = n_cstr; + handle_ptr_->sync_stream(stream_view_); } template diff --git a/cpp/src/pdlp/pdlp.cuh b/cpp/src/pdlp/pdlp.cuh index 14651eab3f..3544de89fa 100644 --- a/cpp/src/pdlp/pdlp.cuh +++ b/cpp/src/pdlp/pdlp.cuh @@ -8,6 +8,7 @@ #pragma once #include +#include #include #include @@ -64,9 +65,9 @@ class pdlp_solver_t { bool is_batch_mode = false); // Distributed Solver Constructor - pdlp_solver_t(problem_t& op_problem, - pdlp_solver_settings_t const& settings, - int distributed_pdlp_num_gpus); + pdlp_solver_t(problem_t& placeholder_problem, + cuopt::linear_programming::io::mps_data_model_t const& mps, + pdlp_solver_settings_t const& settings); optimization_problem_solution_t run_solver(const timer_t& timer); diff --git a/cpp/src/pdlp/solve.cu b/cpp/src/pdlp/solve.cu index b32bad87f8..ef273faf13 100644 --- a/cpp/src/pdlp/solve.cu +++ b/cpp/src/pdlp/solve.cu @@ -773,32 +773,15 @@ static optimization_problem_solution_t run_pdlp_solver( } } #endif - if (settings.hyper_params.use_distributed_pdlp) { - // Resolve the -1 "auto-detect" sentinel to the actual visible-device count on - // the master process - pdlp_solver_settings_t settings_resolved = settings; - if (settings_resolved.distributed_pdlp_num_gpus == -1) { - settings_resolved.distributed_pdlp_num_gpus = raft::device_setter::get_device_count(); - CUOPT_LOG_INFO("distributed_pdlp_num_gpus == -1: auto-detected %d visible CUDA device", - settings_resolved.distributed_pdlp_num_gpus); - } - cuopt_expects(settings_resolved.distributed_pdlp_num_gpus >= 1, - error_type_t::ValidationError, - "distributed_pdlp_num_gpus must be >= 1 or -1 (auto-detect)"); - if (settings_resolved.distributed_pdlp_num_gpus == 1) { - std::cout - << "CAREFUL: use_distributed_pdlp with distributed_pdlp_num_gpus == 1 runs the " - "single-shard dummy path" - << std::endl; - } - cuopt_expects(!is_batch_mode, - error_type_t::ValidationError, - "Distributed PDLP does not support batch mode"); - // Multi-GPU ctor; dispatched by 3rd-arg TYPE (int, not bool batch). - detail::pdlp_solver_t solver( - problem, settings_resolved, settings_resolved.distributed_pdlp_num_gpus); - return solver.run_solver(timer); - } + // Distributed PDLP cannot enter through this path: by the time we have a + // problem_t, the full problem already lives on the master GPU, which defeats + // the purpose of distributed mode. Callers must route to + // solve_lp_distributed_from_mps via solve_lp(mps_data_model, ...). + cuopt_expects(!settings.hyper_params.use_distributed_pdlp, + error_type_t::ValidationError, + "Distributed PDLP must be entered via solve_lp(mps_data_model, ...) " + "so the master GPU never materializes the full problem. Call sites " + "with a problem_t cannot dispatch to distributed mode."); detail::pdlp_solver_t solver(problem, settings, is_batch_mode); if (settings.inside_mip) { solver.set_inside_mip(true); } return solver.run_solver(timer); @@ -2180,14 +2163,75 @@ optimization_problem_solution_t solve_lp_distributed_from_mps( "%d visible CUDA device(s)", settings_resolved.distributed_pdlp_num_gpus); } - if (settings_resolved.distributed_pdlp_num_gpus <= 1) - { + if (settings_resolved.distributed_pdlp_num_gpus <= 1) { std::cout << "CAREFUL: use_distributed_pdlp with distributed_pdlp_num_gpus == 1 runs the " "single-shard dummy path" << std::endl; } - auto op_problem = mps_data_model_to_optimization_problem(handle_ptr, mps_data_model); - return solve_lp(op_problem, settings_resolved, problem_checking, use_pdlp_solver_mode); + // PDLP precision validations (mirror the checks in run_pdlp; distributed + // path only supports the default-precision, non-batch double config). + cuopt_expects(settings_resolved.pdlp_precision == pdlp_precision_t::DefaultPrecision, + error_type_t::ValidationError, + "Distributed PDLP only supports DefaultPrecision (double)."); + cuopt_expects(!settings_resolved.inside_mip, + error_type_t::ValidationError, + "Distributed PDLP is not yet supported from inside MIP."); + + init_logger_t log(settings_resolved.log_file, settings_resolved.log_to_console); + print_version_info(); + init_handler(handle_ptr); + + const i_t n_vars = static_cast(mps_data_model.get_objective_coefficients().size()); + const i_t n_cstr = static_cast(mps_data_model.get_constraint_lower_bounds().size()); + const i_t nnz = static_cast(mps_data_model.get_constraint_matrix_values().size()); + CUOPT_LOG_INFO("Solving a problem with %d constraints, %d variables (%d integers), and %d " + "nonzeros (distributed mps-direct path)", + n_cstr, + n_vars, + 0, + nnz); + + auto lp_timer = cuopt::timer_t(settings_resolved.time_limit); + + // Shape-0 placeholder: needed to build an empty pdlp_solver + cuopt::linear_programming::optimization_problem_t placeholder_op(handle_ptr); + { + std::vector empty_offsets = {0}; + placeholder_op.set_csr_constraint_matrix( + nullptr, 0, nullptr, 0, empty_offsets.data(), static_cast(empty_offsets.size())); + } + detail::problem_t placeholder_problem(placeholder_op); + + detail::pdlp_solver_t solver( + placeholder_problem, mps_data_model, settings_resolved); + + auto sol = solver.run_solver(lp_timer); + + // Maximization post-processing (matches run_pdlp at solve.cu:835-839): + // PDLP internally solves the negated objective, so flip dual / reduced + // cost signs on the gathered solution before returning. + if (mps_data_model.get_sense()) { + adjust_dual_solution_and_reduced_cost( + sol.get_dual_solution(), sol.get_reduced_cost(), handle_ptr->get_stream()); + handle_ptr->sync_stream(); + } + + sol.set_solve_time(lp_timer.elapsed_time()); + CUOPT_LOG_INFO("PDLP finished"); + if (sol.get_termination_status() != pdlp_termination_status_t::ConcurrentLimit) { + CUOPT_LOG_INFO("Status: %s Objective: %.8e Iterations: %d Time: %.3fs", + sol.get_termination_status_string().c_str(), + sol.get_objective_value(), + sol.get_additional_termination_information().number_of_steps_taken, + sol.get_solve_time()); + } + + if (settings_resolved.sol_file != "") { + CUOPT_LOG_INFO("Writing solution to file %s", settings_resolved.sol_file.c_str()); + sol.write_to_sol_file(settings_resolved.sol_file, handle_ptr->get_stream()); + } + + return sol; } // ============================================================================ From e4739b5a16c94d719187e28cd4ea3e32740c8f0b Mon Sep 17 00:00:00 2001 From: Bulle Mostovoi Date: Tue, 2 Jun 2026 15:21:20 +0200 Subject: [PATCH 58/67] removed usage of problem_t for distributed PDLP --- cpp/cuopt_cli.cpp | 7 +- .../distributed_pdlp/multi_gpu_engine.hpp | 499 ++++++++++++++++++ .../initial_scaling.cu | 120 +++-- .../initial_scaling.cuh | 15 +- cpp/src/pdlp/pdlp.cu | 112 +++- cpp/src/pdlp/saddle_point.cu | 7 +- .../convergence_information.cu | 71 +++ .../convergence_information.hpp | 5 + 8 files changed, 790 insertions(+), 46 deletions(-) diff --git a/cpp/cuopt_cli.cpp b/cpp/cuopt_cli.cpp index 7c0a9111d9..0ea79bd4ec 100644 --- a/cpp/cuopt_cli.cpp +++ b/cpp/cuopt_cli.cpp @@ -176,7 +176,12 @@ int run_single_file(const std::string& file_path, auto solution = cuopt::linear_programming::solve_mip(problem_interface.get(), mip_settings); } else { auto& lp_settings = settings.get_pdlp_settings(); - auto solution = cuopt::linear_programming::solve_lp(problem_interface.get(), lp_settings); + + if (lp_settings.hyper_params.use_distributed_pdlp) { + cuopt::linear_programming::solve_lp(handle_ptr.get(), mps_data_model, lp_settings); + } else { + cuopt::linear_programming::solve_lp(problem_interface.get(), lp_settings); + } } } catch (const std::exception& e) { fprintf(stderr, "cuopt_cli error: %s\n", e.what()); diff --git a/cpp/src/pdlp/distributed_pdlp/multi_gpu_engine.hpp b/cpp/src/pdlp/distributed_pdlp/multi_gpu_engine.hpp index 674c4c0ef2..6ab4e35b71 100644 --- a/cpp/src/pdlp/distributed_pdlp/multi_gpu_engine.hpp +++ b/cpp/src/pdlp/distributed_pdlp/multi_gpu_engine.hpp @@ -12,6 +12,8 @@ #include +#include +#include #include #include @@ -27,6 +29,7 @@ #include #include +#include #include #include @@ -336,6 +339,502 @@ struct multi_gpu_engine_t { for_each_shard([](auto& shard) { shard.sub_pdlp->pdhg_solver_.spmvop_At_y(); }); } + // -------- Distributed Ruiz inf-scaling ----------------------------------- + void alloc_global_var_scratch(i_t n_global_vars, + std::vector>& global_var_buf, + std::vector>& local_to_global_var_d) + { + const int nb = static_cast(shards.size()); + global_var_buf.reserve(nb); + local_to_global_var_d.reserve(nb); + for (int r = 0; r < nb; ++r) { + auto& s = *shards[r]; + raft::device_setter guard(s.device_id); + global_var_buf.emplace_back(static_cast(n_global_vars), s.stream.view()); + local_to_global_var_d.emplace_back(static_cast(s.rank_data.total_var_size), + s.stream.view()); + if (s.rank_data.total_var_size > 0) { + RAFT_CUDA_TRY(cudaMemcpyAsync(local_to_global_var_d.back().data(), + s.rank_data.local_to_global_var.data(), + sizeof(i_t) * s.rank_data.local_to_global_var.size(), + cudaMemcpyHostToDevice, + s.stream.view().value())); + } + } + } + + void reduce_iteration_variable_scaling_across_shards( + ncclRedOp_t op, + i_t n_global_vars, + std::vector>& global_var_buf, + std::vector>& local_to_global_var_d) + { + const int nb = static_cast(shards.size()); + + // Zero global buffers, then scatter each shard's local values into their + // global column indices. + for (int r = 0; r < nb; ++r) { + auto& s = *shards[r]; + raft::device_setter guard(s.device_id); + RAFT_CUDA_TRY(cudaMemsetAsync(global_var_buf[r].data(), + 0, + sizeof(f_t) * static_cast(n_global_vars), + s.stream.view().value())); + auto& iter_var_scaling = + s.sub_pdlp->get_initial_scaling_strategy().get_iteration_variable_scaling(); + if (s.rank_data.total_var_size > 0) { + thrust::scatter(rmm::exec_policy_nosync(s.stream.view()), + iter_var_scaling.begin(), + iter_var_scaling.begin() + s.rank_data.total_var_size, + local_to_global_var_d[r].begin(), + global_var_buf[r].begin()); + } + } + + ncclGroupStart(); + for (int r = 0; r < nb; ++r) { + auto& s = *shards[r]; + raft::device_setter guard(s.device_id); + ncclAllReduce(global_var_buf[r].data(), + global_var_buf[r].data(), + static_cast(n_global_vars), + ncclFloat64, + op, + s.comm.get(), + s.stream.view().value()); + } + ncclGroupEnd(); + + // Gather the global per-column value back into each shard's local iter vector. + for (int r = 0; r < nb; ++r) { + auto& s = *shards[r]; + raft::device_setter guard(s.device_id); + auto& iter_var_scaling = + s.sub_pdlp->get_initial_scaling_strategy().get_iteration_variable_scaling(); + if (s.rank_data.total_var_size > 0) { + thrust::gather(rmm::exec_policy_nosync(s.stream.view()), + local_to_global_var_d[r].begin(), + local_to_global_var_d[r].begin() + s.rank_data.total_var_size, + global_var_buf[r].begin(), + iter_var_scaling.begin()); + } + } + } + + void distributed_ruiz_inf_scaling(int num_iter, i_t n_global_vars) + { + if (num_iter <= 0 || n_global_vars <= 0) return; + raft::common::nvtx::range scope("distributed_ruiz_inf_scaling"); + + std::vector> global_var_buf; + std::vector> local_to_global_var_d; + alloc_global_var_scratch(n_global_vars, global_var_buf, local_to_global_var_d); + + for (int it = 0; it < num_iter; ++it) { + // 1) per-shard local kernel: writes iteration_variable_scaling (per-column + // inf-norm partial) and iteration_constraint_matrix_scaling (row, complete). + for_each_shard([](auto& shard) { + shard.sub_pdlp->get_initial_scaling_strategy().ruiz_iter_compute_local_iteration_vectors(); + }); + + // 2) cross-shard column inf-norm reduction (MAX). + reduce_iteration_variable_scaling_across_shards( + ncclMax, n_global_vars, global_var_buf, local_to_global_var_d); + + // 3) per-shard fold into cumulative + reset iter vectors. + for_each_shard([](auto& shard) { + shard.sub_pdlp->get_initial_scaling_strategy().ruiz_iter_apply_cumulative_update(); + }); + } + + // Make sure per-shard cumulative writes are observable on subsequent + // calls (e.g., the next distributed_max_singular_value). + for_each_shard([](auto& shard) { shard.stream.synchronize(); }); + } + + // Distributed Pock-Chambolle: one pass, mirroring single-GPU + // pock_chambolle_scaling but with the per-column sum-of-powers reduced across + // shards (SUM) between the local kernels and the cumulative fold. Rows are + // owned exclusively, so the row half stays local. Runs after the distributed + // Ruiz pass, matching the single-GPU order (Ruiz then Pock-Chambolle). + void distributed_pock_chambolle_scaling(f_t alpha, i_t n_global_vars) + { + if (n_global_vars <= 0) return; + raft::common::nvtx::range scope("distributed_pock_chambolle_scaling"); + + std::vector> global_var_buf; + std::vector> local_to_global_var_d; + alloc_global_var_scratch(n_global_vars, global_var_buf, local_to_global_var_d); + + // 1) per-shard local kernels: row sum (complete) + column sum (partial). + for_each_shard([alpha](auto& shard) { + shard.sub_pdlp->get_initial_scaling_strategy().pock_chambolle_compute_local_iteration_vectors( + alpha); + }); + + // 2) cross-shard column sum-of-powers reduction (SUM). + reduce_iteration_variable_scaling_across_shards( + ncclSum, n_global_vars, global_var_buf, local_to_global_var_d); + + // 3) per-shard fold into cumulative (cumulative /= sqrt(iteration)). + for_each_shard([](auto& shard) { + shard.sub_pdlp->get_initial_scaling_strategy().pock_chambolle_apply_cumulative_update(); + }); + + for_each_shard([](auto& shard) { shard.stream.synchronize(); }); + } + + // -------- Distributed σ_max(A) via power iteration ---------------------- + f_t distributed_max_singular_value(i_t n_global_cstrs, + int max_iterations = 5000, + f_t tolerance = 1e-4) + { + raft::common::nvtx::range scope("distributed_max_singular_value"); + + const int nb = static_cast(shards.size()); + + // Generate the GLOBAL z[] sequence in cstr-index order from a fresh + // mt19937(1), once per call. It's m doubles regardless of N (cheap). + // Each shard then keeps only z[global_idx_for_owned_local_i]. + std::vector h_global_z(static_cast(n_global_cstrs)); + { + std::mt19937 gen(1); + std::normal_distribution dist(f_t(0.0), f_t(1.0)); + for (i_t i = 0; i < n_global_cstrs; ++i) { + h_global_z[i] = dist(gen); + } + } + + // Per-shard scratch lives on each shard's device. We use total (owned + + // halo) sizes for q/z/atq because they're SpMV inputs that need halo + // space. Norms / dot are scalars. + // We use size-1 rmm::device_uvector instead of rmm::device_scalar for the + // per-shard scratch scalars: nvcc + libcudacxx fail the + // copy_constructible concept check when device_scalar appears in a + // std::vector (the check transitively touches rmm::cuda_stream, which is + // non-copyable). device_uvector avoids that path. + std::vector> q; + std::vector> z; + std::vector> atq; + std::vector> sigma_sq; + std::vector> norm_q; + std::vector> residual_norm; + std::vector z_dn(nb, nullptr); + std::vector atq_dn(nb, nullptr); + q.reserve(nb); + z.reserve(nb); + atq.reserve(nb); + sigma_sq.reserve(nb); + norm_q.reserve(nb); + residual_norm.reserve(nb); + + for (int r = 0; r < nb; ++r) { + auto& s = *shards[r]; + raft::device_setter guard(s.device_id); + const i_t cstr_total = s.rank_data.total_cstr_size; + const i_t var_total = s.rank_data.total_var_size; + q.emplace_back(static_cast(cstr_total), s.stream.view()); + z.emplace_back(static_cast(cstr_total), s.stream.view()); + atq.emplace_back(static_cast(var_total), s.stream.view()); + sigma_sq.emplace_back(std::size_t{1}, s.stream.view()); + norm_q.emplace_back(std::size_t{1}, s.stream.view()); + residual_norm.emplace_back(std::size_t{1}, s.stream.view()); + RAFT_CUSPARSE_TRY(raft::sparse::detail::cusparsecreatednvec( + &z_dn[r], static_cast(cstr_total), z.back().data())); + RAFT_CUSPARSE_TRY(raft::sparse::detail::cusparsecreatednvec( + &atq_dn[r], static_cast(var_total), atq.back().data())); + + std::vector h_owned_z(static_cast(s.rank_data.owned_cstr_size)); + for (i_t i = 0; i < s.rank_data.owned_cstr_size; ++i) { + const i_t g = s.rank_data.local_to_global_cstr[i]; + h_owned_z[i] = h_global_z[g]; + } + if (s.rank_data.owned_cstr_size > 0) { + RAFT_CUDA_TRY( + cudaMemcpyAsync(z.back().data(), + h_owned_z.data(), + sizeof(f_t) * static_cast(s.rank_data.owned_cstr_size), + cudaMemcpyHostToDevice, + s.stream.view().value())); + } + if (cstr_total > s.rank_data.owned_cstr_size) { + RAFT_CUDA_TRY(cudaMemsetAsync( + z.back().data() + s.rank_data.owned_cstr_size, + 0, + sizeof(f_t) * static_cast(cstr_total - s.rank_data.owned_cstr_size), + s.stream.view().value())); + } + // Sync to ensure h_owned_z stays valid through the H2D copy (it goes + // out of scope at end of this iteration of the per-shard loop). + s.stream.synchronize(); + } + + // Local halo-exchange helpers that work directly on per-shard external + // buffers (the engine's halo_exchange_var/cstr expect accessors that + // resolve through pdhg_solver_t, which doesn't see our scratch). + auto halo_exchange_cstr_bufs = [&](std::vector>& bufs) { + for (int r = 0; r < nb; ++r) { + auto& s = *shards[r]; + raft::device_setter guard(s.device_id); + auto& y = bufs[r]; + for (int peer = 0; peer < nb; ++peer) { + if (peer == r) continue; + if (s.cstr_send_indices_d[peer].size() == 0) continue; + thrust::gather(rmm::exec_policy_nosync(s.stream.view()), + s.cstr_send_indices_d[peer].begin(), + s.cstr_send_indices_d[peer].end(), + y.begin(), + s.cstr_send_buf_d[peer].begin()); + } + } + ncclGroupStart(); + for (int r = 0; r < nb; ++r) { + auto& s = *shards[r]; + raft::device_setter guard(s.device_id); + for (int peer = 0; peer < nb; ++peer) { + if (peer == r) continue; + ncclSend(s.cstr_send_buf_d[peer].data(), + s.cstr_send_buf_d[peer].size(), + ncclFloat64, + peer, + s.comm.get(), + s.stream.view().value()); + } + } + for (int r = 0; r < nb; ++r) { + auto& s = *shards[r]; + auto& rd = s.rank_data; + raft::device_setter guard(s.device_id); + auto& y = bufs[r]; + for (int peer = 0; peer < nb; ++peer) { + if (peer == r) continue; + f_t* recv_ptr = y.data() + rd.owned_cstr_size + rd.cstr_recv_offsets[peer]; + ncclRecv(recv_ptr, + static_cast(rd.cstr_recv_counts[peer]), + ncclFloat64, + peer, + s.comm.get(), + s.stream.view().value()); + } + } + ncclGroupEnd(); + }; + auto halo_exchange_var_bufs = [&](std::vector>& bufs) { + for (int r = 0; r < nb; ++r) { + auto& s = *shards[r]; + raft::device_setter guard(s.device_id); + auto& x = bufs[r]; + for (int peer = 0; peer < nb; ++peer) { + if (peer == r) continue; + if (s.var_send_indices_d[peer].size() == 0) continue; + thrust::gather(rmm::exec_policy_nosync(s.stream.view()), + s.var_send_indices_d[peer].begin(), + s.var_send_indices_d[peer].end(), + x.begin(), + s.var_send_buf_d[peer].begin()); + } + } + ncclGroupStart(); + for (int r = 0; r < nb; ++r) { + auto& s = *shards[r]; + raft::device_setter guard(s.device_id); + for (int peer = 0; peer < nb; ++peer) { + if (peer == r) continue; + ncclSend(s.var_send_buf_d[peer].data(), + s.var_send_buf_d[peer].size(), + ncclFloat64, + peer, + s.comm.get(), + s.stream.view().value()); + } + } + for (int r = 0; r < nb; ++r) { + auto& s = *shards[r]; + auto& rd = s.rank_data; + raft::device_setter guard(s.device_id); + auto& x = bufs[r]; + for (int peer = 0; peer < nb; ++peer) { + if (peer == r) continue; + f_t* recv_ptr = x.data() + rd.owned_var_size + rd.var_recv_offsets[peer]; + ncclRecv(recv_ptr, + static_cast(rd.var_recv_counts[peer]), + ncclFloat64, + peer, + s.comm.get(), + s.stream.view().value()); + } + } + ncclGroupEnd(); + }; + + // Per-shard partial reductions over the OWNED cstr slice + NCCL allreduce. + // For norm: out := sqrt(Σ_r ||bufs[r][0:owned_cstr]||²). + // For dot : out := Σ_r . + auto distributed_norm_owned_cstr = [&](std::vector>& bufs, + std::vector>& out) { + for (int r = 0; r < nb; ++r) { + auto& s = *shards[r]; + raft::device_setter guard(s.device_id); + const i_t n_owned = s.rank_data.owned_cstr_size; + RAFT_CUBLAS_TRY(raft::linalg::detail::cublasdot(s.handle.get_cublas_handle(), + static_cast(n_owned), + bufs[r].data(), + 1, + bufs[r].data(), + 1, + out[r].data(), + s.stream.view().value())); + } + ncclGroupStart(); + for (int r = 0; r < nb; ++r) { + auto& s = *shards[r]; + raft::device_setter guard(s.device_id); + ncclAllReduce(out[r].data(), + out[r].data(), + 1, + ncclFloat64, + ncclSum, + s.comm.get(), + s.stream.view().value()); + } + ncclGroupEnd(); + for (int r = 0; r < nb; ++r) { + auto& s = *shards[r]; + raft::device_setter guard(s.device_id); + cub::DeviceTransform::Transform( + out[r].data(), out[r].data(), 1, sqrt_inplace_op_t{}, s.stream.view().value()); + } + }; + auto distributed_dot_owned_cstr = [&](std::vector>& a, + std::vector>& b, + std::vector>& out) { + for (int r = 0; r < nb; ++r) { + auto& s = *shards[r]; + raft::device_setter guard(s.device_id); + const i_t n_owned = s.rank_data.owned_cstr_size; + RAFT_CUBLAS_TRY(raft::linalg::detail::cublasdot(s.handle.get_cublas_handle(), + static_cast(n_owned), + a[r].data(), + 1, + b[r].data(), + 1, + out[r].data(), + s.stream.view().value())); + } + ncclGroupStart(); + for (int r = 0; r < nb; ++r) { + auto& s = *shards[r]; + raft::device_setter guard(s.device_id); + ncclAllReduce(out[r].data(), + out[r].data(), + 1, + ncclFloat64, + ncclSum, + s.comm.get(), + s.stream.view().value()); + } + ncclGroupEnd(); + }; + + // ===== Power iteration ===== + // Mirrors single-GPU compute_initial_step_size: z is the carried iterate + // (A Aᵀ q each step); at the top of each iteration q := z then q is + // normalized; the residual z − σ²q is written back into q only to drive + // the convergence check (next iteration's q := z discards it). + for (int it = 0; it < max_iterations; ++it) { + // q := z on the owned slice (the carried iterate), then normalize. + for (int r = 0; r < nb; ++r) { + auto& s = *shards[r]; + raft::device_setter guard(s.device_id); + const i_t n_owned = s.rank_data.owned_cstr_size; + raft::copy(q[r].data(), z[r].data(), n_owned, s.stream.view()); + } + + // ||q||₂ over the global OWNED cstr slice (one allreduce-sum + sqrt). + distributed_norm_owned_cstr(q, norm_q); + + // q /= ||q||₂ on owned slice (halo gets refreshed by next exchange). + for (int r = 0; r < nb; ++r) { + auto& s = *shards[r]; + raft::device_setter guard(s.device_id); + const i_t n_owned = s.rank_data.owned_cstr_size; + cub::DeviceTransform::Transform( + q[r].data(), + q[r].data(), + n_owned, + [n = norm_q[r].data()] __device__(f_t v) { return v / *n; }, + s.stream.view().value()); + } + + // atq = A^T q : halo-exchange q, then per-shard SpMV. spmv_At_into + // rebinds the dual_solution dnvec to q[r].data() and restores the + // canonical binding after the call (see pdhg.cu:643-644). + halo_exchange_cstr_bufs(q); + for (int r = 0; r < nb; ++r) { + auto& s = *shards[r]; + raft::device_setter guard(s.device_id); + s.sub_pdlp->pdhg_solver_.spmv_At_into(q[r], atq_dn[r]); + } + + // z = A atq : halo-exchange atq, then per-shard SpMV. + halo_exchange_var_bufs(atq); + for (int r = 0; r < nb; ++r) { + auto& s = *shards[r]; + raft::device_setter guard(s.device_id); + s.sub_pdlp->pdhg_solver_.spmv_A_into(atq[r], z_dn[r]); + } + + // σ² = q · z over the global OWNED cstr slice (= q^T A A^T q = σ_max² + // when q is the dominant left-singular vector). + distributed_dot_owned_cstr(q, z, sigma_sq); + + // q := -σ² q + z (owned slice) — residual of the eigen-equation. + for (int r = 0; r < nb; ++r) { + auto& s = *shards[r]; + raft::device_setter guard(s.device_id); + const i_t n_owned = s.rank_data.owned_cstr_size; + cub::DeviceTransform::Transform( + cuda::std::make_tuple(q[r].data(), z[r].data()), + q[r].data(), + n_owned, + [s2 = sigma_sq[r].data()] __device__(f_t qv, f_t zv) { return -(*s2) * qv + zv; }, + s.stream.view().value()); + } + + // Convergence check via global residual norm. + distributed_norm_owned_cstr(q, residual_norm); + auto& s0 = *shards[0]; + raft::device_setter guard0(s0.device_id); + f_t h_res{}; + RAFT_CUDA_TRY(cudaMemcpyAsync(&h_res, + residual_norm[0].data(), + sizeof(f_t), + cudaMemcpyDeviceToHost, + s0.stream.view().value())); + s0.stream.synchronize(); + if (h_res < tolerance) break; + } + + // σ_max² is the same on every shard after the last allreduce. + auto& s0 = *shards[0]; + raft::device_setter guard0(s0.device_id); + f_t sigma_sq_h{}; + RAFT_CUDA_TRY(cudaMemcpyAsync(&sigma_sq_h, + sigma_sq[0].data(), + sizeof(f_t), + cudaMemcpyDeviceToHost, + s0.stream.view().value())); + s0.stream.synchronize(); + + for (int r = 0; r < nb; ++r) { + raft::device_setter guard(shards[r]->device_id); + RAFT_CUSPARSE_TRY(cusparseDestroyDnVec(z_dn[r])); + RAFT_CUSPARSE_TRY(cusparseDestroyDnVec(atq_dn[r])); + } + + return std::sqrt(std::max(sigma_sq_h, f_t(0))); + } + // -------- Solution gather (shards -> master) ---------------------------- // Assembles the global potential_next primal/dual solutions and the // reduced_cost on the master from the owned slices distributed across diff --git a/cpp/src/pdlp/initial_scaling_strategy/initial_scaling.cu b/cpp/src/pdlp/initial_scaling_strategy/initial_scaling.cu index 478753e9d9..dcc3e662b0 100644 --- a/cpp/src/pdlp/initial_scaling_strategy/initial_scaling.cu +++ b/cpp/src/pdlp/initial_scaling_strategy/initial_scaling.cu @@ -142,6 +142,10 @@ void pdlp_initial_scaling_strategy_t::compute_scaling_vectors( { raft::common::nvtx::range fun_scope("compute_scaling_vectors"); + // Skip scaling entirely for a shape-0 problem (distributed PDLP builds the + // master pdlp_solver_t from a shape-0 placeholder) + if (primal_size_h_ == 0 || dual_size_h_ == 0) return; + if (hyper_params_.do_ruiz_scaling) { ruiz_inf_scaling(number_of_ruiz_iterations); } if (hyper_params_.do_pock_chambolle_scaling) { pock_chambolle_scaling(alpha); } } @@ -213,6 +217,72 @@ __global__ void inf_norm_row_and_col_kernel( } } +template +void pdlp_initial_scaling_strategy_t::ruiz_iter_compute_local_iteration_vectors() +{ + // find inf norm over rows and columns of the scaled matrix in given iteration + i_t number_of_blocks = op_problem_scaled_.n_constraints / block_size; + if (op_problem_scaled_.n_constraints % block_size) number_of_blocks++; + i_t number_of_threads = std::min(op_problem_scaled_.n_variables, (i_t)block_size); + inf_norm_row_and_col_kernel<<>>( + op_problem_scaled_.view(), this->view()); + RAFT_CUDA_TRY(cudaPeekAtLastError()); + + if (running_mip_) { reset_integer_variables(); } +} + +template +void pdlp_initial_scaling_strategy_t::ruiz_iter_apply_cumulative_update() +{ + raft::linalg::binaryOp(cummulative_constraint_matrix_scaling_.data(), + cummulative_constraint_matrix_scaling_.data(), + iteration_constraint_matrix_scaling_.data(), + dual_size_h_, + a_divides_sqrt_b_bounded(), + stream_view_); + + raft::linalg::binaryOp(cummulative_variable_scaling_.data(), + cummulative_variable_scaling_.data(), + iteration_variable_scaling_.data(), + primal_size_h_, + a_divides_sqrt_b_bounded(), + stream_view_); + + // Reset the iteration_scaling vectors to all 0 + RAFT_CUDA_TRY(cudaMemsetAsync( + iteration_constraint_matrix_scaling_.data(), 0.0, sizeof(f_t) * dual_size_h_, stream_view_)); + RAFT_CUDA_TRY(cudaMemsetAsync( + iteration_variable_scaling_.data(), 0.0, sizeof(f_t) * primal_size_h_, stream_view_)); +} + +template +void pdlp_initial_scaling_strategy_t::reset_scaling_state_for_distributed() +{ + if (primal_size_h_ == 0 || dual_size_h_ == 0) return; + + // Re-allocate the iteration vectors the ctor shrank to 0 and zero them. + iteration_constraint_matrix_scaling_.resize(static_cast(dual_size_h_), stream_view_); + iteration_variable_scaling_.resize(static_cast(primal_size_h_), stream_view_); + RAFT_CUDA_TRY(cudaMemsetAsync( + iteration_constraint_matrix_scaling_.data(), 0, sizeof(f_t) * dual_size_h_, stream_view_)); + RAFT_CUDA_TRY(cudaMemsetAsync( + iteration_variable_scaling_.data(), 0, sizeof(f_t) * primal_size_h_, stream_view_)); + + // Reset cumulative scaling + rescaling to identity (the ctor's stray + // Pock-Chambolle pass and shard.cu's set_cummulative_scaling left these in + // an arbitrary state; distributed scaling recomputes from a clean slate). + thrust::fill(handle_ptr_->get_thrust_policy(), + cummulative_constraint_matrix_scaling_.begin(), + cummulative_constraint_matrix_scaling_.end(), + f_t(1)); + thrust::fill(handle_ptr_->get_thrust_policy(), + cummulative_variable_scaling_.begin(), + cummulative_variable_scaling_.end(), + f_t(1)); + set_h_bound_rescaling(f_t(1)); + set_h_objective_rescaling(f_t(1)); +} + template void pdlp_initial_scaling_strategy_t::ruiz_inf_scaling(i_t number_of_ruiz_iterations) { @@ -221,36 +291,8 @@ void pdlp_initial_scaling_strategy_t::ruiz_inf_scaling(i_t number_of_r std::cout << "Doing ruiz_inf_scaling" << std::endl; #endif for (int i = 0; i < number_of_ruiz_iterations; i++) { - // find inf norm over rows and columns of the scaled matrix in given iteration (matrix is not - // actually updated, but the scaled value is computed and evaluated) - i_t number_of_blocks = op_problem_scaled_.n_constraints / block_size; - if (op_problem_scaled_.n_constraints % block_size) number_of_blocks++; - i_t number_of_threads = std::min(op_problem_scaled_.n_variables, (i_t)block_size); - inf_norm_row_and_col_kernel<<>>( - op_problem_scaled_.view(), this->view()); - RAFT_CUDA_TRY(cudaPeekAtLastError()); - - if (running_mip_) { reset_integer_variables(); } - - raft::linalg::binaryOp(cummulative_constraint_matrix_scaling_.data(), - cummulative_constraint_matrix_scaling_.data(), - iteration_constraint_matrix_scaling_.data(), - dual_size_h_, - a_divides_sqrt_b_bounded(), - stream_view_); - - raft::linalg::binaryOp(cummulative_variable_scaling_.data(), - cummulative_variable_scaling_.data(), - iteration_variable_scaling_.data(), - primal_size_h_, - a_divides_sqrt_b_bounded(), - stream_view_); - - // Reset the iteration_scaling vectors to all 0 - RAFT_CUDA_TRY(cudaMemsetAsync( - iteration_constraint_matrix_scaling_.data(), 0.0, sizeof(f_t) * dual_size_h_, stream_view_)); - RAFT_CUDA_TRY(cudaMemsetAsync( - iteration_variable_scaling_.data(), 0.0, sizeof(f_t) * primal_size_h_, stream_view_)); + ruiz_iter_compute_local_iteration_vectors(); + ruiz_iter_apply_cumulative_update(); } } @@ -343,8 +385,12 @@ __global__ void pock_chambolle_scaling_kernel_col( if (threadIdx.x == 0) initial_scaling_view.iteration_variable_scaling[col] = accumulated_value; } +// Local half of one Pock-Chambolle pass: writes the per-row and per-column +// sums-of-powers into iteration_constraint_matrix_scaling_ / +// iteration_variable_scaling_ template -void pdlp_initial_scaling_strategy_t::pock_chambolle_scaling(f_t alpha) +void pdlp_initial_scaling_strategy_t::pock_chambolle_compute_local_iteration_vectors( + f_t alpha) { // Reset the iteration_scaling vectors to all 0 RAFT_CUDA_TRY(cudaMemsetAsync( @@ -379,7 +425,12 @@ void pdlp_initial_scaling_strategy_t::pock_chambolle_scaling(f_t alpha A_T_offsets_.data(), A_T_indices_.data()); RAFT_CUDA_TRY(cudaPeekAtLastError()); +} +// Fold half of one Pock-Chambolle pass: cumulative /= sqrt(iteration). +template +void pdlp_initial_scaling_strategy_t::pock_chambolle_apply_cumulative_update() +{ if (running_mip_) { reset_integer_variables(); } // divide the sqrt of the vectors of the sums from above to the respective scaling vectors @@ -398,6 +449,13 @@ void pdlp_initial_scaling_strategy_t::pock_chambolle_scaling(f_t alpha stream_view_); } +template +void pdlp_initial_scaling_strategy_t::pock_chambolle_scaling(f_t alpha) +{ + pock_chambolle_compute_local_iteration_vectors(alpha); + pock_chambolle_apply_cumulative_update(); +} + template __global__ void scale_problem_kernel( const typename pdlp_initial_scaling_strategy_t::view_t initial_scaling_view, diff --git a/cpp/src/pdlp/initial_scaling_strategy/initial_scaling.cuh b/cpp/src/pdlp/initial_scaling_strategy/initial_scaling.cuh index dbdb604082..148ccce238 100644 --- a/cpp/src/pdlp/initial_scaling_strategy/initial_scaling.cuh +++ b/cpp/src/pdlp/initial_scaling_strategy/initial_scaling.cuh @@ -94,6 +94,20 @@ class pdlp_initial_scaling_strategy_t { void bound_objective_rescaling(); + // Public for distributed PDLP + void compute_scaling_vectors(i_t number_of_ruiz_iterations, f_t alpha); + + // ----- Distributed-PDLP hooks ----- + + void ruiz_iter_compute_local_iteration_vectors(); + void ruiz_iter_apply_cumulative_update(); + void pock_chambolle_compute_local_iteration_vectors(f_t alpha); + void pock_chambolle_apply_cumulative_update(); + rmm::device_uvector& get_iteration_variable_scaling() { return iteration_variable_scaling_; } + + // Restore the clean pre-scaling state for the distributed path. + void reset_scaling_state_for_distributed(); + /** * @brief Gets the device-side view (with raw pointers), for ease of access * inside cuda kernels @@ -101,7 +115,6 @@ class pdlp_initial_scaling_strategy_t { view_t view(); private: - void compute_scaling_vectors(i_t number_of_ruiz_iterations, f_t alpha); void ruiz_inf_scaling(i_t number_of_ruiz_iterations); void pock_chambolle_scaling(f_t alpha); void reset_integer_variables(); diff --git a/cpp/src/pdlp/pdlp.cu b/cpp/src/pdlp/pdlp.cu index 21291b853d..013905b4fb 100644 --- a/cpp/src/pdlp/pdlp.cu +++ b/cpp/src/pdlp/pdlp.cu @@ -577,24 +577,71 @@ pdlp_solver_t::pdlp_solver_t( objective_scaling_factor, sub_pdlp_settings); - // ----- 8. Seed shard step-size / primal-weight scalars from the master ----- - f_t h_step_size{}, h_primal_weight{}, h_best_primal_weight{}; - f_t h_primal_step_size{}, h_dual_step_size{}; - raft::copy(&h_step_size, step_size_.data(), 1, stream_view_); - raft::copy(&h_primal_weight, primal_weight_.data(), 1, stream_view_); - raft::copy(&h_best_primal_weight, best_primal_weight_.data(), 1, stream_view_); - raft::copy(&h_primal_step_size, primal_step_size_.data(), 1, stream_view_); - raft::copy(&h_dual_step_size, dual_step_size_.data(), 1, stream_view_); + // ----- 8 Distributed Scaling ----- + for (auto& shard : multi_gpu_engine->shards) { + raft::device_setter guard(shard->device_id); + shard->sub_pdlp->get_initial_scaling_strategy().reset_scaling_state_for_distributed(); + } + for (auto& shard : multi_gpu_engine->shards) { + raft::device_setter guard(shard->device_id); + shard->stream.synchronize(); + } + + // Distributed scaling + if (settings_.hyper_params.do_ruiz_scaling) { + multi_gpu_engine->distributed_ruiz_inf_scaling( + settings_.hyper_params.default_l_inf_ruiz_iterations, n_vars); + } + if (settings_.hyper_params.do_pock_chambolle_scaling) { + multi_gpu_engine->distributed_pock_chambolle_scaling( + static_cast(settings_.hyper_params.default_alpha_pock_chambolle_rescaling), n_vars); + } + + for (auto& shard : multi_gpu_engine->shards) { + raft::device_setter guard(shard->device_id); + auto& scaling = shard->sub_pdlp->get_initial_scaling_strategy(); + scaling.scale_problem(); + + shard->sub_pdlp->pdhg_solver_.get_cusparse_view().create_spmv_op_plans( + /*is_reflected=*/settings_.hyper_params.use_reflected_primal_dual); + } + for (auto& shard : multi_gpu_engine->shards) { + raft::device_setter guard(shard->device_id); + shard->stream.synchronize(); + } + + // ----- 8b. Seed initial step-size / primal-weight (distributed, scales to N shards) ----- + constexpr f_t kStepSizeScale = f_t{0.998}; + const f_t sigma_max = multi_gpu_engine->distributed_max_singular_value(n_cstr); + const f_t h_primal_weight = f_t{1}; + const f_t h_step_size = (sigma_max > f_t{0}) ? kStepSizeScale / sigma_max : f_t{1}; + // With primal_weight = 1 the adaptive step-size strategy collapses to + // primal_step_size = step_size / primal_weight = step_size + // dual_step_size = step_size * primal_weight = step_size. + const f_t h_primal_step_size = h_step_size; + const f_t h_dual_step_size = h_step_size; + + // Put the values on master + raft::copy(step_size_.data(), &h_step_size, 1, stream_view_); + raft::copy(primal_weight_.data(), &h_primal_weight, 1, stream_view_); + raft::copy(best_primal_weight_.data(), &h_primal_weight, 1, stream_view_); + raft::copy(primal_step_size_.data(), &h_primal_step_size, 1, stream_view_); + raft::copy(dual_step_size_.data(), &h_dual_step_size, 1, stream_view_); handle_ptr_->sync_stream(stream_view_); + // put the values on each shard for (auto& shard : multi_gpu_engine->shards) { raft::device_setter guard(shard->device_id); auto& sub = *shard->sub_pdlp; raft::copy(sub.step_size_.data(), &h_step_size, 1, shard->stream); raft::copy(sub.primal_weight_.data(), &h_primal_weight, 1, shard->stream); - raft::copy(sub.best_primal_weight_.data(), &h_best_primal_weight, 1, shard->stream); - raft::copy(sub.primal_step_size_.data(), &h_primal_step_size, 1, shard->stream); - raft::copy(sub.dual_step_size_.data(), &h_dual_step_size, 1, shard->stream); + raft::copy(sub.best_primal_weight_.data(), &h_primal_weight, 1, shard->stream); + raft::copy(sub.get_primal_step_size().data(), &h_primal_step_size, 1, shard->stream); + raft::copy(sub.get_dual_step_size().data(), &h_dual_step_size, 1, shard->stream); + } + for (auto& shard : multi_gpu_engine->shards) { + raft::device_setter guard(shard->device_id); + shard->stream.synchronize(); } // Wire the engine into master's pdhg_solver_; shards keep mgpu_engine_ == nullptr. @@ -607,6 +654,49 @@ pdlp_solver_t::pdlp_solver_t( n_vars, stream_view_); primal_size_h_ = n_vars; dual_size_h_ = n_cstr; + + // Distributed conergence_information::init_l2_norms + for (auto& shard : multi_gpu_engine->shards) { + raft::device_setter guard(shard->device_id); + shard->sub_pdlp->get_current_termination_strategy() + .get_convergence_information() + .compute_owned_reference_norm_partials(shard->rank_data.owned_var_size, + shard->rank_data.owned_cstr_size); + } + multi_gpu_engine->allreduce_sum_inplace([](pdlp_solver_t& sp) -> f_t* { + return sp.get_current_termination_strategy() + .get_convergence_information() + .l2_norm_primal_right_hand_side_data(); + }); + multi_gpu_engine->allreduce_sum_inplace([](pdlp_solver_t& sp) -> f_t* { + return sp.get_current_termination_strategy() + .get_convergence_information() + .l2_norm_primal_linear_objective_data(); + }); + for (auto& shard : multi_gpu_engine->shards) { + raft::device_setter guard(shard->device_id); + shard->sub_pdlp->get_current_termination_strategy() + .get_convergence_information() + .sqrt_reference_norms_inplace(); + shard->stream.synchronize(); + } + // Broadcast the values to the master + { + auto& s0 = *multi_gpu_engine->shards[0]; + auto& s0_conv = s0.sub_pdlp->get_current_termination_strategy().get_convergence_information(); + raft::device_setter guard(s0.device_id); + for (auto* ts : {¤t_termination_strategy_, &average_termination_strategy_}) { + auto& ci = ts->get_convergence_information(); + raft::copy(ci.l2_norm_primal_right_hand_side_data(), + s0_conv.l2_norm_primal_right_hand_side_data(), + 1, + stream_view_); + raft::copy(ci.l2_norm_primal_linear_objective_data(), + s0_conv.l2_norm_primal_linear_objective_data(), + 1, + stream_view_); + } + } handle_ptr_->sync_stream(stream_view_); } diff --git a/cpp/src/pdlp/saddle_point.cu b/cpp/src/pdlp/saddle_point.cu index f740176a3c..07a5d0146e 100644 --- a/cpp/src/pdlp/saddle_point.cu +++ b/cpp/src/pdlp/saddle_point.cu @@ -38,8 +38,11 @@ saddle_point_state_t::saddle_point_state_t( current_AtY_{batch_size * primal_size, handle_ptr->get_stream()}, next_AtY_{batch_size * primal_size, handle_ptr->get_stream()} { - EXE_CUOPT_EXPECTS(primal_size > 0, "Size of the primal problem must be larger than 0"); - EXE_CUOPT_EXPECTS(dual_size > 0, "Size of the dual problem must be larger than 0"); + // >= 0 (not > 0): distributed PDLP builds the master pdlp_solver_t from a + // shape-0 placeholder problem so the master never materializes per-variable + // / per-constraint vectors; size-0 device_uvectors are valid throughout. + EXE_CUOPT_EXPECTS(primal_size >= 0, "Size of the primal problem must be non-negative"); + EXE_CUOPT_EXPECTS(dual_size >= 0, "Size of the dual problem must be non-negative"); // Starting from all 0 thrust::fill( diff --git a/cpp/src/pdlp/termination_strategy/convergence_information.cu b/cpp/src/pdlp/termination_strategy/convergence_information.cu index da2340146a..44ddd5b2a1 100644 --- a/cpp/src/pdlp/termination_strategy/convergence_information.cu +++ b/cpp/src/pdlp/termination_strategy/convergence_information.cu @@ -212,6 +212,77 @@ void convergence_information_t::init_l2_norms() } } +template +void convergence_information_t::compute_owned_reference_norm_partials( + i_t owned_var_size, i_t owned_cstr_size) +{ + cuopt_assert(!batch_mode_, "owned reference-norm partials only used in non-batch mGPU mode"); + cuopt_assert(owned_var_size <= primal_size_h_, "owned_var_size must be <= primal_size_h_"); + cuopt_assert(owned_cstr_size <= dual_size_h_, "owned_cstr_size must be <= dual_size_h_"); + + // Σ objective[0:owned_var]² + RAFT_CUBLAS_TRY(raft::linalg::detail::cublasdot(handle_ptr_->get_cublas_handle(), + static_cast(owned_var_size), + problem_ptr->objective_coefficients.data(), + 1, + problem_ptr->objective_coefficients.data(), + 1, + l2_norm_primal_linear_objective_.data(), + stream_view_)); + + // rhs_sum_of_squares(lower[0:owned_cstr], upper[0:owned_cstr]) (no sqrt) + { + rmm::device_buffer d_temp_storage; + size_t bytes = 0; + auto zip_begin = thrust::make_zip_iterator(problem_ptr->constraint_lower_bounds.data(), + problem_ptr->constraint_upper_bounds.data()); + cub::DeviceReduce::TransformReduce(nullptr, + bytes, + zip_begin, + l2_norm_primal_right_hand_side_.data(), + static_cast(owned_cstr_size), + cuda::std::plus<>{}, + rhs_sum_of_squares_t{}, + f_t(0), + stream_view_); + d_temp_storage.resize(bytes, stream_view_); + cub::DeviceReduce::TransformReduce(d_temp_storage.data(), + bytes, + zip_begin, + l2_norm_primal_right_hand_side_.data(), + static_cast(owned_cstr_size), + cuda::std::plus<>{}, + rhs_sum_of_squares_t{}, + f_t(0), + stream_view_); + } + RAFT_CUDA_TRY(cudaStreamSynchronize(stream_view_)); +} + +template +void convergence_information_t::sqrt_reference_norms_inplace() +{ + cub::DeviceTransform::Transform(l2_norm_primal_linear_objective_.data(), + l2_norm_primal_linear_objective_.data(), + 1, + sqrt_func_t{}, + stream_view_); + cub::DeviceTransform::Transform(l2_norm_primal_right_hand_side_.data(), + l2_norm_primal_right_hand_side_.data(), + 1, + sqrt_func_t{}, + stream_view_); + // Broadcast slot [0] to all climbers (no-op outside batch mode). + thrust::fill(handle_ptr_->get_thrust_policy(), + l2_norm_primal_linear_objective_.begin(), + l2_norm_primal_linear_objective_.end(), + l2_norm_primal_linear_objective_.element(0, stream_view_)); + thrust::fill(handle_ptr_->get_thrust_policy(), + l2_norm_primal_right_hand_side_.begin(), + l2_norm_primal_right_hand_side_.end(), + l2_norm_primal_right_hand_side_.element(0, stream_view_)); +} + // --------------------------------------------------------------------------- // init_reduction_storage: allocate and size the temporary buffers used by // cub::DeviceReduce and cub::DeviceSegmentedReduce throughout solving. diff --git a/cpp/src/pdlp/termination_strategy/convergence_information.hpp b/cpp/src/pdlp/termination_strategy/convergence_information.hpp index 6325622a2b..7ff45e46f0 100644 --- a/cpp/src/pdlp/termination_strategy/convergence_information.hpp +++ b/cpp/src/pdlp/termination_strategy/convergence_information.hpp @@ -69,6 +69,11 @@ class convergence_information_t { const rmm::device_uvector& get_l2_norm_primal_linear_objective() const; const rmm::device_uvector& get_l2_norm_primal_right_hand_side() const; + void compute_owned_reference_norm_partials(i_t owned_var_size, i_t owned_cstr_size); + void sqrt_reference_norms_inplace(); + f_t* l2_norm_primal_right_hand_side_data() { return l2_norm_primal_right_hand_side_.data(); } + f_t* l2_norm_primal_linear_objective_data() { return l2_norm_primal_linear_objective_.data(); } + struct view_t { i_t primal_size; i_t dual_size; From 1903f4bfea48d25ba4042bb7d2a02e0a41267718 Mon Sep 17 00:00:00 2001 From: Bulle Mostovoi Date: Tue, 2 Jun 2026 15:39:56 +0200 Subject: [PATCH 59/67] added a cuopt assert for solve_lp in mgpu mode --- cpp/src/pdlp/solve.cu | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/cpp/src/pdlp/solve.cu b/cpp/src/pdlp/solve.cu index ef273faf13..feaeb7bd57 100644 --- a/cpp/src/pdlp/solve.cu +++ b/cpp/src/pdlp/solve.cu @@ -2126,13 +2126,11 @@ optimization_problem_solution_t solve_lp( bool problem_checking, bool use_pdlp_solver_mode) { - // In distributed PDLP we can't allocate the full problem on the master device - if (settings.hyper_params.use_distributed_pdlp) { + cuopt_expects(settings.hyper_params.use_distributed_pdlp, + error_type_t::ValidationError, + "solve_lp from mps_data_model: settings.hyper_params.use_distributed_pdlp must be true"); return solve_lp_distributed_from_mps( handle_ptr, mps_data_model, settings, problem_checking, use_pdlp_solver_mode); - } - auto op_problem = mps_data_model_to_optimization_problem(handle_ptr, mps_data_model); - return solve_lp(op_problem, settings, problem_checking, use_pdlp_solver_mode); } template From 0aacb4f702fe0a413623f07522dac6745f484692 Mon Sep 17 00:00:00 2001 From: Bulle Mostovoi Date: Tue, 2 Jun 2026 15:42:45 +0200 Subject: [PATCH 60/67] style --- cpp/cuopt_cli.cpp | 9 +- .../cuopt/linear_programming/constants.h | 22 +- .../pdlp/pdlp_hyper_params.cuh | 14 +- .../distributed_pdlp/metis_partitioner.cu | 14 +- .../distributed_pdlp/multi_gpu_engine.hpp | 18 +- .../pdlp/distributed_pdlp/partition_loader.cu | 6 +- cpp/src/pdlp/distributed_pdlp/partitioner.cu | 25 +- cpp/src/pdlp/distributed_pdlp/partitioner.hpp | 2 +- cpp/src/pdlp/pdhg.cu | 16 +- cpp/src/pdlp/pdlp.cu | 586 +++++++++--------- .../restart_strategy/pdlp_restart_strategy.cu | 17 +- cpp/src/pdlp/solve.cu | 27 +- cpp/src/pdlp/solve.cuh | 3 +- .../adaptive_step_size_strategy.cu | 10 +- .../convergence_information.cu | 101 ++- cpp/src/pdlp/utilities/mgpu_trace.cuh | 24 +- 16 files changed, 429 insertions(+), 465 deletions(-) diff --git a/cpp/cuopt_cli.cpp b/cpp/cuopt_cli.cpp index 0ea79bd4ec..b730067a28 100644 --- a/cpp/cuopt_cli.cpp +++ b/cpp/cuopt_cli.cpp @@ -436,14 +436,13 @@ int main(int argc, char* argv[]) // For distributed PDLP, -1 means "auto-detect": resolve to the visible device // count so the RMM memory pools match what solve.cu will eventually dispatch. const bool use_distributed_pdlp = settings.get_parameter(CUOPT_USE_DISTRIBUTED_PDLP); - int requested_gpus = - use_distributed_pdlp ? settings.get_parameter(CUOPT_DISTRIBUTED_PDLP_NUM_GPUS) - : settings.get_parameter(CUOPT_NUM_GPUS); + int requested_gpus = use_distributed_pdlp + ? settings.get_parameter(CUOPT_DISTRIBUTED_PDLP_NUM_GPUS) + : settings.get_parameter(CUOPT_NUM_GPUS); if (use_distributed_pdlp && requested_gpus == -1) { requested_gpus = raft::device_setter::get_device_count(); } - const int provisioned_gpus = - std::min(raft::device_setter::get_device_count(), requested_gpus); + const int provisioned_gpus = std::min(raft::device_setter::get_device_count(), requested_gpus); memory_resources.reserve(provisioned_gpus); for (int i = 0; i < provisioned_gpus; ++i) { diff --git a/cpp/include/cuopt/linear_programming/constants.h b/cpp/include/cuopt/linear_programming/constants.h index e695bb21d3..e2cc264cdc 100644 --- a/cpp/include/cuopt/linear_programming/constants.h +++ b/cpp/include/cuopt/linear_programming/constants.h @@ -80,18 +80,18 @@ #define CUOPT_MIP_STRONG_BRANCHING_SIMPLEX_ITERATION_LIMIT \ "mip_strong_branching_simplex_iteration_limit" -#define CUOPT_SOLUTION_FILE "solution_file" -#define CUOPT_NUM_CPU_THREADS "num_cpu_threads" -#define CUOPT_NUM_GPUS "num_gpus" +#define CUOPT_SOLUTION_FILE "solution_file" +#define CUOPT_NUM_CPU_THREADS "num_cpu_threads" +#define CUOPT_NUM_GPUS "num_gpus" #define CUOPT_DISTRIBUTED_PDLP_NUM_GPUS "distributed_pdlp_num_gpus" -#define CUOPT_MULTI_GPU_PARTITION_FILE "multi_gpu_partition_file" -#define CUOPT_USE_DISTRIBUTED_PDLP "use_distributed_pdlp" -#define CUOPT_PDLP_DISABLE_GRAPH "pdlp_disable_graph" -#define CUOPT_USER_PROBLEM_FILE "user_problem_file" -#define CUOPT_PRESOLVE_FILE "presolve_file" -#define CUOPT_RANDOM_SEED "random_seed" -#define CUOPT_PDLP_PRECISION "pdlp_precision" -#define CUOPT_MIP_SEMICONTINUOUS_BIG_M "mip_semi_continuous_big_m" +#define CUOPT_MULTI_GPU_PARTITION_FILE "multi_gpu_partition_file" +#define CUOPT_USE_DISTRIBUTED_PDLP "use_distributed_pdlp" +#define CUOPT_PDLP_DISABLE_GRAPH "pdlp_disable_graph" +#define CUOPT_USER_PROBLEM_FILE "user_problem_file" +#define CUOPT_PRESOLVE_FILE "presolve_file" +#define CUOPT_RANDOM_SEED "random_seed" +#define CUOPT_PDLP_PRECISION "pdlp_precision" +#define CUOPT_MIP_SEMICONTINUOUS_BIG_M "mip_semi_continuous_big_m" #define CUOPT_MIP_HYPER_HEURISTIC_POPULATION_SIZE "mip_hyper_heuristic_population_size" #define CUOPT_MIP_HYPER_HEURISTIC_NUM_CPUFJ_THREADS "mip_hyper_heuristic_num_cpufj_threads" diff --git a/cpp/include/cuopt/linear_programming/pdlp/pdlp_hyper_params.cuh b/cpp/include/cuopt/linear_programming/pdlp/pdlp_hyper_params.cuh index c68dc86d6a..0ce90e7228 100644 --- a/cpp/include/cuopt/linear_programming/pdlp/pdlp_hyper_params.cuh +++ b/cpp/include/cuopt/linear_programming/pdlp/pdlp_hyper_params.cuh @@ -50,13 +50,13 @@ struct pdlp_hyper_params_t { bool use_distributed_pdlp = false; // Debug/diagnostic knob: when true, PDLP bypasses CUDA-graph capture in // ping_pong_graph_t and executes each iteration eagerly - bool pdlp_disable_graph = false; - double reflection_coefficient = 1.0; - double restart_k_p = 0.99; - double restart_k_i = 0.01; - double restart_k_d = 0.0; - double restart_i_smooth = 0.3; - bool use_conditional_major = true; + bool pdlp_disable_graph = false; + double reflection_coefficient = 1.0; + double restart_k_p = 0.99; + double restart_k_i = 0.01; + double restart_k_d = 0.0; + double restart_i_smooth = 0.3; + bool use_conditional_major = true; }; // TODO most likely we want to get rid of pdlp_solver_mode and just have prebuilt diff --git a/cpp/src/pdlp/distributed_pdlp/metis_partitioner.cu b/cpp/src/pdlp/distributed_pdlp/metis_partitioner.cu index 73e2736251..ecc60adda0 100644 --- a/cpp/src/pdlp/distributed_pdlp/metis_partitioner.cu +++ b/cpp/src/pdlp/distributed_pdlp/metis_partitioner.cu @@ -79,7 +79,9 @@ std::vector metis_partitioner_t::partition( std::vector adjncy(2 * static_cast(nnz)); // cstr-side row offsets: A_offsets[0..nb_cstr] (no shift). - for (i_t i = 0; i <= nb_cstr; ++i) { xadj[i] = static_cast(A_offsets[i]); } + for (i_t i = 0; i <= nb_cstr; ++i) { + xadj[i] = static_cast(A_offsets[i]); + } // var-side row offsets: A_t_offsets[0..nb_vars], shifted by +nnz so that // they index into the second half of adjncy. for (i_t i = 0; i <= nb_vars; ++i) { @@ -106,7 +108,7 @@ std::vector metis_partitioner_t::partition( idx_t objval = 0; std::vector metis_parts(nvtx); - auto t0 = std::chrono::high_resolution_clock::now(); + auto t0 = std::chrono::high_resolution_clock::now(); const int status = METIS_PartGraphKway(&metis_nvtx, &ncon, xadj.data(), @@ -120,8 +122,8 @@ std::vector metis_partitioner_t::partition( metis_options, &objval, metis_parts.data()); - auto t1 = std::chrono::high_resolution_clock::now(); - const double dt = std::chrono::duration(t1 - t0).count(); + auto t1 = std::chrono::high_resolution_clock::now(); + const double dt = std::chrono::duration(t1 - t0).count(); cuopt_expects(status == METIS_OK, error_type_t::RuntimeError, "METIS_PartGraphKway failed (status=%d)", @@ -135,7 +137,9 @@ std::vector metis_partitioner_t::partition( dt); std::vector parts(static_cast(nvtx)); - for (i_t i = 0; i < nvtx; ++i) { parts[i] = static_cast(metis_parts[i]); } + for (i_t i = 0; i < nvtx; ++i) { + parts[i] = static_cast(metis_parts[i]); + } validate_partition(parts, static_cast(nb_cstr), diff --git a/cpp/src/pdlp/distributed_pdlp/multi_gpu_engine.hpp b/cpp/src/pdlp/distributed_pdlp/multi_gpu_engine.hpp index 6ab4e35b71..0297ecc0a6 100644 --- a/cpp/src/pdlp/distributed_pdlp/multi_gpu_engine.hpp +++ b/cpp/src/pdlp/distributed_pdlp/multi_gpu_engine.hpp @@ -255,9 +255,7 @@ struct multi_gpu_engine_t { // OutAccess : pdlp_solver_t& -> f_t* (single scalar in shard memory) // SizeAccess : pdlp_shard_t& -> i_t (owned slice length) template - void distributed_l2_norm(BufAccess&& buf_access, - OutAccess&& out_access, - SizeAccess&& size_access) + void distributed_l2_norm(BufAccess&& buf_access, OutAccess&& out_access, SizeAccess&& size_access) { for_each_shard([&](auto& shard) { auto& sub = *shard.sub_pdlp; @@ -858,13 +856,11 @@ struct multi_gpu_engine_t { // master_reduced_cost : destination for the reduced_cost (var-shaped, lives // in the master pdlp_solver_t's termination strategy // convergence_information_). - void gather_potential_next_solutions_to_master( - pdhg_solver_t& master_pdhg, rmm::device_uvector& master_reduced_cost) + void gather_potential_next_solutions_to_master(pdhg_solver_t& master_pdhg, + rmm::device_uvector& master_reduced_cost) { - const std::size_t total_vars = - master_pdhg.get_potential_next_primal_solution().size(); - const std::size_t total_cstrs = - master_pdhg.get_potential_next_dual_solution().size(); + const std::size_t total_vars = master_pdhg.get_potential_next_primal_solution().size(); + const std::size_t total_cstrs = master_pdhg.get_potential_next_dual_solution().size(); std::vector h_primal(total_vars); std::vector h_dual(total_cstrs); @@ -987,8 +983,8 @@ struct multi_gpu_engine_t { } } - // Functionnaly same as graph_capture_fork_to_shards but on a different event to avoid race conditions - // Can be used as a way to sync shards with master stream + // Functionnaly same as graph_capture_fork_to_shards but on a different event to avoid race + // conditions Can be used as a way to sync shards with master stream void sync_await_master(rmm::cuda_stream_view master_stream) { sync_master_ready_event_->record(master_stream); diff --git a/cpp/src/pdlp/distributed_pdlp/partition_loader.cu b/cpp/src/pdlp/distributed_pdlp/partition_loader.cu index 5014607736..5c317f664e 100644 --- a/cpp/src/pdlp/distributed_pdlp/partition_loader.cu +++ b/cpp/src/pdlp/distributed_pdlp/partition_loader.cu @@ -174,12 +174,10 @@ std::vector> partition_loader_t::create_rank_dat // Pad row-offset arrays so cuSPARSE sees the local matrices as // (total_cstr x total_var) for A and (total_var x total_cstr) for A_T - const i_t a_last_nnz = - rd.h_A_row_offsets.empty() ? i_t{0} : rd.h_A_row_offsets.back(); + const i_t a_last_nnz = rd.h_A_row_offsets.empty() ? i_t{0} : rd.h_A_row_offsets.back(); rd.h_A_row_offsets.resize(rd.total_cstr_size + 1, a_last_nnz); - const i_t at_last_nnz = - rd.h_A_t_row_offsets.empty() ? i_t{0} : rd.h_A_t_row_offsets.back(); + const i_t at_last_nnz = rd.h_A_t_row_offsets.empty() ? i_t{0} : rd.h_A_t_row_offsets.back(); rd.h_A_t_row_offsets.resize(rd.total_var_size + 1, at_last_nnz); } diff --git a/cpp/src/pdlp/distributed_pdlp/partitioner.cu b/cpp/src/pdlp/distributed_pdlp/partitioner.cu index 4b809986ce..bc84e521e2 100644 --- a/cpp/src/pdlp/distributed_pdlp/partitioner.cu +++ b/cpp/src/pdlp/distributed_pdlp/partitioner.cu @@ -38,11 +38,8 @@ std::vector dummy_partitioner_t::partition( return parts; } -void validate_partition(std::vector const& parts, - int nb_cstr, - int nb_vars, - int nb_parts, - char const* context) +void validate_partition( + std::vector const& parts, int nb_cstr, int nb_vars, int nb_parts, char const* context) { const std::size_t expected = static_cast(nb_cstr) + static_cast(nb_vars); @@ -52,10 +49,8 @@ void validate_partition(std::vector const& parts, context, expected, parts.size()); - cuopt_expects(nb_parts > 0, - error_type_t::ValidationError, - "%s: nb_parts must be positive", - context); + cuopt_expects( + nb_parts > 0, error_type_t::ValidationError, "%s: nb_parts must be positive", context); if (parts.empty()) { return; } const auto [min_it, max_it] = std::minmax_element(parts.begin(), parts.end()); cuopt_expects(*min_it >= 0, @@ -75,16 +70,16 @@ template std::unique_ptr> make_partitioner(partitioner_kind_t kind) { switch (kind) { - case partitioner_kind_t::Dummy: - return std::make_unique>(); - case partitioner_kind_t::Metis: - return std::make_unique>(); + case partitioner_kind_t::Dummy: return std::make_unique>(); + case partitioner_kind_t::Metis: return std::make_unique>(); } - cuopt_expects(false, error_type_t::RuntimeError, "make_partitioner: unsupported partitioner kind"); + cuopt_expects( + false, error_type_t::RuntimeError, "make_partitioner: unsupported partitioner kind"); return nullptr; } template class dummy_partitioner_t; -template std::unique_ptr> make_partitioner(partitioner_kind_t); +template std::unique_ptr> make_partitioner( + partitioner_kind_t); } // namespace cuopt::linear_programming::detail diff --git a/cpp/src/pdlp/distributed_pdlp/partitioner.hpp b/cpp/src/pdlp/distributed_pdlp/partitioner.hpp index 82650ad805..2a2149db63 100644 --- a/cpp/src/pdlp/distributed_pdlp/partitioner.hpp +++ b/cpp/src/pdlp/distributed_pdlp/partitioner.hpp @@ -41,7 +41,7 @@ enum class partitioner_kind_t { Dummy, Metis }; template class partitioner_i { public: - virtual ~partitioner_i() = default; + virtual ~partitioner_i() = default; virtual std::vector partition(partitioner_input_t const& input) const = 0; }; diff --git a/cpp/src/pdlp/pdhg.cu b/cpp/src/pdlp/pdhg.cu index ec983fd01b..b1f1a59ada 100644 --- a/cpp/src/pdlp/pdhg.cu +++ b/cpp/src/pdlp/pdhg.cu @@ -1257,9 +1257,7 @@ void pdhg_solver_t::compute_next_primal_dual_solution_reflected( // the capture or run outside the graph, leaving the captured graph // empty (or broken) -- which produces the cycling/stall behavior we // observed on larger problems. Mirrors metis_tests bench.cu fork/join. - if (mgpu_engine_ != nullptr) { - mgpu_engine_->graph_capture_fork_to_shards(stream_view_); - } + if (mgpu_engine_ != nullptr) { mgpu_engine_->graph_capture_fork_to_shards(stream_view_); } compute_At_y(); if (mgpu_engine_ != nullptr) { @@ -1362,16 +1360,12 @@ void pdhg_solver_t::compute_next_primal_dual_solution_reflected( // Multi-GPU: close the fork by joining every shard stream back into // the master stream so cudaStreamEndCapture sees a single graph // spanning all streams. - if (mgpu_engine_ != nullptr) { - mgpu_engine_->graph_capture_join_from_shards(stream_view_); - } + if (mgpu_engine_ != nullptr) { mgpu_engine_->graph_capture_join_from_shards(stream_view_); } }); } else { graph_all.run(should_major, [&]() { - if (mgpu_engine_ != nullptr) { - mgpu_engine_->graph_capture_fork_to_shards(stream_view_); - } + if (mgpu_engine_ != nullptr) { mgpu_engine_->graph_capture_fork_to_shards(stream_view_); } // Compute next primal compute_At_y(); @@ -1478,9 +1472,7 @@ void pdhg_solver_t::compute_next_primal_dual_solution_reflected( print("reflected_dual_", reflected_dual_); #endif - if (mgpu_engine_ != nullptr) { - mgpu_engine_->graph_capture_join_from_shards(stream_view_); - } + if (mgpu_engine_ != nullptr) { mgpu_engine_->graph_capture_join_from_shards(stream_view_); } }); } diff --git a/cpp/src/pdlp/pdlp.cu b/cpp/src/pdlp/pdlp.cu index 013905b4fb..576ab417f1 100644 --- a/cpp/src/pdlp/pdlp.cu +++ b/cpp/src/pdlp/pdlp.cu @@ -390,8 +390,7 @@ pdlp_solver_t::pdlp_solver_t( // Makes all inner feilds of master 0 size : pdlp_solver_t(placeholder_problem, settings, /*is_legacy_batch_mode=*/false) { - cuopt_expects(placeholder_problem.n_variables == 0 && - placeholder_problem.n_constraints == 0 && + cuopt_expects(placeholder_problem.n_variables == 0 && placeholder_problem.n_constraints == 0 && placeholder_problem.nnz == 0, error_type_t::ValidationError, "Distributed mGPU pdlp_solver_t ctor requires a shape-0 " @@ -407,297 +406,297 @@ pdlp_solver_t::pdlp_solver_t( } if constexpr (!std::is_same_v) { - cuopt_expects(false, - error_type_t::ValidationError, - "Distributed PDLP currently requires double precision"); + cuopt_expects( + false, error_type_t::ValidationError, "Distributed PDLP currently requires double precision"); return; } - // ----- 1. Read problem shape and bulk data directly from mps (host) ----- - const i_t n_vars = static_cast(mps.get_objective_coefficients().size()); - const i_t n_cstr = static_cast(mps.get_constraint_lower_bounds().size()); - const i_t nnz = static_cast(mps.get_constraint_matrix_values().size()); - cuopt_expects(n_vars > 0, - error_type_t::ValidationError, - "Distributed PDLP from mps requires a non-empty objective"); - cuopt_expects(n_cstr > 0, - error_type_t::ValidationError, - "Distributed PDLP from mps requires at least one constraint"); - cuopt_expects(static_cast(mps.get_constraint_matrix_offsets().size()) == n_cstr + 1, - error_type_t::ValidationError, - "mps constraint_matrix_offsets size must equal n_constraints + 1"); - cuopt_expects( - static_cast(mps.get_constraint_matrix_indices().size()) == nnz, - error_type_t::ValidationError, - "mps constraint_matrix_indices size must equal nnz (constraint_matrix_values size)"); - cuopt_expects(static_cast(mps.get_constraint_upper_bounds().size()) == n_cstr, - error_type_t::ValidationError, - "mps constraint_upper_bounds size must equal n_constraints"); - cuopt_expects(static_cast(mps.get_variable_lower_bounds().size()) == n_vars, - error_type_t::ValidationError, - "mps variable_lower_bounds size must equal n_variables"); - cuopt_expects(static_cast(mps.get_variable_upper_bounds().size()) == n_vars, - error_type_t::ValidationError, - "mps variable_upper_bounds size must equal n_variables"); - - const bool maximize = mps.get_sense(); - f_t objective_offset = mps.get_objective_offset(); - f_t objective_scaling_factor = mps.get_objective_scaling_factor(); - - // Objective: copy (mutable so we can negate for maximize, matching - // problem_helpers.cuh::convert_to_maximization_problem). - std::vector h_obj = mps.get_objective_coefficients(); - if (maximize) { - for (auto& v : h_obj) v = -v; - objective_offset = -objective_offset; - objective_scaling_factor = -objective_scaling_factor; - } - - // Bounds (copy from mps; engine ctor takes by const ref to std::vector). - std::vector h_var_lower = mps.get_variable_lower_bounds(); - std::vector h_var_upper = mps.get_variable_upper_bounds(); - std::vector h_cstr_lower = mps.get_constraint_lower_bounds(); - std::vector h_cstr_upper = mps.get_constraint_upper_bounds(); - - // A (CSR) — mutable copies for the engine + partitioner consumers below. - std::vector h_A_row_offsets = mps.get_constraint_matrix_offsets(); - std::vector h_A_col_indices = mps.get_constraint_matrix_indices(); - std::vector h_A_values = mps.get_constraint_matrix_values(); - - // ----- 2. Transpose A -> A^T on the host (one-shot CSR transpose) ----- - // CSC(A) and CSR(A^T) share the same memory layout, so the CSC produced - // by dual_simplex::csr_matrix_t::to_compressed_col IS the CSR of A^T. - // O(nnz + n_vars) counting sort, same as problem_t::compute_transpose. - namespace ds = cuopt::linear_programming::dual_simplex; - ds::csr_matrix_t A_csr(n_cstr, n_vars, nnz); - A_csr.row_start = h_A_row_offsets; - A_csr.j = h_A_col_indices; - A_csr.x = h_A_values; - ds::csc_matrix_t AT_as_csc(n_vars, n_cstr, nnz); - A_csr.to_compressed_col(AT_as_csc); - std::vector h_A_t_row_offsets = std::move(AT_as_csc.col_start); - std::vector h_A_t_col_indices = std::move(AT_as_csc.i); - std::vector h_A_t_values = std::move(AT_as_csc.x); - - // ----- 3. Identity scaling for V1 ----- - // Real multi-GPU scaling is a TODO; ship the unscaled problem to shards as - // both "unscaled" and "scaled" so the engine and per-shard pdlp_solver_t - // can run end-to-end. Scaling factor vectors are 1.0 everywhere so the - // shard-side unscale at the end is a no-op. - std::vector h_A_values_scaled = h_A_values; - std::vector h_A_t_values_scaled = h_A_t_values; - std::vector h_obj_scaled = h_obj; - std::vector h_var_lower_scaled = h_var_lower; - std::vector h_var_upper_scaled = h_var_upper; - std::vector h_cstr_lower_scaled = h_cstr_lower; - std::vector h_cstr_upper_scaled = h_cstr_upper; - std::vector h_cummulative_cstr_scaling(n_cstr, f_t(1.0)); - std::vector h_cummulative_var_scaling(n_vars, f_t(1.0)); - const f_t h_bound_rescaling = f_t(1.0); - const f_t h_objective_rescaling = f_t(1.0); - - // ----- 4. Partition ----- - std::vector parts; - if (!settings.multi_gpu_partition_file.empty()) { - parts = partition_loader_t::parse_distributed_pdlp_partition_file( - settings.multi_gpu_partition_file); - validate_partition(parts, n_cstr, n_vars, distributed_pdlp_num_gpus, "partition file"); - } else { - if (distributed_pdlp_num_gpus == 1) { - std::cout << "CAREFUL: distributed_pdlp_num_gpus == 1, running dummy version (single " - "part covering " - << n_cstr << " cstrs + " << n_vars << " vars)" << std::endl; - } - partitioner_input_t partition_input; - partition_input.nb_cstr = n_cstr; - partition_input.nb_vars = n_vars; - partition_input.nb_parts = distributed_pdlp_num_gpus; - - // METIS_PartGraphKway requires nparts >= 2; route num_gpus == 1 to Dummy. - const partitioner_kind_t kind = - (distributed_pdlp_num_gpus == 1) ? partitioner_kind_t::Dummy : partitioner_kind_t::Metis; - if (kind == partitioner_kind_t::Metis) { - // partitioner_input_t holds non-const std::vector* pointers; we - // already have the data in our local mutable buffers above. - partition_input.A.row_offsets = &h_A_row_offsets; - partition_input.A.col_indices = &h_A_col_indices; - partition_input.A.num_rows = n_cstr; - partition_input.A.num_cols = n_vars; - partition_input.A_t.row_offsets = &h_A_t_row_offsets; - partition_input.A_t.col_indices = &h_A_t_col_indices; - partition_input.A_t.num_rows = n_vars; - partition_input.A_t.num_cols = n_cstr; - } - auto partitioner = make_partitioner(kind); - parts = partitioner->partition(partition_input); - } - - // ----- 5. Build per-rank data ----- - std::vector> sub_pdlp_rank_data = - partition_loader_t::create_rank_data_from_parts(parts, - h_A_row_offsets, - h_A_col_indices, - h_A_values, - h_A_values_scaled, - h_A_t_row_offsets, - h_A_t_col_indices, - h_A_t_values, - h_A_t_values_scaled, - settings.distributed_pdlp_num_gpus, - n_cstr, - n_vars, - nnz); - - // ----- 6. Per-shard settings ----- - pdlp_solver_settings_t sub_pdlp_settings = settings; - sub_pdlp_settings.num_gpus = 1; - sub_pdlp_settings.distributed_pdlp_num_gpus = 1; - sub_pdlp_settings.multi_gpu_partition_file = ""; - sub_pdlp_settings.is_distributed_sub_pdlp = true; - sub_pdlp_settings.hyper_params.default_l_inf_ruiz_iterations = 0; - sub_pdlp_settings.hyper_params.default_alpha_pock_chambolle_rescaling = 0.0; - - // ----- 7. Construct the engine: NCCL comms + per-shard pdlp_solver_t ----- - multi_gpu_engine.emplace(std::move(sub_pdlp_rank_data), - h_obj, - h_var_lower, - h_var_upper, - h_cstr_lower, - h_cstr_upper, - h_obj_scaled, - h_var_lower_scaled, - h_var_upper_scaled, - h_cstr_lower_scaled, - h_cstr_upper_scaled, - h_cummulative_cstr_scaling, - h_cummulative_var_scaling, - h_bound_rescaling, - h_objective_rescaling, - maximize, - objective_offset, - objective_scaling_factor, - sub_pdlp_settings); - - // ----- 8 Distributed Scaling ----- - for (auto& shard : multi_gpu_engine->shards) { - raft::device_setter guard(shard->device_id); - shard->sub_pdlp->get_initial_scaling_strategy().reset_scaling_state_for_distributed(); - } - for (auto& shard : multi_gpu_engine->shards) { - raft::device_setter guard(shard->device_id); - shard->stream.synchronize(); - } - - // Distributed scaling - if (settings_.hyper_params.do_ruiz_scaling) { - multi_gpu_engine->distributed_ruiz_inf_scaling( - settings_.hyper_params.default_l_inf_ruiz_iterations, n_vars); - } - if (settings_.hyper_params.do_pock_chambolle_scaling) { - multi_gpu_engine->distributed_pock_chambolle_scaling( - static_cast(settings_.hyper_params.default_alpha_pock_chambolle_rescaling), n_vars); - } - - for (auto& shard : multi_gpu_engine->shards) { - raft::device_setter guard(shard->device_id); - auto& scaling = shard->sub_pdlp->get_initial_scaling_strategy(); - scaling.scale_problem(); - - shard->sub_pdlp->pdhg_solver_.get_cusparse_view().create_spmv_op_plans( - /*is_reflected=*/settings_.hyper_params.use_reflected_primal_dual); - } - for (auto& shard : multi_gpu_engine->shards) { - raft::device_setter guard(shard->device_id); - shard->stream.synchronize(); - } - - // ----- 8b. Seed initial step-size / primal-weight (distributed, scales to N shards) ----- - constexpr f_t kStepSizeScale = f_t{0.998}; - const f_t sigma_max = multi_gpu_engine->distributed_max_singular_value(n_cstr); - const f_t h_primal_weight = f_t{1}; - const f_t h_step_size = (sigma_max > f_t{0}) ? kStepSizeScale / sigma_max : f_t{1}; - // With primal_weight = 1 the adaptive step-size strategy collapses to - // primal_step_size = step_size / primal_weight = step_size - // dual_step_size = step_size * primal_weight = step_size. - const f_t h_primal_step_size = h_step_size; - const f_t h_dual_step_size = h_step_size; - - // Put the values on master - raft::copy(step_size_.data(), &h_step_size, 1, stream_view_); - raft::copy(primal_weight_.data(), &h_primal_weight, 1, stream_view_); - raft::copy(best_primal_weight_.data(), &h_primal_weight, 1, stream_view_); - raft::copy(primal_step_size_.data(), &h_primal_step_size, 1, stream_view_); - raft::copy(dual_step_size_.data(), &h_dual_step_size, 1, stream_view_); - handle_ptr_->sync_stream(stream_view_); - - // put the values on each shard - for (auto& shard : multi_gpu_engine->shards) { - raft::device_setter guard(shard->device_id); - auto& sub = *shard->sub_pdlp; - raft::copy(sub.step_size_.data(), &h_step_size, 1, shard->stream); - raft::copy(sub.primal_weight_.data(), &h_primal_weight, 1, shard->stream); - raft::copy(sub.best_primal_weight_.data(), &h_primal_weight, 1, shard->stream); - raft::copy(sub.get_primal_step_size().data(), &h_primal_step_size, 1, shard->stream); - raft::copy(sub.get_dual_step_size().data(), &h_dual_step_size, 1, shard->stream); - } - for (auto& shard : multi_gpu_engine->shards) { - raft::device_setter guard(shard->device_id); - shard->stream.synchronize(); - } - - // Wire the engine into master's pdhg_solver_; shards keep mgpu_engine_ == nullptr. - pdhg_solver_.set_multi_gpu_engine(&*multi_gpu_engine); - - // ----- 9. Resize master gather destinations to the full problem size ----- - pdhg_solver_.get_potential_next_primal_solution().resize(n_vars, stream_view_); - pdhg_solver_.get_potential_next_dual_solution().resize(n_cstr, stream_view_); - current_termination_strategy_.get_convergence_information().get_reduced_cost().resize( - n_vars, stream_view_); - primal_size_h_ = n_vars; - dual_size_h_ = n_cstr; - - // Distributed conergence_information::init_l2_norms - for (auto& shard : multi_gpu_engine->shards) { - raft::device_setter guard(shard->device_id); - shard->sub_pdlp->get_current_termination_strategy() - .get_convergence_information() - .compute_owned_reference_norm_partials(shard->rank_data.owned_var_size, - shard->rank_data.owned_cstr_size); + // ----- 1. Read problem shape and bulk data directly from mps (host) ----- + const i_t n_vars = static_cast(mps.get_objective_coefficients().size()); + const i_t n_cstr = static_cast(mps.get_constraint_lower_bounds().size()); + const i_t nnz = static_cast(mps.get_constraint_matrix_values().size()); + cuopt_expects(n_vars > 0, + error_type_t::ValidationError, + "Distributed PDLP from mps requires a non-empty objective"); + cuopt_expects(n_cstr > 0, + error_type_t::ValidationError, + "Distributed PDLP from mps requires at least one constraint"); + cuopt_expects(static_cast(mps.get_constraint_matrix_offsets().size()) == n_cstr + 1, + error_type_t::ValidationError, + "mps constraint_matrix_offsets size must equal n_constraints + 1"); + cuopt_expects( + static_cast(mps.get_constraint_matrix_indices().size()) == nnz, + error_type_t::ValidationError, + "mps constraint_matrix_indices size must equal nnz (constraint_matrix_values size)"); + cuopt_expects(static_cast(mps.get_constraint_upper_bounds().size()) == n_cstr, + error_type_t::ValidationError, + "mps constraint_upper_bounds size must equal n_constraints"); + cuopt_expects(static_cast(mps.get_variable_lower_bounds().size()) == n_vars, + error_type_t::ValidationError, + "mps variable_lower_bounds size must equal n_variables"); + cuopt_expects(static_cast(mps.get_variable_upper_bounds().size()) == n_vars, + error_type_t::ValidationError, + "mps variable_upper_bounds size must equal n_variables"); + + const bool maximize = mps.get_sense(); + f_t objective_offset = mps.get_objective_offset(); + f_t objective_scaling_factor = mps.get_objective_scaling_factor(); + + // Objective: copy (mutable so we can negate for maximize, matching + // problem_helpers.cuh::convert_to_maximization_problem). + std::vector h_obj = mps.get_objective_coefficients(); + if (maximize) { + for (auto& v : h_obj) + v = -v; + objective_offset = -objective_offset; + objective_scaling_factor = -objective_scaling_factor; + } + + // Bounds (copy from mps; engine ctor takes by const ref to std::vector). + std::vector h_var_lower = mps.get_variable_lower_bounds(); + std::vector h_var_upper = mps.get_variable_upper_bounds(); + std::vector h_cstr_lower = mps.get_constraint_lower_bounds(); + std::vector h_cstr_upper = mps.get_constraint_upper_bounds(); + + // A (CSR) — mutable copies for the engine + partitioner consumers below. + std::vector h_A_row_offsets = mps.get_constraint_matrix_offsets(); + std::vector h_A_col_indices = mps.get_constraint_matrix_indices(); + std::vector h_A_values = mps.get_constraint_matrix_values(); + + // ----- 2. Transpose A -> A^T on the host (one-shot CSR transpose) ----- + // CSC(A) and CSR(A^T) share the same memory layout, so the CSC produced + // by dual_simplex::csr_matrix_t::to_compressed_col IS the CSR of A^T. + // O(nnz + n_vars) counting sort, same as problem_t::compute_transpose. + namespace ds = cuopt::linear_programming::dual_simplex; + ds::csr_matrix_t A_csr(n_cstr, n_vars, nnz); + A_csr.row_start = h_A_row_offsets; + A_csr.j = h_A_col_indices; + A_csr.x = h_A_values; + ds::csc_matrix_t AT_as_csc(n_vars, n_cstr, nnz); + A_csr.to_compressed_col(AT_as_csc); + std::vector h_A_t_row_offsets = std::move(AT_as_csc.col_start); + std::vector h_A_t_col_indices = std::move(AT_as_csc.i); + std::vector h_A_t_values = std::move(AT_as_csc.x); + + // ----- 3. Identity scaling for V1 ----- + // Real multi-GPU scaling is a TODO; ship the unscaled problem to shards as + // both "unscaled" and "scaled" so the engine and per-shard pdlp_solver_t + // can run end-to-end. Scaling factor vectors are 1.0 everywhere so the + // shard-side unscale at the end is a no-op. + std::vector h_A_values_scaled = h_A_values; + std::vector h_A_t_values_scaled = h_A_t_values; + std::vector h_obj_scaled = h_obj; + std::vector h_var_lower_scaled = h_var_lower; + std::vector h_var_upper_scaled = h_var_upper; + std::vector h_cstr_lower_scaled = h_cstr_lower; + std::vector h_cstr_upper_scaled = h_cstr_upper; + std::vector h_cummulative_cstr_scaling(n_cstr, f_t(1.0)); + std::vector h_cummulative_var_scaling(n_vars, f_t(1.0)); + const f_t h_bound_rescaling = f_t(1.0); + const f_t h_objective_rescaling = f_t(1.0); + + // ----- 4. Partition ----- + std::vector parts; + if (!settings.multi_gpu_partition_file.empty()) { + parts = partition_loader_t::parse_distributed_pdlp_partition_file( + settings.multi_gpu_partition_file); + validate_partition(parts, n_cstr, n_vars, distributed_pdlp_num_gpus, "partition file"); + } else { + if (distributed_pdlp_num_gpus == 1) { + std::cout << "CAREFUL: distributed_pdlp_num_gpus == 1, running dummy version (single " + "part covering " + << n_cstr << " cstrs + " << n_vars << " vars)" << std::endl; } - multi_gpu_engine->allreduce_sum_inplace([](pdlp_solver_t& sp) -> f_t* { - return sp.get_current_termination_strategy() - .get_convergence_information() - .l2_norm_primal_right_hand_side_data(); - }); - multi_gpu_engine->allreduce_sum_inplace([](pdlp_solver_t& sp) -> f_t* { - return sp.get_current_termination_strategy() - .get_convergence_information() - .l2_norm_primal_linear_objective_data(); - }); - for (auto& shard : multi_gpu_engine->shards) { - raft::device_setter guard(shard->device_id); - shard->sub_pdlp->get_current_termination_strategy() - .get_convergence_information() - .sqrt_reference_norms_inplace(); - shard->stream.synchronize(); + partitioner_input_t partition_input; + partition_input.nb_cstr = n_cstr; + partition_input.nb_vars = n_vars; + partition_input.nb_parts = distributed_pdlp_num_gpus; + + // METIS_PartGraphKway requires nparts >= 2; route num_gpus == 1 to Dummy. + const partitioner_kind_t kind = + (distributed_pdlp_num_gpus == 1) ? partitioner_kind_t::Dummy : partitioner_kind_t::Metis; + if (kind == partitioner_kind_t::Metis) { + // partitioner_input_t holds non-const std::vector* pointers; we + // already have the data in our local mutable buffers above. + partition_input.A.row_offsets = &h_A_row_offsets; + partition_input.A.col_indices = &h_A_col_indices; + partition_input.A.num_rows = n_cstr; + partition_input.A.num_cols = n_vars; + partition_input.A_t.row_offsets = &h_A_t_row_offsets; + partition_input.A_t.col_indices = &h_A_t_col_indices; + partition_input.A_t.num_rows = n_vars; + partition_input.A_t.num_cols = n_cstr; } - // Broadcast the values to the master - { - auto& s0 = *multi_gpu_engine->shards[0]; - auto& s0_conv = s0.sub_pdlp->get_current_termination_strategy().get_convergence_information(); - raft::device_setter guard(s0.device_id); - for (auto* ts : {¤t_termination_strategy_, &average_termination_strategy_}) { - auto& ci = ts->get_convergence_information(); - raft::copy(ci.l2_norm_primal_right_hand_side_data(), - s0_conv.l2_norm_primal_right_hand_side_data(), - 1, - stream_view_); - raft::copy(ci.l2_norm_primal_linear_objective_data(), - s0_conv.l2_norm_primal_linear_objective_data(), - 1, - stream_view_); - } + auto partitioner = make_partitioner(kind); + parts = partitioner->partition(partition_input); + } + + // ----- 5. Build per-rank data ----- + std::vector> sub_pdlp_rank_data = + partition_loader_t::create_rank_data_from_parts(parts, + h_A_row_offsets, + h_A_col_indices, + h_A_values, + h_A_values_scaled, + h_A_t_row_offsets, + h_A_t_col_indices, + h_A_t_values, + h_A_t_values_scaled, + settings.distributed_pdlp_num_gpus, + n_cstr, + n_vars, + nnz); + + // ----- 6. Per-shard settings ----- + pdlp_solver_settings_t sub_pdlp_settings = settings; + sub_pdlp_settings.num_gpus = 1; + sub_pdlp_settings.distributed_pdlp_num_gpus = 1; + sub_pdlp_settings.multi_gpu_partition_file = ""; + sub_pdlp_settings.is_distributed_sub_pdlp = true; + sub_pdlp_settings.hyper_params.default_l_inf_ruiz_iterations = 0; + sub_pdlp_settings.hyper_params.default_alpha_pock_chambolle_rescaling = 0.0; + + // ----- 7. Construct the engine: NCCL comms + per-shard pdlp_solver_t ----- + multi_gpu_engine.emplace(std::move(sub_pdlp_rank_data), + h_obj, + h_var_lower, + h_var_upper, + h_cstr_lower, + h_cstr_upper, + h_obj_scaled, + h_var_lower_scaled, + h_var_upper_scaled, + h_cstr_lower_scaled, + h_cstr_upper_scaled, + h_cummulative_cstr_scaling, + h_cummulative_var_scaling, + h_bound_rescaling, + h_objective_rescaling, + maximize, + objective_offset, + objective_scaling_factor, + sub_pdlp_settings); + + // ----- 8 Distributed Scaling ----- + for (auto& shard : multi_gpu_engine->shards) { + raft::device_setter guard(shard->device_id); + shard->sub_pdlp->get_initial_scaling_strategy().reset_scaling_state_for_distributed(); + } + for (auto& shard : multi_gpu_engine->shards) { + raft::device_setter guard(shard->device_id); + shard->stream.synchronize(); + } + + // Distributed scaling + if (settings_.hyper_params.do_ruiz_scaling) { + multi_gpu_engine->distributed_ruiz_inf_scaling( + settings_.hyper_params.default_l_inf_ruiz_iterations, n_vars); + } + if (settings_.hyper_params.do_pock_chambolle_scaling) { + multi_gpu_engine->distributed_pock_chambolle_scaling( + static_cast(settings_.hyper_params.default_alpha_pock_chambolle_rescaling), n_vars); + } + + for (auto& shard : multi_gpu_engine->shards) { + raft::device_setter guard(shard->device_id); + auto& scaling = shard->sub_pdlp->get_initial_scaling_strategy(); + scaling.scale_problem(); + + shard->sub_pdlp->pdhg_solver_.get_cusparse_view().create_spmv_op_plans( + /*is_reflected=*/settings_.hyper_params.use_reflected_primal_dual); + } + for (auto& shard : multi_gpu_engine->shards) { + raft::device_setter guard(shard->device_id); + shard->stream.synchronize(); + } + + // ----- 8b. Seed initial step-size / primal-weight (distributed, scales to N shards) ----- + constexpr f_t kStepSizeScale = f_t{0.998}; + const f_t sigma_max = multi_gpu_engine->distributed_max_singular_value(n_cstr); + const f_t h_primal_weight = f_t{1}; + const f_t h_step_size = (sigma_max > f_t{0}) ? kStepSizeScale / sigma_max : f_t{1}; + // With primal_weight = 1 the adaptive step-size strategy collapses to + // primal_step_size = step_size / primal_weight = step_size + // dual_step_size = step_size * primal_weight = step_size. + const f_t h_primal_step_size = h_step_size; + const f_t h_dual_step_size = h_step_size; + + // Put the values on master + raft::copy(step_size_.data(), &h_step_size, 1, stream_view_); + raft::copy(primal_weight_.data(), &h_primal_weight, 1, stream_view_); + raft::copy(best_primal_weight_.data(), &h_primal_weight, 1, stream_view_); + raft::copy(primal_step_size_.data(), &h_primal_step_size, 1, stream_view_); + raft::copy(dual_step_size_.data(), &h_dual_step_size, 1, stream_view_); + handle_ptr_->sync_stream(stream_view_); + + // put the values on each shard + for (auto& shard : multi_gpu_engine->shards) { + raft::device_setter guard(shard->device_id); + auto& sub = *shard->sub_pdlp; + raft::copy(sub.step_size_.data(), &h_step_size, 1, shard->stream); + raft::copy(sub.primal_weight_.data(), &h_primal_weight, 1, shard->stream); + raft::copy(sub.best_primal_weight_.data(), &h_primal_weight, 1, shard->stream); + raft::copy(sub.get_primal_step_size().data(), &h_primal_step_size, 1, shard->stream); + raft::copy(sub.get_dual_step_size().data(), &h_dual_step_size, 1, shard->stream); + } + for (auto& shard : multi_gpu_engine->shards) { + raft::device_setter guard(shard->device_id); + shard->stream.synchronize(); + } + + // Wire the engine into master's pdhg_solver_; shards keep mgpu_engine_ == nullptr. + pdhg_solver_.set_multi_gpu_engine(&*multi_gpu_engine); + + // ----- 9. Resize master gather destinations to the full problem size ----- + pdhg_solver_.get_potential_next_primal_solution().resize(n_vars, stream_view_); + pdhg_solver_.get_potential_next_dual_solution().resize(n_cstr, stream_view_); + current_termination_strategy_.get_convergence_information().get_reduced_cost().resize( + n_vars, stream_view_); + primal_size_h_ = n_vars; + dual_size_h_ = n_cstr; + + // Distributed conergence_information::init_l2_norms + for (auto& shard : multi_gpu_engine->shards) { + raft::device_setter guard(shard->device_id); + shard->sub_pdlp->get_current_termination_strategy() + .get_convergence_information() + .compute_owned_reference_norm_partials(shard->rank_data.owned_var_size, + shard->rank_data.owned_cstr_size); + } + multi_gpu_engine->allreduce_sum_inplace([](pdlp_solver_t& sp) -> f_t* { + return sp.get_current_termination_strategy() + .get_convergence_information() + .l2_norm_primal_right_hand_side_data(); + }); + multi_gpu_engine->allreduce_sum_inplace([](pdlp_solver_t& sp) -> f_t* { + return sp.get_current_termination_strategy() + .get_convergence_information() + .l2_norm_primal_linear_objective_data(); + }); + for (auto& shard : multi_gpu_engine->shards) { + raft::device_setter guard(shard->device_id); + shard->sub_pdlp->get_current_termination_strategy() + .get_convergence_information() + .sqrt_reference_norms_inplace(); + shard->stream.synchronize(); + } + // Broadcast the values to the master + { + auto& s0 = *multi_gpu_engine->shards[0]; + auto& s0_conv = s0.sub_pdlp->get_current_termination_strategy().get_convergence_information(); + raft::device_setter guard(s0.device_id); + for (auto* ts : {¤t_termination_strategy_, &average_termination_strategy_}) { + auto& ci = ts->get_convergence_information(); + raft::copy(ci.l2_norm_primal_right_hand_side_data(), + s0_conv.l2_norm_primal_right_hand_side_data(), + 1, + stream_view_); + raft::copy(ci.l2_norm_primal_linear_objective_data(), + s0_conv.l2_norm_primal_linear_objective_data(), + 1, + stream_view_); } - handle_ptr_->sync_stream(stream_view_); + } + handle_ptr_->sync_stream(stream_view_); } template @@ -2418,10 +2417,9 @@ void pdlp_solver_t::compute_fixed_error(std::vector& has_restarte multi_gpu_engine->allreduce_sum_inplace( [](auto& sp) -> f_t* { return sp.step_size_strategy_.get_interaction().data(); }); - multi_gpu_engine->allreduce_sum_inplace( - [](auto& sp) -> f_t* { - return sp.step_size_strategy_.get_norm_squared_delta_primal().data(); - }); + multi_gpu_engine->allreduce_sum_inplace([](auto& sp) -> f_t* { + return sp.step_size_strategy_.get_norm_squared_delta_primal().data(); + }); multi_gpu_engine->allreduce_sum_inplace( [](auto& sp) -> f_t* { return sp.step_size_strategy_.get_norm_squared_delta_dual().data(); }); @@ -3022,9 +3020,7 @@ optimization_problem_solution_t pdlp_solver_t::run_solver(co // 1. At the very beginning of the solver, when no steps have been taken yet // 2. After a single step, since average of one step is the same step if (internal_solver_iterations_ <= 1) { - if (multi_gpu_engine) { - assert(false && "Not implemented"); - } + if (multi_gpu_engine) { assert(false && "Not implemented"); } raft::copy(unscaled_primal_avg_solution_.data(), pdhg_solver_.get_primal_solution().data(), primal_size_h_, diff --git a/cpp/src/pdlp/restart_strategy/pdlp_restart_strategy.cu b/cpp/src/pdlp/restart_strategy/pdlp_restart_strategy.cu index b7d49fc32f..ee1d19b96b 100644 --- a/cpp/src/pdlp/restart_strategy/pdlp_restart_strategy.cu +++ b/cpp/src/pdlp/restart_strategy/pdlp_restart_strategy.cu @@ -931,11 +931,11 @@ void pdlp_restart_strategy_t::cupdlpx_restart( .last_restart_duality_gap_.primal_distance_traveled_.data(), 1, stream_view_); - raft::copy(last_restart_duality_gap_.dual_distance_traveled_.data(), - s0.sub_pdlp->get_restart_strategy() - .last_restart_duality_gap_.dual_distance_traveled_.data(), - 1, - stream_view_); + raft::copy( + last_restart_duality_gap_.dual_distance_traveled_.data(), + s0.sub_pdlp->get_restart_strategy().last_restart_duality_gap_.dual_distance_traveled_.data(), + 1, + stream_view_); } else { distance_squared_moved_from_last_restart_period( pdhg_solver.get_potential_next_primal_solution(), @@ -1021,8 +1021,7 @@ void pdlp_restart_strategy_t::cupdlpx_restart( engine->for_each_shard([&](auto& shard) { auto& sub = *shard.sub_pdlp; - raft::copy( - sub.get_primal_step_size().data(), &h_primal_step_size, 1, shard.stream.view()); + raft::copy(sub.get_primal_step_size().data(), &h_primal_step_size, 1, shard.stream.view()); raft::copy(sub.get_dual_step_size().data(), &h_dual_step_size, 1, shard.stream.view()); raft::copy(sub.get_primal_weight().data(), &h_primal_weight, 1, shard.stream.view()); raft::copy( @@ -1087,8 +1086,8 @@ void pdlp_restart_strategy_t::cupdlpx_restart( if (auto* engine = pdhg_solver.get_mgpu_engine()) { engine->for_each_shard([&](auto& shard) { - shard.sub_pdlp->get_restart_strategy().weighted_average_solution_.iterations_since_last_restart_ = - 0; + shard.sub_pdlp->get_restart_strategy() + .weighted_average_solution_.iterations_since_last_restart_ = 0; }); } } diff --git a/cpp/src/pdlp/solve.cu b/cpp/src/pdlp/solve.cu index feaeb7bd57..156a601b29 100644 --- a/cpp/src/pdlp/solve.cu +++ b/cpp/src/pdlp/solve.cu @@ -2126,11 +2126,12 @@ optimization_problem_solution_t solve_lp( bool problem_checking, bool use_pdlp_solver_mode) { - cuopt_expects(settings.hyper_params.use_distributed_pdlp, - error_type_t::ValidationError, - "solve_lp from mps_data_model: settings.hyper_params.use_distributed_pdlp must be true"); - return solve_lp_distributed_from_mps( - handle_ptr, mps_data_model, settings, problem_checking, use_pdlp_solver_mode); + cuopt_expects( + settings.hyper_params.use_distributed_pdlp, + error_type_t::ValidationError, + "solve_lp from mps_data_model: settings.hyper_params.use_distributed_pdlp must be true"); + return solve_lp_distributed_from_mps( + handle_ptr, mps_data_model, settings, problem_checking, use_pdlp_solver_mode); } template @@ -2182,12 +2183,13 @@ optimization_problem_solution_t solve_lp_distributed_from_mps( const i_t n_vars = static_cast(mps_data_model.get_objective_coefficients().size()); const i_t n_cstr = static_cast(mps_data_model.get_constraint_lower_bounds().size()); const i_t nnz = static_cast(mps_data_model.get_constraint_matrix_values().size()); - CUOPT_LOG_INFO("Solving a problem with %d constraints, %d variables (%d integers), and %d " - "nonzeros (distributed mps-direct path)", - n_cstr, - n_vars, - 0, - nnz); + CUOPT_LOG_INFO( + "Solving a problem with %d constraints, %d variables (%d integers), and %d " + "nonzeros (distributed mps-direct path)", + n_cstr, + n_vars, + 0, + nnz); auto lp_timer = cuopt::timer_t(settings_resolved.time_limit); @@ -2200,8 +2202,7 @@ optimization_problem_solution_t solve_lp_distributed_from_mps( } detail::problem_t placeholder_problem(placeholder_op); - detail::pdlp_solver_t solver( - placeholder_problem, mps_data_model, settings_resolved); + detail::pdlp_solver_t solver(placeholder_problem, mps_data_model, settings_resolved); auto sol = solver.run_solver(lp_timer); diff --git a/cpp/src/pdlp/solve.cuh b/cpp/src/pdlp/solve.cuh index abb657943f..160f4602ba 100644 --- a/cpp/src/pdlp/solve.cuh +++ b/cpp/src/pdlp/solve.cuh @@ -64,8 +64,7 @@ cuopt::linear_programming::optimization_problem_solution_t solve_lp_wi * @pre `settings.hyper_params.use_distributed_pdlp == true`. */ template -cuopt::linear_programming::optimization_problem_solution_t -solve_lp_distributed_from_mps( +cuopt::linear_programming::optimization_problem_solution_t solve_lp_distributed_from_mps( raft::handle_t const* handle_ptr, const cuopt::linear_programming::io::mps_data_model_t& mps_data_model, pdlp_solver_settings_t const& settings, diff --git a/cpp/src/pdlp/step_size_strategy/adaptive_step_size_strategy.cu b/cpp/src/pdlp/step_size_strategy/adaptive_step_size_strategy.cu index 530a426117..aac777a44e 100644 --- a/cpp/src/pdlp/step_size_strategy/adaptive_step_size_strategy.cu +++ b/cpp/src/pdlp/step_size_strategy/adaptive_step_size_strategy.cu @@ -369,12 +369,10 @@ void adaptive_step_size_strategy_t::compute_interaction_and_movement( i_t owned_cstr_size) { // mGPU needs to know owned size to restrict the reductions to the owned prefix - const i_t reduce_primal_size = (owned_primal_size >= 0) - ? owned_primal_size - : current_saddle_point_state.get_primal_size(); - const i_t reduce_dual_size = (owned_cstr_size >= 0) - ? owned_cstr_size - : current_saddle_point_state.get_dual_size(); + const i_t reduce_primal_size = + (owned_primal_size >= 0) ? owned_primal_size : current_saddle_point_state.get_primal_size(); + const i_t reduce_dual_size = + (owned_cstr_size >= 0) ? owned_cstr_size : current_saddle_point_state.get_dual_size(); // QP would need this: // if iszero(problem.objective_matrix) diff --git a/cpp/src/pdlp/termination_strategy/convergence_information.cu b/cpp/src/pdlp/termination_strategy/convergence_information.cu index 44ddd5b2a1..1dfc8229da 100644 --- a/cpp/src/pdlp/termination_strategy/convergence_information.cu +++ b/cpp/src/pdlp/termination_strategy/convergence_information.cu @@ -213,8 +213,8 @@ void convergence_information_t::init_l2_norms() } template -void convergence_information_t::compute_owned_reference_norm_partials( - i_t owned_var_size, i_t owned_cstr_size) +void convergence_information_t::compute_owned_reference_norm_partials(i_t owned_var_size, + i_t owned_cstr_size) { cuopt_assert(!batch_mode_, "owned reference-norm partials only used in non-batch mGPU mode"); cuopt_assert(owned_var_size <= primal_size_h_, "owned_var_size must be <= primal_size_h_"); @@ -233,7 +233,7 @@ void convergence_information_t::compute_owned_reference_norm_partials( // rhs_sum_of_squares(lower[0:owned_cstr], upper[0:owned_cstr]) (no sqrt) { rmm::device_buffer d_temp_storage; - size_t bytes = 0; + size_t bytes = 0; auto zip_begin = thrust::make_zip_iterator(problem_ptr->constraint_lower_bounds.data(), problem_ptr->constraint_upper_bounds.data()); cub::DeviceReduce::TransformReduce(nullptr, @@ -491,8 +491,7 @@ void convergence_information_t::compute_convergence_information( print("dual_slack", dual_slack); #endif - if (current_pdhg_solver.is_multi_gpu()) - { + if (current_pdhg_solver.is_multi_gpu()) { auto* engine = current_pdhg_solver.get_mgpu_engine(); cuopt_assert(engine != nullptr, "mGPU branch reached but current_pdhg_solver has no engine (shard pdhg?)"); @@ -502,19 +501,17 @@ void convergence_information_t::compute_convergence_information( // Prepares halo values in potential_next_primal_solution - engine->halo_exchange_var( - [](pdhg_solver_t& pdhg) -> rmm::device_uvector& { - return pdhg.get_potential_next_primal_solution(); - }); + engine->halo_exchange_var([](pdhg_solver_t& pdhg) -> rmm::device_uvector& { + return pdhg.get_potential_next_primal_solution(); + }); for (auto& shard : engine->shards) { raft::device_setter guard(shard->device_id); auto& sub_pdlp = *shard->sub_pdlp; auto& sub_conv = sub_pdlp.get_current_termination_strategy().get_convergence_information(); - sub_conv.compute_primal_residual( - sub_conv.op_problem_cusparse_view_, - sub_pdlp.pdhg_solver_.get_dual_tmp_resource(), - sub_pdlp.pdhg_solver_.get_potential_next_dual_solution()); + sub_conv.compute_primal_residual(sub_conv.op_problem_cusparse_view_, + sub_pdlp.pdhg_solver_.get_dual_tmp_resource(), + sub_pdlp.pdhg_solver_.get_potential_next_dual_solution()); sub_conv.compute_primal_objective_owned_partial( sub_pdlp.pdhg_solver_.get_potential_next_primal_solution(), shard->rank_data.owned_var_size); @@ -522,13 +519,12 @@ void convergence_information_t::compute_convergence_information( // Reduce all primal objectives across shards cuopt_assert(!batch_mode_, "multi-GPU PDLP is not supported in batch mode"); - engine->allreduce_sum_inplace( - [](pdlp_solver_t& sp) -> f_t* { - return sp.get_current_termination_strategy() - .get_convergence_information() - .get_primal_objective() - .data(); - }); + engine->allreduce_sum_inplace([](pdlp_solver_t& sp) -> f_t* { + return sp.get_current_termination_strategy() + .get_convergence_information() + .get_primal_objective() + .data(); + }); // Get the reduced primal objective from the shard[0] (arbitrary) // Sync shards with master stream to avoid race conditions @@ -536,16 +532,15 @@ void convergence_information_t::compute_convergence_information( { auto& s0 = *engine->shards[0]; raft::device_setter guard(s0.device_id); - auto& s0_conv = - s0.sub_pdlp->get_current_termination_strategy().get_convergence_information(); + auto& s0_conv = s0.sub_pdlp->get_current_termination_strategy().get_convergence_information(); raft::copy(primal_objective_.data(), s0_conv.get_primal_objective().data(), 1, stream_view_); } apply_primal_objective_scaling_and_offset(); + } else { + compute_primal_residual( + op_problem_cusparse_view_, current_pdhg_solver.get_dual_tmp_resource(), dual_iterate); + compute_primal_objective(primal_iterate); } - else { - compute_primal_residual( - op_problem_cusparse_view_, current_pdhg_solver.get_dual_tmp_resource(), dual_iterate); - compute_primal_objective(primal_iterate);} #ifdef CUPDLP_DEBUG_MODE print("Primal Residual", primal_residual_); @@ -556,9 +551,7 @@ void convergence_information_t::compute_convergence_information( auto* engine = current_pdhg_solver.get_mgpu_engine(); engine->distributed_l2_norm( [](pdlp_solver_t& sp) -> rmm::device_uvector& { - return sp.get_current_termination_strategy() - .get_convergence_information() - .primal_residual_; + return sp.get_current_termination_strategy().get_convergence_information().primal_residual_; }, [](pdlp_solver_t& sp) -> f_t* { return sp.get_current_termination_strategy() @@ -629,10 +622,9 @@ void convergence_information_t::compute_convergence_information( // cv.dual_solution descriptor, which (cuPDLPx, see // cusparse_view.cu:931-937) is bound to _potential_next_dual -- not to // current.dual_solution. So we must halo-exchange the same buffer. - engine->halo_exchange_cstr( - [](pdhg_solver_t& pdhg) -> rmm::device_uvector& { - return pdhg.get_potential_next_dual_solution(); - }); + engine->halo_exchange_cstr([](pdhg_solver_t& pdhg) -> rmm::device_uvector& { + return pdhg.get_potential_next_dual_solution(); + }); // 2-3) Per-shard: // - compute_dual_residual: shard.dual_residual_ has owned-var entries @@ -653,11 +645,10 @@ void convergence_information_t::compute_convergence_information( raft::device_setter guard(shard->device_id); auto& sub_pdlp = *shard->sub_pdlp; auto& sub_conv = sub_pdlp.get_current_termination_strategy().get_convergence_information(); - sub_conv.compute_dual_residual( - sub_conv.op_problem_cusparse_view_, - sub_pdlp.pdhg_solver_.get_primal_tmp_resource(), - sub_pdlp.pdhg_solver_.get_potential_next_primal_solution(), - sub_pdlp.pdhg_solver_.get_dual_slack()); + sub_conv.compute_dual_residual(sub_conv.op_problem_cusparse_view_, + sub_pdlp.pdhg_solver_.get_primal_tmp_resource(), + sub_pdlp.pdhg_solver_.get_potential_next_primal_solution(), + sub_pdlp.pdhg_solver_.get_dual_slack()); sub_conv.compute_dual_objective_owned_partial( sub_pdlp.pdhg_solver_.get_potential_next_primal_solution(), sub_pdlp.pdhg_solver_.get_dual_slack(), @@ -668,21 +659,19 @@ void convergence_information_t::compute_convergence_information( // 4) Allreduce dual_objective_ across shards (sum, in place). Same // offset/scaling-after-allreduce reasoning as primal: applying offset // per-shard would over-count it Nshards times. - engine->allreduce_sum_inplace( - [](pdlp_solver_t& sp) -> f_t* { - return sp.get_current_termination_strategy() - .get_convergence_information() - .get_dual_objective() - .data(); - }); + engine->allreduce_sum_inplace([](pdlp_solver_t& sp) -> f_t* { + return sp.get_current_termination_strategy() + .get_convergence_information() + .get_dual_objective() + .data(); + }); // Sync shards with master stream to avoid race conditions engine->sync_await_shards(stream_view_); { auto& s0 = *engine->shards[0]; raft::device_setter guard(s0.device_id); - auto& s0_conv = - s0.sub_pdlp->get_current_termination_strategy().get_convergence_information(); + auto& s0_conv = s0.sub_pdlp->get_current_termination_strategy().get_convergence_information(); raft::copy(dual_objective_.data(), s0_conv.get_dual_objective().data(), 1, stream_view_); } apply_dual_objective_scaling_and_offset(); @@ -704,9 +693,7 @@ void convergence_information_t::compute_convergence_information( auto* engine = current_pdhg_solver.get_mgpu_engine(); engine->distributed_l2_norm( [](pdlp_solver_t& sp) -> rmm::device_uvector& { - return sp.get_current_termination_strategy() - .get_convergence_information() - .dual_residual_; + return sp.get_current_termination_strategy().get_convergence_information().dual_residual_; }, [](pdlp_solver_t& sp) -> f_t* { return sp.get_current_termination_strategy() @@ -758,7 +745,8 @@ void convergence_information_t::compute_convergence_information( std::numeric_limits::lowest()); } - // In mGPU, full primal_objective and dual_objective already mirrored to master so no special behaviour + // In mGPU, full primal_objective and dual_objective already mirrored to master so no special + // behaviour const auto [grid_size, block_size] = kernel_config_from_batch_size(climber_strategies_.size()); compute_remaining_stats_kernel <<>>(this->view(), climber_strategies_.size()); @@ -1049,12 +1037,11 @@ void convergence_information_t::compute_dual_objective_owned_partial( stream_view_); // dual_objective_ = dual_dot_ + sum_primal_slack_ (still a partial sum). - cub::DeviceTransform::Transform( - cuda::std::make_tuple(dual_dot_.data(), sum_primal_slack_.data()), - dual_objective_.data(), - 1, - cuda::std::plus<>{}, - stream_view_); + cub::DeviceTransform::Transform(cuda::std::make_tuple(dual_dot_.data(), sum_primal_slack_.data()), + dual_objective_.data(), + 1, + cuda::std::plus<>{}, + stream_view_); } template diff --git a/cpp/src/pdlp/utilities/mgpu_trace.cuh b/cpp/src/pdlp/utilities/mgpu_trace.cuh index 06a848b18e..d9975d3202 100644 --- a/cpp/src/pdlp/utilities/mgpu_trace.cuh +++ b/cpp/src/pdlp/utilities/mgpu_trace.cuh @@ -35,18 +35,18 @@ inline bool mgpu_trace_enabled() } // namespace cuopt::linear_programming::detail -#define MGPU_TRACE(msg) \ - do { \ - if (::cuopt::linear_programming::detail::mgpu_trace_enabled()) { \ - std::fprintf(stderr, "[mgpu %s:%d] %s\n", __func__, __LINE__, (msg)); \ - std::fflush(stderr); \ - } \ +#define MGPU_TRACE(msg) \ + do { \ + if (::cuopt::linear_programming::detail::mgpu_trace_enabled()) { \ + std::fprintf(stderr, "[mgpu %s:%d] %s\n", __func__, __LINE__, (msg)); \ + std::fflush(stderr); \ + } \ } while (0) -#define MGPU_TRACE_FMT(fmt, ...) \ - do { \ - if (::cuopt::linear_programming::detail::mgpu_trace_enabled()) { \ - std::fprintf(stderr, "[mgpu %s:%d] " fmt "\n", __func__, __LINE__, __VA_ARGS__); \ - std::fflush(stderr); \ - } \ +#define MGPU_TRACE_FMT(fmt, ...) \ + do { \ + if (::cuopt::linear_programming::detail::mgpu_trace_enabled()) { \ + std::fprintf(stderr, "[mgpu %s:%d] " fmt "\n", __func__, __LINE__, __VA_ARGS__); \ + std::fflush(stderr); \ + } \ } while (0) From 6df81454a13b14e51bf504615996f066e72172ec Mon Sep 17 00:00:00 2001 From: Bulle Mostovoi Date: Tue, 2 Jun 2026 09:16:25 -0700 Subject: [PATCH 61/67] fixed bound/objective rescaling, now afiro on 8 shards work but hangs in the end --- .../distributed_pdlp/multi_gpu_engine.hpp | 179 ++++++++++++++++++ .../pdlp/distributed_pdlp/partition_loader.cu | 5 +- .../initial_scaling.cu | 47 ++++- .../initial_scaling.cuh | 16 ++ cpp/src/pdlp/pdlp.cu | 13 ++ 5 files changed, 256 insertions(+), 4 deletions(-) diff --git a/cpp/src/pdlp/distributed_pdlp/multi_gpu_engine.hpp b/cpp/src/pdlp/distributed_pdlp/multi_gpu_engine.hpp index 0297ecc0a6..3a0fcb755d 100644 --- a/cpp/src/pdlp/distributed_pdlp/multi_gpu_engine.hpp +++ b/cpp/src/pdlp/distributed_pdlp/multi_gpu_engine.hpp @@ -18,16 +18,21 @@ #include #include +#include #include #include #include +#include +#include #include +#include #include #include #include +#include #include #include #include @@ -45,6 +50,29 @@ struct sqrt_inplace_op_t { __host__ __device__ f_t operator()(f_t x) const { return raft::sqrt(x); } }; +// Squared-norm contribution of a constraint's [lower, upper] bound pair, used to +// build the distributed bound rescaling (mirrors rhs_sum_of_squares_t). Defined +// at namespace scope to avoid extended-lambda-in-template restrictions. +template +struct mgpu_rhs_sq_op_t { + __host__ __device__ f_t operator()(const thrust::tuple& t) const + { + const f_t lower = thrust::get<0>(t); + const f_t upper = thrust::get<1>(t); + f_t sum = f_t(0); + if (isfinite(lower) && (lower != upper)) sum += lower * lower; + if (isfinite(upper)) sum += upper * upper; + return sum; + } +}; + +// Weighted square of an objective coefficient (mirrors weighted_square_op). +template +struct mgpu_weighted_sq_op_t { + f_t weight; + __host__ __device__ f_t operator()(f_t v) const { return v * v * weight; } +}; + template struct multi_gpu_engine_t { // Constructs shards from rank_data @@ -219,6 +247,63 @@ struct multi_gpu_engine_t { ncclGroupEnd(); } + // -------- Broadcast owned constraint (row) scaling into halo ------------ + void broadcast_constraint_scaling_to_halo() + { + const int nb = static_cast(shards.size()); + auto buf_access = [](pdlp_shard_t& s) -> rmm::device_uvector& { + return s.sub_pdlp->get_initial_scaling_strategy().get_cummulative_constraint_matrix_scaling(); + }; + + // Gather each owner's owned scaling values that peers need. + for (int r = 0; r < nb; ++r) { + auto& s = *shards[r]; + raft::device_setter guard(s.device_id); + auto& y = buf_access(s); + for (int peer = 0; peer < nb; ++peer) { + if (peer == r) continue; + if (s.cstr_send_indices_d[peer].size() == 0) continue; + thrust::gather(rmm::exec_policy_nosync(s.stream.view()), + s.cstr_send_indices_d[peer].begin(), + s.cstr_send_indices_d[peer].end(), + y.begin(), + s.cstr_send_buf_d[peer].begin()); + } + } + + ncclGroupStart(); + for (int r = 0; r < nb; ++r) { + auto& s = *shards[r]; + raft::device_setter guard(s.device_id); + for (int peer = 0; peer < nb; ++peer) { + if (peer == r) continue; + ncclSend(s.cstr_send_buf_d[peer].data(), + s.cstr_send_buf_d[peer].size(), + ncclFloat64, + peer, + s.comm.get(), + s.stream.view().value()); + } + } + for (int r = 0; r < nb; ++r) { + auto& s = *shards[r]; + auto& rd = s.rank_data; + raft::device_setter guard(s.device_id); + auto& y = buf_access(s); + for (int peer = 0; peer < nb; ++peer) { + if (peer == r) continue; + f_t* recv_ptr = y.data() + rd.owned_cstr_size + rd.cstr_recv_offsets[peer]; + ncclRecv(recv_ptr, + static_cast(rd.cstr_recv_counts[peer]), + ncclFloat64, + peer, + s.comm.get(), + s.stream.view().value()); + } + } + ncclGroupEnd(); + } + // -------- NCCL allreduce (sum, in place) -------------------------------- // Per-shard in-place sum-allreduce. Each shard's stream issues an // ncclAllReduce(buf, buf, count, ncclFloat64, ncclSum, ...) inside a single @@ -281,6 +366,100 @@ struct multi_gpu_engine_t { }); } + // -------- Distributed bound / objective rescaling ----------------------- + void distributed_bound_objective_rescaling(f_t c_scaling_weight) + { + const int nb = static_cast(shards.size()); + + std::vector> bound_sq; + std::vector> obj_sq; + bound_sq.reserve(nb); + obj_sq.reserve(nb); + + // 1) per-shard partial squared norms over OWNED entries only (halo rhs is + // +/-inf and would otherwise double-count owned entries shared as halo). + for (int r = 0; r < nb; ++r) { + auto& s = *shards[r]; + raft::device_setter guard(s.device_id); + bound_sq.emplace_back(1, s.stream.view()); + obj_sq.emplace_back(1, s.stream.view()); + + const auto& scaled = s.sub_pdlp->get_initial_scaling_strategy().get_scaled_op_problem(); + const int n_owned_cstr = static_cast(s.rank_data.owned_cstr_size); + const int n_owned_var = static_cast(s.rank_data.owned_var_size); + + auto bound_in = thrust::make_transform_iterator( + thrust::make_zip_iterator(scaled.constraint_lower_bounds.data(), + scaled.constraint_upper_bounds.data()), + mgpu_rhs_sq_op_t{}); + size_t tmp_bytes_b = 0; + cub::DeviceReduce::Sum( + nullptr, tmp_bytes_b, bound_in, bound_sq[r].data(), n_owned_cstr, s.stream.view().value()); + rmm::device_buffer scratch_b(tmp_bytes_b, s.stream.view()); + cub::DeviceReduce::Sum(scratch_b.data(), + tmp_bytes_b, + bound_in, + bound_sq[r].data(), + n_owned_cstr, + s.stream.view().value()); + + auto obj_in = thrust::make_transform_iterator(scaled.objective_coefficients.data(), + mgpu_weighted_sq_op_t{c_scaling_weight}); + size_t tmp_bytes_o = 0; + cub::DeviceReduce::Sum( + nullptr, tmp_bytes_o, obj_in, obj_sq[r].data(), n_owned_var, s.stream.view().value()); + rmm::device_buffer scratch_o(tmp_bytes_o, s.stream.view()); + cub::DeviceReduce::Sum(scratch_o.data(), + tmp_bytes_o, + obj_in, + obj_sq[r].data(), + n_owned_var, + s.stream.view().value()); + } + + // 2) NCCL allreduce SUM -> every shard holds the global squared norms. + ncclGroupStart(); + for (int r = 0; r < nb; ++r) { + auto& s = *shards[r]; + raft::device_setter guard(s.device_id); + ncclAllReduce(bound_sq[r].data(), + bound_sq[r].data(), + 1, + ncclFloat64, + ncclSum, + s.comm.get(), + s.stream.view().value()); + ncclAllReduce(obj_sq[r].data(), + obj_sq[r].data(), + 1, + ncclFloat64, + ncclSum, + s.comm.get(), + s.stream.view().value()); + } + ncclGroupEnd(); + + // 3) derive the identical scalars and apply on every shard. + for (int r = 0; r < nb; ++r) { + auto& s = *shards[r]; + raft::device_setter guard(s.device_id); + f_t h_bound_sq = f_t(0); + f_t h_obj_sq = f_t(0); + raft::copy(&h_bound_sq, bound_sq[r].data(), 1, s.stream.view()); + raft::copy(&h_obj_sq, obj_sq[r].data(), 1, s.stream.view()); + s.stream.synchronize(); + const f_t bound_rescaling = f_t(1) / (std::sqrt(h_bound_sq) + f_t(1)); + const f_t objective_rescaling = f_t(1) / (std::sqrt(h_obj_sq) + f_t(1)); + s.sub_pdlp->get_initial_scaling_strategy().apply_distributed_bound_objective_rescaling( + bound_rescaling, objective_rescaling); + } + for (int r = 0; r < nb; ++r) { + auto& s = *shards[r]; + raft::device_setter guard(s.device_id); + s.stream.synchronize(); + } + } + // -------- Generic distributed SpMVs ------------------------------------- // distributed_spmv_A : halo-update the var-shaped input buffer returned by // `in_buf(pdhg)`, then per-shard A @ in_buf -> out_desc. diff --git a/cpp/src/pdlp/distributed_pdlp/partition_loader.cu b/cpp/src/pdlp/distributed_pdlp/partition_loader.cu index 5c317f664e..0ef1eaf4da 100644 --- a/cpp/src/pdlp/distributed_pdlp/partition_loader.cu +++ b/cpp/src/pdlp/distributed_pdlp/partition_loader.cu @@ -196,8 +196,7 @@ std::vector> partition_loader_t::create_rank_dat if (peer == rank) continue; for (auto recv_cstr : rank_data[peer].cstr_send_per_peer[rank]) { rd.global_to_local_cstr[recv_cstr] = curr_id; - // rd.local_to_global_cstr.push_back(recv_cstr); // Not needed, we only do local_to_global - // on owned side + rd.local_to_global_cstr.push_back(recv_cstr); curr_id++; } } @@ -212,7 +211,7 @@ std::vector> partition_loader_t::create_rank_dat if (peer == rank) continue; for (auto recv_var : rank_data[peer].var_send_per_peer[rank]) { rd.global_to_local_var[recv_var] = curr_id; - // rd.local_to_global_var.push_back(recv_var); // same as over + rd.local_to_global_var.push_back(recv_var); curr_id++; } } diff --git a/cpp/src/pdlp/initial_scaling_strategy/initial_scaling.cu b/cpp/src/pdlp/initial_scaling_strategy/initial_scaling.cu index dcc3e662b0..cb498b3756 100644 --- a/cpp/src/pdlp/initial_scaling_strategy/initial_scaling.cu +++ b/cpp/src/pdlp/initial_scaling_strategy/initial_scaling.cu @@ -644,7 +644,8 @@ void pdlp_initial_scaling_strategy_t::scale_problem() cuda::std::multiplies{}, stream_view_); - if (hyper_params_.bound_objective_rescaling && !running_mip_) { + if (hyper_params_.bound_objective_rescaling && !running_mip_ && + !skip_distributed_local_rescaling_) { // Coefficients are computed on the already scaled values bound_objective_rescaling(); @@ -957,6 +958,50 @@ const problem_t& pdlp_initial_scaling_strategy_t::get_scaled return op_problem_scaled_; } +template +void pdlp_initial_scaling_strategy_t::apply_distributed_bound_objective_rescaling( + f_t bound_rescaling, f_t objective_rescaling) +{ + using f_t2 = typename type_2::type; + + // constraint bounds *= bound_rescaling (matches scale_problem() bound block) + cub::DeviceTransform::Transform( + cuda::std::make_tuple(op_problem_scaled_.constraint_lower_bounds.data(), + op_problem_scaled_.constraint_upper_bounds.data()), + thrust::make_zip_iterator(op_problem_scaled_.constraint_lower_bounds.data(), + op_problem_scaled_.constraint_upper_bounds.data()), + op_problem_scaled_.constraint_upper_bounds.size(), + [bound_rescaling] __device__(f_t lower, f_t upper) -> thrust::tuple { + return {lower * bound_rescaling, upper * bound_rescaling}; + }, + stream_view_.value()); + + // variable bounds *= bound_rescaling (batch-1 path only; distributed is batch 1) + cub::DeviceTransform::Transform( + op_problem_scaled_.variable_bounds.data(), + op_problem_scaled_.variable_bounds.data(), + op_problem_scaled_.variable_bounds.size(), + [bound_rescaling] __device__(f_t2 variable_bounds) -> f_t2 { + return {variable_bounds.x * bound_rescaling, variable_bounds.y * bound_rescaling}; + }, + stream_view_); + + // objective *= objective_rescaling + cub::DeviceTransform::Transform( + op_problem_scaled_.objective_coefficients.data(), + op_problem_scaled_.objective_coefficients.data(), + op_problem_scaled_.objective_coefficients.size(), + [objective_rescaling] __device__(f_t c) -> f_t { return c * objective_rescaling; }, + stream_view_); + + // Store the factors (sets both host copies and the device rescaling vectors) + // so unscale_solutions() / scale_solutions() apply them consistently. The flag + // hyper_params_.bound_objective_rescaling stays true on shards so those paths + // are active; only scale_problem()'s local recompute is skipped. + set_h_bound_rescaling(bound_rescaling); + set_h_objective_rescaling(objective_rescaling); +} + template const rmm::device_uvector& pdlp_initial_scaling_strategy_t::get_constraint_matrix_scaling_vector() const diff --git a/cpp/src/pdlp/initial_scaling_strategy/initial_scaling.cuh b/cpp/src/pdlp/initial_scaling_strategy/initial_scaling.cuh index 148ccce238..409df5340a 100644 --- a/cpp/src/pdlp/initial_scaling_strategy/initial_scaling.cuh +++ b/cpp/src/pdlp/initial_scaling_strategy/initial_scaling.cuh @@ -75,6 +75,11 @@ class pdlp_initial_scaling_strategy_t { rmm::device_uvector& dual_slack) const; void unscale_solutions(solution_t& solution) const; const rmm::device_uvector& get_constraint_matrix_scaling_vector() const; + // Mutable access needed by distributed PDLP to broadcast owned constraint + rmm::device_uvector& get_cummulative_constraint_matrix_scaling() + { + return cummulative_constraint_matrix_scaling_; + } const rmm::device_uvector& get_variable_scaling_vector() const; const problem_t& get_scaled_op_problem(); @@ -94,6 +99,14 @@ class pdlp_initial_scaling_strategy_t { void bound_objective_rescaling(); + // Distributed PDLP: apply an externally-computed GLOBAL bound / objective + // rescaling to the already-scaled problem. + void apply_distributed_bound_objective_rescaling(f_t bound_rescaling, f_t objective_rescaling); + + // Distributed PDLP: skip the LOCAL bound/objective rescaling inside + // scale_problem() + void set_skip_distributed_local_rescaling(bool value) { skip_distributed_local_rescaling_ = value; } + // Public for distributed PDLP void compute_scaling_vectors(i_t number_of_ruiz_iterations, f_t alpha); @@ -144,5 +157,8 @@ class pdlp_initial_scaling_strategy_t { rmm::device_uvector& A_T_indices_; const pdlp_hyper_params::pdlp_hyper_params_t& hyper_params_; bool running_mip_; + // Distributed PDLP: when true, scale_problem() skips its local + // bound/objective rescaling (the global factor is applied separately). + bool skip_distributed_local_rescaling_{false}; }; } // namespace cuopt::linear_programming::detail diff --git a/cpp/src/pdlp/pdlp.cu b/cpp/src/pdlp/pdlp.cu index 576ab417f1..4200b487c8 100644 --- a/cpp/src/pdlp/pdlp.cu +++ b/cpp/src/pdlp/pdlp.cu @@ -591,14 +591,21 @@ pdlp_solver_t::pdlp_solver_t( multi_gpu_engine->distributed_ruiz_inf_scaling( settings_.hyper_params.default_l_inf_ruiz_iterations, n_vars); } + // push local scaling to halo + multi_gpu_engine->broadcast_constraint_scaling_to_halo(); if (settings_.hyper_params.do_pock_chambolle_scaling) { multi_gpu_engine->distributed_pock_chambolle_scaling( static_cast(settings_.hyper_params.default_alpha_pock_chambolle_rescaling), n_vars); } + // Refresh the halo constraint scaling after Pock-Chambolle + multi_gpu_engine->broadcast_constraint_scaling_to_halo(); for (auto& shard : multi_gpu_engine->shards) { raft::device_setter guard(shard->device_id); auto& scaling = shard->sub_pdlp->get_initial_scaling_strategy(); + // Skip the per-shard local bound/objective rescaling; the global factor is + // applied below. Keeps the unscale path active (flag stays true). + scaling.set_skip_distributed_local_rescaling(true); scaling.scale_problem(); shard->sub_pdlp->pdhg_solver_.get_cusparse_view().create_spmv_op_plans( @@ -609,6 +616,12 @@ pdlp_solver_t::pdlp_solver_t( shard->stream.synchronize(); } + // Global bound/objective rescaling: allreduce the owned partial squared-norms + if (settings_.hyper_params.bound_objective_rescaling && !inside_mip_) { + multi_gpu_engine->distributed_bound_objective_rescaling( + static_cast(settings_.hyper_params.initial_primal_weight_c_scaling)); + } + // ----- 8b. Seed initial step-size / primal-weight (distributed, scales to N shards) ----- constexpr f_t kStepSizeScale = f_t{0.998}; const f_t sigma_max = multi_gpu_engine->distributed_max_singular_value(n_cstr); From df9f79366cfc9a997bd59480bae9ae623edafcc6 Mon Sep 17 00:00:00 2001 From: Bulle Mostovoi Date: Tue, 2 Jun 2026 12:29:09 -0700 Subject: [PATCH 62/67] actually disable the graph ^^ (kms) --- cpp/src/pdlp/solve.cu | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/cpp/src/pdlp/solve.cu b/cpp/src/pdlp/solve.cu index 156a601b29..228bacfd21 100644 --- a/cpp/src/pdlp/solve.cu +++ b/cpp/src/pdlp/solve.cu @@ -2155,6 +2155,10 @@ optimization_problem_solution_t solve_lp_distributed_from_mps( "use_distributed_pdlp; please set settings.presolver = presolver_t::None"); pdlp_solver_settings_t settings_resolved = settings; + + detail::pdlp_graph_disabled_flag().store(settings_resolved.hyper_params.pdlp_disable_graph, + std::memory_order_relaxed); + if (settings_resolved.distributed_pdlp_num_gpus == -1) { settings_resolved.distributed_pdlp_num_gpus = raft::device_setter::get_device_count(); CUOPT_LOG_INFO( From 4c8bcd1a6710fc4b56b74d1d99ed21f39221b0c1 Mon Sep 17 00:00:00 2001 From: Bulle Mostovoi Date: Thu, 4 Jun 2026 12:21:34 +0200 Subject: [PATCH 63/67] added option to export parts file --- .../cuopt/linear_programming/constants.h | 1 + .../pdlp/solver_settings.hpp | 5 +++++ cpp/src/math_optimization/solver_settings.cu | 1 + .../pdlp/distributed_pdlp/partition_loader.cu | 18 ++++++++++++++++++ .../pdlp/distributed_pdlp/partition_loader.hpp | 6 ++++++ cpp/src/pdlp/pdlp.cu | 9 +++++++++ 6 files changed, 40 insertions(+) diff --git a/cpp/include/cuopt/linear_programming/constants.h b/cpp/include/cuopt/linear_programming/constants.h index e2cc264cdc..e24ca5c346 100644 --- a/cpp/include/cuopt/linear_programming/constants.h +++ b/cpp/include/cuopt/linear_programming/constants.h @@ -85,6 +85,7 @@ #define CUOPT_NUM_GPUS "num_gpus" #define CUOPT_DISTRIBUTED_PDLP_NUM_GPUS "distributed_pdlp_num_gpus" #define CUOPT_MULTI_GPU_PARTITION_FILE "multi_gpu_partition_file" +#define CUOPT_MULTI_GPU_EXPORT_PARTITION_FILE "multi_gpu_export_partition_file" #define CUOPT_USE_DISTRIBUTED_PDLP "use_distributed_pdlp" #define CUOPT_PDLP_DISABLE_GRAPH "pdlp_disable_graph" #define CUOPT_USER_PROBLEM_FILE "user_problem_file" diff --git a/cpp/include/cuopt/linear_programming/pdlp/solver_settings.hpp b/cpp/include/cuopt/linear_programming/pdlp/solver_settings.hpp index efdbd5733c..1443333df4 100644 --- a/cpp/include/cuopt/linear_programming/pdlp/solver_settings.hpp +++ b/cpp/include/cuopt/linear_programming/pdlp/solver_settings.hpp @@ -311,6 +311,11 @@ class pdlp_solver_settings_t { // -1 means auto-detect int distributed_pdlp_num_gpus{-1}; std::string multi_gpu_partition_file{""}; + // If non-empty, the partition computed for distributed PDLP is written to this + // path (one part-id per line) right after partitioning. The file can be fed + // back via multi_gpu_partition_file. Exposed as the multi_gpu_export_partition_file + // parameter (CLI: --multi-gpu-export-partition-file ). + std::string multi_gpu_export_partition_file{""}; // Set to true inside the shards bool is_distributed_sub_pdlp{false}; method_t method{method_t::Concurrent}; diff --git a/cpp/src/math_optimization/solver_settings.cu b/cpp/src/math_optimization/solver_settings.cu index 629c8a8428..87324524f1 100644 --- a/cpp/src/math_optimization/solver_settings.cu +++ b/cpp/src/math_optimization/solver_settings.cu @@ -192,6 +192,7 @@ solver_settings_t::solver_settings_t() : pdlp_settings(), mip_settings {CUOPT_PRESOLVE_FILE, &mip_settings.presolve_file, ""}, {CUOPT_PRESOLVE_FILE, &pdlp_settings.presolve_file, ""}, {CUOPT_MULTI_GPU_PARTITION_FILE, &pdlp_settings.multi_gpu_partition_file, ""}, + {CUOPT_MULTI_GPU_EXPORT_PARTITION_FILE, &pdlp_settings.multi_gpu_export_partition_file, ""}, }; // clang-format on } diff --git a/cpp/src/pdlp/distributed_pdlp/partition_loader.cu b/cpp/src/pdlp/distributed_pdlp/partition_loader.cu index 0ef1eaf4da..a6db3a9fe8 100644 --- a/cpp/src/pdlp/distributed_pdlp/partition_loader.cu +++ b/cpp/src/pdlp/distributed_pdlp/partition_loader.cu @@ -40,6 +40,24 @@ std::vector partition_loader_t::parse_distributed_pdlp_partition_ return parts; } +template +void partition_loader_t::export_distributed_pdlp_partition_file( + std::string const& file, std::vector const& parts) +{ + std::ofstream part_file(file); + cuopt_expects(part_file.is_open(), + error_type_t::ValidationError, + "Failed to open partition file for export: %s", + file.c_str()); + for (auto const& part : parts) { + part_file << part << "\n"; + } + cuopt_expects(part_file.good(), + error_type_t::RuntimeError, + "Failed while writing partition file: %s", + file.c_str()); +} + template std::vector> partition_loader_t::create_rank_data_from_parts( const std::vector& parts, diff --git a/cpp/src/pdlp/distributed_pdlp/partition_loader.hpp b/cpp/src/pdlp/distributed_pdlp/partition_loader.hpp index 915c24a828..ce12d241f9 100644 --- a/cpp/src/pdlp/distributed_pdlp/partition_loader.hpp +++ b/cpp/src/pdlp/distributed_pdlp/partition_loader.hpp @@ -19,6 +19,12 @@ struct partition_loader_t { // nb_cstr + nb_vars, indexed as in create_rank_data_from_parts (cstrs first, then vars). static std::vector parse_distributed_pdlp_partition_file(std::string const& file); + // Write a partition vector to file in the same format parse_... reads back: + // one part-id per line. Useful for inspecting / reusing a computed partition + // (e.g. CLI --distributed-pdlp-export-parts). + static void export_distributed_pdlp_partition_file(std::string const& file, + std::vector const& parts); + // Slices the data to prepare a split from metis partitionning with halo communication static std::vector> create_rank_data_from_parts( const std::vector& parts, diff --git a/cpp/src/pdlp/pdlp.cu b/cpp/src/pdlp/pdlp.cu index 4200b487c8..150311ae33 100644 --- a/cpp/src/pdlp/pdlp.cu +++ b/cpp/src/pdlp/pdlp.cu @@ -530,6 +530,14 @@ pdlp_solver_t::pdlp_solver_t( parts = partitioner->partition(partition_input); } + // Optionally dump the partition right after computing it (one part-id per line). + if (!settings.multi_gpu_export_partition_file.empty()) { + partition_loader_t::export_distributed_pdlp_partition_file( + settings.multi_gpu_export_partition_file, parts); + std::cout << "Exported " << parts.size() << " part-ids to " + << settings.multi_gpu_export_partition_file << std::endl; + } + // ----- 5. Build per-rank data ----- std::vector> sub_pdlp_rank_data = partition_loader_t::create_rank_data_from_parts(parts, @@ -551,6 +559,7 @@ pdlp_solver_t::pdlp_solver_t( sub_pdlp_settings.num_gpus = 1; sub_pdlp_settings.distributed_pdlp_num_gpus = 1; sub_pdlp_settings.multi_gpu_partition_file = ""; + sub_pdlp_settings.multi_gpu_export_partition_file = ""; sub_pdlp_settings.is_distributed_sub_pdlp = true; sub_pdlp_settings.hyper_params.default_l_inf_ruiz_iterations = 0; sub_pdlp_settings.hyper_params.default_alpha_pock_chambolle_rescaling = 0.0; From a8a8054b36333ffebeaba6312c2d998bfa9156ec Mon Sep 17 00:00:00 2001 From: Bulle Mostovoi Date: Thu, 4 Jun 2026 13:29:27 +0200 Subject: [PATCH 64/67] addded test for import export parts file --- cpp/tests/linear_programming/pdlp_test.cu | 62 +++++++++++++++++++++++ 1 file changed, 62 insertions(+) diff --git a/cpp/tests/linear_programming/pdlp_test.cu b/cpp/tests/linear_programming/pdlp_test.cu index d29995efc5..b20ce4a1c9 100644 --- a/cpp/tests/linear_programming/pdlp_test.cu +++ b/cpp/tests/linear_programming/pdlp_test.cu @@ -8,12 +8,16 @@ #include #include #include +#include +#include #include #include #include #include #include +#include + #include "utilities/pdlp_test_utilities.cuh" #include "../mip/mip_utils.cuh" @@ -91,6 +95,64 @@ TEST(pdlp_class, run_double) afiro_primal_objective, solution.get_additional_termination_information().primal_objective)); } +// Distributed-PDLP partition round-trip: partition the afiro constraint/variable +// bipartite graph with METIS, write it out, read it back, and confirm the parsed +// vector is identical to what the partitioner produced. +TEST(pdlp_class, distributed_partition_metis_export_import_roundtrip) +{ + using namespace cuopt::linear_programming::detail; + namespace ds = cuopt::linear_programming::dual_simplex; + + auto path = make_path_absolute("linear_programming/afiro_original.mps"); + cuopt::linear_programming::io::mps_data_model_t mps = + cuopt::linear_programming::io::parse_mps(path, true); + + const int n_vars = static_cast(mps.get_objective_coefficients().size()); + const int n_cstr = static_cast(mps.get_constraint_lower_bounds().size()); + const int nnz = static_cast(mps.get_constraint_matrix_values().size()); + + std::vector h_A_row_offsets = mps.get_constraint_matrix_offsets(); + std::vector h_A_col_indices = mps.get_constraint_matrix_indices(); + std::vector h_A_values = mps.get_constraint_matrix_values(); + + // Transpose A -> A^T (CSR of A^T == CSC of A), mirroring solve_lp_distributed_from_mps. + ds::csr_matrix_t A_csr(n_cstr, n_vars, nnz); + A_csr.row_start = h_A_row_offsets; + A_csr.j = h_A_col_indices; + A_csr.x = h_A_values; + ds::csc_matrix_t AT_as_csc(n_vars, n_cstr, nnz); + A_csr.to_compressed_col(AT_as_csc); + std::vector h_A_t_row_offsets = AT_as_csc.col_start; + std::vector h_A_t_col_indices = AT_as_csc.i; + + partitioner_input_t input; + input.nb_cstr = n_cstr; + input.nb_vars = n_vars; + input.nb_parts = 2; + input.A.row_offsets = &h_A_row_offsets; + input.A.col_indices = &h_A_col_indices; + input.A.num_rows = n_cstr; + input.A.num_cols = n_vars; + input.A_t.row_offsets = &h_A_t_row_offsets; + input.A_t.col_indices = &h_A_t_col_indices; + input.A_t.num_rows = n_vars; + input.A_t.num_cols = n_cstr; + + auto partitioner = make_partitioner(partitioner_kind_t::Metis); + std::vector parts = partitioner->partition(input); + ASSERT_EQ(parts.size(), static_cast(n_cstr + n_vars)); + + std::string dir = ::testing::TempDir(); + if (!dir.empty() && dir.back() != '/') { dir.push_back('/'); } + const std::string out_path = dir + "afiro_metis_roundtrip.parts"; + + partition_loader_t::export_distributed_pdlp_partition_file(out_path, parts); + std::vector reloaded = + partition_loader_t::parse_distributed_pdlp_partition_file(out_path); + + EXPECT_EQ(parts, reloaded); +} + TEST(pdlp_class, precision_mixed) { using namespace cuopt::linear_programming::detail; From 5abcd2e0feaa00efa9a43daa1be94cf4cb89f034 Mon Sep 17 00:00:00 2001 From: Bulle Mostovoi Date: Thu, 4 Jun 2026 14:43:36 +0200 Subject: [PATCH 65/67] added full solve tests --- cpp/tests/linear_programming/pdlp_test.cu | 104 ++++++++++++++++++++++ 1 file changed, 104 insertions(+) diff --git a/cpp/tests/linear_programming/pdlp_test.cu b/cpp/tests/linear_programming/pdlp_test.cu index b20ce4a1c9..65cc2f0d9f 100644 --- a/cpp/tests/linear_programming/pdlp_test.cu +++ b/cpp/tests/linear_programming/pdlp_test.cu @@ -49,11 +49,13 @@ #include #include +#include #include #include #include #include #include +#include #include #include #include @@ -153,6 +155,108 @@ TEST(pdlp_class, distributed_partition_metis_export_import_roundtrip) EXPECT_EQ(parts, reloaded); } +namespace { + +// Solve `mps_rel_path` with the single-GPU PDLP ("base") and with distributed PDLP +// (num_gpus = -1 => auto-detect; 1 GPU is fine), then assert the distributed run +// matches the base run on everything meaningful: termination status, step count +// (within 15%), primal/dual objective, and the full primal/dual solution vectors. +// All value comparisons use a loose relative tolerance. +void expect_distributed_matches_base(raft::handle_t const& handle, + std::string const& mps_rel_path, + bool fixed_mps_format = false) +{ + constexpr double loose_rel = 1e-3; + auto near_rel = [](double a, double b, double rel) { + return std::fabs(a - b) <= rel * (1.0 + std::fabs(a)); + }; + + auto path = make_path_absolute(mps_rel_path); + io::mps_data_model_t problem = io::parse_mps(path, fixed_mps_format); + + // Shared settings: PDLP, no presolve (distributed requires presolver == None, so the + // base run must match to keep the two problems identical). + pdlp_solver_settings_t base_settings{}; + base_settings.method = method_t::PDLP; + base_settings.presolver = presolver_t::None; + + // ----- base: single-GPU PDLP (materialize the full problem on one GPU) ----- + auto base_op = mps_data_model_to_optimization_problem(&handle, problem); + auto base = solve_lp(base_op, base_settings); + + // ----- distributed PDLP (identical settings, only the distributed flags flipped) ----- + pdlp_solver_settings_t dist_settings = base_settings; + dist_settings.hyper_params.use_distributed_pdlp = true; + dist_settings.distributed_pdlp_num_gpus = -1; + auto dist = solve_lp(&handle, problem, dist_settings); + + // ----- termination status ----- + ASSERT_EQ(static_cast(base.get_termination_status()), CUOPT_TERMINATION_STATUS_OPTIMAL) + << mps_rel_path << ": base did not reach optimal"; + EXPECT_EQ(static_cast(dist.get_termination_status()), + static_cast(base.get_termination_status())) + << mps_rel_path << ": distributed termination status differs from base"; + + const auto& base_info = base.get_additional_termination_information(); + const auto& dist_info = dist.get_additional_termination_information(); + + // ----- objectives ----- + EXPECT_TRUE(near_rel(base_info.primal_objective, dist_info.primal_objective, loose_rel)) + << mps_rel_path << ": primal objective base=" << base_info.primal_objective + << " distributed=" << dist_info.primal_objective; + EXPECT_TRUE(near_rel(base_info.dual_objective, dist_info.dual_objective, loose_rel)) + << mps_rel_path << ": dual objective base=" << base_info.dual_objective + << " distributed=" << dist_info.dual_objective; + + // ----- step count: within 15% of the larger of the two ----- + const int base_steps = base_info.number_of_steps_taken; + const int dist_steps = dist_info.number_of_steps_taken; + const int max_steps = std::max(base_steps, dist_steps); + const int step_diff = std::max(base_steps, dist_steps) - std::min(base_steps, dist_steps); + EXPECT_LE(static_cast(step_diff), 0.15 * max_steps) + << mps_rel_path << ": step counts differ by >15% (base=" << base_steps + << ", distributed=" << dist_steps << ")"; + + // ----- primal / dual solution vectors ----- + auto base_primal = cuopt::host_copy(base.get_primal_solution(), handle.get_stream()); + auto dist_primal = cuopt::host_copy(dist.get_primal_solution(), handle.get_stream()); + ASSERT_EQ(base_primal.size(), dist_primal.size()) << mps_rel_path << ": primal size mismatch"; + for (std::size_t i = 0; i < base_primal.size(); ++i) { + EXPECT_TRUE(near_rel(base_primal[i], dist_primal[i], loose_rel)) + << mps_rel_path << ": primal[" << i << "] base=" << base_primal[i] + << " distributed=" << dist_primal[i]; + } + + auto base_dual = cuopt::host_copy(base.get_dual_solution(), handle.get_stream()); + auto dist_dual = cuopt::host_copy(dist.get_dual_solution(), handle.get_stream()); + ASSERT_EQ(base_dual.size(), dist_dual.size()) << mps_rel_path << ": dual size mismatch"; + for (std::size_t i = 0; i < base_dual.size(); ++i) { + EXPECT_TRUE(near_rel(base_dual[i], dist_dual[i], loose_rel)) + << mps_rel_path << ": dual[" << i << "] base=" << base_dual[i] + << " distributed=" << dist_dual[i]; + } +} + +} // namespace + +TEST(pdlp_class, distributed_parity_afiro) +{ + const raft::handle_t handle{}; + expect_distributed_matches_base(handle, "linear_programming/afiro_original.mps", true); +} + +TEST(pdlp_class, distributed_parity_square41) +{ + const raft::handle_t handle{}; + expect_distributed_matches_base(handle, "linear_programming/neos3/neos3.mps"); +} + +TEST(pdlp_class, distributed_parity_a2864) +{ + const raft::handle_t handle{}; + expect_distributed_matches_base(handle, "linear_programming/a2864/a2864.mps"); +} + TEST(pdlp_class, precision_mixed) { using namespace cuopt::linear_programming::detail; From 0b0ce2ccd9b2d4f2e1273d7c5a548f81619836ba Mon Sep 17 00:00:00 2001 From: Bulle Mostovoi Date: Thu, 4 Jun 2026 16:15:58 +0200 Subject: [PATCH 66/67] added kaminpar partitionner and possibility to chose the partitionner --- cpp/CMakeLists.txt | 12 ++ cpp/cmake/thirdparty/get_kaminpar.cmake | 48 ++++++ .../cuopt/linear_programming/constants.h | 1 + .../pdlp/solver_settings.hpp | 8 + cpp/src/math_optimization/solver_settings.cu | 1 + cpp/src/pdlp/CMakeLists.txt | 1 + .../distributed_pdlp/kaminpar_partitioner.cpp | 142 ++++++++++++++++++ .../distributed_pdlp/kaminpar_partitioner.hpp | 23 +++ .../distributed_pdlp/metis_partitioner.cu | 13 +- cpp/src/pdlp/distributed_pdlp/partitioner.cu | 3 + cpp/src/pdlp/distributed_pdlp/partitioner.hpp | 9 +- cpp/src/pdlp/pdlp.cu | 54 +++++-- cpp/src/pdlp/solve.cu | 5 - 13 files changed, 291 insertions(+), 29 deletions(-) create mode 100644 cpp/cmake/thirdparty/get_kaminpar.cmake create mode 100644 cpp/src/pdlp/distributed_pdlp/kaminpar_partitioner.cpp create mode 100644 cpp/src/pdlp/distributed_pdlp/kaminpar_partitioner.hpp diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index d27072bcf9..0bf2b0f3f7 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -378,6 +378,17 @@ set_target_properties(metis_external PROPERTIES ) message(STATUS "Using METIS: ${METIS_LIBRARY}") +# ################################################################################################## +# - KaMinPar (multi-threaded partitioning for distributed PDLP) ------------------------------------ +# Brought in the RAPIDS way (rapids_cpm_find): uses an installed KaMinPar (deb/rpm/conda, +# discovered via its CMake config) if present, otherwise builds the pinned source via CPM. +# Distributed PDLP prefers KaMinPar over METIS. +include(cmake/thirdparty/get_kaminpar.cmake) +if (NOT TARGET KaMinPar::KaMinPar) + message(FATAL_ERROR "KaMinPar::KaMinPar was not made available by get_kaminpar.cmake") +endif () +message(STATUS "Using KaMinPar (distributed PDLP prefers KaMinPar over METIS)") + # ################################################################################################## # - gRPC and Protobuf setup ----------------------------------------------------------------------- @@ -642,6 +653,7 @@ target_link_libraries(cuopt ${CUOPT_PRIVATE_CUDA_LIBS} nccl_external metis_external + KaMinPar::KaMinPar $<$:protobuf::libprotobuf> $<$:gRPC::grpc++> ) diff --git a/cpp/cmake/thirdparty/get_kaminpar.cmake b/cpp/cmake/thirdparty/get_kaminpar.cmake new file mode 100644 index 0000000000..d548a76115 --- /dev/null +++ b/cpp/cmake/thirdparty/get_kaminpar.cmake @@ -0,0 +1,48 @@ +# cmake-format: off +# SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# cmake-format: on + +# Multi-threaded graph partitioner for distributed PDLP. +# Uses rapids_cpm_find so a system / conda / .deb install of KaMinPar (which ships a +# CMake config package exporting KaMinPar::KaMinPar) is used when available, and +# otherwise the pinned source is cloned and built via CPM. KaMinPar depends on TBB, +# which cuOpt already requires (see find_package(TBB) for papilo). +function(find_and_configure_kaminpar) + set(oneValueArgs VERSION PINNED_TAG) + cmake_parse_arguments(PKG "" "${oneValueArgs}" "" ${ARGN}) + + rapids_cpm_find(KaMinPar ${PKG_VERSION} + GLOBAL_TARGETS KaMinPar::KaMinPar + CPM_ARGS + GIT_REPOSITORY https://github.com/KaHIP/KaMinPar.git + GIT_TAG ${PKG_PINNED_TAG} + EXCLUDE_FROM_ALL + OPTIONS + "KAMINPAR_BUILD_APPS OFF" + "KAMINPAR_BUILD_TOOLS OFF" + "KAMINPAR_BUILD_TESTS OFF" + "KAMINPAR_BUILD_BENCHMARKS OFF" + "KAMINPAR_BUILD_EXAMPLES OFF" + "KAMINPAR_BUILD_DISTRIBUTED OFF" + # Timers use global state and force single-threaded use of the library + # interface; disable so cuOpt can call the partitioner freely. + "KAMINPAR_ENABLE_TIMERS OFF" + # Avoid an extra hard dependency on Google Sparsehash. + "KAMINPAR_BUILD_WITH_SPARSEHASH OFF" + # cuOpt's TBB is discovered via a legacy find that only exposes TBB::tbb + # (no TBB::tbbmalloc target); disable KaMinPar's optional tbbmalloc use. + "KAMINPAR_ENABLE_TBB_MALLOC OFF" + # Large LP constraint graphs can exceed 2^31 directed edges. + "KAMINPAR_64BIT_EDGE_IDS ON" + "INSTALL_KAMINPAR OFF" + ) + + if(KaMinPar_ADDED) + message(VERBOSE "CUOPT: Using KaMinPar located in ${KaMinPar_SOURCE_DIR}") + else() + message(VERBOSE "CUOPT: Using KaMinPar located in ${KaMinPar_DIR}") + endif() +endfunction() + +find_and_configure_kaminpar(VERSION 3.7.3 PINNED_TAG v3.7.3) diff --git a/cpp/include/cuopt/linear_programming/constants.h b/cpp/include/cuopt/linear_programming/constants.h index e24ca5c346..420a03526b 100644 --- a/cpp/include/cuopt/linear_programming/constants.h +++ b/cpp/include/cuopt/linear_programming/constants.h @@ -86,6 +86,7 @@ #define CUOPT_DISTRIBUTED_PDLP_NUM_GPUS "distributed_pdlp_num_gpus" #define CUOPT_MULTI_GPU_PARTITION_FILE "multi_gpu_partition_file" #define CUOPT_MULTI_GPU_EXPORT_PARTITION_FILE "multi_gpu_export_partition_file" +#define CUOPT_DISTRIBUTED_PDLP_PARTITIONER "distributed_pdlp_partitioner" #define CUOPT_USE_DISTRIBUTED_PDLP "use_distributed_pdlp" #define CUOPT_PDLP_DISABLE_GRAPH "pdlp_disable_graph" #define CUOPT_USER_PROBLEM_FILE "user_problem_file" diff --git a/cpp/include/cuopt/linear_programming/pdlp/solver_settings.hpp b/cpp/include/cuopt/linear_programming/pdlp/solver_settings.hpp index 1443333df4..42ef1f592a 100644 --- a/cpp/include/cuopt/linear_programming/pdlp/solver_settings.hpp +++ b/cpp/include/cuopt/linear_programming/pdlp/solver_settings.hpp @@ -316,6 +316,14 @@ class pdlp_solver_settings_t { // back via multi_gpu_partition_file. Exposed as the multi_gpu_export_partition_file // parameter (CLI: --multi-gpu-export-partition-file ). std::string multi_gpu_export_partition_file{""}; + // Which graph partitioner distributed PDLP uses. One of: + // "auto" - 1 GPU => Dummy; otherwise KaMinPar + // "dummy" - round-robin, no graph (trivial) + // "metis" - serial METIS_PartGraphKway + // "kaminpar" - multi-threaded KaMinPar + // Exposed as the distributed_pdlp_partitioner parameter + // (CLI: --distributed-pdlp-partitioner ). + std::string distributed_pdlp_partitioner{"auto"}; // Set to true inside the shards bool is_distributed_sub_pdlp{false}; method_t method{method_t::Concurrent}; diff --git a/cpp/src/math_optimization/solver_settings.cu b/cpp/src/math_optimization/solver_settings.cu index 87324524f1..254a3afb38 100644 --- a/cpp/src/math_optimization/solver_settings.cu +++ b/cpp/src/math_optimization/solver_settings.cu @@ -193,6 +193,7 @@ solver_settings_t::solver_settings_t() : pdlp_settings(), mip_settings {CUOPT_PRESOLVE_FILE, &pdlp_settings.presolve_file, ""}, {CUOPT_MULTI_GPU_PARTITION_FILE, &pdlp_settings.multi_gpu_partition_file, ""}, {CUOPT_MULTI_GPU_EXPORT_PARTITION_FILE, &pdlp_settings.multi_gpu_export_partition_file, ""}, + {CUOPT_DISTRIBUTED_PDLP_PARTITIONER, &pdlp_settings.distributed_pdlp_partitioner, "auto"}, }; // clang-format on } diff --git a/cpp/src/pdlp/CMakeLists.txt b/cpp/src/pdlp/CMakeLists.txt index 863cf20962..12f2550203 100644 --- a/cpp/src/pdlp/CMakeLists.txt +++ b/cpp/src/pdlp/CMakeLists.txt @@ -34,6 +34,7 @@ set(LP_CORE_FILES ${CMAKE_CURRENT_SOURCE_DIR}/distributed_pdlp/partition_loader.cu ${CMAKE_CURRENT_SOURCE_DIR}/distributed_pdlp/partitioner.cu ${CMAKE_CURRENT_SOURCE_DIR}/distributed_pdlp/metis_partitioner.cu + ${CMAKE_CURRENT_SOURCE_DIR}/distributed_pdlp/kaminpar_partitioner.cpp ) # C and Python adapter files diff --git a/cpp/src/pdlp/distributed_pdlp/kaminpar_partitioner.cpp b/cpp/src/pdlp/distributed_pdlp/kaminpar_partitioner.cpp new file mode 100644 index 0000000000..e7bf943f92 --- /dev/null +++ b/cpp/src/pdlp/distributed_pdlp/kaminpar_partitioner.cpp @@ -0,0 +1,142 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + */ + +// Plain C++ translation unit (not .cu): KaMinPar's public header is C++20 host code +// and pulls in TBB; keeping it out of nvcc avoids device-compiler friction. + +#include +#include + +#include + +#include + +#include + +#include +#include +#include +#include +#include + +namespace cuopt::linear_programming::detail { + +// Builds the bipartite constraint/variable graph induced by A (identical layout +// to metis_partitioner_t) and runs the multi-threaded KaMinPar k-way kernel. +// * nodes [0, nb_cstr) : constraint nodes +// * nodes [nb_cstr, nb_cstr+nb_vars): variable nodes +// * undirected edges from each A nonzero (one half via A, one via A_t) +template +std::vector kaminpar_partitioner_t::partition( + partitioner_input_t const& input) const +{ + cuopt_expects(input.nb_parts >= 1, + error_type_t::ValidationError, + "kaminpar_partitioner: nb_parts must be >= 1"); + cuopt_expects(input.nb_cstr >= 0 && input.nb_vars >= 0, + error_type_t::ValidationError, + "kaminpar_partitioner: invalid problem dimensions"); + + // The k-way kernel needs at least 2 blocks. For the single-shard case the + // partition is trivial (everything in block 0); short-circuit so KaMinPar can + // still be selected with distributed_pdlp_num_gpus == 1 without crashing. + if (input.nb_parts == 1) { + CUOPT_LOG_INFO("KaMinPar: nb_parts == 1, returning trivial single-block partition"); + return std::vector(static_cast(input.nb_cstr + input.nb_vars), i_t{0}); + } + cuopt_expects(input.A.row_offsets != nullptr && input.A.col_indices != nullptr, + error_type_t::ValidationError, + "kaminpar_partitioner: A.row_offsets and A.col_indices are required"); + cuopt_expects(input.A_t.row_offsets != nullptr && input.A_t.col_indices != nullptr, + error_type_t::ValidationError, + "kaminpar_partitioner: A_t.row_offsets and A_t.col_indices are required"); + + auto const& A_offsets = *input.A.row_offsets; + auto const& A_cols = *input.A.col_indices; + auto const& A_t_offsets = *input.A_t.row_offsets; + auto const& A_t_cols = *input.A_t.col_indices; + + cuopt_expects(static_cast(A_offsets.size()) == input.nb_cstr + 1, + error_type_t::ValidationError, + "kaminpar_partitioner: A.row_offsets size mismatch (expected nb_cstr+1)"); + cuopt_expects(static_cast(A_t_offsets.size()) == input.nb_vars + 1, + error_type_t::ValidationError, + "kaminpar_partitioner: A_t.row_offsets size mismatch (expected nb_vars+1)"); + cuopt_expects(A_cols.size() == A_t_cols.size(), + error_type_t::ValidationError, + "kaminpar_partitioner: A and A_t nnz mismatch"); + + const i_t nb_cstr = input.nb_cstr; + const i_t nb_vars = input.nb_vars; + const i_t nnz = static_cast(A_cols.size()); + const i_t nvtx = nb_cstr + nb_vars; + + // Resolve thread count: <= 0 => all hardware threads (1 as a last resort). + int nthreads = input.nb_threads > 0 ? static_cast(input.nb_threads) : 0; + if (nthreads <= 0) { + nthreads = static_cast(std::thread::hardware_concurrency()); + if (nthreads <= 0) { nthreads = 1; } + } + + // Bipartite CSR using KaMinPar index types (EdgeID for offsets, NodeID for neighbours). + std::vector xadj(static_cast(nvtx) + 1); + std::vector adjncy(2 * static_cast(nnz)); + + for (i_t i = 0; i <= nb_cstr; ++i) { + xadj[i] = static_cast(A_offsets[i]); + } + for (i_t i = 0; i <= nb_vars; ++i) { + xadj[nb_cstr + i] = + static_cast(A_t_offsets[i]) + static_cast(nnz); + } + for (i_t k = 0; k < nnz; ++k) { + adjncy[k] = + static_cast(A_cols[k]) + static_cast(nb_cstr); + } + for (i_t k = 0; k < nnz; ++k) { + adjncy[nnz + k] = static_cast(A_t_cols[k]); + } + + std::vector block_of(static_cast(nvtx)); + + kaminpar::KaMinPar engine(nthreads, kaminpar::shm::create_default_context()); + engine.copy_graph(std::span(xadj), + std::span(adjncy)); + engine.set_k(static_cast(input.nb_parts)); + // ~3% imbalance, matching METIS_PartGraphKway's default balance constraint. + engine.set_uniform_max_block_weights(0.03); + + auto t0 = std::chrono::high_resolution_clock::now(); + const kaminpar::shm::EdgeWeight edge_cut = + engine.compute_partition(std::span(block_of)); + auto t1 = std::chrono::high_resolution_clock::now(); + const double dt = std::chrono::duration(t1 - t0).count(); + + CUOPT_LOG_INFO( + "KaMinPar partitioned bipartite graph: nvtx=%d nnz=%d nb_parts=%d nthreads=%d edge_cut=%lld " + "in %.3fs", + static_cast(nvtx), + static_cast(nnz), + static_cast(input.nb_parts), + nthreads, + static_cast(edge_cut), + dt); + + std::vector parts(static_cast(nvtx)); + for (i_t i = 0; i < nvtx; ++i) { + parts[i] = static_cast(block_of[i]); + } + + validate_partition(parts, + static_cast(nb_cstr), + static_cast(nb_vars), + static_cast(input.nb_parts), + "kaminpar_partitioner"); + return parts; +} + +template class kaminpar_partitioner_t; + +} // namespace cuopt::linear_programming::detail diff --git a/cpp/src/pdlp/distributed_pdlp/kaminpar_partitioner.hpp b/cpp/src/pdlp/distributed_pdlp/kaminpar_partitioner.hpp new file mode 100644 index 0000000000..43fda76f9f --- /dev/null +++ b/cpp/src/pdlp/distributed_pdlp/kaminpar_partitioner.hpp @@ -0,0 +1,23 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + */ + +#pragma once + +#include + +namespace cuopt::linear_programming::detail { + +// Multi-threaded k-way partitioner backed by KaMinPar. Builds the same +// constraint/variable bipartite graph as metis_partitioner_t, but runs the +// shared-memory parallel KaMinPar kernel so partitioning scales across all CPU +// cores of a node (set via partitioner_input_t::nb_threads; <= 0 => all +// hardware threads). +template +class kaminpar_partitioner_t : public partitioner_i { + public: + std::vector partition(partitioner_input_t const& input) const override; +}; + +} // namespace cuopt::linear_programming::detail diff --git a/cpp/src/pdlp/distributed_pdlp/metis_partitioner.cu b/cpp/src/pdlp/distributed_pdlp/metis_partitioner.cu index ecc60adda0..9a4f0f50b1 100644 --- a/cpp/src/pdlp/distributed_pdlp/metis_partitioner.cu +++ b/cpp/src/pdlp/distributed_pdlp/metis_partitioner.cu @@ -32,18 +32,15 @@ std::vector metis_partitioner_t::partition( cuopt_expects(input.nb_parts > 0, error_type_t::ValidationError, "metis_partitioner: nb_parts must be positive"); - // METIS_PartGraphKway internally does integer arithmetic of the form - // `nedges / nparts` and traps with SIGFPE when nparts == 1. The single-part - // case is also trivial (everything in part 0) so callers should route it to - // the Dummy partitioner instead (see pdlp_solver_t mGPU ctor). - cuopt_expects(input.nb_parts >= 2, - error_type_t::ValidationError, - "metis_partitioner: nb_parts must be >= 2 (METIS_PartGraphKway requirement); " - "use the Dummy partitioner for the single-shard case"); cuopt_expects(input.nb_cstr >= 0 && input.nb_vars >= 0, error_type_t::ValidationError, "metis_partitioner: invalid problem dimensions"); + if (input.nb_parts == 1) { + CUOPT_LOG_INFO("METIS: nb_parts == 1, returning trivial single-block partition"); + return std::vector(static_cast(input.nb_cstr + input.nb_vars), i_t{0}); + } + cuopt_expects(input.A.row_offsets != nullptr && input.A.col_indices != nullptr, error_type_t::ValidationError, "metis_partitioner: A.row_offsets and A.col_indices are required"); diff --git a/cpp/src/pdlp/distributed_pdlp/partitioner.cu b/cpp/src/pdlp/distributed_pdlp/partitioner.cu index bc84e521e2..e3866c3ad1 100644 --- a/cpp/src/pdlp/distributed_pdlp/partitioner.cu +++ b/cpp/src/pdlp/distributed_pdlp/partitioner.cu @@ -3,6 +3,7 @@ * SPDX-License-Identifier: Apache-2.0 */ +#include #include #include @@ -72,6 +73,8 @@ std::unique_ptr> make_partitioner(partitioner_kind_t kin switch (kind) { case partitioner_kind_t::Dummy: return std::make_unique>(); case partitioner_kind_t::Metis: return std::make_unique>(); + case partitioner_kind_t::KaMinPar: + return std::make_unique>(); } cuopt_expects( false, error_type_t::RuntimeError, "make_partitioner: unsupported partitioner kind"); diff --git a/cpp/src/pdlp/distributed_pdlp/partitioner.hpp b/cpp/src/pdlp/distributed_pdlp/partitioner.hpp index 2a2149db63..70b2e34c06 100644 --- a/cpp/src/pdlp/distributed_pdlp/partitioner.hpp +++ b/cpp/src/pdlp/distributed_pdlp/partitioner.hpp @@ -29,6 +29,10 @@ struct partitioner_input_t { i_t nb_cstr{0}; i_t nb_vars{0}; i_t nb_parts{0}; + // Number of CPU threads the partitioner may use. Only honored by the + // multi-threaded KaMinPar backend; <= 0 means "auto" (all hardware threads). + // Serial backends (METIS, Dummy) ignore it. + i_t nb_threads{0}; // Constraint matrix A (rows = constraints, cols = variables). csr_host_view_t A{}; // Transpose A_t (rows = variables, cols = constraints). Optional for partitioners @@ -36,7 +40,10 @@ struct partitioner_input_t { csr_host_view_t A_t{}; }; -enum class partitioner_kind_t { Dummy, Metis }; +// Dummy: round-robin, no graph (single-shard / debugging). +// Metis: serial METIS_PartGraphKway. +// KaMinPar: multi-threaded KaMinPar (preferred for multi-shard partitioning). +enum class partitioner_kind_t { Dummy, Metis, KaMinPar }; template class partitioner_i { diff --git a/cpp/src/pdlp/pdlp.cu b/cpp/src/pdlp/pdlp.cu index 150311ae33..0514ae1d13 100644 --- a/cpp/src/pdlp/pdlp.cu +++ b/cpp/src/pdlp/pdlp.cu @@ -44,6 +44,7 @@ #include #include +#include #include #include #include @@ -398,12 +399,6 @@ pdlp_solver_t::pdlp_solver_t( const int distributed_pdlp_num_gpus = settings.distributed_pdlp_num_gpus; CUOPT_LOG_INFO("Solving with distributed PDLP on %d GPU (mps direct path)", distributed_pdlp_num_gpus); - if (distributed_pdlp_num_gpus == 1) { - std::cout << "CAREFUL !!: distributed_pdlp_num_gpus == 1, running single-shard dummy path, " - "if you want to set the number of GPUs to use for distributed PDLP, set the " - "parameter --distributed-pdlp-num-gpus" - << std::endl; - } if constexpr (!std::is_same_v) { cuopt_expects( @@ -501,20 +496,37 @@ pdlp_solver_t::pdlp_solver_t( settings.multi_gpu_partition_file); validate_partition(parts, n_cstr, n_vars, distributed_pdlp_num_gpus, "partition file"); } else { - if (distributed_pdlp_num_gpus == 1) { - std::cout << "CAREFUL: distributed_pdlp_num_gpus == 1, running dummy version (single " - "part covering " - << n_cstr << " cstrs + " << n_vars << " vars)" << std::endl; - } partitioner_input_t partition_input; partition_input.nb_cstr = n_cstr; partition_input.nb_vars = n_vars; partition_input.nb_parts = distributed_pdlp_num_gpus; - // METIS_PartGraphKway requires nparts >= 2; route num_gpus == 1 to Dummy. - const partitioner_kind_t kind = - (distributed_pdlp_num_gpus == 1) ? partitioner_kind_t::Dummy : partitioner_kind_t::Metis; - if (kind == partitioner_kind_t::Metis) { + // Resolve which partitioner to use. + std::string partitioner_choice = settings.distributed_pdlp_partitioner; + std::transform(partitioner_choice.begin(), + partitioner_choice.end(), + partitioner_choice.begin(), + [](unsigned char c) { return std::tolower(c); }); + partitioner_kind_t kind; + if (partitioner_choice.empty() || partitioner_choice == "auto") { + kind = (distributed_pdlp_num_gpus == 1) ? partitioner_kind_t::Dummy + : partitioner_kind_t::KaMinPar; + } else if (partitioner_choice == "dummy") { + kind = partitioner_kind_t::Dummy; + } else if (partitioner_choice == "metis") { + kind = partitioner_kind_t::Metis; + } else if (partitioner_choice == "kaminpar") { + kind = partitioner_kind_t::KaMinPar; + } else { + cuopt_expects(false, + error_type_t::ValidationError, + "Unknown distributed_pdlp_partitioner '%s' (expected auto|dummy|metis|kaminpar)", + settings.distributed_pdlp_partitioner.c_str()); + kind = partitioner_kind_t::Dummy; // unreachable; silences -Wmaybe-uninitialized + } + const bool needs_graph = + (kind == partitioner_kind_t::Metis || kind == partitioner_kind_t::KaMinPar); + if (needs_graph) { // partitioner_input_t holds non-const std::vector* pointers; we // already have the data in our local mutable buffers above. partition_input.A.row_offsets = &h_A_row_offsets; @@ -525,7 +537,19 @@ pdlp_solver_t::pdlp_solver_t( partition_input.A_t.col_indices = &h_A_t_col_indices; partition_input.A_t.num_rows = n_vars; partition_input.A_t.num_cols = n_cstr; + // 0 => KaMinPar auto-detects and uses all hardware threads (ignored by METIS). + partition_input.nb_threads = 0; } + const char* kind_name = (kind == partitioner_kind_t::Dummy) ? "dummy" + : (kind == partitioner_kind_t::Metis) ? "metis" + : (kind == partitioner_kind_t::KaMinPar) ? "kaminpar" + : "unknown"; + CUOPT_LOG_INFO("Partitioning %d constraints + %d variables into %d part(s) using the %s " + "partitioner", + n_cstr, + n_vars, + distributed_pdlp_num_gpus, + kind_name); auto partitioner = make_partitioner(kind); parts = partitioner->partition(partition_input); } diff --git a/cpp/src/pdlp/solve.cu b/cpp/src/pdlp/solve.cu index 228bacfd21..595c06b20a 100644 --- a/cpp/src/pdlp/solve.cu +++ b/cpp/src/pdlp/solve.cu @@ -2166,11 +2166,6 @@ optimization_problem_solution_t solve_lp_distributed_from_mps( "%d visible CUDA device(s)", settings_resolved.distributed_pdlp_num_gpus); } - if (settings_resolved.distributed_pdlp_num_gpus <= 1) { - std::cout << "CAREFUL: use_distributed_pdlp with distributed_pdlp_num_gpus == 1 runs the " - "single-shard dummy path" - << std::endl; - } // PDLP precision validations (mirror the checks in run_pdlp; distributed // path only supports the default-precision, non-batch double config). cuopt_expects(settings_resolved.pdlp_precision == pdlp_precision_t::DefaultPrecision, From 91b1ae5a619bb9edec2a6775a24304a95b73fdf6 Mon Sep 17 00:00:00 2001 From: Bulle Mostovoi Date: Thu, 4 Jun 2026 16:16:19 +0200 Subject: [PATCH 67/67] style --- .../cuopt/linear_programming/constants.h | 26 +++++++++---------- .../distributed_pdlp/multi_gpu_engine.hpp | 6 ++--- cpp/src/pdlp/distributed_pdlp/partitioner.cu | 3 +-- .../initial_scaling.cuh | 5 +++- cpp/src/pdlp/pdlp.cu | 26 ++++++++++--------- cpp/tests/linear_programming/pdlp_test.cu | 14 +++++----- 6 files changed, 42 insertions(+), 38 deletions(-) diff --git a/cpp/include/cuopt/linear_programming/constants.h b/cpp/include/cuopt/linear_programming/constants.h index 420a03526b..29648d1a0f 100644 --- a/cpp/include/cuopt/linear_programming/constants.h +++ b/cpp/include/cuopt/linear_programming/constants.h @@ -80,20 +80,20 @@ #define CUOPT_MIP_STRONG_BRANCHING_SIMPLEX_ITERATION_LIMIT \ "mip_strong_branching_simplex_iteration_limit" -#define CUOPT_SOLUTION_FILE "solution_file" -#define CUOPT_NUM_CPU_THREADS "num_cpu_threads" -#define CUOPT_NUM_GPUS "num_gpus" -#define CUOPT_DISTRIBUTED_PDLP_NUM_GPUS "distributed_pdlp_num_gpus" -#define CUOPT_MULTI_GPU_PARTITION_FILE "multi_gpu_partition_file" +#define CUOPT_SOLUTION_FILE "solution_file" +#define CUOPT_NUM_CPU_THREADS "num_cpu_threads" +#define CUOPT_NUM_GPUS "num_gpus" +#define CUOPT_DISTRIBUTED_PDLP_NUM_GPUS "distributed_pdlp_num_gpus" +#define CUOPT_MULTI_GPU_PARTITION_FILE "multi_gpu_partition_file" #define CUOPT_MULTI_GPU_EXPORT_PARTITION_FILE "multi_gpu_export_partition_file" -#define CUOPT_DISTRIBUTED_PDLP_PARTITIONER "distributed_pdlp_partitioner" -#define CUOPT_USE_DISTRIBUTED_PDLP "use_distributed_pdlp" -#define CUOPT_PDLP_DISABLE_GRAPH "pdlp_disable_graph" -#define CUOPT_USER_PROBLEM_FILE "user_problem_file" -#define CUOPT_PRESOLVE_FILE "presolve_file" -#define CUOPT_RANDOM_SEED "random_seed" -#define CUOPT_PDLP_PRECISION "pdlp_precision" -#define CUOPT_MIP_SEMICONTINUOUS_BIG_M "mip_semi_continuous_big_m" +#define CUOPT_DISTRIBUTED_PDLP_PARTITIONER "distributed_pdlp_partitioner" +#define CUOPT_USE_DISTRIBUTED_PDLP "use_distributed_pdlp" +#define CUOPT_PDLP_DISABLE_GRAPH "pdlp_disable_graph" +#define CUOPT_USER_PROBLEM_FILE "user_problem_file" +#define CUOPT_PRESOLVE_FILE "presolve_file" +#define CUOPT_RANDOM_SEED "random_seed" +#define CUOPT_PDLP_PRECISION "pdlp_precision" +#define CUOPT_MIP_SEMICONTINUOUS_BIG_M "mip_semi_continuous_big_m" #define CUOPT_MIP_HYPER_HEURISTIC_POPULATION_SIZE "mip_hyper_heuristic_population_size" #define CUOPT_MIP_HYPER_HEURISTIC_NUM_CPUFJ_THREADS "mip_hyper_heuristic_num_cpufj_threads" diff --git a/cpp/src/pdlp/distributed_pdlp/multi_gpu_engine.hpp b/cpp/src/pdlp/distributed_pdlp/multi_gpu_engine.hpp index 3a0fcb755d..89153e8bd7 100644 --- a/cpp/src/pdlp/distributed_pdlp/multi_gpu_engine.hpp +++ b/cpp/src/pdlp/distributed_pdlp/multi_gpu_engine.hpp @@ -250,7 +250,7 @@ struct multi_gpu_engine_t { // -------- Broadcast owned constraint (row) scaling into halo ------------ void broadcast_constraint_scaling_to_halo() { - const int nb = static_cast(shards.size()); + const int nb = static_cast(shards.size()); auto buf_access = [](pdlp_shard_t& s) -> rmm::device_uvector& { return s.sub_pdlp->get_initial_scaling_strategy().get_cummulative_constraint_matrix_scaling(); }; @@ -384,7 +384,7 @@ struct multi_gpu_engine_t { bound_sq.emplace_back(1, s.stream.view()); obj_sq.emplace_back(1, s.stream.view()); - const auto& scaled = s.sub_pdlp->get_initial_scaling_strategy().get_scaled_op_problem(); + const auto& scaled = s.sub_pdlp->get_initial_scaling_strategy().get_scaled_op_problem(); const int n_owned_cstr = static_cast(s.rank_data.owned_cstr_size); const int n_owned_var = static_cast(s.rank_data.owned_var_size); @@ -403,7 +403,7 @@ struct multi_gpu_engine_t { n_owned_cstr, s.stream.view().value()); - auto obj_in = thrust::make_transform_iterator(scaled.objective_coefficients.data(), + auto obj_in = thrust::make_transform_iterator(scaled.objective_coefficients.data(), mgpu_weighted_sq_op_t{c_scaling_weight}); size_t tmp_bytes_o = 0; cub::DeviceReduce::Sum( diff --git a/cpp/src/pdlp/distributed_pdlp/partitioner.cu b/cpp/src/pdlp/distributed_pdlp/partitioner.cu index e3866c3ad1..727a8b56f9 100644 --- a/cpp/src/pdlp/distributed_pdlp/partitioner.cu +++ b/cpp/src/pdlp/distributed_pdlp/partitioner.cu @@ -73,8 +73,7 @@ std::unique_ptr> make_partitioner(partitioner_kind_t kin switch (kind) { case partitioner_kind_t::Dummy: return std::make_unique>(); case partitioner_kind_t::Metis: return std::make_unique>(); - case partitioner_kind_t::KaMinPar: - return std::make_unique>(); + case partitioner_kind_t::KaMinPar: return std::make_unique>(); } cuopt_expects( false, error_type_t::RuntimeError, "make_partitioner: unsupported partitioner kind"); diff --git a/cpp/src/pdlp/initial_scaling_strategy/initial_scaling.cuh b/cpp/src/pdlp/initial_scaling_strategy/initial_scaling.cuh index 409df5340a..13f639079d 100644 --- a/cpp/src/pdlp/initial_scaling_strategy/initial_scaling.cuh +++ b/cpp/src/pdlp/initial_scaling_strategy/initial_scaling.cuh @@ -105,7 +105,10 @@ class pdlp_initial_scaling_strategy_t { // Distributed PDLP: skip the LOCAL bound/objective rescaling inside // scale_problem() - void set_skip_distributed_local_rescaling(bool value) { skip_distributed_local_rescaling_ = value; } + void set_skip_distributed_local_rescaling(bool value) + { + skip_distributed_local_rescaling_ = value; + } // Public for distributed PDLP void compute_scaling_vectors(i_t number_of_ruiz_iterations, f_t alpha); diff --git a/cpp/src/pdlp/pdlp.cu b/cpp/src/pdlp/pdlp.cu index 0514ae1d13..71c6b0a48c 100644 --- a/cpp/src/pdlp/pdlp.cu +++ b/cpp/src/pdlp/pdlp.cu @@ -509,8 +509,8 @@ pdlp_solver_t::pdlp_solver_t( [](unsigned char c) { return std::tolower(c); }); partitioner_kind_t kind; if (partitioner_choice.empty() || partitioner_choice == "auto") { - kind = (distributed_pdlp_num_gpus == 1) ? partitioner_kind_t::Dummy - : partitioner_kind_t::KaMinPar; + kind = + (distributed_pdlp_num_gpus == 1) ? partitioner_kind_t::Dummy : partitioner_kind_t::KaMinPar; } else if (partitioner_choice == "dummy") { kind = partitioner_kind_t::Dummy; } else if (partitioner_choice == "metis") { @@ -518,10 +518,11 @@ pdlp_solver_t::pdlp_solver_t( } else if (partitioner_choice == "kaminpar") { kind = partitioner_kind_t::KaMinPar; } else { - cuopt_expects(false, - error_type_t::ValidationError, - "Unknown distributed_pdlp_partitioner '%s' (expected auto|dummy|metis|kaminpar)", - settings.distributed_pdlp_partitioner.c_str()); + cuopt_expects( + false, + error_type_t::ValidationError, + "Unknown distributed_pdlp_partitioner '%s' (expected auto|dummy|metis|kaminpar)", + settings.distributed_pdlp_partitioner.c_str()); kind = partitioner_kind_t::Dummy; // unreachable; silences -Wmaybe-uninitialized } const bool needs_graph = @@ -544,12 +545,13 @@ pdlp_solver_t::pdlp_solver_t( : (kind == partitioner_kind_t::Metis) ? "metis" : (kind == partitioner_kind_t::KaMinPar) ? "kaminpar" : "unknown"; - CUOPT_LOG_INFO("Partitioning %d constraints + %d variables into %d part(s) using the %s " - "partitioner", - n_cstr, - n_vars, - distributed_pdlp_num_gpus, - kind_name); + CUOPT_LOG_INFO( + "Partitioning %d constraints + %d variables into %d part(s) using the %s " + "partitioner", + n_cstr, + n_vars, + distributed_pdlp_num_gpus, + kind_name); auto partitioner = make_partitioner(kind); parts = partitioner->partition(partition_input); } diff --git a/cpp/tests/linear_programming/pdlp_test.cu b/cpp/tests/linear_programming/pdlp_test.cu index 65cc2f0d9f..d17cf2af6f 100644 --- a/cpp/tests/linear_programming/pdlp_test.cu +++ b/cpp/tests/linear_programming/pdlp_test.cu @@ -113,9 +113,9 @@ TEST(pdlp_class, distributed_partition_metis_export_import_roundtrip) const int n_cstr = static_cast(mps.get_constraint_lower_bounds().size()); const int nnz = static_cast(mps.get_constraint_matrix_values().size()); - std::vector h_A_row_offsets = mps.get_constraint_matrix_offsets(); - std::vector h_A_col_indices = mps.get_constraint_matrix_indices(); - std::vector h_A_values = mps.get_constraint_matrix_values(); + std::vector h_A_row_offsets = mps.get_constraint_matrix_offsets(); + std::vector h_A_col_indices = mps.get_constraint_matrix_indices(); + std::vector h_A_values = mps.get_constraint_matrix_values(); // Transpose A -> A^T (CSR of A^T == CSC of A), mirroring solve_lp_distributed_from_mps. ds::csr_matrix_t A_csr(n_cstr, n_vars, nnz); @@ -171,7 +171,7 @@ void expect_distributed_matches_base(raft::handle_t const& handle, return std::fabs(a - b) <= rel * (1.0 + std::fabs(a)); }; - auto path = make_path_absolute(mps_rel_path); + auto path = make_path_absolute(mps_rel_path); io::mps_data_model_t problem = io::parse_mps(path, fixed_mps_format); // Shared settings: PDLP, no presolve (distributed requires presolver == None, so the @@ -186,9 +186,9 @@ void expect_distributed_matches_base(raft::handle_t const& handle, // ----- distributed PDLP (identical settings, only the distributed flags flipped) ----- pdlp_solver_settings_t dist_settings = base_settings; - dist_settings.hyper_params.use_distributed_pdlp = true; - dist_settings.distributed_pdlp_num_gpus = -1; - auto dist = solve_lp(&handle, problem, dist_settings); + dist_settings.hyper_params.use_distributed_pdlp = true; + dist_settings.distributed_pdlp_num_gpus = -1; + auto dist = solve_lp(&handle, problem, dist_settings); // ----- termination status ----- ASSERT_EQ(static_cast(base.get_termination_status()), CUOPT_TERMINATION_STATUS_OPTIMAL)