diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 3ac02c917a..0bf2b0f3f7 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -314,6 +314,81 @@ create_logger_macros(CUOPT "cuopt::default_logger()" include/cuopt) find_package(CUDSS REQUIRED) +# ################################################################################################## +# - NCCL (multi-GPU distributed PDLP) ------------------------------------------------------------- +# NCCL is shipped via the conda env; no canonical CMake config target, so look it +# up by name in the standard lib paths (plus CONDA_PREFIX as a hint). +set(NCCL_HINT_PREFIXES "") +if (DEFINED ENV{CONDA_PREFIX} AND NOT "$ENV{CONDA_PREFIX}" STREQUAL "") + list(APPEND NCCL_HINT_PREFIXES "$ENV{CONDA_PREFIX}") +endif () +find_path(NCCL_INCLUDE_DIR + NAMES nccl.h + HINTS ${NCCL_HINT_PREFIXES} + PATH_SUFFIXES include +) +find_library(NCCL_LIBRARY + NAMES nccl + HINTS ${NCCL_HINT_PREFIXES} + PATH_SUFFIXES lib lib64 +) +if (NOT NCCL_INCLUDE_DIR OR NOT NCCL_LIBRARY) + message(FATAL_ERROR "NCCL not found. Looked in ${NCCL_HINT_PREFIXES}. Install nccl-dev / libnccl-dev in the active env.") +endif () +add_library(nccl_external UNKNOWN IMPORTED GLOBAL) +set_target_properties(nccl_external PROPERTIES + IMPORTED_LOCATION "${NCCL_LIBRARY}" + INTERFACE_INCLUDE_DIRECTORIES "${NCCL_INCLUDE_DIR}" +) +message(STATUS "Using NCCL: ${NCCL_LIBRARY}") + +# ################################################################################################## +# - METIS (graph partitioning for distributed PDLP) ----------------------------------------------- +# Found by searching CONDA_PREFIX first, then CUOPT_METIS_ROOT (cmake var or env) +# if the user wants to pull METIS from a different conda env / system path. +set(METIS_HINT_PREFIXES "") +if (DEFINED ENV{CONDA_PREFIX} AND NOT "$ENV{CONDA_PREFIX}" STREQUAL "") + list(APPEND METIS_HINT_PREFIXES "$ENV{CONDA_PREFIX}") +endif () +if (DEFINED CUOPT_METIS_ROOT AND NOT "${CUOPT_METIS_ROOT}" STREQUAL "") + list(APPEND METIS_HINT_PREFIXES "${CUOPT_METIS_ROOT}") +endif () +if (DEFINED ENV{CUOPT_METIS_ROOT} AND NOT "$ENV{CUOPT_METIS_ROOT}" STREQUAL "") + list(APPEND METIS_HINT_PREFIXES "$ENV{CUOPT_METIS_ROOT}") +endif () +find_path(METIS_INCLUDE_DIR + NAMES metis.h + HINTS ${METIS_HINT_PREFIXES} + PATH_SUFFIXES include +) +find_library(METIS_LIBRARY + NAMES metis libmetis + HINTS ${METIS_HINT_PREFIXES} + PATH_SUFFIXES lib lib64 +) +if (NOT METIS_INCLUDE_DIR OR NOT METIS_LIBRARY) + message(FATAL_ERROR "METIS not found. Looked in: ${METIS_HINT_PREFIXES}. " + "Install it via 'conda install -c conda-forge metis' in the active env, " + "or set CUOPT_METIS_ROOT to a prefix containing include/metis.h and lib/libmetis.{so,a}.") +endif () +add_library(metis_external UNKNOWN IMPORTED GLOBAL) +set_target_properties(metis_external PROPERTIES + IMPORTED_LOCATION "${METIS_LIBRARY}" + INTERFACE_INCLUDE_DIRECTORIES "${METIS_INCLUDE_DIR}" +) +message(STATUS "Using METIS: ${METIS_LIBRARY}") + +# ################################################################################################## +# - KaMinPar (multi-threaded partitioning for distributed PDLP) ------------------------------------ +# Brought in the RAPIDS way (rapids_cpm_find): uses an installed KaMinPar (deb/rpm/conda, +# discovered via its CMake config) if present, otherwise builds the pinned source via CPM. +# Distributed PDLP prefers KaMinPar over METIS. +include(cmake/thirdparty/get_kaminpar.cmake) +if (NOT TARGET KaMinPar::KaMinPar) + message(FATAL_ERROR "KaMinPar::KaMinPar was not made available by get_kaminpar.cmake") +endif () +message(STATUS "Using KaMinPar (distributed PDLP prefers KaMinPar over METIS)") + # ################################################################################################## # - gRPC and Protobuf setup ----------------------------------------------------------------------- @@ -576,6 +651,9 @@ target_link_libraries(cuopt ${CUDSS_LIB_FILE} PRIVATE ${CUOPT_PRIVATE_CUDA_LIBS} + nccl_external + metis_external + KaMinPar::KaMinPar $<$:protobuf::libprotobuf> $<$:gRPC::grpc++> ) diff --git a/cpp/cmake/thirdparty/get_kaminpar.cmake b/cpp/cmake/thirdparty/get_kaminpar.cmake new file mode 100644 index 0000000000..d548a76115 --- /dev/null +++ b/cpp/cmake/thirdparty/get_kaminpar.cmake @@ -0,0 +1,48 @@ +# cmake-format: off +# SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# cmake-format: on + +# Multi-threaded graph partitioner for distributed PDLP. +# Uses rapids_cpm_find so a system / conda / .deb install of KaMinPar (which ships a +# CMake config package exporting KaMinPar::KaMinPar) is used when available, and +# otherwise the pinned source is cloned and built via CPM. KaMinPar depends on TBB, +# which cuOpt already requires (see find_package(TBB) for papilo). +function(find_and_configure_kaminpar) + set(oneValueArgs VERSION PINNED_TAG) + cmake_parse_arguments(PKG "" "${oneValueArgs}" "" ${ARGN}) + + rapids_cpm_find(KaMinPar ${PKG_VERSION} + GLOBAL_TARGETS KaMinPar::KaMinPar + CPM_ARGS + GIT_REPOSITORY https://github.com/KaHIP/KaMinPar.git + GIT_TAG ${PKG_PINNED_TAG} + EXCLUDE_FROM_ALL + OPTIONS + "KAMINPAR_BUILD_APPS OFF" + "KAMINPAR_BUILD_TOOLS OFF" + "KAMINPAR_BUILD_TESTS OFF" + "KAMINPAR_BUILD_BENCHMARKS OFF" + "KAMINPAR_BUILD_EXAMPLES OFF" + "KAMINPAR_BUILD_DISTRIBUTED OFF" + # Timers use global state and force single-threaded use of the library + # interface; disable so cuOpt can call the partitioner freely. + "KAMINPAR_ENABLE_TIMERS OFF" + # Avoid an extra hard dependency on Google Sparsehash. + "KAMINPAR_BUILD_WITH_SPARSEHASH OFF" + # cuOpt's TBB is discovered via a legacy find that only exposes TBB::tbb + # (no TBB::tbbmalloc target); disable KaMinPar's optional tbbmalloc use. + "KAMINPAR_ENABLE_TBB_MALLOC OFF" + # Large LP constraint graphs can exceed 2^31 directed edges. + "KAMINPAR_64BIT_EDGE_IDS ON" + "INSTALL_KAMINPAR OFF" + ) + + if(KaMinPar_ADDED) + message(VERBOSE "CUOPT: Using KaMinPar located in ${KaMinPar_SOURCE_DIR}") + else() + message(VERBOSE "CUOPT: Using KaMinPar located in ${KaMinPar_DIR}") + endif() +endfunction() + +find_and_configure_kaminpar(VERSION 3.7.3 PINNED_TAG v3.7.3) diff --git a/cpp/cuopt_cli.cpp b/cpp/cuopt_cli.cpp index 39aab47170..b730067a28 100644 --- a/cpp/cuopt_cli.cpp +++ b/cpp/cuopt_cli.cpp @@ -176,7 +176,12 @@ int run_single_file(const std::string& file_path, auto solution = cuopt::linear_programming::solve_mip(problem_interface.get(), mip_settings); } else { auto& lp_settings = settings.get_pdlp_settings(); - auto solution = cuopt::linear_programming::solve_lp(problem_interface.get(), lp_settings); + + if (lp_settings.hyper_params.use_distributed_pdlp) { + cuopt::linear_programming::solve_lp(handle_ptr.get(), mps_data_model, lp_settings); + } else { + cuopt::linear_programming::solve_lp(problem_interface.get(), lp_settings); + } } } catch (const std::exception& e) { fprintf(stderr, "cuopt_cli error: %s\n", e.what()); @@ -426,10 +431,21 @@ int main(int argc, char* argv[]) std::vector memory_resources; if (memory_backend == cuopt::linear_programming::memory_backend_t::GPU) { - const int num_gpus = settings.get_parameter(CUOPT_NUM_GPUS); + // Distributed PDLP scales one shard per GPU and uses its own knob; everything else + // (concurrent, batch, MIP) uses num_gpus which is capped at 2. + // For distributed PDLP, -1 means "auto-detect": resolve to the visible device + // count so the RMM memory pools match what solve.cu will eventually dispatch. + const bool use_distributed_pdlp = settings.get_parameter(CUOPT_USE_DISTRIBUTED_PDLP); + int requested_gpus = use_distributed_pdlp + ? settings.get_parameter(CUOPT_DISTRIBUTED_PDLP_NUM_GPUS) + : settings.get_parameter(CUOPT_NUM_GPUS); + if (use_distributed_pdlp && requested_gpus == -1) { + requested_gpus = raft::device_setter::get_device_count(); + } + const int provisioned_gpus = std::min(raft::device_setter::get_device_count(), requested_gpus); - memory_resources.reserve(std::min(raft::device_setter::get_device_count(), num_gpus)); - for (int i = 0; i < std::min(raft::device_setter::get_device_count(), num_gpus); ++i) { + memory_resources.reserve(provisioned_gpus); + for (int i = 0; i < provisioned_gpus; ++i) { RAFT_CUDA_TRY(cudaSetDevice(i)); memory_resources.emplace_back(); rmm::mr::set_per_device_resource(rmm::cuda_device_id{i}, memory_resources.back()); diff --git a/cpp/include/cuopt/linear_programming/constants.h b/cpp/include/cuopt/linear_programming/constants.h index 3342ba375c..29648d1a0f 100644 --- a/cpp/include/cuopt/linear_programming/constants.h +++ b/cpp/include/cuopt/linear_programming/constants.h @@ -80,14 +80,20 @@ #define CUOPT_MIP_STRONG_BRANCHING_SIMPLEX_ITERATION_LIMIT \ "mip_strong_branching_simplex_iteration_limit" -#define CUOPT_SOLUTION_FILE "solution_file" -#define CUOPT_NUM_CPU_THREADS "num_cpu_threads" -#define CUOPT_NUM_GPUS "num_gpus" -#define CUOPT_USER_PROBLEM_FILE "user_problem_file" -#define CUOPT_PRESOLVE_FILE "presolve_file" -#define CUOPT_RANDOM_SEED "random_seed" -#define CUOPT_PDLP_PRECISION "pdlp_precision" -#define CUOPT_MIP_SEMICONTINUOUS_BIG_M "mip_semi_continuous_big_m" +#define CUOPT_SOLUTION_FILE "solution_file" +#define CUOPT_NUM_CPU_THREADS "num_cpu_threads" +#define CUOPT_NUM_GPUS "num_gpus" +#define CUOPT_DISTRIBUTED_PDLP_NUM_GPUS "distributed_pdlp_num_gpus" +#define CUOPT_MULTI_GPU_PARTITION_FILE "multi_gpu_partition_file" +#define CUOPT_MULTI_GPU_EXPORT_PARTITION_FILE "multi_gpu_export_partition_file" +#define CUOPT_DISTRIBUTED_PDLP_PARTITIONER "distributed_pdlp_partitioner" +#define CUOPT_USE_DISTRIBUTED_PDLP "use_distributed_pdlp" +#define CUOPT_PDLP_DISABLE_GRAPH "pdlp_disable_graph" +#define CUOPT_USER_PROBLEM_FILE "user_problem_file" +#define CUOPT_PRESOLVE_FILE "presolve_file" +#define CUOPT_RANDOM_SEED "random_seed" +#define CUOPT_PDLP_PRECISION "pdlp_precision" +#define CUOPT_MIP_SEMICONTINUOUS_BIG_M "mip_semi_continuous_big_m" #define CUOPT_MIP_HYPER_HEURISTIC_POPULATION_SIZE "mip_hyper_heuristic_population_size" #define CUOPT_MIP_HYPER_HEURISTIC_NUM_CPUFJ_THREADS "mip_hyper_heuristic_num_cpufj_threads" diff --git a/cpp/include/cuopt/linear_programming/pdlp/pdlp_hyper_params.cuh b/cpp/include/cuopt/linear_programming/pdlp/pdlp_hyper_params.cuh index 282e91d7ef..0ce90e7228 100644 --- a/cpp/include/cuopt/linear_programming/pdlp/pdlp_hyper_params.cuh +++ b/cpp/include/cuopt/linear_programming/pdlp/pdlp_hyper_params.cuh @@ -47,12 +47,16 @@ struct pdlp_hyper_params_t { bool bound_objective_rescaling = true; bool use_reflected_primal_dual = true; bool use_fixed_point_error = true; - double reflection_coefficient = 1.0; - double restart_k_p = 0.99; - double restart_k_i = 0.01; - double restart_k_d = 0.0; - double restart_i_smooth = 0.3; - bool use_conditional_major = true; + bool use_distributed_pdlp = false; + // Debug/diagnostic knob: when true, PDLP bypasses CUDA-graph capture in + // ping_pong_graph_t and executes each iteration eagerly + bool pdlp_disable_graph = false; + double reflection_coefficient = 1.0; + double restart_k_p = 0.99; + double restart_k_i = 0.01; + double restart_k_d = 0.0; + double restart_i_smooth = 0.3; + bool use_conditional_major = true; }; // TODO most likely we want to get rid of pdlp_solver_mode and just have prebuilt diff --git a/cpp/include/cuopt/linear_programming/pdlp/solver_settings.hpp b/cpp/include/cuopt/linear_programming/pdlp/solver_settings.hpp index b30286f9ce..42ef1f592a 100644 --- a/cpp/include/cuopt/linear_programming/pdlp/solver_settings.hpp +++ b/cpp/include/cuopt/linear_programming/pdlp/solver_settings.hpp @@ -307,6 +307,25 @@ class pdlp_solver_settings_t { presolver_t presolver{presolver_t::Default}; bool dual_postsolve{true}; int num_gpus{1}; + // Number of GPUs to use specifically for distributed PDLP (use_distributed_pdlp=true). + // -1 means auto-detect + int distributed_pdlp_num_gpus{-1}; + std::string multi_gpu_partition_file{""}; + // If non-empty, the partition computed for distributed PDLP is written to this + // path (one part-id per line) right after partitioning. The file can be fed + // back via multi_gpu_partition_file. Exposed as the multi_gpu_export_partition_file + // parameter (CLI: --multi-gpu-export-partition-file ). + std::string multi_gpu_export_partition_file{""}; + // Which graph partitioner distributed PDLP uses. One of: + // "auto" - 1 GPU => Dummy; otherwise KaMinPar + // "dummy" - round-robin, no graph (trivial) + // "metis" - serial METIS_PartGraphKway + // "kaminpar" - multi-threaded KaMinPar + // Exposed as the distributed_pdlp_partitioner parameter + // (CLI: --distributed-pdlp-partitioner ). + std::string distributed_pdlp_partitioner{"auto"}; + // Set to true inside the shards + bool is_distributed_sub_pdlp{false}; method_t method{method_t::Concurrent}; bool inside_mip{false}; // For concurrent termination diff --git a/cpp/src/math_optimization/solver_settings.cu b/cpp/src/math_optimization/solver_settings.cu index 0b4c360b10..254a3afb38 100644 --- a/cpp/src/math_optimization/solver_settings.cu +++ b/cpp/src/math_optimization/solver_settings.cu @@ -140,6 +140,7 @@ solver_settings_t::solver_settings_t() : pdlp_settings(), mip_settings {CUOPT_MIP_REDUCED_COST_STRENGTHENING, &mip_settings.reduced_cost_strengthening, -1, std::numeric_limits::max(), -1}, {CUOPT_NUM_GPUS, &pdlp_settings.num_gpus, 1, 2, 1}, {CUOPT_NUM_GPUS, &mip_settings.num_gpus, 1, 2, 1}, + {CUOPT_DISTRIBUTED_PDLP_NUM_GPUS, &pdlp_settings.distributed_pdlp_num_gpus, -1, 576, -1}, {CUOPT_MIP_BATCH_PDLP_STRONG_BRANCHING, &mip_settings.mip_batch_pdlp_strong_branching, 0, 2, 0}, {CUOPT_MIP_BATCH_PDLP_RELIABILITY_BRANCHING, &mip_settings.mip_batch_pdlp_reliability_branching, 0, 2, 0}, {CUOPT_MIP_STRONG_BRANCHING_SIMPLEX_ITERATION_LIMIT, &mip_settings.strong_branching_simplex_iteration_limit, -1,std::numeric_limits::max(), -1}, @@ -177,6 +178,8 @@ solver_settings_t::solver_settings_t() : pdlp_settings(), mip_settings {CUOPT_DUAL_POSTSOLVE, &pdlp_settings.dual_postsolve, true}, {CUOPT_BARRIER_ITERATIVE_REFINEMENT, &pdlp_settings.barrier_iterative_refinement, true}, {CUOPT_MIP_PROBING, &mip_settings.probing, true}, + {CUOPT_USE_DISTRIBUTED_PDLP, &pdlp_settings.hyper_params.use_distributed_pdlp, false}, + {CUOPT_PDLP_DISABLE_GRAPH, &pdlp_settings.hyper_params.pdlp_disable_graph, false}, }; // String parameters string_parameters = { @@ -187,7 +190,10 @@ solver_settings_t::solver_settings_t() : pdlp_settings(), mip_settings {CUOPT_USER_PROBLEM_FILE, &mip_settings.user_problem_file, ""}, {CUOPT_USER_PROBLEM_FILE, &pdlp_settings.user_problem_file, ""}, {CUOPT_PRESOLVE_FILE, &mip_settings.presolve_file, ""}, - {CUOPT_PRESOLVE_FILE, &pdlp_settings.presolve_file, ""} + {CUOPT_PRESOLVE_FILE, &pdlp_settings.presolve_file, ""}, + {CUOPT_MULTI_GPU_PARTITION_FILE, &pdlp_settings.multi_gpu_partition_file, ""}, + {CUOPT_MULTI_GPU_EXPORT_PARTITION_FILE, &pdlp_settings.multi_gpu_export_partition_file, ""}, + {CUOPT_DISTRIBUTED_PDLP_PARTITIONER, &pdlp_settings.distributed_pdlp_partitioner, "auto"}, }; // clang-format on } diff --git a/cpp/src/pdlp/CMakeLists.txt b/cpp/src/pdlp/CMakeLists.txt index f5f26837b6..12f2550203 100644 --- a/cpp/src/pdlp/CMakeLists.txt +++ b/cpp/src/pdlp/CMakeLists.txt @@ -29,6 +29,12 @@ set(LP_CORE_FILES ${CMAKE_CURRENT_SOURCE_DIR}/termination_strategy/convergence_information.cu ${CMAKE_CURRENT_SOURCE_DIR}/optimal_batch_size_handler/optimal_batch_size_handler.cu ${CMAKE_CURRENT_SOURCE_DIR}/utilities/ping_pong_graph.cu + ${CMAKE_CURRENT_SOURCE_DIR}/distributed_pdlp/shard.cu + ${CMAKE_CURRENT_SOURCE_DIR}/distributed_pdlp/multi_gpu_engine.cu + ${CMAKE_CURRENT_SOURCE_DIR}/distributed_pdlp/partition_loader.cu + ${CMAKE_CURRENT_SOURCE_DIR}/distributed_pdlp/partitioner.cu + ${CMAKE_CURRENT_SOURCE_DIR}/distributed_pdlp/metis_partitioner.cu + ${CMAKE_CURRENT_SOURCE_DIR}/distributed_pdlp/kaminpar_partitioner.cpp ) # C and Python adapter files diff --git a/cpp/src/pdlp/cusparse_view.cu b/cpp/src/pdlp/cusparse_view.cu index 396fd27499..1e3638cdbd 100644 --- a/cpp/src/pdlp/cusparse_view.cu +++ b/cpp/src/pdlp/cusparse_view.cu @@ -498,14 +498,17 @@ cusparse_view_t::cusparse_view_t( // setup cusparse view A.create(op_problem_scaled.n_constraints, op_problem_scaled.n_variables, - op_problem_scaled.nnz, + static_cast(A_.size()), const_cast(op_problem_scaled.offsets.data()), const_cast(op_problem_scaled.variables.data()), const_cast(op_problem_scaled.coefficients.data())); + // A_T can have a different nnz than A in multi-GPU shards + // A is just what is needed to compute A_x for owned constraints + // A_T is just what is needed to compute A_T_y for owned variables A_T.create(op_problem_scaled.n_variables, op_problem_scaled.n_constraints, - op_problem_scaled.nnz, + static_cast(A_T_.size()), const_cast(A_T_offsets_.data()), const_cast(A_T_indices_.data()), const_cast(A_T_.data())); @@ -914,14 +917,14 @@ cusparse_view_t::cusparse_view_t( // setup cusparse view A.create(op_problem.n_constraints, op_problem.n_variables, - op_problem.nnz, + static_cast(A_.size()), const_cast(op_problem.offsets.data()), const_cast(op_problem.variables.data()), const_cast(op_problem.coefficients.data())); A_T.create(op_problem.n_variables, op_problem.n_constraints, - op_problem.nnz, + static_cast(A_T_.size()), const_cast(A_T_offsets_.data()), const_cast(A_T_indices_.data()), const_cast(A_T_.data())); @@ -1129,16 +1132,18 @@ cusparse_view_t::cusparse_view_t( // Copying them from the existing cuSparse view is a bad practice and creates segfault post // CUDA 12.4 Using the saved pointer of the existing cusparse view to make sure we capture the // correct pointer + // See comment in the PDHG cusparse_view_t ctor: bind the descriptor nnz to + // the actual value-buffer length so A and A_T stay symmetric and shard-safe. A.create(op_problem.n_constraints, op_problem.n_variables, - op_problem.nnz, + static_cast(A_.size()), const_cast(A_offsets_.data()), const_cast(A_indices_.data()), const_cast(A_.data())); A_T.create(op_problem.n_variables, op_problem.n_constraints, - op_problem.nnz, + static_cast(existing_cusparse_view.A_T_.size()), const_cast(existing_cusparse_view.A_T_offsets_.data()), const_cast(existing_cusparse_view.A_T_indices_.data()), const_cast(existing_cusparse_view.A_T_.data())); diff --git a/cpp/src/pdlp/distributed_pdlp/kaminpar_partitioner.cpp b/cpp/src/pdlp/distributed_pdlp/kaminpar_partitioner.cpp new file mode 100644 index 0000000000..e7bf943f92 --- /dev/null +++ b/cpp/src/pdlp/distributed_pdlp/kaminpar_partitioner.cpp @@ -0,0 +1,142 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + */ + +// Plain C++ translation unit (not .cu): KaMinPar's public header is C++20 host code +// and pulls in TBB; keeping it out of nvcc avoids device-compiler friction. + +#include +#include + +#include + +#include + +#include + +#include +#include +#include +#include +#include + +namespace cuopt::linear_programming::detail { + +// Builds the bipartite constraint/variable graph induced by A (identical layout +// to metis_partitioner_t) and runs the multi-threaded KaMinPar k-way kernel. +// * nodes [0, nb_cstr) : constraint nodes +// * nodes [nb_cstr, nb_cstr+nb_vars): variable nodes +// * undirected edges from each A nonzero (one half via A, one via A_t) +template +std::vector kaminpar_partitioner_t::partition( + partitioner_input_t const& input) const +{ + cuopt_expects(input.nb_parts >= 1, + error_type_t::ValidationError, + "kaminpar_partitioner: nb_parts must be >= 1"); + cuopt_expects(input.nb_cstr >= 0 && input.nb_vars >= 0, + error_type_t::ValidationError, + "kaminpar_partitioner: invalid problem dimensions"); + + // The k-way kernel needs at least 2 blocks. For the single-shard case the + // partition is trivial (everything in block 0); short-circuit so KaMinPar can + // still be selected with distributed_pdlp_num_gpus == 1 without crashing. + if (input.nb_parts == 1) { + CUOPT_LOG_INFO("KaMinPar: nb_parts == 1, returning trivial single-block partition"); + return std::vector(static_cast(input.nb_cstr + input.nb_vars), i_t{0}); + } + cuopt_expects(input.A.row_offsets != nullptr && input.A.col_indices != nullptr, + error_type_t::ValidationError, + "kaminpar_partitioner: A.row_offsets and A.col_indices are required"); + cuopt_expects(input.A_t.row_offsets != nullptr && input.A_t.col_indices != nullptr, + error_type_t::ValidationError, + "kaminpar_partitioner: A_t.row_offsets and A_t.col_indices are required"); + + auto const& A_offsets = *input.A.row_offsets; + auto const& A_cols = *input.A.col_indices; + auto const& A_t_offsets = *input.A_t.row_offsets; + auto const& A_t_cols = *input.A_t.col_indices; + + cuopt_expects(static_cast(A_offsets.size()) == input.nb_cstr + 1, + error_type_t::ValidationError, + "kaminpar_partitioner: A.row_offsets size mismatch (expected nb_cstr+1)"); + cuopt_expects(static_cast(A_t_offsets.size()) == input.nb_vars + 1, + error_type_t::ValidationError, + "kaminpar_partitioner: A_t.row_offsets size mismatch (expected nb_vars+1)"); + cuopt_expects(A_cols.size() == A_t_cols.size(), + error_type_t::ValidationError, + "kaminpar_partitioner: A and A_t nnz mismatch"); + + const i_t nb_cstr = input.nb_cstr; + const i_t nb_vars = input.nb_vars; + const i_t nnz = static_cast(A_cols.size()); + const i_t nvtx = nb_cstr + nb_vars; + + // Resolve thread count: <= 0 => all hardware threads (1 as a last resort). + int nthreads = input.nb_threads > 0 ? static_cast(input.nb_threads) : 0; + if (nthreads <= 0) { + nthreads = static_cast(std::thread::hardware_concurrency()); + if (nthreads <= 0) { nthreads = 1; } + } + + // Bipartite CSR using KaMinPar index types (EdgeID for offsets, NodeID for neighbours). + std::vector xadj(static_cast(nvtx) + 1); + std::vector adjncy(2 * static_cast(nnz)); + + for (i_t i = 0; i <= nb_cstr; ++i) { + xadj[i] = static_cast(A_offsets[i]); + } + for (i_t i = 0; i <= nb_vars; ++i) { + xadj[nb_cstr + i] = + static_cast(A_t_offsets[i]) + static_cast(nnz); + } + for (i_t k = 0; k < nnz; ++k) { + adjncy[k] = + static_cast(A_cols[k]) + static_cast(nb_cstr); + } + for (i_t k = 0; k < nnz; ++k) { + adjncy[nnz + k] = static_cast(A_t_cols[k]); + } + + std::vector block_of(static_cast(nvtx)); + + kaminpar::KaMinPar engine(nthreads, kaminpar::shm::create_default_context()); + engine.copy_graph(std::span(xadj), + std::span(adjncy)); + engine.set_k(static_cast(input.nb_parts)); + // ~3% imbalance, matching METIS_PartGraphKway's default balance constraint. + engine.set_uniform_max_block_weights(0.03); + + auto t0 = std::chrono::high_resolution_clock::now(); + const kaminpar::shm::EdgeWeight edge_cut = + engine.compute_partition(std::span(block_of)); + auto t1 = std::chrono::high_resolution_clock::now(); + const double dt = std::chrono::duration(t1 - t0).count(); + + CUOPT_LOG_INFO( + "KaMinPar partitioned bipartite graph: nvtx=%d nnz=%d nb_parts=%d nthreads=%d edge_cut=%lld " + "in %.3fs", + static_cast(nvtx), + static_cast(nnz), + static_cast(input.nb_parts), + nthreads, + static_cast(edge_cut), + dt); + + std::vector parts(static_cast(nvtx)); + for (i_t i = 0; i < nvtx; ++i) { + parts[i] = static_cast(block_of[i]); + } + + validate_partition(parts, + static_cast(nb_cstr), + static_cast(nb_vars), + static_cast(input.nb_parts), + "kaminpar_partitioner"); + return parts; +} + +template class kaminpar_partitioner_t; + +} // namespace cuopt::linear_programming::detail diff --git a/cpp/src/pdlp/distributed_pdlp/kaminpar_partitioner.hpp b/cpp/src/pdlp/distributed_pdlp/kaminpar_partitioner.hpp new file mode 100644 index 0000000000..43fda76f9f --- /dev/null +++ b/cpp/src/pdlp/distributed_pdlp/kaminpar_partitioner.hpp @@ -0,0 +1,23 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + */ + +#pragma once + +#include + +namespace cuopt::linear_programming::detail { + +// Multi-threaded k-way partitioner backed by KaMinPar. Builds the same +// constraint/variable bipartite graph as metis_partitioner_t, but runs the +// shared-memory parallel KaMinPar kernel so partitioning scales across all CPU +// cores of a node (set via partitioner_input_t::nb_threads; <= 0 => all +// hardware threads). +template +class kaminpar_partitioner_t : public partitioner_i { + public: + std::vector partition(partitioner_input_t const& input) const override; +}; + +} // namespace cuopt::linear_programming::detail diff --git a/cpp/src/pdlp/distributed_pdlp/metis_partitioner.cu b/cpp/src/pdlp/distributed_pdlp/metis_partitioner.cu new file mode 100644 index 0000000000..9a4f0f50b1 --- /dev/null +++ b/cpp/src/pdlp/distributed_pdlp/metis_partitioner.cu @@ -0,0 +1,151 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + */ + +#include +#include + +#include + +#include + +#include + +#include +#include +#include + +namespace cuopt::linear_programming::detail { + +// Builds the bipartite constraint/variable graph induced by A and runs +// METIS_PartGraphKway to assign each of the (nb_cstr + nb_vars) nodes to a +// part in [0, nb_parts). Layout matches metis_tests: +// * nodes [0, nb_cstr) : constraint nodes +// * nodes [nb_cstr, nb_cstr+nb_vars): variable nodes +// * undirected edges from each A nonzero (one half via A, one via A_t) +// The output is consumed by partition_loader_t::create_rank_data_from_parts. +template +std::vector metis_partitioner_t::partition( + partitioner_input_t const& input) const +{ + cuopt_expects(input.nb_parts > 0, + error_type_t::ValidationError, + "metis_partitioner: nb_parts must be positive"); + cuopt_expects(input.nb_cstr >= 0 && input.nb_vars >= 0, + error_type_t::ValidationError, + "metis_partitioner: invalid problem dimensions"); + + if (input.nb_parts == 1) { + CUOPT_LOG_INFO("METIS: nb_parts == 1, returning trivial single-block partition"); + return std::vector(static_cast(input.nb_cstr + input.nb_vars), i_t{0}); + } + + cuopt_expects(input.A.row_offsets != nullptr && input.A.col_indices != nullptr, + error_type_t::ValidationError, + "metis_partitioner: A.row_offsets and A.col_indices are required"); + cuopt_expects(input.A_t.row_offsets != nullptr && input.A_t.col_indices != nullptr, + error_type_t::ValidationError, + "metis_partitioner: A_t.row_offsets and A_t.col_indices are required"); + + auto const& A_offsets = *input.A.row_offsets; + auto const& A_cols = *input.A.col_indices; + auto const& A_t_offsets = *input.A_t.row_offsets; + auto const& A_t_cols = *input.A_t.col_indices; + + cuopt_expects(static_cast(A_offsets.size()) == input.nb_cstr + 1, + error_type_t::ValidationError, + "metis_partitioner: A.row_offsets size mismatch (expected nb_cstr+1)"); + cuopt_expects(static_cast(A_t_offsets.size()) == input.nb_vars + 1, + error_type_t::ValidationError, + "metis_partitioner: A_t.row_offsets size mismatch (expected nb_vars+1)"); + cuopt_expects(A_cols.size() == A_t_cols.size(), + error_type_t::ValidationError, + "metis_partitioner: A and A_t nnz mismatch"); + + const i_t nb_cstr = input.nb_cstr; + const i_t nb_vars = input.nb_vars; + const i_t nnz = static_cast(A_cols.size()); + const i_t nvtx = nb_cstr + nb_vars; + + // Bipartite CSR. Same construction as metis_tests/src/main.cpp: + // xadj has length nvtx + 1 + // adjncy has length 2 * nnz (each A nonzero contributes one half-edge + // from cstr side via A and one half-edge from var side via A_t) + std::vector xadj(nvtx + 1); + std::vector adjncy(2 * static_cast(nnz)); + + // cstr-side row offsets: A_offsets[0..nb_cstr] (no shift). + for (i_t i = 0; i <= nb_cstr; ++i) { + xadj[i] = static_cast(A_offsets[i]); + } + // var-side row offsets: A_t_offsets[0..nb_vars], shifted by +nnz so that + // they index into the second half of adjncy. + for (i_t i = 0; i <= nb_vars; ++i) { + xadj[nb_cstr + i] = static_cast(A_t_offsets[i]) + static_cast(nnz); + } + + // cstr-side neighbours: A_cols[i] shifted by +nb_cstr to index into the + // variable node block. + for (i_t k = 0; k < nnz; ++k) { + adjncy[k] = static_cast(A_cols[k]) + static_cast(nb_cstr); + } + // var-side neighbours: A_t_cols[i] already in [0, nb_cstr). + for (i_t k = 0; k < nnz; ++k) { + adjncy[nnz + k] = static_cast(A_t_cols[k]); + } + + idx_t metis_options[METIS_NOPTIONS]; + METIS_SetDefaultOptions(metis_options); + metis_options[METIS_OPTION_OBJTYPE] = METIS_OBJTYPE_CUT; + + idx_t metis_nvtx = static_cast(nvtx); + idx_t ncon = 1; + idx_t nparts = static_cast(input.nb_parts); + idx_t objval = 0; + std::vector metis_parts(nvtx); + + auto t0 = std::chrono::high_resolution_clock::now(); + const int status = METIS_PartGraphKway(&metis_nvtx, + &ncon, + xadj.data(), + adjncy.data(), + /*vwgt=*/nullptr, + /*vsize=*/nullptr, + /*adjwgt=*/nullptr, + &nparts, + /*tpwgts=*/nullptr, + /*ubvec=*/nullptr, + metis_options, + &objval, + metis_parts.data()); + auto t1 = std::chrono::high_resolution_clock::now(); + const double dt = std::chrono::duration(t1 - t0).count(); + cuopt_expects(status == METIS_OK, + error_type_t::RuntimeError, + "METIS_PartGraphKway failed (status=%d)", + status); + CUOPT_LOG_INFO( + "METIS partitioned bipartite graph: nvtx=%d nnz=%d nb_parts=%d edge_cut=%lld in %.3fs", + static_cast(nvtx), + static_cast(nnz), + static_cast(input.nb_parts), + static_cast(objval), + dt); + + std::vector parts(static_cast(nvtx)); + for (i_t i = 0; i < nvtx; ++i) { + parts[i] = static_cast(metis_parts[i]); + } + + validate_partition(parts, + static_cast(nb_cstr), + static_cast(nb_vars), + static_cast(input.nb_parts), + "metis_partitioner"); + return parts; +} + +template class metis_partitioner_t; + +} // namespace cuopt::linear_programming::detail diff --git a/cpp/src/pdlp/distributed_pdlp/metis_partitioner.hpp b/cpp/src/pdlp/distributed_pdlp/metis_partitioner.hpp new file mode 100644 index 0000000000..c4e37f57a9 --- /dev/null +++ b/cpp/src/pdlp/distributed_pdlp/metis_partitioner.hpp @@ -0,0 +1,24 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + */ + +#pragma once + +#include + +namespace cuopt::linear_programming::detail { + +// METIS k-way partitioner on the constraint/variable bipartite graph induced by A. +// Requires partitioner_input_t::A and A_t (or A row_offsets/col_indices only — the +// implementation builds the bipartite adjacency the same way as metis_tests: +// cstr nodes [0, nb_cstr), var nodes [nb_cstr, nb_cstr+nb_vars), edges from A and A_t). +// +// Wire into make_partitioner() once METIS is an optional cuOpt dependency. +template +class metis_partitioner_t : public partitioner_i { + public: + std::vector partition(partitioner_input_t const& input) const override; +}; + +} // namespace cuopt::linear_programming::detail diff --git a/cpp/src/pdlp/distributed_pdlp/multi_gpu_engine.cu b/cpp/src/pdlp/distributed_pdlp/multi_gpu_engine.cu new file mode 100644 index 0000000000..98f33b6c88 --- /dev/null +++ b/cpp/src/pdlp/distributed_pdlp/multi_gpu_engine.cu @@ -0,0 +1,102 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + */ + +#include +// compute_A_x() / compute_At_y() (defined inline in the engine header) call +// shard.sub_pdlp->pdhg_solver_.compute_* — pdlp_solver_t must be complete at +// the explicit instantiation point below. +#include + +#include + +#include + +#include + +#include + +namespace cuopt::linear_programming::detail { + +template +multi_gpu_engine_t::multi_gpu_engine_t( + std::vector>&& rank_data, + std::vector const& h_global_obj, + std::vector const& h_global_var_lower, + std::vector const& h_global_var_upper, + std::vector const& h_global_cstr_lower, + std::vector const& h_global_cstr_upper, + std::vector const& h_global_obj_scaled, + std::vector const& h_global_var_lower_scaled, + std::vector const& h_global_var_upper_scaled, + std::vector const& h_global_cstr_lower_scaled, + std::vector const& h_global_cstr_upper_scaled, + std::vector const& h_global_cummulative_cstr_scaling, + std::vector const& h_global_cummulative_var_scaling, + f_t h_bound_rescaling, + f_t h_objective_rescaling, + bool maximize, + f_t objective_offset, + f_t objective_scaling_factor, + pdlp_solver_settings_t const& sub_solver_settings) + : stream() +{ + const int nb_parts = static_cast(rank_data.size()); + cuopt_expects( + nb_parts > 0, error_type_t::ValidationError, "multi_gpu_engine_t: rank_data must be non-empty"); + + shards.reserve(nb_parts); + std::vector devices(nb_parts); + std::iota(devices.begin(), devices.end(), 0); + + // Create NCCL Comms then let shards own them + std::vector raw_comms(nb_parts); + cuopt_expects(ncclCommInitAll(raw_comms.data(), nb_parts, devices.data()) == ncclSuccess, + error_type_t::RuntimeError, + "ncclCommInitAll failed"); + + // 3. Construct one shard per rank, pinned to its device. + for (int r = 0; r < nb_parts; ++r) { + raft::device_setter guard(devices[r]); // shard ctor needs device set + shards.emplace_back(std::make_unique>(devices[r], + std::move(rank_data[r]), + raw_comms[r], + h_global_obj, + h_global_var_lower, + h_global_var_upper, + h_global_cstr_lower, + h_global_cstr_upper, + h_global_obj_scaled, + h_global_var_lower_scaled, + h_global_var_upper_scaled, + h_global_cstr_lower_scaled, + h_global_cstr_upper_scaled, + h_global_cummulative_cstr_scaling, + h_global_cummulative_var_scaling, + h_bound_rescaling, + h_objective_rescaling, + maximize, + objective_offset, + objective_scaling_factor, + sub_solver_settings)); + } + + // Two different events + // capture_*_event_ are used inside graph capture + // ext_*_event_ are used when sync is needed outside of graph + graph_master_ready_event_ = std::make_unique(); + sync_master_ready_event_ = std::make_unique(); + graph_shard_ready_events_.reserve(nb_parts); + sync_shard_ready_events_.reserve(nb_parts); + for (int r = 0; r < nb_parts; ++r) { + raft::device_setter guard(devices[r]); + graph_shard_ready_events_.emplace_back(std::make_unique()); + sync_shard_ready_events_.emplace_back(std::make_unique()); + } +} + +template struct multi_gpu_engine_t; +// template struct multi_gpu_engine_t; + +} // namespace cuopt::linear_programming::detail diff --git a/cpp/src/pdlp/distributed_pdlp/multi_gpu_engine.hpp b/cpp/src/pdlp/distributed_pdlp/multi_gpu_engine.hpp new file mode 100644 index 0000000000..89153e8bd7 --- /dev/null +++ b/cpp/src/pdlp/distributed_pdlp/multi_gpu_engine.hpp @@ -0,0 +1,1191 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + */ +#pragma once + +#include +#include +#include +#include +#include + +#include + +#include +#include +#include +#include + +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include +#include +#include +#include +#include + +namespace cuopt::linear_programming::detail { + +// Element-wise sqrt functor. Defined at namespace scope (not as a local +// extended HD lambda) because nvcc disallows extended __host__ __device__ +// lambdas appearing inside templates whose template arguments are +// themselves local lambda types (which happens when distributed_l2_norm is +// invoked with closure accessors). +template +struct sqrt_inplace_op_t { + __host__ __device__ f_t operator()(f_t x) const { return raft::sqrt(x); } +}; + +// Squared-norm contribution of a constraint's [lower, upper] bound pair, used to +// build the distributed bound rescaling (mirrors rhs_sum_of_squares_t). Defined +// at namespace scope to avoid extended-lambda-in-template restrictions. +template +struct mgpu_rhs_sq_op_t { + __host__ __device__ f_t operator()(const thrust::tuple& t) const + { + const f_t lower = thrust::get<0>(t); + const f_t upper = thrust::get<1>(t); + f_t sum = f_t(0); + if (isfinite(lower) && (lower != upper)) sum += lower * lower; + if (isfinite(upper)) sum += upper * upper; + return sum; + } +}; + +// Weighted square of an objective coefficient (mirrors weighted_square_op). +template +struct mgpu_weighted_sq_op_t { + f_t weight; + __host__ __device__ f_t operator()(f_t v) const { return v * v * weight; } +}; + +template +struct multi_gpu_engine_t { + // Constructs shards from rank_data + multi_gpu_engine_t(std::vector>&& rank_data, + std::vector const& h_global_obj, + std::vector const& h_global_var_lower, + std::vector const& h_global_var_upper, + std::vector const& h_global_cstr_lower, + std::vector const& h_global_cstr_upper, + std::vector const& h_global_obj_scaled, + std::vector const& h_global_var_lower_scaled, + std::vector const& h_global_var_upper_scaled, + std::vector const& h_global_cstr_lower_scaled, + std::vector const& h_global_cstr_upper_scaled, + std::vector const& h_global_cummulative_cstr_scaling, + std::vector const& h_global_cummulative_var_scaling, + f_t h_bound_rescaling, + f_t h_objective_rescaling, + bool maximize, + f_t objective_offset, + f_t objective_scaling_factor, + pdlp_solver_settings_t const& sub_solver_settings); + + multi_gpu_engine_t(const multi_gpu_engine_t&) = delete; + multi_gpu_engine_t& operator=(const multi_gpu_engine_t&) = delete; + + template + void for_each_shard(Fn&& fn) + { + for (auto& s : shards) { + raft::device_setter guard(s->device_id); + fn(*s); + } + } + + template + void distributed_transform(std::tuple in_accessors, + OutAccess out, + SizeAccess sz, + Op op) + { + for_each_shard([&](auto& shard) { + auto& sub = *shard.sub_pdlp; + // turns the Tuple of lambdas into a tuple of rmm::device_uvector + auto cub_inputs = std::apply( + [&sub](auto&... acc) { return cuda::std::make_tuple(acc(sub)...); }, in_accessors); + + cub::DeviceTransform::Transform(cub_inputs, out(sub), sz(sub), op, shard.stream.view()); + }); + } + // --- 2) convenience: single input accessor (delegates) --- + template + void distributed_transform(InAccess in, OutAccess out, SizeAccess sz, Op op) + { + distributed_transform(std::make_tuple(in), out, sz, op); + } + + // -------- Halo exchange (variables / x) --------------------------------- + // Fills the halo slice [owned_var_size, total_var_size) of the per-shard + // input buffer returned by `buf_access(pdhg)` (the buffer A @ x will read). + // Step 1: thrust::gather per-peer outgoing values into staging buffers. + // Step 2: a single NCCL group with matched ncclSend / ncclRecv across all + // (rank, peer) pairs. + template + void halo_exchange_var(BufAccess&& buf_access) + { + const int nb = static_cast(shards.size()); + + // Step 1: gather owned values that each peer needs into per-peer staging. + for (int r = 0; r < nb; ++r) { + auto& s = *shards[r]; + raft::device_setter guard(s.device_id); + auto& x = buf_access(s.sub_pdlp->pdhg_solver_); + for (int peer = 0; peer < nb; ++peer) { + if (peer == r) continue; + if (s.var_send_indices_d[peer].size() == 0) continue; + thrust::gather(rmm::exec_policy_nosync(s.stream.view()), + s.var_send_indices_d[peer].begin(), + s.var_send_indices_d[peer].end(), + x.begin(), + s.var_send_buf_d[peer].begin()); + } + } + + // Step 2: matched send / recv across the whole topology in one NCCL group. + ncclGroupStart(); + for (int r = 0; r < nb; ++r) { + auto& s = *shards[r]; + raft::device_setter guard(s.device_id); + for (int peer = 0; peer < nb; ++peer) { + if (peer == r) continue; + ncclSend(s.var_send_buf_d[peer].data(), + s.var_send_buf_d[peer].size(), + ncclFloat64, + peer, + s.comm.get(), + s.stream.view().value()); + } + } + for (int r = 0; r < nb; ++r) { + auto& s = *shards[r]; + auto& rd = s.rank_data; + raft::device_setter guard(s.device_id); + auto& x = buf_access(s.sub_pdlp->pdhg_solver_); + for (int peer = 0; peer < nb; ++peer) { + if (peer == r) continue; + f_t* recv_ptr = x.data() + rd.owned_var_size + rd.var_recv_offsets[peer]; + ncclRecv(recv_ptr, + static_cast(rd.var_recv_counts[peer]), + ncclFloat64, + peer, + s.comm.get(), + s.stream.view().value()); + } + } + ncclGroupEnd(); + } + + // -------- Halo exchange (constraints / y) ------------------------------- + // Same as halo_exchange_var but for a constraint-shaped buffer (the input + // A_T @ y will read) and constraint halos. + template + void halo_exchange_cstr(BufAccess&& buf_access) + { + const int nb = static_cast(shards.size()); + + for (int r = 0; r < nb; ++r) { + auto& s = *shards[r]; + raft::device_setter guard(s.device_id); + auto& y = buf_access(s.sub_pdlp->pdhg_solver_); + for (int peer = 0; peer < nb; ++peer) { + if (peer == r) continue; + if (s.cstr_send_indices_d[peer].size() == 0) continue; + thrust::gather(rmm::exec_policy_nosync(s.stream.view()), + s.cstr_send_indices_d[peer].begin(), + s.cstr_send_indices_d[peer].end(), + y.begin(), + s.cstr_send_buf_d[peer].begin()); + } + } + + ncclGroupStart(); + for (int r = 0; r < nb; ++r) { + auto& s = *shards[r]; + raft::device_setter guard(s.device_id); + for (int peer = 0; peer < nb; ++peer) { + if (peer == r) continue; + ncclSend(s.cstr_send_buf_d[peer].data(), + s.cstr_send_buf_d[peer].size(), + ncclFloat64, + peer, + s.comm.get(), + s.stream.view().value()); + } + } + for (int r = 0; r < nb; ++r) { + auto& s = *shards[r]; + auto& rd = s.rank_data; + raft::device_setter guard(s.device_id); + auto& y = buf_access(s.sub_pdlp->pdhg_solver_); + for (int peer = 0; peer < nb; ++peer) { + if (peer == r) continue; + f_t* recv_ptr = y.data() + rd.owned_cstr_size + rd.cstr_recv_offsets[peer]; + ncclRecv(recv_ptr, + static_cast(rd.cstr_recv_counts[peer]), + ncclFloat64, + peer, + s.comm.get(), + s.stream.view().value()); + } + } + ncclGroupEnd(); + } + + // -------- Broadcast owned constraint (row) scaling into halo ------------ + void broadcast_constraint_scaling_to_halo() + { + const int nb = static_cast(shards.size()); + auto buf_access = [](pdlp_shard_t& s) -> rmm::device_uvector& { + return s.sub_pdlp->get_initial_scaling_strategy().get_cummulative_constraint_matrix_scaling(); + }; + + // Gather each owner's owned scaling values that peers need. + for (int r = 0; r < nb; ++r) { + auto& s = *shards[r]; + raft::device_setter guard(s.device_id); + auto& y = buf_access(s); + for (int peer = 0; peer < nb; ++peer) { + if (peer == r) continue; + if (s.cstr_send_indices_d[peer].size() == 0) continue; + thrust::gather(rmm::exec_policy_nosync(s.stream.view()), + s.cstr_send_indices_d[peer].begin(), + s.cstr_send_indices_d[peer].end(), + y.begin(), + s.cstr_send_buf_d[peer].begin()); + } + } + + ncclGroupStart(); + for (int r = 0; r < nb; ++r) { + auto& s = *shards[r]; + raft::device_setter guard(s.device_id); + for (int peer = 0; peer < nb; ++peer) { + if (peer == r) continue; + ncclSend(s.cstr_send_buf_d[peer].data(), + s.cstr_send_buf_d[peer].size(), + ncclFloat64, + peer, + s.comm.get(), + s.stream.view().value()); + } + } + for (int r = 0; r < nb; ++r) { + auto& s = *shards[r]; + auto& rd = s.rank_data; + raft::device_setter guard(s.device_id); + auto& y = buf_access(s); + for (int peer = 0; peer < nb; ++peer) { + if (peer == r) continue; + f_t* recv_ptr = y.data() + rd.owned_cstr_size + rd.cstr_recv_offsets[peer]; + ncclRecv(recv_ptr, + static_cast(rd.cstr_recv_counts[peer]), + ncclFloat64, + peer, + s.comm.get(), + s.stream.view().value()); + } + } + ncclGroupEnd(); + } + + // -------- NCCL allreduce (sum, in place) -------------------------------- + // Per-shard in-place sum-allreduce. Each shard's stream issues an + // ncclAllReduce(buf, buf, count, ncclFloat64, ncclSum, ...) inside a single + // group. After this returns, every shard's buffer holds the global sum. + // + // PtrAccess: pdlp_solver_t& -> f_t* (e.g. into step_size_strategy_). + template + void allreduce_sum_inplace(PtrAccess&& ptr_access, size_t count = 1) + { + ncclGroupStart(); + for (auto& s : shards) { + raft::device_setter guard(s->device_id); + f_t* buf = ptr_access(*s->sub_pdlp); + ncclAllReduce(buf, buf, count, ncclFloat64, ncclSum, s->comm.get(), s->stream.view().value()); + } + ncclGroupEnd(); + } + + // -------- Distributed L2 norm ------------------------------------------ + // Computes sqrt(Σ_k Σ_{i ∈ owned_k} buf_k[i]²) and writes the scalar into + // the buffer returned by `out_access` on EVERY shard. + // + // Algorithm: + // 1) per shard: out = cublasdot(buf[0:n_owned], buf[0:n_owned]) (partial Σ²) + // 2) NCCL allreduce SUM on out (count = 1) (global Σ²) + // 3) per shard: out = sqrt(out) + // + // The caller is responsible for clipping correctness via `size_access` + // (which picks `rank_data.owned_var_size` or `rank_data.owned_cstr_size` + // depending on the shape of the input buffer), and for mirroring the + // result back to master if downstream code needs it there. + // + // BufAccess : pdlp_solver_t& -> rmm::device_uvector& + // OutAccess : pdlp_solver_t& -> f_t* (single scalar in shard memory) + // SizeAccess : pdlp_shard_t& -> i_t (owned slice length) + template + void distributed_l2_norm(BufAccess&& buf_access, OutAccess&& out_access, SizeAccess&& size_access) + { + for_each_shard([&](auto& shard) { + auto& sub = *shard.sub_pdlp; + auto& buf = buf_access(sub); + const i_t n = size_access(shard); + f_t* out = out_access(sub); + RAFT_CUBLAS_TRY(raft::linalg::detail::cublasdot(shard.handle.get_cublas_handle(), + static_cast(n), + buf.data(), + 1, + buf.data(), + 1, + out, + shard.stream.view().value())); + }); + + allreduce_sum_inplace(out_access, /*count=*/1); + + for_each_shard([&](auto& shard) { + f_t* out = out_access(*shard.sub_pdlp); + cub::DeviceTransform::Transform( + out, out, 1, sqrt_inplace_op_t{}, shard.stream.view().value()); + }); + } + + // -------- Distributed bound / objective rescaling ----------------------- + void distributed_bound_objective_rescaling(f_t c_scaling_weight) + { + const int nb = static_cast(shards.size()); + + std::vector> bound_sq; + std::vector> obj_sq; + bound_sq.reserve(nb); + obj_sq.reserve(nb); + + // 1) per-shard partial squared norms over OWNED entries only (halo rhs is + // +/-inf and would otherwise double-count owned entries shared as halo). + for (int r = 0; r < nb; ++r) { + auto& s = *shards[r]; + raft::device_setter guard(s.device_id); + bound_sq.emplace_back(1, s.stream.view()); + obj_sq.emplace_back(1, s.stream.view()); + + const auto& scaled = s.sub_pdlp->get_initial_scaling_strategy().get_scaled_op_problem(); + const int n_owned_cstr = static_cast(s.rank_data.owned_cstr_size); + const int n_owned_var = static_cast(s.rank_data.owned_var_size); + + auto bound_in = thrust::make_transform_iterator( + thrust::make_zip_iterator(scaled.constraint_lower_bounds.data(), + scaled.constraint_upper_bounds.data()), + mgpu_rhs_sq_op_t{}); + size_t tmp_bytes_b = 0; + cub::DeviceReduce::Sum( + nullptr, tmp_bytes_b, bound_in, bound_sq[r].data(), n_owned_cstr, s.stream.view().value()); + rmm::device_buffer scratch_b(tmp_bytes_b, s.stream.view()); + cub::DeviceReduce::Sum(scratch_b.data(), + tmp_bytes_b, + bound_in, + bound_sq[r].data(), + n_owned_cstr, + s.stream.view().value()); + + auto obj_in = thrust::make_transform_iterator(scaled.objective_coefficients.data(), + mgpu_weighted_sq_op_t{c_scaling_weight}); + size_t tmp_bytes_o = 0; + cub::DeviceReduce::Sum( + nullptr, tmp_bytes_o, obj_in, obj_sq[r].data(), n_owned_var, s.stream.view().value()); + rmm::device_buffer scratch_o(tmp_bytes_o, s.stream.view()); + cub::DeviceReduce::Sum(scratch_o.data(), + tmp_bytes_o, + obj_in, + obj_sq[r].data(), + n_owned_var, + s.stream.view().value()); + } + + // 2) NCCL allreduce SUM -> every shard holds the global squared norms. + ncclGroupStart(); + for (int r = 0; r < nb; ++r) { + auto& s = *shards[r]; + raft::device_setter guard(s.device_id); + ncclAllReduce(bound_sq[r].data(), + bound_sq[r].data(), + 1, + ncclFloat64, + ncclSum, + s.comm.get(), + s.stream.view().value()); + ncclAllReduce(obj_sq[r].data(), + obj_sq[r].data(), + 1, + ncclFloat64, + ncclSum, + s.comm.get(), + s.stream.view().value()); + } + ncclGroupEnd(); + + // 3) derive the identical scalars and apply on every shard. + for (int r = 0; r < nb; ++r) { + auto& s = *shards[r]; + raft::device_setter guard(s.device_id); + f_t h_bound_sq = f_t(0); + f_t h_obj_sq = f_t(0); + raft::copy(&h_bound_sq, bound_sq[r].data(), 1, s.stream.view()); + raft::copy(&h_obj_sq, obj_sq[r].data(), 1, s.stream.view()); + s.stream.synchronize(); + const f_t bound_rescaling = f_t(1) / (std::sqrt(h_bound_sq) + f_t(1)); + const f_t objective_rescaling = f_t(1) / (std::sqrt(h_obj_sq) + f_t(1)); + s.sub_pdlp->get_initial_scaling_strategy().apply_distributed_bound_objective_rescaling( + bound_rescaling, objective_rescaling); + } + for (int r = 0; r < nb; ++r) { + auto& s = *shards[r]; + raft::device_setter guard(s.device_id); + s.stream.synchronize(); + } + } + + // -------- Generic distributed SpMVs ------------------------------------- + // distributed_spmv_A : halo-update the var-shaped input buffer returned by + // `in_buf(pdhg)`, then per-shard A @ in_buf -> out_desc. + // distributed_spmv_At: halo-update the cstr-shaped input buffer returned by + // `in_buf(pdhg)`, then per-shard A_T @ in_buf -> out_desc. + // + // Accessor signatures: + // in_buf (pdhg_solver_t&) -> rmm::device_uvector& + // out_desc(pdhg_solver_t&) -> cusparseDnVecDescr_t + template + void distributed_spmv_A(InBufAccess&& in_buf, OutDescAccess&& out_desc) + { + halo_exchange_var(in_buf); + for_each_shard([&](auto& shard) { + auto& sub_pdhg = shard.sub_pdlp->pdhg_solver_; + sub_pdhg.spmv_A_into(in_buf(sub_pdhg), out_desc(sub_pdhg)); + }); + } + + template + void distributed_spmv_At(InBufAccess&& in_buf, OutDescAccess&& out_desc) + { + halo_exchange_cstr(in_buf); + for_each_shard([&](auto& shard) { + auto& sub_pdhg = shard.sub_pdlp->pdhg_solver_; + sub_pdhg.spmv_At_into(in_buf(sub_pdhg), out_desc(sub_pdhg)); + }); + } + + // -------- High-level: A @ x and A_T @ y --------------------------------- + // Thin wrappers used from pdhg_solver_t::compute_A_x / compute_At_y when an + // engine is wired in. They drive the per-shard plan-based SpMV via the + // canonical cusparse_view bindings (no rebinding) so the descriptor binding + // is never disturbed by mGPU machinery. + // + // The halo-exchange MUST target the exact buffer the canonical descriptor + // is bound to in the PDHG cusparse_view (see cusparse_view.cu lines 516-519 + // and 595-599): + // - cv.reflected_primal_solution -> reflected_primal_ (var-shaped) + // - cv.dual_solution -> current.dual_solution_ (cstr-shaped) + // For 1 shard the halo-exchange is a no-op, but the buffer choice is what + // makes multi-shard correctness work, so we keep it accurate either way. + void distributed_compute_A_x() + { + halo_exchange_var( + [](auto& pdhg) -> rmm::device_uvector& { return pdhg.get_reflected_primal(); }); + for_each_shard([](auto& shard) { shard.sub_pdlp->pdhg_solver_.spmvop_A_x(); }); + } + + void distributed_compute_At_y() + { + halo_exchange_cstr( + [](auto& pdhg) -> rmm::device_uvector& { return pdhg.get_dual_solution(); }); + for_each_shard([](auto& shard) { shard.sub_pdlp->pdhg_solver_.spmvop_At_y(); }); + } + + // -------- Distributed Ruiz inf-scaling ----------------------------------- + void alloc_global_var_scratch(i_t n_global_vars, + std::vector>& global_var_buf, + std::vector>& local_to_global_var_d) + { + const int nb = static_cast(shards.size()); + global_var_buf.reserve(nb); + local_to_global_var_d.reserve(nb); + for (int r = 0; r < nb; ++r) { + auto& s = *shards[r]; + raft::device_setter guard(s.device_id); + global_var_buf.emplace_back(static_cast(n_global_vars), s.stream.view()); + local_to_global_var_d.emplace_back(static_cast(s.rank_data.total_var_size), + s.stream.view()); + if (s.rank_data.total_var_size > 0) { + RAFT_CUDA_TRY(cudaMemcpyAsync(local_to_global_var_d.back().data(), + s.rank_data.local_to_global_var.data(), + sizeof(i_t) * s.rank_data.local_to_global_var.size(), + cudaMemcpyHostToDevice, + s.stream.view().value())); + } + } + } + + void reduce_iteration_variable_scaling_across_shards( + ncclRedOp_t op, + i_t n_global_vars, + std::vector>& global_var_buf, + std::vector>& local_to_global_var_d) + { + const int nb = static_cast(shards.size()); + + // Zero global buffers, then scatter each shard's local values into their + // global column indices. + for (int r = 0; r < nb; ++r) { + auto& s = *shards[r]; + raft::device_setter guard(s.device_id); + RAFT_CUDA_TRY(cudaMemsetAsync(global_var_buf[r].data(), + 0, + sizeof(f_t) * static_cast(n_global_vars), + s.stream.view().value())); + auto& iter_var_scaling = + s.sub_pdlp->get_initial_scaling_strategy().get_iteration_variable_scaling(); + if (s.rank_data.total_var_size > 0) { + thrust::scatter(rmm::exec_policy_nosync(s.stream.view()), + iter_var_scaling.begin(), + iter_var_scaling.begin() + s.rank_data.total_var_size, + local_to_global_var_d[r].begin(), + global_var_buf[r].begin()); + } + } + + ncclGroupStart(); + for (int r = 0; r < nb; ++r) { + auto& s = *shards[r]; + raft::device_setter guard(s.device_id); + ncclAllReduce(global_var_buf[r].data(), + global_var_buf[r].data(), + static_cast(n_global_vars), + ncclFloat64, + op, + s.comm.get(), + s.stream.view().value()); + } + ncclGroupEnd(); + + // Gather the global per-column value back into each shard's local iter vector. + for (int r = 0; r < nb; ++r) { + auto& s = *shards[r]; + raft::device_setter guard(s.device_id); + auto& iter_var_scaling = + s.sub_pdlp->get_initial_scaling_strategy().get_iteration_variable_scaling(); + if (s.rank_data.total_var_size > 0) { + thrust::gather(rmm::exec_policy_nosync(s.stream.view()), + local_to_global_var_d[r].begin(), + local_to_global_var_d[r].begin() + s.rank_data.total_var_size, + global_var_buf[r].begin(), + iter_var_scaling.begin()); + } + } + } + + void distributed_ruiz_inf_scaling(int num_iter, i_t n_global_vars) + { + if (num_iter <= 0 || n_global_vars <= 0) return; + raft::common::nvtx::range scope("distributed_ruiz_inf_scaling"); + + std::vector> global_var_buf; + std::vector> local_to_global_var_d; + alloc_global_var_scratch(n_global_vars, global_var_buf, local_to_global_var_d); + + for (int it = 0; it < num_iter; ++it) { + // 1) per-shard local kernel: writes iteration_variable_scaling (per-column + // inf-norm partial) and iteration_constraint_matrix_scaling (row, complete). + for_each_shard([](auto& shard) { + shard.sub_pdlp->get_initial_scaling_strategy().ruiz_iter_compute_local_iteration_vectors(); + }); + + // 2) cross-shard column inf-norm reduction (MAX). + reduce_iteration_variable_scaling_across_shards( + ncclMax, n_global_vars, global_var_buf, local_to_global_var_d); + + // 3) per-shard fold into cumulative + reset iter vectors. + for_each_shard([](auto& shard) { + shard.sub_pdlp->get_initial_scaling_strategy().ruiz_iter_apply_cumulative_update(); + }); + } + + // Make sure per-shard cumulative writes are observable on subsequent + // calls (e.g., the next distributed_max_singular_value). + for_each_shard([](auto& shard) { shard.stream.synchronize(); }); + } + + // Distributed Pock-Chambolle: one pass, mirroring single-GPU + // pock_chambolle_scaling but with the per-column sum-of-powers reduced across + // shards (SUM) between the local kernels and the cumulative fold. Rows are + // owned exclusively, so the row half stays local. Runs after the distributed + // Ruiz pass, matching the single-GPU order (Ruiz then Pock-Chambolle). + void distributed_pock_chambolle_scaling(f_t alpha, i_t n_global_vars) + { + if (n_global_vars <= 0) return; + raft::common::nvtx::range scope("distributed_pock_chambolle_scaling"); + + std::vector> global_var_buf; + std::vector> local_to_global_var_d; + alloc_global_var_scratch(n_global_vars, global_var_buf, local_to_global_var_d); + + // 1) per-shard local kernels: row sum (complete) + column sum (partial). + for_each_shard([alpha](auto& shard) { + shard.sub_pdlp->get_initial_scaling_strategy().pock_chambolle_compute_local_iteration_vectors( + alpha); + }); + + // 2) cross-shard column sum-of-powers reduction (SUM). + reduce_iteration_variable_scaling_across_shards( + ncclSum, n_global_vars, global_var_buf, local_to_global_var_d); + + // 3) per-shard fold into cumulative (cumulative /= sqrt(iteration)). + for_each_shard([](auto& shard) { + shard.sub_pdlp->get_initial_scaling_strategy().pock_chambolle_apply_cumulative_update(); + }); + + for_each_shard([](auto& shard) { shard.stream.synchronize(); }); + } + + // -------- Distributed σ_max(A) via power iteration ---------------------- + f_t distributed_max_singular_value(i_t n_global_cstrs, + int max_iterations = 5000, + f_t tolerance = 1e-4) + { + raft::common::nvtx::range scope("distributed_max_singular_value"); + + const int nb = static_cast(shards.size()); + + // Generate the GLOBAL z[] sequence in cstr-index order from a fresh + // mt19937(1), once per call. It's m doubles regardless of N (cheap). + // Each shard then keeps only z[global_idx_for_owned_local_i]. + std::vector h_global_z(static_cast(n_global_cstrs)); + { + std::mt19937 gen(1); + std::normal_distribution dist(f_t(0.0), f_t(1.0)); + for (i_t i = 0; i < n_global_cstrs; ++i) { + h_global_z[i] = dist(gen); + } + } + + // Per-shard scratch lives on each shard's device. We use total (owned + + // halo) sizes for q/z/atq because they're SpMV inputs that need halo + // space. Norms / dot are scalars. + // We use size-1 rmm::device_uvector instead of rmm::device_scalar for the + // per-shard scratch scalars: nvcc + libcudacxx fail the + // copy_constructible concept check when device_scalar appears in a + // std::vector (the check transitively touches rmm::cuda_stream, which is + // non-copyable). device_uvector avoids that path. + std::vector> q; + std::vector> z; + std::vector> atq; + std::vector> sigma_sq; + std::vector> norm_q; + std::vector> residual_norm; + std::vector z_dn(nb, nullptr); + std::vector atq_dn(nb, nullptr); + q.reserve(nb); + z.reserve(nb); + atq.reserve(nb); + sigma_sq.reserve(nb); + norm_q.reserve(nb); + residual_norm.reserve(nb); + + for (int r = 0; r < nb; ++r) { + auto& s = *shards[r]; + raft::device_setter guard(s.device_id); + const i_t cstr_total = s.rank_data.total_cstr_size; + const i_t var_total = s.rank_data.total_var_size; + q.emplace_back(static_cast(cstr_total), s.stream.view()); + z.emplace_back(static_cast(cstr_total), s.stream.view()); + atq.emplace_back(static_cast(var_total), s.stream.view()); + sigma_sq.emplace_back(std::size_t{1}, s.stream.view()); + norm_q.emplace_back(std::size_t{1}, s.stream.view()); + residual_norm.emplace_back(std::size_t{1}, s.stream.view()); + RAFT_CUSPARSE_TRY(raft::sparse::detail::cusparsecreatednvec( + &z_dn[r], static_cast(cstr_total), z.back().data())); + RAFT_CUSPARSE_TRY(raft::sparse::detail::cusparsecreatednvec( + &atq_dn[r], static_cast(var_total), atq.back().data())); + + std::vector h_owned_z(static_cast(s.rank_data.owned_cstr_size)); + for (i_t i = 0; i < s.rank_data.owned_cstr_size; ++i) { + const i_t g = s.rank_data.local_to_global_cstr[i]; + h_owned_z[i] = h_global_z[g]; + } + if (s.rank_data.owned_cstr_size > 0) { + RAFT_CUDA_TRY( + cudaMemcpyAsync(z.back().data(), + h_owned_z.data(), + sizeof(f_t) * static_cast(s.rank_data.owned_cstr_size), + cudaMemcpyHostToDevice, + s.stream.view().value())); + } + if (cstr_total > s.rank_data.owned_cstr_size) { + RAFT_CUDA_TRY(cudaMemsetAsync( + z.back().data() + s.rank_data.owned_cstr_size, + 0, + sizeof(f_t) * static_cast(cstr_total - s.rank_data.owned_cstr_size), + s.stream.view().value())); + } + // Sync to ensure h_owned_z stays valid through the H2D copy (it goes + // out of scope at end of this iteration of the per-shard loop). + s.stream.synchronize(); + } + + // Local halo-exchange helpers that work directly on per-shard external + // buffers (the engine's halo_exchange_var/cstr expect accessors that + // resolve through pdhg_solver_t, which doesn't see our scratch). + auto halo_exchange_cstr_bufs = [&](std::vector>& bufs) { + for (int r = 0; r < nb; ++r) { + auto& s = *shards[r]; + raft::device_setter guard(s.device_id); + auto& y = bufs[r]; + for (int peer = 0; peer < nb; ++peer) { + if (peer == r) continue; + if (s.cstr_send_indices_d[peer].size() == 0) continue; + thrust::gather(rmm::exec_policy_nosync(s.stream.view()), + s.cstr_send_indices_d[peer].begin(), + s.cstr_send_indices_d[peer].end(), + y.begin(), + s.cstr_send_buf_d[peer].begin()); + } + } + ncclGroupStart(); + for (int r = 0; r < nb; ++r) { + auto& s = *shards[r]; + raft::device_setter guard(s.device_id); + for (int peer = 0; peer < nb; ++peer) { + if (peer == r) continue; + ncclSend(s.cstr_send_buf_d[peer].data(), + s.cstr_send_buf_d[peer].size(), + ncclFloat64, + peer, + s.comm.get(), + s.stream.view().value()); + } + } + for (int r = 0; r < nb; ++r) { + auto& s = *shards[r]; + auto& rd = s.rank_data; + raft::device_setter guard(s.device_id); + auto& y = bufs[r]; + for (int peer = 0; peer < nb; ++peer) { + if (peer == r) continue; + f_t* recv_ptr = y.data() + rd.owned_cstr_size + rd.cstr_recv_offsets[peer]; + ncclRecv(recv_ptr, + static_cast(rd.cstr_recv_counts[peer]), + ncclFloat64, + peer, + s.comm.get(), + s.stream.view().value()); + } + } + ncclGroupEnd(); + }; + auto halo_exchange_var_bufs = [&](std::vector>& bufs) { + for (int r = 0; r < nb; ++r) { + auto& s = *shards[r]; + raft::device_setter guard(s.device_id); + auto& x = bufs[r]; + for (int peer = 0; peer < nb; ++peer) { + if (peer == r) continue; + if (s.var_send_indices_d[peer].size() == 0) continue; + thrust::gather(rmm::exec_policy_nosync(s.stream.view()), + s.var_send_indices_d[peer].begin(), + s.var_send_indices_d[peer].end(), + x.begin(), + s.var_send_buf_d[peer].begin()); + } + } + ncclGroupStart(); + for (int r = 0; r < nb; ++r) { + auto& s = *shards[r]; + raft::device_setter guard(s.device_id); + for (int peer = 0; peer < nb; ++peer) { + if (peer == r) continue; + ncclSend(s.var_send_buf_d[peer].data(), + s.var_send_buf_d[peer].size(), + ncclFloat64, + peer, + s.comm.get(), + s.stream.view().value()); + } + } + for (int r = 0; r < nb; ++r) { + auto& s = *shards[r]; + auto& rd = s.rank_data; + raft::device_setter guard(s.device_id); + auto& x = bufs[r]; + for (int peer = 0; peer < nb; ++peer) { + if (peer == r) continue; + f_t* recv_ptr = x.data() + rd.owned_var_size + rd.var_recv_offsets[peer]; + ncclRecv(recv_ptr, + static_cast(rd.var_recv_counts[peer]), + ncclFloat64, + peer, + s.comm.get(), + s.stream.view().value()); + } + } + ncclGroupEnd(); + }; + + // Per-shard partial reductions over the OWNED cstr slice + NCCL allreduce. + // For norm: out := sqrt(Σ_r ||bufs[r][0:owned_cstr]||²). + // For dot : out := Σ_r . + auto distributed_norm_owned_cstr = [&](std::vector>& bufs, + std::vector>& out) { + for (int r = 0; r < nb; ++r) { + auto& s = *shards[r]; + raft::device_setter guard(s.device_id); + const i_t n_owned = s.rank_data.owned_cstr_size; + RAFT_CUBLAS_TRY(raft::linalg::detail::cublasdot(s.handle.get_cublas_handle(), + static_cast(n_owned), + bufs[r].data(), + 1, + bufs[r].data(), + 1, + out[r].data(), + s.stream.view().value())); + } + ncclGroupStart(); + for (int r = 0; r < nb; ++r) { + auto& s = *shards[r]; + raft::device_setter guard(s.device_id); + ncclAllReduce(out[r].data(), + out[r].data(), + 1, + ncclFloat64, + ncclSum, + s.comm.get(), + s.stream.view().value()); + } + ncclGroupEnd(); + for (int r = 0; r < nb; ++r) { + auto& s = *shards[r]; + raft::device_setter guard(s.device_id); + cub::DeviceTransform::Transform( + out[r].data(), out[r].data(), 1, sqrt_inplace_op_t{}, s.stream.view().value()); + } + }; + auto distributed_dot_owned_cstr = [&](std::vector>& a, + std::vector>& b, + std::vector>& out) { + for (int r = 0; r < nb; ++r) { + auto& s = *shards[r]; + raft::device_setter guard(s.device_id); + const i_t n_owned = s.rank_data.owned_cstr_size; + RAFT_CUBLAS_TRY(raft::linalg::detail::cublasdot(s.handle.get_cublas_handle(), + static_cast(n_owned), + a[r].data(), + 1, + b[r].data(), + 1, + out[r].data(), + s.stream.view().value())); + } + ncclGroupStart(); + for (int r = 0; r < nb; ++r) { + auto& s = *shards[r]; + raft::device_setter guard(s.device_id); + ncclAllReduce(out[r].data(), + out[r].data(), + 1, + ncclFloat64, + ncclSum, + s.comm.get(), + s.stream.view().value()); + } + ncclGroupEnd(); + }; + + // ===== Power iteration ===== + // Mirrors single-GPU compute_initial_step_size: z is the carried iterate + // (A Aᵀ q each step); at the top of each iteration q := z then q is + // normalized; the residual z − σ²q is written back into q only to drive + // the convergence check (next iteration's q := z discards it). + for (int it = 0; it < max_iterations; ++it) { + // q := z on the owned slice (the carried iterate), then normalize. + for (int r = 0; r < nb; ++r) { + auto& s = *shards[r]; + raft::device_setter guard(s.device_id); + const i_t n_owned = s.rank_data.owned_cstr_size; + raft::copy(q[r].data(), z[r].data(), n_owned, s.stream.view()); + } + + // ||q||₂ over the global OWNED cstr slice (one allreduce-sum + sqrt). + distributed_norm_owned_cstr(q, norm_q); + + // q /= ||q||₂ on owned slice (halo gets refreshed by next exchange). + for (int r = 0; r < nb; ++r) { + auto& s = *shards[r]; + raft::device_setter guard(s.device_id); + const i_t n_owned = s.rank_data.owned_cstr_size; + cub::DeviceTransform::Transform( + q[r].data(), + q[r].data(), + n_owned, + [n = norm_q[r].data()] __device__(f_t v) { return v / *n; }, + s.stream.view().value()); + } + + // atq = A^T q : halo-exchange q, then per-shard SpMV. spmv_At_into + // rebinds the dual_solution dnvec to q[r].data() and restores the + // canonical binding after the call (see pdhg.cu:643-644). + halo_exchange_cstr_bufs(q); + for (int r = 0; r < nb; ++r) { + auto& s = *shards[r]; + raft::device_setter guard(s.device_id); + s.sub_pdlp->pdhg_solver_.spmv_At_into(q[r], atq_dn[r]); + } + + // z = A atq : halo-exchange atq, then per-shard SpMV. + halo_exchange_var_bufs(atq); + for (int r = 0; r < nb; ++r) { + auto& s = *shards[r]; + raft::device_setter guard(s.device_id); + s.sub_pdlp->pdhg_solver_.spmv_A_into(atq[r], z_dn[r]); + } + + // σ² = q · z over the global OWNED cstr slice (= q^T A A^T q = σ_max² + // when q is the dominant left-singular vector). + distributed_dot_owned_cstr(q, z, sigma_sq); + + // q := -σ² q + z (owned slice) — residual of the eigen-equation. + for (int r = 0; r < nb; ++r) { + auto& s = *shards[r]; + raft::device_setter guard(s.device_id); + const i_t n_owned = s.rank_data.owned_cstr_size; + cub::DeviceTransform::Transform( + cuda::std::make_tuple(q[r].data(), z[r].data()), + q[r].data(), + n_owned, + [s2 = sigma_sq[r].data()] __device__(f_t qv, f_t zv) { return -(*s2) * qv + zv; }, + s.stream.view().value()); + } + + // Convergence check via global residual norm. + distributed_norm_owned_cstr(q, residual_norm); + auto& s0 = *shards[0]; + raft::device_setter guard0(s0.device_id); + f_t h_res{}; + RAFT_CUDA_TRY(cudaMemcpyAsync(&h_res, + residual_norm[0].data(), + sizeof(f_t), + cudaMemcpyDeviceToHost, + s0.stream.view().value())); + s0.stream.synchronize(); + if (h_res < tolerance) break; + } + + // σ_max² is the same on every shard after the last allreduce. + auto& s0 = *shards[0]; + raft::device_setter guard0(s0.device_id); + f_t sigma_sq_h{}; + RAFT_CUDA_TRY(cudaMemcpyAsync(&sigma_sq_h, + sigma_sq[0].data(), + sizeof(f_t), + cudaMemcpyDeviceToHost, + s0.stream.view().value())); + s0.stream.synchronize(); + + for (int r = 0; r < nb; ++r) { + raft::device_setter guard(shards[r]->device_id); + RAFT_CUSPARSE_TRY(cusparseDestroyDnVec(z_dn[r])); + RAFT_CUSPARSE_TRY(cusparseDestroyDnVec(atq_dn[r])); + } + + return std::sqrt(std::max(sigma_sq_h, f_t(0))); + } + + // -------- Solution gather (shards -> master) ---------------------------- + // Assembles the global potential_next primal/dual solutions and the + // reduced_cost on the master from the owned slices distributed across + // shards. Each shard's first owned_var_size (resp. owned_cstr_size) entries + // of its potential_next_primal_solution_ / reduced_cost_ (resp. + // potential_next_dual_solution_) are the live, up-to-date owned values; the + // master buffers are not updated during iterations and would otherwise + // return stale data. + // + // Used right before fill_return_problem_solution() at the return sites in + // pdlp_solver_t::check_termination() and pdlp_solver_t::check_limits(): the + // user-visible solution must contain gathered global values for primal, + // dual, and reduced_cost. + // + // Mirrors the metis_tests engine::get_x_output / get_y_output pattern: + // per shard: alloc small host tmp, copy owned slice device->host, sync, + // host-scatter via rank_data.local_to_global_{var,cstr} into a contiguous + // host buffer. Then one host->device copy into the master buffer per field. + // + // master_pdhg : provides destinations for primal / dual. + // master_reduced_cost : destination for the reduced_cost (var-shaped, lives + // in the master pdlp_solver_t's termination strategy + // convergence_information_). + void gather_potential_next_solutions_to_master(pdhg_solver_t& master_pdhg, + rmm::device_uvector& master_reduced_cost) + { + const std::size_t total_vars = master_pdhg.get_potential_next_primal_solution().size(); + const std::size_t total_cstrs = master_pdhg.get_potential_next_dual_solution().size(); + + std::vector h_primal(total_vars); + std::vector h_dual(total_cstrs); + std::vector h_reduced_cost(total_vars); + + for (auto& s_uptr : shards) { + auto& s = *s_uptr; + raft::device_setter guard(s.device_id); + const i_t nv = s.rank_data.owned_var_size; + const i_t nc = s.rank_data.owned_cstr_size; + + std::vector tmp_primal(nv); + std::vector tmp_dual(nc); + std::vector tmp_reduced_cost(nv); + + auto& sub_reduced_cost = s.sub_pdlp->get_current_termination_strategy() + .get_convergence_information() + .get_reduced_cost(); + + if (nv > 0) { + RAFT_CUDA_TRY( + cudaMemcpyAsync(tmp_primal.data(), + s.sub_pdlp->pdhg_solver_.get_potential_next_primal_solution().data(), + static_cast(nv) * sizeof(f_t), + cudaMemcpyDeviceToHost, + s.stream.view().value())); + RAFT_CUDA_TRY(cudaMemcpyAsync(tmp_reduced_cost.data(), + sub_reduced_cost.data(), + static_cast(nv) * sizeof(f_t), + cudaMemcpyDeviceToHost, + s.stream.view().value())); + } + if (nc > 0) { + RAFT_CUDA_TRY( + cudaMemcpyAsync(tmp_dual.data(), + s.sub_pdlp->pdhg_solver_.get_potential_next_dual_solution().data(), + static_cast(nc) * sizeof(f_t), + cudaMemcpyDeviceToHost, + s.stream.view().value())); + } + RAFT_CUDA_TRY(cudaStreamSynchronize(s.stream.view().value())); + + if (nv > 0) { + thrust::scatter(thrust::host, + tmp_primal.begin(), + tmp_primal.end(), + s.rank_data.local_to_global_var.begin(), + h_primal.begin()); + thrust::scatter(thrust::host, + tmp_reduced_cost.begin(), + tmp_reduced_cost.end(), + s.rank_data.local_to_global_var.begin(), + h_reduced_cost.begin()); + } + if (nc > 0) { + thrust::scatter(thrust::host, + tmp_dual.begin(), + tmp_dual.end(), + s.rank_data.local_to_global_cstr.begin(), + h_dual.begin()); + } + } + + // Host -> master device. engine.stream lives on the master device + // (created at engine construction when master device was current). + RAFT_CUDA_TRY(cudaMemcpyAsync(master_pdhg.get_potential_next_primal_solution().data(), + h_primal.data(), + total_vars * sizeof(f_t), + cudaMemcpyHostToDevice, + stream.view().value())); + RAFT_CUDA_TRY(cudaMemcpyAsync(master_pdhg.get_potential_next_dual_solution().data(), + h_dual.data(), + total_cstrs * sizeof(f_t), + cudaMemcpyHostToDevice, + stream.view().value())); + RAFT_CUDA_TRY(cudaMemcpyAsync(master_reduced_cost.data(), + h_reduced_cost.data(), + total_vars * sizeof(f_t), + cudaMemcpyHostToDevice, + stream.view().value())); + RAFT_CUDA_TRY(cudaStreamSynchronize(stream.view().value())); + } + + // Engine-level stream for fork/join orchestration (master side). + rmm::cuda_stream stream; + + // Shards stored by unique_ptr because pdlp_shard_t is immovable + // (owns device-affine resources: handle, NCCL comm, RMM buffers). + std::vector>> shards; + + // ===== Cross-stream synchronization events ===== + // two different events + // capture_*_event_ are used inside graph capture + // ext_*_event_ are used when sync is needed outside of graph + std::unique_ptr graph_master_ready_event_; + std::vector> graph_shard_ready_events_; + std::unique_ptr sync_master_ready_event_; + std::vector> sync_shard_ready_events_; + + // Forks master stream to shards, so that the captured graph can see the work on the shards + void graph_capture_fork_to_shards(rmm::cuda_stream_view master_stream) + { + graph_master_ready_event_->record(master_stream); + for (auto& s : shards) { + raft::device_setter guard(s->device_id); + graph_master_ready_event_->stream_wait(s->stream.view()); + } + } + + // Joins shards back to master stream for correct graph capture + void graph_capture_join_from_shards(rmm::cuda_stream_view master_stream) + { + const int nb = static_cast(shards.size()); + for (int r = 0; r < nb; ++r) { + raft::device_setter guard(shards[r]->device_id); + graph_shard_ready_events_[r]->record(shards[r]->stream.view()); + } + for (auto& e : graph_shard_ready_events_) { + e->stream_wait(master_stream); + } + } + + // Functionnaly same as graph_capture_fork_to_shards but on a different event to avoid race + // conditions Can be used as a way to sync shards with master stream + void sync_await_master(rmm::cuda_stream_view master_stream) + { + sync_master_ready_event_->record(master_stream); + for (auto& s : shards) { + raft::device_setter guard(s->device_id); + sync_master_ready_event_->stream_wait(s->stream.view()); + } + } + + // Same as sync_await_master + // Can be used as a way to sync master stream with shards + void sync_await_shards(rmm::cuda_stream_view master_stream) + { + const int nb = static_cast(shards.size()); + for (int r = 0; r < nb; ++r) { + raft::device_setter guard(shards[r]->device_id); + sync_shard_ready_events_[r]->record(shards[r]->stream.view()); + } + for (auto& e : sync_shard_ready_events_) { + e->stream_wait(master_stream); + } + } +}; + +} // namespace cuopt::linear_programming::detail diff --git a/cpp/src/pdlp/distributed_pdlp/partition_loader.cu b/cpp/src/pdlp/distributed_pdlp/partition_loader.cu new file mode 100644 index 0000000000..a6db3a9fe8 --- /dev/null +++ b/cpp/src/pdlp/distributed_pdlp/partition_loader.cu @@ -0,0 +1,262 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + */ + +#include + +#include + +#include +#include +#include + +namespace cuopt::linear_programming::detail { + +template +std::vector partition_loader_t::parse_distributed_pdlp_partition_file( + std::string const& file) +{ + std::ifstream part_file(file); + cuopt_expects(part_file.is_open(), + error_type_t::ValidationError, + "Failed to open partition file: %s", + file.c_str()); + + // One integer per line; operator>> skips whitespace so blank lines and + // trailing newlines are tolerated. + std::vector parts; + i_t part = 0; + while (part_file >> part) { + parts.push_back(part); + } + + // We must have hit EOF cleanly; any other state means a malformed token. + cuopt_expects(part_file.eof(), + error_type_t::ValidationError, + "Malformed partition file (expected one integer per line): %s", + file.c_str()); + + return parts; +} + +template +void partition_loader_t::export_distributed_pdlp_partition_file( + std::string const& file, std::vector const& parts) +{ + std::ofstream part_file(file); + cuopt_expects(part_file.is_open(), + error_type_t::ValidationError, + "Failed to open partition file for export: %s", + file.c_str()); + for (auto const& part : parts) { + part_file << part << "\n"; + } + cuopt_expects(part_file.good(), + error_type_t::RuntimeError, + "Failed while writing partition file: %s", + file.c_str()); +} + +template +std::vector> partition_loader_t::create_rank_data_from_parts( + const std::vector& parts, + const std::vector& A_row_offsets, + const std::vector& A_col_indices, + const std::vector& A_values, + const std::vector& A_values_scaled, + const std::vector& A_t_row_offsets, + const std::vector& A_t_col_indices, + const std::vector& A_t_values, + const std::vector& A_t_values_scaled, + i_t nb_parts, + i_t nb_cstr, + i_t nb_vars, + i_t nnz) +{ + cuopt_expects(A_values.size() == A_values_scaled.size(), + error_type_t::ValidationError, + "A_values and A_values_scaled must have the same length"); + cuopt_expects(A_t_values.size() == A_t_values_scaled.size(), + error_type_t::ValidationError, + "A_t_values and A_t_values_scaled must have the same length"); + + std::vector> rank_data(nb_parts, rank_data_t(nb_parts)); + std::vector cstr_parts(parts.begin(), parts.begin() + nb_cstr); + std::vector var_parts(parts.begin() + nb_cstr, parts.begin() + nb_cstr + nb_vars); + + // 1. Compute ownership + for (i_t i = 0; i < nb_cstr; i++) { + rank_data[cstr_parts[i]].owned_cstr_indices.push_back(i); + } + for (i_t i = 0; i < nb_vars; i++) { + rank_data[var_parts[i]].owned_var_indices.push_back(i); + } + + // 2. Compute local matrices and rank_data + for (i_t rank = 0; rank < nb_parts; rank++) { + auto& rd = rank_data[rank]; + rd.owned_var_size = rd.owned_var_indices.size(); + rd.owned_cstr_size = rd.owned_cstr_indices.size(); + // ---- A side ---- + std::vector local_A_row_offsets; + std::vector local_A_col_indices; + std::vector local_A_values; + std::vector local_A_values_scaled; + + i_t local_A_nnz = 0; + local_A_row_offsets.push_back(local_A_nnz); + + // For each owned constraint, build local matrix A. We walk both the + // unscaled and scaled global value arrays in lockstep so the produced + // local arrays share identical (offsets, col_indices) and differ only + // in values. + for (auto owned_cstr : rd.owned_cstr_indices) { + i_t cstr_len = A_row_offsets[owned_cstr + 1] - A_row_offsets[owned_cstr]; + i_t row_start = A_row_offsets[owned_cstr]; + for (i_t v = 0; v < cstr_len; v++) { + local_A_col_indices.push_back(A_col_indices[row_start + v]); + local_A_values.push_back(A_values[row_start + v]); + local_A_values_scaled.push_back(A_values_scaled[row_start + v]); + } + local_A_nnz += cstr_len; + local_A_row_offsets.push_back(local_A_nnz); + } + + std::set needed_vars; + for (auto indice : local_A_col_indices) { + if (var_parts[indice] != rank) needed_vars.insert(indice); + } + + for (i_t peer = 0; peer < nb_parts; peer++) { + std::vector needed_var_from_peer; + for (auto needed_var : needed_vars) { + if (var_parts[needed_var] == peer) needed_var_from_peer.push_back(needed_var); + } + i_t nb_recv_from_peer = needed_var_from_peer.size(); + rd.var_recv_counts[peer] = nb_recv_from_peer; + rd.var_recv_offsets[peer] = + peer == 0 ? 0 : rd.var_recv_offsets[peer - 1] + rd.var_recv_counts[peer - 1]; + rank_data[peer].var_send_per_peer[rank] = std::move(needed_var_from_peer); + } + + rd.h_A_row_offsets = std::move(local_A_row_offsets); + rd.h_A_col_indices = std::move(local_A_col_indices); + rd.h_A_values = std::move(local_A_values); + rd.h_A_values_scaled = std::move(local_A_values_scaled); + + // ---- A_t side ---- + std::vector local_A_t_row_offsets; + std::vector local_A_t_col_indices; + std::vector local_A_t_values; + std::vector local_A_t_values_scaled; + i_t local_A_t_nnz = 0; + local_A_t_row_offsets.push_back(local_A_t_nnz); + + for (auto owned_var : rd.owned_var_indices) { + i_t var_len = A_t_row_offsets[owned_var + 1] - A_t_row_offsets[owned_var]; + i_t row_start = A_t_row_offsets[owned_var]; + for (i_t v = 0; v < var_len; v++) { + local_A_t_col_indices.push_back(A_t_col_indices[row_start + v]); + local_A_t_values.push_back(A_t_values[row_start + v]); + local_A_t_values_scaled.push_back(A_t_values_scaled[row_start + v]); + } + local_A_t_nnz += var_len; + local_A_t_row_offsets.push_back(local_A_t_nnz); + } + + std::set needed_cstrs; + for (auto indice : local_A_t_col_indices) { + if (cstr_parts[indice] != rank) needed_cstrs.insert(indice); + } + + for (i_t peer = 0; peer < nb_parts; peer++) { + std::vector needed_cstr_from_peer; + for (auto needed_cstr : needed_cstrs) { + if (cstr_parts[needed_cstr] == peer) needed_cstr_from_peer.push_back(needed_cstr); + } + i_t nb_recv_from_peer = needed_cstr_from_peer.size(); + rd.cstr_recv_counts[peer] = nb_recv_from_peer; + rd.cstr_recv_offsets[peer] = + peer == 0 ? 0 : rd.cstr_recv_offsets[peer - 1] + rd.cstr_recv_counts[peer - 1]; + rank_data[peer].cstr_send_per_peer[rank] = std::move(needed_cstr_from_peer); + } + + rd.h_A_t_row_offsets = std::move(local_A_t_row_offsets); + rd.h_A_t_col_indices = std::move(local_A_t_col_indices); + rd.h_A_t_values = std::move(local_A_t_values); + rd.h_A_t_values_scaled = std::move(local_A_t_values_scaled); + + rd.total_var_size = rd.owned_var_size + needed_vars.size(); + rd.total_cstr_size = rd.owned_cstr_size + needed_cstrs.size(); + + // Pad row-offset arrays so cuSPARSE sees the local matrices as + // (total_cstr x total_var) for A and (total_var x total_cstr) for A_T + const i_t a_last_nnz = rd.h_A_row_offsets.empty() ? i_t{0} : rd.h_A_row_offsets.back(); + rd.h_A_row_offsets.resize(rd.total_cstr_size + 1, a_last_nnz); + + const i_t at_last_nnz = rd.h_A_t_row_offsets.empty() ? i_t{0} : rd.h_A_t_row_offsets.back(); + rd.h_A_t_row_offsets.resize(rd.total_var_size + 1, at_last_nnz); + } + + // 3. Generate local indices for contiguous [[self], [peer1], ..., [peer_k]] + // Build scatter_gather_maps + for (i_t rank = 0; rank < nb_parts; rank++) { + auto& rd = rank_data[rank]; + + i_t curr_id = 0; + for (auto owned_cstr : rd.owned_cstr_indices) { + rd.global_to_local_cstr[owned_cstr] = curr_id; + rd.local_to_global_cstr.push_back(owned_cstr); + curr_id++; + } + for (i_t peer = 0; peer < nb_parts; peer++) { + if (peer == rank) continue; + for (auto recv_cstr : rank_data[peer].cstr_send_per_peer[rank]) { + rd.global_to_local_cstr[recv_cstr] = curr_id; + rd.local_to_global_cstr.push_back(recv_cstr); + curr_id++; + } + } + + curr_id = 0; + for (auto owned_var : rd.owned_var_indices) { + rd.global_to_local_var[owned_var] = curr_id; + rd.local_to_global_var.push_back(owned_var); + curr_id++; + } + for (i_t peer = 0; peer < nb_parts; peer++) { + if (peer == rank) continue; + for (auto recv_var : rank_data[peer].var_send_per_peer[rank]) { + rd.global_to_local_var[recv_var] = curr_id; + rd.local_to_global_var.push_back(recv_var); + curr_id++; + } + } + } + + // 4. Remap global -> local everywhere + for (i_t rank = 0; rank < nb_parts; rank++) { + auto& rd = rank_data[rank]; + + for (auto& send_vec : rd.var_send_per_peer) { + for (auto& v : send_vec) + v = rd.global_to_local_var.at(v); + } + for (auto& send_vec : rd.cstr_send_per_peer) { + for (auto& v : send_vec) + v = rd.global_to_local_cstr.at(v); + } + + for (auto& v : rd.h_A_col_indices) + v = rd.global_to_local_var.at(v); + for (auto& v : rd.h_A_t_col_indices) + v = rd.global_to_local_cstr.at(v); + } + + return rank_data; +} + +template struct partition_loader_t; + +} // namespace cuopt::linear_programming::detail diff --git a/cpp/src/pdlp/distributed_pdlp/partition_loader.hpp b/cpp/src/pdlp/distributed_pdlp/partition_loader.hpp new file mode 100644 index 0000000000..ce12d241f9 --- /dev/null +++ b/cpp/src/pdlp/distributed_pdlp/partition_loader.hpp @@ -0,0 +1,45 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + */ + +#pragma once + +#include + +#include +#include + +namespace cuopt::linear_programming::detail { + +template +struct partition_loader_t { + // Read a Metis-style partition file: one part-id per line (whitespace-tolerant), + // ASCII integers in [0, nb_parts). Returns a flat vector of length + // nb_cstr + nb_vars, indexed as in create_rank_data_from_parts (cstrs first, then vars). + static std::vector parse_distributed_pdlp_partition_file(std::string const& file); + + // Write a partition vector to file in the same format parse_... reads back: + // one part-id per line. Useful for inspecting / reusing a computed partition + // (e.g. CLI --distributed-pdlp-export-parts). + static void export_distributed_pdlp_partition_file(std::string const& file, + std::vector const& parts); + + // Slices the data to prepare a split from metis partitionning with halo communication + static std::vector> create_rank_data_from_parts( + const std::vector& parts, + const std::vector& A_row_offsets, + const std::vector& A_col_indices, + const std::vector& A_values, + const std::vector& A_values_scaled, + const std::vector& A_t_row_offsets, + const std::vector& A_t_col_indices, + const std::vector& A_t_values, + const std::vector& A_t_values_scaled, + i_t nb_parts, + i_t nb_cstr, + i_t nb_vars, + i_t nnz); +}; + +} // namespace cuopt::linear_programming::detail diff --git a/cpp/src/pdlp/distributed_pdlp/partitioner.cu b/cpp/src/pdlp/distributed_pdlp/partitioner.cu new file mode 100644 index 0000000000..727a8b56f9 --- /dev/null +++ b/cpp/src/pdlp/distributed_pdlp/partitioner.cu @@ -0,0 +1,87 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + */ + +#include +#include +#include + +#include + +#include +#include + +namespace cuopt::linear_programming::detail { + +template +std::vector dummy_partitioner_t::partition( + partitioner_input_t const& input) const +{ + cuopt_expects(input.nb_parts > 0, + error_type_t::ValidationError, + "dummy_partitioner: nb_parts must be positive"); + cuopt_expects(input.nb_cstr >= 0 && input.nb_vars >= 0, + error_type_t::ValidationError, + "dummy_partitioner: invalid problem dimensions"); + + const std::size_t nvtx = + static_cast(input.nb_cstr) + static_cast(input.nb_vars); + std::vector parts(nvtx); + for (std::size_t i = 0; i < nvtx; ++i) { + parts[i] = static_cast(i % static_cast(input.nb_parts)); + } + validate_partition(parts, + static_cast(input.nb_cstr), + static_cast(input.nb_vars), + static_cast(input.nb_parts), + "dummy_partitioner"); + return parts; +} + +void validate_partition( + std::vector const& parts, int nb_cstr, int nb_vars, int nb_parts, char const* context) +{ + const std::size_t expected = + static_cast(nb_cstr) + static_cast(nb_vars); + cuopt_expects(parts.size() == expected, + error_type_t::ValidationError, + "%s: expected %zu part entries (cstrs + vars), got %zu", + context, + expected, + parts.size()); + cuopt_expects( + nb_parts > 0, error_type_t::ValidationError, "%s: nb_parts must be positive", context); + if (parts.empty()) { return; } + const auto [min_it, max_it] = std::minmax_element(parts.begin(), parts.end()); + cuopt_expects(*min_it >= 0, + error_type_t::ValidationError, + "%s: partition ids must be non-negative (min=%d)", + context, + static_cast(*min_it)); + cuopt_expects(*max_it < nb_parts, + error_type_t::ValidationError, + "%s: partition ids must be in [0, %d) (max=%d)", + context, + static_cast(nb_parts), + static_cast(*max_it)); +} + +template +std::unique_ptr> make_partitioner(partitioner_kind_t kind) +{ + switch (kind) { + case partitioner_kind_t::Dummy: return std::make_unique>(); + case partitioner_kind_t::Metis: return std::make_unique>(); + case partitioner_kind_t::KaMinPar: return std::make_unique>(); + } + cuopt_expects( + false, error_type_t::RuntimeError, "make_partitioner: unsupported partitioner kind"); + return nullptr; +} + +template class dummy_partitioner_t; +template std::unique_ptr> make_partitioner( + partitioner_kind_t); + +} // namespace cuopt::linear_programming::detail diff --git a/cpp/src/pdlp/distributed_pdlp/partitioner.hpp b/cpp/src/pdlp/distributed_pdlp/partitioner.hpp new file mode 100644 index 0000000000..70b2e34c06 --- /dev/null +++ b/cpp/src/pdlp/distributed_pdlp/partitioner.hpp @@ -0,0 +1,70 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + */ + +#pragma once + +#include +#include +#include + +namespace cuopt::linear_programming::detail { + +// Non-owning view of a host CSR matrix (A or A_t). +template +struct csr_host_view_t { + std::vector const* row_offsets{nullptr}; + std::vector const* col_indices{nullptr}; + std::vector const* values{nullptr}; // optional; unused by topology-only partitioners + i_t num_rows{0}; + i_t num_cols{0}; +}; + +// Inputs shared by all distributed-PDLP partitioners. +// Returns a flat vector of length (nb_cstr + nb_vars): constraint part-ids first, +// then variable part-ids, each in [0, nb_parts). +template +struct partitioner_input_t { + i_t nb_cstr{0}; + i_t nb_vars{0}; + i_t nb_parts{0}; + // Number of CPU threads the partitioner may use. Only honored by the + // multi-threaded KaMinPar backend; <= 0 means "auto" (all hardware threads). + // Serial backends (METIS, Dummy) ignore it. + i_t nb_threads{0}; + // Constraint matrix A (rows = constraints, cols = variables). + csr_host_view_t A{}; + // Transpose A_t (rows = variables, cols = constraints). Optional for partitioners + // that build a bipartite graph (e.g. METIS); dummy partitioner ignores both matrices. + csr_host_view_t A_t{}; +}; + +// Dummy: round-robin, no graph (single-shard / debugging). +// Metis: serial METIS_PartGraphKway. +// KaMinPar: multi-threaded KaMinPar (preferred for multi-shard partitioning). +enum class partitioner_kind_t { Dummy, Metis, KaMinPar }; + +template +class partitioner_i { + public: + virtual ~partitioner_i() = default; + virtual std::vector partition(partitioner_input_t const& input) const = 0; +}; + +template +class dummy_partitioner_t : public partitioner_i { + public: + std::vector partition(partitioner_input_t const& input) const override; +}; + +void validate_partition(std::vector const& parts, + int nb_cstr, + int nb_vars, + int nb_parts, + char const* context = "partition"); + +template +std::unique_ptr> make_partitioner(partitioner_kind_t kind); + +} // namespace cuopt::linear_programming::detail diff --git a/cpp/src/pdlp/distributed_pdlp/rank_data.hpp b/cpp/src/pdlp/distributed_pdlp/rank_data.hpp new file mode 100644 index 0000000000..d52d277116 --- /dev/null +++ b/cpp/src/pdlp/distributed_pdlp/rank_data.hpp @@ -0,0 +1,61 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + */ + +#pragma once + +#include +#include + +namespace cuopt::linear_programming::detail { +template +struct rank_data_t { + rank_data_t(std::size_t nb_parts) + : var_send_per_peer(nb_parts), + cstr_send_per_peer(nb_parts), + var_recv_counts(nb_parts, 0), + var_recv_offsets(nb_parts, 0), + cstr_recv_counts(nb_parts, 0), + cstr_recv_offsets(nb_parts, 0) + { + } + + i_t owned_var_size{0}; + i_t total_var_size{0}; + i_t owned_cstr_size{0}; + i_t total_cstr_size{0}; + + // === Ownership === + std::vector owned_var_indices; + std::vector owned_cstr_indices; + + // === Send plan: per peer, indices to gather + send === + std::vector> var_send_per_peer; + std::vector> cstr_send_per_peer; + + // === Recv plan: per peer, contiguous slot in halo region === + std::vector var_recv_counts; + std::vector var_recv_offsets; + std::vector cstr_recv_counts; + std::vector cstr_recv_offsets; + + // === Mappings === + std::unordered_map global_to_local_var; + std::unordered_map global_to_local_cstr; + std::vector local_to_global_var; + std::vector local_to_global_cstr; + + // === Local host CSR matrices === + // A + std::vector h_A_row_offsets; + std::vector h_A_col_indices; + std::vector h_A_values; + std::vector h_A_values_scaled; + // A_t + std::vector h_A_t_row_offsets; + std::vector h_A_t_col_indices; + std::vector h_A_t_values; + std::vector h_A_t_values_scaled; +}; +} // namespace cuopt::linear_programming::detail diff --git a/cpp/src/pdlp/distributed_pdlp/shard.cu b/cpp/src/pdlp/distributed_pdlp/shard.cu new file mode 100644 index 0000000000..356e10a03c --- /dev/null +++ b/cpp/src/pdlp/distributed_pdlp/shard.cu @@ -0,0 +1,242 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + */ + +#include +#include +#include + +#include + +#include +#include + +#include +#include + +namespace cuopt::linear_programming::detail { + +// This must be done in .cu file because the pdlp_solver_t is not already complete in the hpp file +template +pdlp_shard_t::~pdlp_shard_t() = default; + +template +pdlp_shard_t::pdlp_shard_t(int device_id, + rank_data_t&& rd, + ncclComm_t raw_comm, + std::vector const& h_global_obj, + std::vector const& h_global_var_lower, + std::vector const& h_global_var_upper, + std::vector const& h_global_cstr_lower, + std::vector const& h_global_cstr_upper, + std::vector const& h_global_obj_scaled, + std::vector const& h_global_var_lower_scaled, + std::vector const& h_global_var_upper_scaled, + std::vector const& h_global_cstr_lower_scaled, + std::vector const& h_global_cstr_upper_scaled, + std::vector const& h_global_cummulative_cstr_scaling, + std::vector const& h_global_cummulative_var_scaling, + f_t h_bound_rescaling, + f_t h_objective_rescaling, + bool maximize, + f_t objective_offset, + f_t objective_scaling_factor, + pdlp_solver_settings_t const& settings) + : device_id(device_id), + stream(), + handle(stream.view()), + comm(raw_comm, nccl_comm_deleter_t{device_id}), + rank_data(std::move(rd)), + opt_problem(std::nullopt), + sub_problem(std::nullopt), + sub_pdlp(nullptr) +{ + assert(raft::device_setter::get_current_device() == device_id && + "Right device must be set before building the shard"); + + // ---- 1. Gather per-shard host slices using rank_data's index maps. ---- + // All vectors are sized to TOTAL (owned + halo). Owned slots get real + // values; halo slots keep defaults because they should not be accessed + std::vector h_obj(rank_data.total_var_size, f_t{0}); + std::vector h_var_lower(rank_data.total_var_size, -std::numeric_limits::infinity()); + std::vector h_var_upper(rank_data.total_var_size, std::numeric_limits::infinity()); + std::vector h_cstr_lower(rank_data.total_cstr_size, -std::numeric_limits::infinity()); + std::vector h_cstr_upper(rank_data.total_cstr_size, std::numeric_limits::infinity()); + + std::vector h_obj_scaled(rank_data.total_var_size, f_t{0}); + std::vector h_var_lower_scaled(rank_data.total_var_size, + -std::numeric_limits::infinity()); + std::vector h_var_upper_scaled(rank_data.total_var_size, + std::numeric_limits::infinity()); + std::vector h_cstr_lower_scaled(rank_data.total_cstr_size, + -std::numeric_limits::infinity()); + std::vector h_cstr_upper_scaled(rank_data.total_cstr_size, + std::numeric_limits::infinity()); + + for (i_t i = 0; i < rank_data.owned_var_size; ++i) { + const auto g = rank_data.local_to_global_var[i]; + h_obj[i] = h_global_obj[g]; + h_var_lower[i] = h_global_var_lower[g]; + h_var_upper[i] = h_global_var_upper[g]; + h_obj_scaled[i] = h_global_obj_scaled[g]; + h_var_lower_scaled[i] = h_global_var_lower_scaled[g]; + h_var_upper_scaled[i] = h_global_var_upper_scaled[g]; + } + for (i_t i = 0; i < rank_data.owned_cstr_size; ++i) { + const auto g = rank_data.local_to_global_cstr[i]; + h_cstr_lower[i] = h_global_cstr_lower[g]; + h_cstr_upper[i] = h_global_cstr_upper[g]; + h_cstr_lower_scaled[i] = h_global_cstr_lower_scaled[g]; + h_cstr_upper_scaled[i] = h_global_cstr_upper_scaled[g]; + } + + // Get local scaling factors + std::vector h_cstr_scaling_local(rank_data.total_cstr_size, f_t{1}); + std::vector h_var_scaling_local(rank_data.total_var_size, f_t{1}); + for (i_t i = 0; i < rank_data.owned_cstr_size; ++i) { + h_cstr_scaling_local[i] = h_global_cummulative_cstr_scaling[rank_data.local_to_global_cstr[i]]; + } + for (i_t i = 0; i < rank_data.owned_var_size; ++i) { + h_var_scaling_local[i] = h_global_cummulative_var_scaling[rank_data.local_to_global_var[i]]; + } + + // ---- 2. Build optimization_problem_t on this shard's device (UNSCALED). ---- + opt_problem.emplace(&handle); + opt_problem->set_csr_constraint_matrix(rank_data.h_A_values.data(), + static_cast(rank_data.h_A_values.size()), + rank_data.h_A_col_indices.data(), + static_cast(rank_data.h_A_col_indices.size()), + rank_data.h_A_row_offsets.data(), + static_cast(rank_data.h_A_row_offsets.size())); + + // Primal axis: TOTAL (owned + halo). Halo slots have neutral defaults. + opt_problem->set_objective_coefficients(h_obj.data(), rank_data.total_var_size); + opt_problem->set_variable_lower_bounds(h_var_lower.data(), rank_data.total_var_size); + opt_problem->set_variable_upper_bounds(h_var_upper.data(), rank_data.total_var_size); + + // Dual axis: TOTAL (owned + halo). Halo slots have ±inf so trivially satisfied. + opt_problem->set_constraint_lower_bounds(h_cstr_lower.data(), rank_data.total_cstr_size); + opt_problem->set_constraint_upper_bounds(h_cstr_upper.data(), rank_data.total_cstr_size); + + opt_problem->set_maximize(maximize); + opt_problem->set_objective_offset(objective_offset); + opt_problem->set_objective_scaling_factor(objective_scaling_factor); + opt_problem->set_problem_category(problem_category_t::LP); + + // ---- 3. Build problem_t from opt_problem (still UNSCALED). ---- + sub_problem.emplace(*opt_problem); + + // ---- 4. Override reverse_* with the real local A_T from rank_data. ---- + // problem_t's ctor computes the transpose of the LOCAL A, which is wrong + // in multi-GPU: A_local is owned_cstr x total_var, and A_t_local is the + // pre-sliced owned_var x total_cstr matrix we built during partitioning. + auto stream_view = handle.get_stream(); + sub_problem->reverse_offsets.resize(rank_data.h_A_t_row_offsets.size(), stream_view); + sub_problem->reverse_constraints.resize(rank_data.h_A_t_col_indices.size(), stream_view); + sub_problem->reverse_coefficients.resize(rank_data.h_A_t_values.size(), stream_view); + raft::copy(sub_problem->reverse_offsets.data(), + rank_data.h_A_t_row_offsets.data(), + rank_data.h_A_t_row_offsets.size(), + stream_view); + raft::copy(sub_problem->reverse_constraints.data(), + rank_data.h_A_t_col_indices.data(), + rank_data.h_A_t_col_indices.size(), + stream_view); + raft::copy(sub_problem->reverse_coefficients.data(), + rank_data.h_A_t_values.data(), + rank_data.h_A_t_values.size(), + stream_view); + handle.sync_stream(stream_view); + + // ---- 5. Build sub_pdlp (single-GPU mode; multi_gpu flags cleared by caller). ---- + sub_pdlp = std::make_unique>( + *sub_problem, settings, /*is_legacy_batch_mode=*/false); + + sub_pdlp->pdhg_solver_.set_is_multi_gpu(true); + + // Re-inject master-scaled buffers inside sub_pdlp. + // Need to also re-inject the offsets and variables arrays to revert + // the csrsort done by problem_t's constructor. + auto& scaled = sub_pdlp->get_op_problem_scaled(); + raft::copy(scaled.offsets.data(), + rank_data.h_A_row_offsets.data(), + rank_data.h_A_row_offsets.size(), + stream_view); + raft::copy(scaled.variables.data(), + rank_data.h_A_col_indices.data(), + rank_data.h_A_col_indices.size(), + stream_view); + raft::copy(scaled.coefficients.data(), + rank_data.h_A_values_scaled.data(), + rank_data.h_A_values_scaled.size(), + stream_view); + // A_T side: all three arrays were already overridden together from + // rank_data on sub_problem (see step 4 above) and deep-copied into the + // scaled problem, so reverse_offsets / reverse_constraints already match + // h_A_t_values_scaled's order. Only the values need a SCALED swap-in. + raft::copy(scaled.reverse_coefficients.data(), + rank_data.h_A_t_values_scaled.data(), + rank_data.h_A_t_values_scaled.size(), + stream_view); + raft::copy( + scaled.objective_coefficients.data(), h_obj_scaled.data(), h_obj_scaled.size(), stream_view); + raft::copy(scaled.constraint_lower_bounds.data(), + h_cstr_lower_scaled.data(), + h_cstr_lower_scaled.size(), + stream_view); + raft::copy(scaled.constraint_upper_bounds.data(), + h_cstr_upper_scaled.data(), + h_cstr_upper_scaled.size(), + stream_view); + + using f_t2 = typename type_2::type; + std::vector h_var_bounds_scaled_packed(rank_data.total_var_size); + for (i_t i = 0; i < rank_data.total_var_size; ++i) { + h_var_bounds_scaled_packed[i].x = h_var_lower_scaled[i]; + h_var_bounds_scaled_packed[i].y = h_var_upper_scaled[i]; + } + raft::copy(scaled.variable_bounds.data(), + h_var_bounds_scaled_packed.data(), + h_var_bounds_scaled_packed.size(), + stream_view); + + combine_constraint_bounds(scaled, scaled.combined_bounds); + + // Inject master-scaled buffers inside sub_pdlp.initil_strategy + auto& scaling = sub_pdlp->get_initial_scaling_strategy(); + scaling.set_cummulative_scaling(h_cstr_scaling_local, h_var_scaling_local); + scaling.set_h_bound_rescaling(h_bound_rescaling); + scaling.set_h_objective_rescaling(h_objective_rescaling); + + sub_pdlp->pdhg_solver_.get_cusparse_view().create_spmv_op_plans( + /* is_reflected */ true); + // ---- 6. Build per-peer halo-exchange plans (ported from metis_tests). ---- + // For each peer p, we precompute: + // send_indices_d[p] : local indices to gather (uploaded from host send plan) + // send_buf_d[p] : f_t staging buffer sized to match + // Self-peer slot is present but empty (size 0). Used in engine halo exchange. + auto build_send_plan = [&](auto const& send_per_peer, auto& indices_d, auto& buf_d) { + const std::size_t n_peers = send_per_peer.size(); + indices_d.reserve(n_peers); + buf_d.reserve(n_peers); + for (auto const& send_to_peer : send_per_peer) { + rmm::device_uvector idx(send_to_peer.size(), stream_view); + rmm::device_uvector buf(send_to_peer.size(), stream_view); + if (!send_to_peer.empty()) { + raft::copy(idx.data(), send_to_peer.data(), send_to_peer.size(), stream_view); + } + indices_d.emplace_back(std::move(idx)); + buf_d.emplace_back(std::move(buf)); + } + }; + build_send_plan(rank_data.var_send_per_peer, var_send_indices_d, var_send_buf_d); + build_send_plan(rank_data.cstr_send_per_peer, cstr_send_indices_d, cstr_send_buf_d); + + handle.sync_stream(stream_view); +} + +template struct pdlp_shard_t; +template struct pdlp_shard_t; + +} // namespace cuopt::linear_programming::detail diff --git a/cpp/src/pdlp/distributed_pdlp/shard.hpp b/cpp/src/pdlp/distributed_pdlp/shard.hpp new file mode 100644 index 0000000000..35babc12db --- /dev/null +++ b/cpp/src/pdlp/distributed_pdlp/shard.hpp @@ -0,0 +1,99 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + */ +#pragma once + +#include + +#include +#include +#include + +#include +#include +#include +#include + +#include + +#include +#include +#include + +namespace cuopt::linear_programming::detail { + +// Forward-declare to break the cyclic include with pdlp.cuh +// (pdlp.cuh -> multi_gpu_engine.hpp -> shard.hpp -> pdlp.cuh). +// Definitions of out-of-line members live in shard.cu, which includes pdlp.cuh. +template +class pdlp_solver_t; + +// RAII deleter for ncclComm_t; sets the right device before destroy. +struct nccl_comm_deleter_t { + int device_id{-1}; + void operator()(ncclComm* comm) const noexcept + { + if (comm == nullptr) return; + raft::device_setter guard(device_id); + ncclCommDestroy(comm); + } +}; +using nccl_comm_unique_ptr_t = std::unique_ptr; + +template +struct pdlp_shard_t { + // Out-of-line (in shard.cu) because pdlp_solver_t is incomplete here. + ~pdlp_shard_t(); + + // sub worker for distributed pdlp. Owns its own view on scaled problem and unscaled problem + // Owns necessary multi-gpu data (rank_data, device_id, nccl_comm) + pdlp_shard_t(int device_id, + rank_data_t&& rd, + ncclComm_t raw_comm, + std::vector const& h_global_obj, + std::vector const& h_global_var_lower, + std::vector const& h_global_var_upper, + std::vector const& h_global_cstr_lower, + std::vector const& h_global_cstr_upper, + std::vector const& h_global_obj_scaled, + std::vector const& h_global_var_lower_scaled, + std::vector const& h_global_var_upper_scaled, + std::vector const& h_global_cstr_lower_scaled, + std::vector const& h_global_cstr_upper_scaled, + std::vector const& h_global_cummulative_cstr_scaling, + std::vector const& h_global_cummulative_var_scaling, + f_t h_bound_rescaling, + f_t h_objective_rescaling, + bool maximize, + f_t objective_offset, + f_t objective_scaling_factor, + pdlp_solver_settings_t const& settings); + + pdlp_shard_t(const pdlp_shard_t&) = delete; + pdlp_shard_t& operator=(const pdlp_shard_t&) = delete; + // Move ops are implicitly deleted (user-declared dtor + deleted copy). + // Intentional: shard owns device-affine resources and must never move. + // Store as std::unique_ptr in any container. + + int device_id; + rmm::cuda_stream stream; + raft::handle_t handle; + nccl_comm_unique_ptr_t comm; + rank_data_t rank_data; + std::optional> opt_problem; + std::optional> sub_problem; + std::unique_ptr> sub_pdlp; + + // Per-peer halo-exchange state. Inner index = peer rank. + // Slot for self (peer == this rank) is present but unused (size 0). + // var_send_indices_d[peer] : local indices into primal vector to gather and ncclSend + // var_send_buf_d [peer] : staging buffer for outgoing variable values + // cstr_send_indices_d/cstr_send_buf_d : same, for dual vector + std::vector> var_send_indices_d; + std::vector> var_send_buf_d; + std::vector> cstr_send_indices_d; + std::vector> cstr_send_buf_d; +}; + +} // namespace cuopt::linear_programming::detail diff --git a/cpp/src/pdlp/initial_scaling_strategy/initial_scaling.cu b/cpp/src/pdlp/initial_scaling_strategy/initial_scaling.cu index c79249c45d..cb498b3756 100644 --- a/cpp/src/pdlp/initial_scaling_strategy/initial_scaling.cu +++ b/cpp/src/pdlp/initial_scaling_strategy/initial_scaling.cu @@ -142,6 +142,10 @@ void pdlp_initial_scaling_strategy_t::compute_scaling_vectors( { raft::common::nvtx::range fun_scope("compute_scaling_vectors"); + // Skip scaling entirely for a shape-0 problem (distributed PDLP builds the + // master pdlp_solver_t from a shape-0 placeholder) + if (primal_size_h_ == 0 || dual_size_h_ == 0) return; + if (hyper_params_.do_ruiz_scaling) { ruiz_inf_scaling(number_of_ruiz_iterations); } if (hyper_params_.do_pock_chambolle_scaling) { pock_chambolle_scaling(alpha); } } @@ -213,6 +217,72 @@ __global__ void inf_norm_row_and_col_kernel( } } +template +void pdlp_initial_scaling_strategy_t::ruiz_iter_compute_local_iteration_vectors() +{ + // find inf norm over rows and columns of the scaled matrix in given iteration + i_t number_of_blocks = op_problem_scaled_.n_constraints / block_size; + if (op_problem_scaled_.n_constraints % block_size) number_of_blocks++; + i_t number_of_threads = std::min(op_problem_scaled_.n_variables, (i_t)block_size); + inf_norm_row_and_col_kernel<<>>( + op_problem_scaled_.view(), this->view()); + RAFT_CUDA_TRY(cudaPeekAtLastError()); + + if (running_mip_) { reset_integer_variables(); } +} + +template +void pdlp_initial_scaling_strategy_t::ruiz_iter_apply_cumulative_update() +{ + raft::linalg::binaryOp(cummulative_constraint_matrix_scaling_.data(), + cummulative_constraint_matrix_scaling_.data(), + iteration_constraint_matrix_scaling_.data(), + dual_size_h_, + a_divides_sqrt_b_bounded(), + stream_view_); + + raft::linalg::binaryOp(cummulative_variable_scaling_.data(), + cummulative_variable_scaling_.data(), + iteration_variable_scaling_.data(), + primal_size_h_, + a_divides_sqrt_b_bounded(), + stream_view_); + + // Reset the iteration_scaling vectors to all 0 + RAFT_CUDA_TRY(cudaMemsetAsync( + iteration_constraint_matrix_scaling_.data(), 0.0, sizeof(f_t) * dual_size_h_, stream_view_)); + RAFT_CUDA_TRY(cudaMemsetAsync( + iteration_variable_scaling_.data(), 0.0, sizeof(f_t) * primal_size_h_, stream_view_)); +} + +template +void pdlp_initial_scaling_strategy_t::reset_scaling_state_for_distributed() +{ + if (primal_size_h_ == 0 || dual_size_h_ == 0) return; + + // Re-allocate the iteration vectors the ctor shrank to 0 and zero them. + iteration_constraint_matrix_scaling_.resize(static_cast(dual_size_h_), stream_view_); + iteration_variable_scaling_.resize(static_cast(primal_size_h_), stream_view_); + RAFT_CUDA_TRY(cudaMemsetAsync( + iteration_constraint_matrix_scaling_.data(), 0, sizeof(f_t) * dual_size_h_, stream_view_)); + RAFT_CUDA_TRY(cudaMemsetAsync( + iteration_variable_scaling_.data(), 0, sizeof(f_t) * primal_size_h_, stream_view_)); + + // Reset cumulative scaling + rescaling to identity (the ctor's stray + // Pock-Chambolle pass and shard.cu's set_cummulative_scaling left these in + // an arbitrary state; distributed scaling recomputes from a clean slate). + thrust::fill(handle_ptr_->get_thrust_policy(), + cummulative_constraint_matrix_scaling_.begin(), + cummulative_constraint_matrix_scaling_.end(), + f_t(1)); + thrust::fill(handle_ptr_->get_thrust_policy(), + cummulative_variable_scaling_.begin(), + cummulative_variable_scaling_.end(), + f_t(1)); + set_h_bound_rescaling(f_t(1)); + set_h_objective_rescaling(f_t(1)); +} + template void pdlp_initial_scaling_strategy_t::ruiz_inf_scaling(i_t number_of_ruiz_iterations) { @@ -221,36 +291,8 @@ void pdlp_initial_scaling_strategy_t::ruiz_inf_scaling(i_t number_of_r std::cout << "Doing ruiz_inf_scaling" << std::endl; #endif for (int i = 0; i < number_of_ruiz_iterations; i++) { - // find inf norm over rows and columns of the scaled matrix in given iteration (matrix is not - // actually updated, but the scaled value is computed and evaluated) - i_t number_of_blocks = op_problem_scaled_.n_constraints / block_size; - if (op_problem_scaled_.n_constraints % block_size) number_of_blocks++; - i_t number_of_threads = std::min(op_problem_scaled_.n_variables, (i_t)block_size); - inf_norm_row_and_col_kernel<<>>( - op_problem_scaled_.view(), this->view()); - RAFT_CUDA_TRY(cudaPeekAtLastError()); - - if (running_mip_) { reset_integer_variables(); } - - raft::linalg::binaryOp(cummulative_constraint_matrix_scaling_.data(), - cummulative_constraint_matrix_scaling_.data(), - iteration_constraint_matrix_scaling_.data(), - dual_size_h_, - a_divides_sqrt_b_bounded(), - stream_view_); - - raft::linalg::binaryOp(cummulative_variable_scaling_.data(), - cummulative_variable_scaling_.data(), - iteration_variable_scaling_.data(), - primal_size_h_, - a_divides_sqrt_b_bounded(), - stream_view_); - - // Reset the iteration_scaling vectors to all 0 - RAFT_CUDA_TRY(cudaMemsetAsync( - iteration_constraint_matrix_scaling_.data(), 0.0, sizeof(f_t) * dual_size_h_, stream_view_)); - RAFT_CUDA_TRY(cudaMemsetAsync( - iteration_variable_scaling_.data(), 0.0, sizeof(f_t) * primal_size_h_, stream_view_)); + ruiz_iter_compute_local_iteration_vectors(); + ruiz_iter_apply_cumulative_update(); } } @@ -343,8 +385,12 @@ __global__ void pock_chambolle_scaling_kernel_col( if (threadIdx.x == 0) initial_scaling_view.iteration_variable_scaling[col] = accumulated_value; } +// Local half of one Pock-Chambolle pass: writes the per-row and per-column +// sums-of-powers into iteration_constraint_matrix_scaling_ / +// iteration_variable_scaling_ template -void pdlp_initial_scaling_strategy_t::pock_chambolle_scaling(f_t alpha) +void pdlp_initial_scaling_strategy_t::pock_chambolle_compute_local_iteration_vectors( + f_t alpha) { // Reset the iteration_scaling vectors to all 0 RAFT_CUDA_TRY(cudaMemsetAsync( @@ -379,7 +425,12 @@ void pdlp_initial_scaling_strategy_t::pock_chambolle_scaling(f_t alpha A_T_offsets_.data(), A_T_indices_.data()); RAFT_CUDA_TRY(cudaPeekAtLastError()); +} +// Fold half of one Pock-Chambolle pass: cumulative /= sqrt(iteration). +template +void pdlp_initial_scaling_strategy_t::pock_chambolle_apply_cumulative_update() +{ if (running_mip_) { reset_integer_variables(); } // divide the sqrt of the vectors of the sums from above to the respective scaling vectors @@ -398,6 +449,13 @@ void pdlp_initial_scaling_strategy_t::pock_chambolle_scaling(f_t alpha stream_view_); } +template +void pdlp_initial_scaling_strategy_t::pock_chambolle_scaling(f_t alpha) +{ + pock_chambolle_compute_local_iteration_vectors(alpha); + pock_chambolle_apply_cumulative_update(); +} + template __global__ void scale_problem_kernel( const typename pdlp_initial_scaling_strategy_t::view_t initial_scaling_view, @@ -586,7 +644,8 @@ void pdlp_initial_scaling_strategy_t::scale_problem() cuda::std::multiplies{}, stream_view_); - if (hyper_params_.bound_objective_rescaling && !running_mip_) { + if (hyper_params_.bound_objective_rescaling && !running_mip_ && + !skip_distributed_local_rescaling_) { // Coefficients are computed on the already scaled values bound_objective_rescaling(); @@ -899,6 +958,50 @@ const problem_t& pdlp_initial_scaling_strategy_t::get_scaled return op_problem_scaled_; } +template +void pdlp_initial_scaling_strategy_t::apply_distributed_bound_objective_rescaling( + f_t bound_rescaling, f_t objective_rescaling) +{ + using f_t2 = typename type_2::type; + + // constraint bounds *= bound_rescaling (matches scale_problem() bound block) + cub::DeviceTransform::Transform( + cuda::std::make_tuple(op_problem_scaled_.constraint_lower_bounds.data(), + op_problem_scaled_.constraint_upper_bounds.data()), + thrust::make_zip_iterator(op_problem_scaled_.constraint_lower_bounds.data(), + op_problem_scaled_.constraint_upper_bounds.data()), + op_problem_scaled_.constraint_upper_bounds.size(), + [bound_rescaling] __device__(f_t lower, f_t upper) -> thrust::tuple { + return {lower * bound_rescaling, upper * bound_rescaling}; + }, + stream_view_.value()); + + // variable bounds *= bound_rescaling (batch-1 path only; distributed is batch 1) + cub::DeviceTransform::Transform( + op_problem_scaled_.variable_bounds.data(), + op_problem_scaled_.variable_bounds.data(), + op_problem_scaled_.variable_bounds.size(), + [bound_rescaling] __device__(f_t2 variable_bounds) -> f_t2 { + return {variable_bounds.x * bound_rescaling, variable_bounds.y * bound_rescaling}; + }, + stream_view_); + + // objective *= objective_rescaling + cub::DeviceTransform::Transform( + op_problem_scaled_.objective_coefficients.data(), + op_problem_scaled_.objective_coefficients.data(), + op_problem_scaled_.objective_coefficients.size(), + [objective_rescaling] __device__(f_t c) -> f_t { return c * objective_rescaling; }, + stream_view_); + + // Store the factors (sets both host copies and the device rescaling vectors) + // so unscale_solutions() / scale_solutions() apply them consistently. The flag + // hyper_params_.bound_objective_rescaling stays true on shards so those paths + // are active; only scale_problem()'s local recompute is skipped. + set_h_bound_rescaling(bound_rescaling); + set_h_objective_rescaling(objective_rescaling); +} + template const rmm::device_uvector& pdlp_initial_scaling_strategy_t::get_constraint_matrix_scaling_vector() const @@ -913,6 +1016,46 @@ pdlp_initial_scaling_strategy_t::get_variable_scaling_vector() const return cummulative_variable_scaling_; } +template +void pdlp_initial_scaling_strategy_t::set_cummulative_scaling( + const std::vector& h_cummulative_constraint_matrix_scaling, + const std::vector& h_cummulative_variable_scaling) +{ + cuopt_expects(static_cast(h_cummulative_constraint_matrix_scaling.size()) == dual_size_h_, + error_type_t::ValidationError, + "set_cummulative_scaling: host constraint scaling vector size mismatch"); + cuopt_expects(static_cast(h_cummulative_variable_scaling.size()) == primal_size_h_, + error_type_t::ValidationError, + "set_cummulative_scaling: host variable scaling vector size mismatch"); + + raft::copy(cummulative_constraint_matrix_scaling_.data(), + h_cummulative_constraint_matrix_scaling.data(), + h_cummulative_constraint_matrix_scaling.size(), + stream_view_); + raft::copy(cummulative_variable_scaling_.data(), + h_cummulative_variable_scaling.data(), + h_cummulative_variable_scaling.size(), + stream_view_); +} + +template +void pdlp_initial_scaling_strategy_t::set_h_bound_rescaling(f_t value) +{ + std::fill(h_bound_rescaling_.begin(), h_bound_rescaling_.end(), value); + thrust::fill( + handle_ptr_->get_thrust_policy(), bound_rescaling_.begin(), bound_rescaling_.end(), value); +} + +template +void pdlp_initial_scaling_strategy_t::set_h_objective_rescaling(f_t value) +{ + std::fill(h_objective_rescaling_.begin(), h_objective_rescaling_.end(), value); + thrust::fill(handle_ptr_->get_thrust_policy(), + objective_rescaling_.begin(), + objective_rescaling_.end(), + value); +} + template typename pdlp_initial_scaling_strategy_t::view_t pdlp_initial_scaling_strategy_t::view() diff --git a/cpp/src/pdlp/initial_scaling_strategy/initial_scaling.cuh b/cpp/src/pdlp/initial_scaling_strategy/initial_scaling.cuh index 99d1472b6f..13f639079d 100644 --- a/cpp/src/pdlp/initial_scaling_strategy/initial_scaling.cuh +++ b/cpp/src/pdlp/initial_scaling_strategy/initial_scaling.cuh @@ -75,6 +75,11 @@ class pdlp_initial_scaling_strategy_t { rmm::device_uvector& dual_slack) const; void unscale_solutions(solution_t& solution) const; const rmm::device_uvector& get_constraint_matrix_scaling_vector() const; + // Mutable access needed by distributed PDLP to broadcast owned constraint + rmm::device_uvector& get_cummulative_constraint_matrix_scaling() + { + return cummulative_constraint_matrix_scaling_; + } const rmm::device_uvector& get_variable_scaling_vector() const; const problem_t& get_scaled_op_problem(); @@ -85,8 +90,40 @@ class pdlp_initial_scaling_strategy_t { void swap_context(const thrust::universal_host_pinned_vector>& swap_pairs); void resize_context(i_t new_size); + // Inject scaling state computed by another pdlp_initial_scaling_strategy_t + // Needed by distributed PDLP + void set_cummulative_scaling(const std::vector& h_cummulative_constraint_matrix_scaling, + const std::vector& h_cummulative_variable_scaling); + void set_h_bound_rescaling(f_t value); + void set_h_objective_rescaling(f_t value); + void bound_objective_rescaling(); + // Distributed PDLP: apply an externally-computed GLOBAL bound / objective + // rescaling to the already-scaled problem. + void apply_distributed_bound_objective_rescaling(f_t bound_rescaling, f_t objective_rescaling); + + // Distributed PDLP: skip the LOCAL bound/objective rescaling inside + // scale_problem() + void set_skip_distributed_local_rescaling(bool value) + { + skip_distributed_local_rescaling_ = value; + } + + // Public for distributed PDLP + void compute_scaling_vectors(i_t number_of_ruiz_iterations, f_t alpha); + + // ----- Distributed-PDLP hooks ----- + + void ruiz_iter_compute_local_iteration_vectors(); + void ruiz_iter_apply_cumulative_update(); + void pock_chambolle_compute_local_iteration_vectors(f_t alpha); + void pock_chambolle_apply_cumulative_update(); + rmm::device_uvector& get_iteration_variable_scaling() { return iteration_variable_scaling_; } + + // Restore the clean pre-scaling state for the distributed path. + void reset_scaling_state_for_distributed(); + /** * @brief Gets the device-side view (with raw pointers), for ease of access * inside cuda kernels @@ -94,7 +131,6 @@ class pdlp_initial_scaling_strategy_t { view_t view(); private: - void compute_scaling_vectors(i_t number_of_ruiz_iterations, f_t alpha); void ruiz_inf_scaling(i_t number_of_ruiz_iterations); void pock_chambolle_scaling(f_t alpha); void reset_integer_variables(); @@ -124,5 +160,8 @@ class pdlp_initial_scaling_strategy_t { rmm::device_uvector& A_T_indices_; const pdlp_hyper_params::pdlp_hyper_params_t& hyper_params_; bool running_mip_; + // Distributed PDLP: when true, scale_problem() skips its local + // bound/objective rescaling (the global factor is applied separately). + bool skip_distributed_local_rescaling_{false}; }; } // namespace cuopt::linear_programming::detail diff --git a/cpp/src/pdlp/pdhg.cu b/cpp/src/pdlp/pdhg.cu index e88366a295..b1f1a59ada 100644 --- a/cpp/src/pdlp/pdhg.cu +++ b/cpp/src/pdlp/pdhg.cu @@ -5,6 +5,11 @@ */ /* clang-format on */ #include +// pdlp.cuh defines pdlp_solver_t which the engine's compute_A_x/compute_At_y +// template bodies dereference via shard.sub_pdlp->pdhg_solver_. Must be a +// complete type at the point of template instantiation below. +#include +#include #include #include #include @@ -506,6 +511,15 @@ void pdhg_solver_t::compute_At_y() { // A_t @ y + // Multi-GPU dispatch: when the master pdhg has an engine, drive halo + // exchange + per-shard SpMV via the engine. Shards' pdhg_solver_ have no + // engine pointer set, so their compute_At_y falls through to the cusparse + // path below on each shard's local A_t. + if (mgpu_engine_ != nullptr) { + mgpu_engine_->distributed_compute_At_y(); + return; + } + if (!batch_mode_) { if constexpr (std::is_same_v) { if (cusparse_view_.mixed_precision_enabled_) { @@ -555,6 +569,15 @@ template void pdhg_solver_t::compute_A_x() { // A @ x + + // Multi-GPU dispatch: see compute_At_y. The engine halo-updates the + // reflected_primal vector (the buffer this SpMV reads) and then drives + // per-shard local cusparse SpMV. + if (mgpu_engine_ != nullptr) { + mgpu_engine_->distributed_compute_A_x(); + return; + } + if (!batch_mode_) { if constexpr (std::is_same_v) { if (cusparse_view_.mixed_precision_enabled_) { @@ -600,6 +623,50 @@ void pdhg_solver_t::compute_A_x() } } +template +void pdhg_solver_t::spmv_At_into(rmm::device_uvector& in_buf, + cusparseDnVecDescr_t out_desc) +{ + RAFT_CUSPARSE_TRY(cusparseDnVecSetValues(cusparse_view_.dual_solution, in_buf.data())); + RAFT_CUSPARSE_TRY(raft::sparse::detail::cusparsespmv(handle_ptr_->get_cusparse_handle(), + CUSPARSE_OPERATION_NON_TRANSPOSE, + reusable_device_scalar_value_1_.data(), + cusparse_view_.A_T, + cusparse_view_.dual_solution, + reusable_device_scalar_value_0_.data(), + out_desc, + CUSPARSE_SPMV_CSR_ALG2, + (f_t*)cusparse_view_.buffer_transpose.data(), + stream_view_)); + // Restore the canonical binding so subsequent code on this shard that reads + // cv.dual_solution sees the dual_solution_ buffer it was constructed with. + RAFT_CUSPARSE_TRY(cusparseDnVecSetValues(cusparse_view_.dual_solution, + current_saddle_point_state_.get_dual_solution().data())); +} + +template +void pdhg_solver_t::spmv_A_into(rmm::device_uvector& in_buf, + cusparseDnVecDescr_t out_desc) +{ + RAFT_CUSPARSE_TRY( + cusparseDnVecSetValues(cusparse_view_.reflected_primal_solution, in_buf.data())); + RAFT_CUSPARSE_TRY( + raft::sparse::detail::cusparsespmv(handle_ptr_->get_cusparse_handle(), + CUSPARSE_OPERATION_NON_TRANSPOSE, + reusable_device_scalar_value_1_.data(), + cusparse_view_.A, + cusparse_view_.reflected_primal_solution, + reusable_device_scalar_value_0_.data(), + out_desc, + CUSPARSE_SPMV_CSR_ALG2, + (f_t*)cusparse_view_.buffer_non_transpose.data(), + stream_view_)); + // Restore the canonical binding so subsequent code on this shard that reads + // cv.reflected_primal_solution sees the reflected_primal_ buffer. + RAFT_CUSPARSE_TRY( + cusparseDnVecSetValues(cusparse_view_.reflected_primal_solution, reflected_primal_.data())); +} + template void pdhg_solver_t::compute_primal_projection_with_gradient( rmm::device_uvector& primal_step_size) @@ -694,6 +761,26 @@ struct primal_reflected_major_projection { const f_t* scalar_; }; +// Pure cub-transform extract — body byte-identical to the non-batch inline +// path in compute_next_primal_dual_solution_reflected. The platform dispatch +// (single-GPU vs per-shard fan-out) lives at the call site, not here. +// Placed after primal_reflected_major_projection so the functor is visible. +template +void pdhg_solver_t::primal_reflected_major_projection_transform( + rmm::device_uvector& primal_step_size) +{ + cub::DeviceTransform::Transform( + cuda::std::make_tuple(current_saddle_point_state_.get_primal_solution().data(), + problem_ptr->objective_coefficients.data(), + current_saddle_point_state_.get_current_AtY().data(), + problem_ptr->variable_bounds.data()), + thrust::make_zip_iterator( + potential_next_primal_solution_.data(), dual_slack_.data(), reflected_primal_.data()), + primal_size_h_, + primal_reflected_major_projection(primal_step_size.data()), + stream_view_.value()); +} + template struct primal_reflected_major_projection_batch { using f_t2 = typename type_2::type; @@ -722,6 +809,21 @@ struct primal_reflected_projection { const f_t* scalar_; }; +template +void pdhg_solver_t::primal_reflected_projection_transform( + rmm::device_uvector& primal_step_size) +{ + cub::DeviceTransform::Transform( + cuda::std::make_tuple(current_saddle_point_state_.get_primal_solution().data(), + problem_ptr->objective_coefficients.data(), + current_saddle_point_state_.get_current_AtY().data(), + problem_ptr->variable_bounds.data()), + reflected_primal_.data(), + primal_size_h_, + primal_reflected_projection(primal_step_size.data()), + stream_view_.value()); +} + template struct primal_reflected_projection_batch { using f_t2 = typename type_2::type; @@ -751,6 +853,21 @@ struct dual_reflected_major_projection { const f_t* scalar_; }; +template +void pdhg_solver_t::dual_reflected_major_projection_transform( + rmm::device_uvector& dual_step_size) +{ + cub::DeviceTransform::Transform( + cuda::std::make_tuple(current_saddle_point_state_.get_dual_solution().data(), + current_saddle_point_state_.get_dual_gradient().data(), + problem_ptr->constraint_lower_bounds.data(), + problem_ptr->constraint_upper_bounds.data()), + thrust::make_zip_iterator(potential_next_dual_solution_.data(), reflected_dual_.data()), + dual_size_h_, + dual_reflected_major_projection(dual_step_size.data()), + stream_view_.value()); +} + template struct dual_reflected_major_projection_batch { HDI thrust::tuple operator()( @@ -779,6 +896,21 @@ struct dual_reflected_projection { const f_t* scalar_; }; +template +void pdhg_solver_t::dual_reflected_projection_transform( + rmm::device_uvector& dual_step_size) +{ + cub::DeviceTransform::Transform( + cuda::std::make_tuple(current_saddle_point_state_.get_dual_solution().data(), + current_saddle_point_state_.get_dual_gradient().data(), + problem_ptr->constraint_lower_bounds.data(), + problem_ptr->constraint_upper_bounds.data()), + reflected_dual_.data(), + dual_size_h_, + dual_reflected_projection(dual_step_size.data()), + stream_view_.value()); +} + template struct dual_reflected_projection_batch { HDI f_t @@ -1113,22 +1245,30 @@ void pdhg_solver_t::compute_next_primal_dual_solution_reflected( using f_t2 = typename type_2::type; + if (mgpu_engine_ != nullptr) { mgpu_engine_->sync_await_shards(stream_view_); } + // Compute next primal solution reflected. if (should_major) { graph_all.run(should_major, [&]() { + // Multi-GPU: splice shard streams into the capture so their kernels and + // NCCL collectives are recorded into the same graph. Without this, work + // issued on shard.stream from inside this lambda would either invalidate + // the capture or run outside the graph, leaving the captured graph + // empty (or broken) -- which produces the cycling/stall behavior we + // observed on larger problems. Mirrors metis_tests bench.cu fork/join. + if (mgpu_engine_ != nullptr) { mgpu_engine_->graph_capture_fork_to_shards(stream_view_); } + compute_At_y(); - if (!batch_mode_) { - cub::DeviceTransform::Transform( - cuda::std::make_tuple(current_saddle_point_state_.get_primal_solution().data(), - problem_ptr->objective_coefficients.data(), - current_saddle_point_state_.get_current_AtY().data(), - problem_ptr->variable_bounds.data()), - thrust::make_zip_iterator( - potential_next_primal_solution_.data(), dual_slack_.data(), reflected_primal_.data()), - primal_size_h_, - primal_reflected_major_projection(primal_step_size.data()), - stream_view_.value()); + if (mgpu_engine_ != nullptr) { + for (auto& shard : mgpu_engine_->shards) { + raft::device_setter guard(shard->device_id); + auto& sub_pdlp = *shard->sub_pdlp; + sub_pdlp.pdhg_solver_.primal_reflected_major_projection_transform( + sub_pdlp.get_primal_step_size()); + } + } else if (!batch_mode_) { + primal_reflected_major_projection_transform(primal_step_size); } else { cub::DeviceFor::Bulk( potential_next_primal_solution_.size(), @@ -1187,16 +1327,15 @@ void pdhg_solver_t::compute_next_primal_dual_solution_reflected( // Compute next dual compute_A_x(); - if (!batch_mode_) { - cub::DeviceTransform::Transform( - cuda::std::make_tuple(current_saddle_point_state_.get_dual_solution().data(), - current_saddle_point_state_.get_dual_gradient().data(), - problem_ptr->constraint_lower_bounds.data(), - problem_ptr->constraint_upper_bounds.data()), - thrust::make_zip_iterator(potential_next_dual_solution_.data(), reflected_dual_.data()), - dual_size_h_, - dual_reflected_major_projection(dual_step_size.data()), - stream_view_.value()); + if (mgpu_engine_ != nullptr) { + for (auto& shard : mgpu_engine_->shards) { + raft::device_setter guard(shard->device_id); + auto& sub_pdlp = *shard->sub_pdlp; + sub_pdlp.pdhg_solver_.dual_reflected_major_projection_transform( + sub_pdlp.get_dual_step_size()); + } + } else if (!batch_mode_) { + dual_reflected_major_projection_transform(dual_step_size); } else { cub::DeviceFor::Bulk( potential_next_dual_solution_.size(), @@ -1217,10 +1356,17 @@ void pdhg_solver_t::compute_next_primal_dual_solution_reflected( print("potential_next_dual_solution_", potential_next_dual_solution_); print("reflected_dual_", reflected_dual_); #endif + + // Multi-GPU: close the fork by joining every shard stream back into + // the master stream so cudaStreamEndCapture sees a single graph + // spanning all streams. + if (mgpu_engine_ != nullptr) { mgpu_engine_->graph_capture_join_from_shards(stream_view_); } }); } else { graph_all.run(should_major, [&]() { + if (mgpu_engine_ != nullptr) { mgpu_engine_->graph_capture_fork_to_shards(stream_view_); } + // Compute next primal compute_At_y(); @@ -1232,16 +1378,15 @@ void pdhg_solver_t::compute_next_primal_dual_solution_reflected( current_saddle_point_state_.get_current_AtY()); #endif - if (!batch_mode_) { - cub::DeviceTransform::Transform( - cuda::std::make_tuple(current_saddle_point_state_.get_primal_solution().data(), - problem_ptr->objective_coefficients.data(), - current_saddle_point_state_.get_current_AtY().data(), - problem_ptr->variable_bounds.data()), - reflected_primal_.data(), - primal_size_h_, - primal_reflected_projection(primal_step_size.data()), - stream_view_.value()); + if (mgpu_engine_ != nullptr) { + for (auto& shard : mgpu_engine_->shards) { + raft::device_setter guard(shard->device_id); + auto& sub_pdlp = *shard->sub_pdlp; + sub_pdlp.pdhg_solver_.primal_reflected_projection_transform( + sub_pdlp.get_primal_step_size()); + } + } else if (!batch_mode_) { + primal_reflected_projection_transform(primal_step_size); } else { cub::DeviceFor::Bulk( reflected_primal_.size(), @@ -1301,16 +1446,14 @@ void pdhg_solver_t::compute_next_primal_dual_solution_reflected( // Compute next dual compute_A_x(); - if (!batch_mode_) { - cub::DeviceTransform::Transform( - cuda::std::make_tuple(current_saddle_point_state_.get_dual_solution().data(), - current_saddle_point_state_.get_dual_gradient().data(), - problem_ptr->constraint_lower_bounds.data(), - problem_ptr->constraint_upper_bounds.data()), - reflected_dual_.data(), - dual_size_h_, - dual_reflected_projection(dual_step_size.data()), - stream_view_.value()); + if (mgpu_engine_ != nullptr) { + for (auto& shard : mgpu_engine_->shards) { + raft::device_setter guard(shard->device_id); + auto& sub_pdlp = *shard->sub_pdlp; + sub_pdlp.pdhg_solver_.dual_reflected_projection_transform(sub_pdlp.get_dual_step_size()); + } + } else if (!batch_mode_) { + dual_reflected_projection_transform(dual_step_size); } else { cub::DeviceFor::Bulk( reflected_dual_.size(), @@ -1328,8 +1471,13 @@ void pdhg_solver_t::compute_next_primal_dual_solution_reflected( #ifdef CUPDLP_DEBUG_MODE print("reflected_dual_", reflected_dual_); #endif + + if (mgpu_engine_ != nullptr) { mgpu_engine_->graph_capture_join_from_shards(stream_view_); } }); } + + // sync to master stream after the graph is captured + if (mgpu_engine_ != nullptr) { mgpu_engine_->sync_await_master(stream_view_); } } template diff --git a/cpp/src/pdlp/pdhg.hpp b/cpp/src/pdlp/pdhg.hpp index 01eabe967a..e4d16360a7 100644 --- a/cpp/src/pdlp/pdhg.hpp +++ b/cpp/src/pdlp/pdhg.hpp @@ -24,6 +24,12 @@ #include namespace cuopt::linear_programming::detail { + +// Forward-declared to avoid include cycle: multi_gpu_engine.hpp itself includes pdhg.hpp +// (engine calls per-shard pdhg compute_*). pdhg.cu does the full include. +template +struct multi_gpu_engine_t; + template class pdhg_solver_t { public: @@ -76,6 +82,53 @@ class pdhg_solver_t { void update_solution(cusparse_view_t& current_op_problem_evaluation_cusparse_view_); void refine_initial_primal_projection(const rmm::device_uvector& bound_rescaling); + // SpMV primitives. Public so the multi-GPU engine can drive them per-shard + // after halo-exchanging the relevant vector. Single-GPU PDLP still calls + // them internally via take_step / compute_next_*. + // + // If set_multi_gpu_engine() has been called, these dispatch to the engine + // (halo exchange + per-shard SpMV). Otherwise they run the single-GPU + // cusparse path on the local matrix. + void compute_At_y(); + void compute_A_x(); + void spmvop_At_y(); + void spmvop_A_x(); + + // Parameterized SpMVs used by the multi-GPU engine. + // Both temporarily hijack a canonical input descriptor in cusparse_view_ + // (cv.dual_solution for At, cv.reflected_primal_solution for A) to point at + // `in_buf.data()`, run the local SpMV into `out_desc`, then restore the + // descriptor to its original buffer so other code on this shard is unaffected. + // No multi-GPU dispatch inside — the engine is the orchestrator. + void spmv_At_into(rmm::device_uvector& in_buf, cusparseDnVecDescr_t out_desc); + void spmv_A_into(rmm::device_uvector& in_buf, cusparseDnVecDescr_t out_desc); + + // Pure cub-transform extractions. Each one is byte-identical to the inline + // cub call it replaces — no platform dispatch inside. Callers handle the + // single-GPU vs per-shard branching at the call site (see the + // "if (mgpu_engine_) for shard..." blocks in compute_next_*). + void primal_reflected_major_projection_transform(rmm::device_uvector& primal_step_size); + void dual_reflected_major_projection_transform(rmm::device_uvector& dual_step_size); + void primal_reflected_projection_transform(rmm::device_uvector& primal_step_size); + void dual_reflected_projection_transform(rmm::device_uvector& dual_step_size); + + // Master PDLP wires up the engine pointer here after the engine is built. + // Shards' pdhg_solver_ leaves this null so each shard runs single-GPU SpMV + // on its local matrix. Also flips is_multi_gpu_ — convenience flag that any + // pdhg participating in a distributed run (master OR shard) carries true. + void set_multi_gpu_engine(multi_gpu_engine_t* engine) + { + mgpu_engine_ = engine; + is_multi_gpu_ = (engine != nullptr); + } + + // Mark a shard's pdhg_solver_ as part of a distributed run without giving it + // an engine (shards don't orchestrate; they only run local SpMV on owned + // rows). Called from shard.cu right after sub_pdlp is constructed. + void set_is_multi_gpu(bool v) { is_multi_gpu_ = v; } + bool is_multi_gpu() const { return is_multi_gpu_; } + multi_gpu_engine_t* get_mgpu_engine() const { return mgpu_engine_; } + i_t total_pdhg_iterations_; private: @@ -93,12 +146,9 @@ class pdhg_solver_t { void compute_primal_projection_with_gradient(rmm::device_uvector& primal_step_size); void compute_primal_projection(rmm::device_uvector& primal_step_size); - void compute_At_y(); - void compute_A_x(); - void spmvop_At_y(); - void spmvop_A_x(); bool batch_mode_{false}; + bool is_multi_gpu_{false}; raft::handle_t const* handle_ptr_{nullptr}; rmm::cuda_stream_view stream_view_; @@ -150,6 +200,10 @@ class pdhg_solver_t { rmm::device_uvector new_bounds_lower_; rmm::device_uvector new_bounds_upper_; cuda::fast_mod_div batch_size_divisor_; + + // Non-owning. Set on the master pdhg_solver_ in distributed mode; null + // (default) means single-GPU path. See compute_At_y / compute_A_x. + multi_gpu_engine_t* mgpu_engine_{nullptr}; }; } // namespace cuopt::linear_programming::detail diff --git a/cpp/src/pdlp/pdlp.cu b/cpp/src/pdlp/pdlp.cu index dadc54cc08..71c6b0a48c 100644 --- a/cpp/src/pdlp/pdlp.cu +++ b/cpp/src/pdlp/pdlp.cu @@ -11,12 +11,16 @@ #include #include +#include +#include #include #include #include +#include #include #include "cuopt/linear_programming/pdlp/solver_solution.hpp" +#include "distributed_pdlp/multi_gpu_engine.hpp" #include #include @@ -40,9 +44,11 @@ #include #include +#include #include #include #include +#include #include namespace cuopt::linear_programming::detail { @@ -371,6 +377,376 @@ pdlp_solver_t::pdlp_solver_t(problem_t& op_problem, } } +// ============================================================================ +// Distributed multi-GPU ctor. +// needs placeholder_problem to be a shape-0 problem +// reads the problem from mps_data_model directly +// builds internal attributes from the placeholder_problem +// builds the engine from the mps_data_model +template +pdlp_solver_t::pdlp_solver_t( + problem_t& placeholder_problem, + cuopt::linear_programming::io::mps_data_model_t const& mps, + pdlp_solver_settings_t const& settings) + // Makes all inner feilds of master 0 size + : pdlp_solver_t(placeholder_problem, settings, /*is_legacy_batch_mode=*/false) +{ + cuopt_expects(placeholder_problem.n_variables == 0 && placeholder_problem.n_constraints == 0 && + placeholder_problem.nnz == 0, + error_type_t::ValidationError, + "Distributed mGPU pdlp_solver_t ctor requires a shape-0 " + "placeholder problem (n_variables == n_constraints == nnz == 0)"); + const int distributed_pdlp_num_gpus = settings.distributed_pdlp_num_gpus; + CUOPT_LOG_INFO("Solving with distributed PDLP on %d GPU (mps direct path)", + distributed_pdlp_num_gpus); + + if constexpr (!std::is_same_v) { + cuopt_expects( + false, error_type_t::ValidationError, "Distributed PDLP currently requires double precision"); + return; + } + // ----- 1. Read problem shape and bulk data directly from mps (host) ----- + const i_t n_vars = static_cast(mps.get_objective_coefficients().size()); + const i_t n_cstr = static_cast(mps.get_constraint_lower_bounds().size()); + const i_t nnz = static_cast(mps.get_constraint_matrix_values().size()); + cuopt_expects(n_vars > 0, + error_type_t::ValidationError, + "Distributed PDLP from mps requires a non-empty objective"); + cuopt_expects(n_cstr > 0, + error_type_t::ValidationError, + "Distributed PDLP from mps requires at least one constraint"); + cuopt_expects(static_cast(mps.get_constraint_matrix_offsets().size()) == n_cstr + 1, + error_type_t::ValidationError, + "mps constraint_matrix_offsets size must equal n_constraints + 1"); + cuopt_expects( + static_cast(mps.get_constraint_matrix_indices().size()) == nnz, + error_type_t::ValidationError, + "mps constraint_matrix_indices size must equal nnz (constraint_matrix_values size)"); + cuopt_expects(static_cast(mps.get_constraint_upper_bounds().size()) == n_cstr, + error_type_t::ValidationError, + "mps constraint_upper_bounds size must equal n_constraints"); + cuopt_expects(static_cast(mps.get_variable_lower_bounds().size()) == n_vars, + error_type_t::ValidationError, + "mps variable_lower_bounds size must equal n_variables"); + cuopt_expects(static_cast(mps.get_variable_upper_bounds().size()) == n_vars, + error_type_t::ValidationError, + "mps variable_upper_bounds size must equal n_variables"); + + const bool maximize = mps.get_sense(); + f_t objective_offset = mps.get_objective_offset(); + f_t objective_scaling_factor = mps.get_objective_scaling_factor(); + + // Objective: copy (mutable so we can negate for maximize, matching + // problem_helpers.cuh::convert_to_maximization_problem). + std::vector h_obj = mps.get_objective_coefficients(); + if (maximize) { + for (auto& v : h_obj) + v = -v; + objective_offset = -objective_offset; + objective_scaling_factor = -objective_scaling_factor; + } + + // Bounds (copy from mps; engine ctor takes by const ref to std::vector). + std::vector h_var_lower = mps.get_variable_lower_bounds(); + std::vector h_var_upper = mps.get_variable_upper_bounds(); + std::vector h_cstr_lower = mps.get_constraint_lower_bounds(); + std::vector h_cstr_upper = mps.get_constraint_upper_bounds(); + + // A (CSR) — mutable copies for the engine + partitioner consumers below. + std::vector h_A_row_offsets = mps.get_constraint_matrix_offsets(); + std::vector h_A_col_indices = mps.get_constraint_matrix_indices(); + std::vector h_A_values = mps.get_constraint_matrix_values(); + + // ----- 2. Transpose A -> A^T on the host (one-shot CSR transpose) ----- + // CSC(A) and CSR(A^T) share the same memory layout, so the CSC produced + // by dual_simplex::csr_matrix_t::to_compressed_col IS the CSR of A^T. + // O(nnz + n_vars) counting sort, same as problem_t::compute_transpose. + namespace ds = cuopt::linear_programming::dual_simplex; + ds::csr_matrix_t A_csr(n_cstr, n_vars, nnz); + A_csr.row_start = h_A_row_offsets; + A_csr.j = h_A_col_indices; + A_csr.x = h_A_values; + ds::csc_matrix_t AT_as_csc(n_vars, n_cstr, nnz); + A_csr.to_compressed_col(AT_as_csc); + std::vector h_A_t_row_offsets = std::move(AT_as_csc.col_start); + std::vector h_A_t_col_indices = std::move(AT_as_csc.i); + std::vector h_A_t_values = std::move(AT_as_csc.x); + + // ----- 3. Identity scaling for V1 ----- + // Real multi-GPU scaling is a TODO; ship the unscaled problem to shards as + // both "unscaled" and "scaled" so the engine and per-shard pdlp_solver_t + // can run end-to-end. Scaling factor vectors are 1.0 everywhere so the + // shard-side unscale at the end is a no-op. + std::vector h_A_values_scaled = h_A_values; + std::vector h_A_t_values_scaled = h_A_t_values; + std::vector h_obj_scaled = h_obj; + std::vector h_var_lower_scaled = h_var_lower; + std::vector h_var_upper_scaled = h_var_upper; + std::vector h_cstr_lower_scaled = h_cstr_lower; + std::vector h_cstr_upper_scaled = h_cstr_upper; + std::vector h_cummulative_cstr_scaling(n_cstr, f_t(1.0)); + std::vector h_cummulative_var_scaling(n_vars, f_t(1.0)); + const f_t h_bound_rescaling = f_t(1.0); + const f_t h_objective_rescaling = f_t(1.0); + + // ----- 4. Partition ----- + std::vector parts; + if (!settings.multi_gpu_partition_file.empty()) { + parts = partition_loader_t::parse_distributed_pdlp_partition_file( + settings.multi_gpu_partition_file); + validate_partition(parts, n_cstr, n_vars, distributed_pdlp_num_gpus, "partition file"); + } else { + partitioner_input_t partition_input; + partition_input.nb_cstr = n_cstr; + partition_input.nb_vars = n_vars; + partition_input.nb_parts = distributed_pdlp_num_gpus; + + // Resolve which partitioner to use. + std::string partitioner_choice = settings.distributed_pdlp_partitioner; + std::transform(partitioner_choice.begin(), + partitioner_choice.end(), + partitioner_choice.begin(), + [](unsigned char c) { return std::tolower(c); }); + partitioner_kind_t kind; + if (partitioner_choice.empty() || partitioner_choice == "auto") { + kind = + (distributed_pdlp_num_gpus == 1) ? partitioner_kind_t::Dummy : partitioner_kind_t::KaMinPar; + } else if (partitioner_choice == "dummy") { + kind = partitioner_kind_t::Dummy; + } else if (partitioner_choice == "metis") { + kind = partitioner_kind_t::Metis; + } else if (partitioner_choice == "kaminpar") { + kind = partitioner_kind_t::KaMinPar; + } else { + cuopt_expects( + false, + error_type_t::ValidationError, + "Unknown distributed_pdlp_partitioner '%s' (expected auto|dummy|metis|kaminpar)", + settings.distributed_pdlp_partitioner.c_str()); + kind = partitioner_kind_t::Dummy; // unreachable; silences -Wmaybe-uninitialized + } + const bool needs_graph = + (kind == partitioner_kind_t::Metis || kind == partitioner_kind_t::KaMinPar); + if (needs_graph) { + // partitioner_input_t holds non-const std::vector* pointers; we + // already have the data in our local mutable buffers above. + partition_input.A.row_offsets = &h_A_row_offsets; + partition_input.A.col_indices = &h_A_col_indices; + partition_input.A.num_rows = n_cstr; + partition_input.A.num_cols = n_vars; + partition_input.A_t.row_offsets = &h_A_t_row_offsets; + partition_input.A_t.col_indices = &h_A_t_col_indices; + partition_input.A_t.num_rows = n_vars; + partition_input.A_t.num_cols = n_cstr; + // 0 => KaMinPar auto-detects and uses all hardware threads (ignored by METIS). + partition_input.nb_threads = 0; + } + const char* kind_name = (kind == partitioner_kind_t::Dummy) ? "dummy" + : (kind == partitioner_kind_t::Metis) ? "metis" + : (kind == partitioner_kind_t::KaMinPar) ? "kaminpar" + : "unknown"; + CUOPT_LOG_INFO( + "Partitioning %d constraints + %d variables into %d part(s) using the %s " + "partitioner", + n_cstr, + n_vars, + distributed_pdlp_num_gpus, + kind_name); + auto partitioner = make_partitioner(kind); + parts = partitioner->partition(partition_input); + } + + // Optionally dump the partition right after computing it (one part-id per line). + if (!settings.multi_gpu_export_partition_file.empty()) { + partition_loader_t::export_distributed_pdlp_partition_file( + settings.multi_gpu_export_partition_file, parts); + std::cout << "Exported " << parts.size() << " part-ids to " + << settings.multi_gpu_export_partition_file << std::endl; + } + + // ----- 5. Build per-rank data ----- + std::vector> sub_pdlp_rank_data = + partition_loader_t::create_rank_data_from_parts(parts, + h_A_row_offsets, + h_A_col_indices, + h_A_values, + h_A_values_scaled, + h_A_t_row_offsets, + h_A_t_col_indices, + h_A_t_values, + h_A_t_values_scaled, + settings.distributed_pdlp_num_gpus, + n_cstr, + n_vars, + nnz); + + // ----- 6. Per-shard settings ----- + pdlp_solver_settings_t sub_pdlp_settings = settings; + sub_pdlp_settings.num_gpus = 1; + sub_pdlp_settings.distributed_pdlp_num_gpus = 1; + sub_pdlp_settings.multi_gpu_partition_file = ""; + sub_pdlp_settings.multi_gpu_export_partition_file = ""; + sub_pdlp_settings.is_distributed_sub_pdlp = true; + sub_pdlp_settings.hyper_params.default_l_inf_ruiz_iterations = 0; + sub_pdlp_settings.hyper_params.default_alpha_pock_chambolle_rescaling = 0.0; + + // ----- 7. Construct the engine: NCCL comms + per-shard pdlp_solver_t ----- + multi_gpu_engine.emplace(std::move(sub_pdlp_rank_data), + h_obj, + h_var_lower, + h_var_upper, + h_cstr_lower, + h_cstr_upper, + h_obj_scaled, + h_var_lower_scaled, + h_var_upper_scaled, + h_cstr_lower_scaled, + h_cstr_upper_scaled, + h_cummulative_cstr_scaling, + h_cummulative_var_scaling, + h_bound_rescaling, + h_objective_rescaling, + maximize, + objective_offset, + objective_scaling_factor, + sub_pdlp_settings); + + // ----- 8 Distributed Scaling ----- + for (auto& shard : multi_gpu_engine->shards) { + raft::device_setter guard(shard->device_id); + shard->sub_pdlp->get_initial_scaling_strategy().reset_scaling_state_for_distributed(); + } + for (auto& shard : multi_gpu_engine->shards) { + raft::device_setter guard(shard->device_id); + shard->stream.synchronize(); + } + + // Distributed scaling + if (settings_.hyper_params.do_ruiz_scaling) { + multi_gpu_engine->distributed_ruiz_inf_scaling( + settings_.hyper_params.default_l_inf_ruiz_iterations, n_vars); + } + // push local scaling to halo + multi_gpu_engine->broadcast_constraint_scaling_to_halo(); + if (settings_.hyper_params.do_pock_chambolle_scaling) { + multi_gpu_engine->distributed_pock_chambolle_scaling( + static_cast(settings_.hyper_params.default_alpha_pock_chambolle_rescaling), n_vars); + } + // Refresh the halo constraint scaling after Pock-Chambolle + multi_gpu_engine->broadcast_constraint_scaling_to_halo(); + + for (auto& shard : multi_gpu_engine->shards) { + raft::device_setter guard(shard->device_id); + auto& scaling = shard->sub_pdlp->get_initial_scaling_strategy(); + // Skip the per-shard local bound/objective rescaling; the global factor is + // applied below. Keeps the unscale path active (flag stays true). + scaling.set_skip_distributed_local_rescaling(true); + scaling.scale_problem(); + + shard->sub_pdlp->pdhg_solver_.get_cusparse_view().create_spmv_op_plans( + /*is_reflected=*/settings_.hyper_params.use_reflected_primal_dual); + } + for (auto& shard : multi_gpu_engine->shards) { + raft::device_setter guard(shard->device_id); + shard->stream.synchronize(); + } + + // Global bound/objective rescaling: allreduce the owned partial squared-norms + if (settings_.hyper_params.bound_objective_rescaling && !inside_mip_) { + multi_gpu_engine->distributed_bound_objective_rescaling( + static_cast(settings_.hyper_params.initial_primal_weight_c_scaling)); + } + + // ----- 8b. Seed initial step-size / primal-weight (distributed, scales to N shards) ----- + constexpr f_t kStepSizeScale = f_t{0.998}; + const f_t sigma_max = multi_gpu_engine->distributed_max_singular_value(n_cstr); + const f_t h_primal_weight = f_t{1}; + const f_t h_step_size = (sigma_max > f_t{0}) ? kStepSizeScale / sigma_max : f_t{1}; + // With primal_weight = 1 the adaptive step-size strategy collapses to + // primal_step_size = step_size / primal_weight = step_size + // dual_step_size = step_size * primal_weight = step_size. + const f_t h_primal_step_size = h_step_size; + const f_t h_dual_step_size = h_step_size; + + // Put the values on master + raft::copy(step_size_.data(), &h_step_size, 1, stream_view_); + raft::copy(primal_weight_.data(), &h_primal_weight, 1, stream_view_); + raft::copy(best_primal_weight_.data(), &h_primal_weight, 1, stream_view_); + raft::copy(primal_step_size_.data(), &h_primal_step_size, 1, stream_view_); + raft::copy(dual_step_size_.data(), &h_dual_step_size, 1, stream_view_); + handle_ptr_->sync_stream(stream_view_); + + // put the values on each shard + for (auto& shard : multi_gpu_engine->shards) { + raft::device_setter guard(shard->device_id); + auto& sub = *shard->sub_pdlp; + raft::copy(sub.step_size_.data(), &h_step_size, 1, shard->stream); + raft::copy(sub.primal_weight_.data(), &h_primal_weight, 1, shard->stream); + raft::copy(sub.best_primal_weight_.data(), &h_primal_weight, 1, shard->stream); + raft::copy(sub.get_primal_step_size().data(), &h_primal_step_size, 1, shard->stream); + raft::copy(sub.get_dual_step_size().data(), &h_dual_step_size, 1, shard->stream); + } + for (auto& shard : multi_gpu_engine->shards) { + raft::device_setter guard(shard->device_id); + shard->stream.synchronize(); + } + + // Wire the engine into master's pdhg_solver_; shards keep mgpu_engine_ == nullptr. + pdhg_solver_.set_multi_gpu_engine(&*multi_gpu_engine); + + // ----- 9. Resize master gather destinations to the full problem size ----- + pdhg_solver_.get_potential_next_primal_solution().resize(n_vars, stream_view_); + pdhg_solver_.get_potential_next_dual_solution().resize(n_cstr, stream_view_); + current_termination_strategy_.get_convergence_information().get_reduced_cost().resize( + n_vars, stream_view_); + primal_size_h_ = n_vars; + dual_size_h_ = n_cstr; + + // Distributed conergence_information::init_l2_norms + for (auto& shard : multi_gpu_engine->shards) { + raft::device_setter guard(shard->device_id); + shard->sub_pdlp->get_current_termination_strategy() + .get_convergence_information() + .compute_owned_reference_norm_partials(shard->rank_data.owned_var_size, + shard->rank_data.owned_cstr_size); + } + multi_gpu_engine->allreduce_sum_inplace([](pdlp_solver_t& sp) -> f_t* { + return sp.get_current_termination_strategy() + .get_convergence_information() + .l2_norm_primal_right_hand_side_data(); + }); + multi_gpu_engine->allreduce_sum_inplace([](pdlp_solver_t& sp) -> f_t* { + return sp.get_current_termination_strategy() + .get_convergence_information() + .l2_norm_primal_linear_objective_data(); + }); + for (auto& shard : multi_gpu_engine->shards) { + raft::device_setter guard(shard->device_id); + shard->sub_pdlp->get_current_termination_strategy() + .get_convergence_information() + .sqrt_reference_norms_inplace(); + shard->stream.synchronize(); + } + // Broadcast the values to the master + { + auto& s0 = *multi_gpu_engine->shards[0]; + auto& s0_conv = s0.sub_pdlp->get_current_termination_strategy().get_convergence_information(); + raft::device_setter guard(s0.device_id); + for (auto* ts : {¤t_termination_strategy_, &average_termination_strategy_}) { + auto& ci = ts->get_convergence_information(); + raft::copy(ci.l2_norm_primal_right_hand_side_data(), + s0_conv.l2_norm_primal_right_hand_side_data(), + 1, + stream_view_); + raft::copy(ci.l2_norm_primal_linear_objective_data(), + s0_conv.l2_norm_primal_linear_objective_data(), + 1, + stream_view_); + } + } + handle_ptr_->sync_stream(stream_view_); +} + template void pdlp_solver_t::set_initial_primal_weight(f_t initial_primal_weight) { @@ -442,6 +818,11 @@ std::optional> pdlp_solver_t RAFT_CUDA_TRY(cudaDeviceSynchronize()); std::cout << "Time Limit reached, returning current solution" << std::endl; #endif + if (auto* engine = pdhg_solver_.get_mgpu_engine()) { + engine->gather_potential_next_solutions_to_master( + pdhg_solver_, + current_termination_strategy_.get_convergence_information().get_reduced_cost()); + } return current_termination_strategy_.fill_return_problem_solution( internal_solver_iterations_, pdhg_solver_, @@ -475,6 +856,11 @@ std::optional> pdlp_solver_t return finalize_batch_return_with_limit_reached(pdlp_termination_status_t::IterationLimit); } + if (auto* engine = pdhg_solver_.get_mgpu_engine()) { + engine->gather_potential_next_solutions_to_master( + pdhg_solver_, + current_termination_strategy_.get_convergence_information().get_reduced_cost()); + } return current_termination_strategy_.fill_return_problem_solution( internal_solver_iterations_, pdhg_solver_, @@ -1152,6 +1538,11 @@ std::optional> pdlp_solver_t #endif print_final_termination_criteria( timer, current_termination_strategy_.get_convergence_information(), termination_current); + if (auto* engine = pdhg_solver_.get_mgpu_engine()) { + engine->gather_potential_next_solutions_to_master( + pdhg_solver_, + current_termination_strategy_.get_convergence_information().get_reduced_cost()); + } return current_termination_strategy_.fill_return_problem_solution( internal_solver_iterations_, pdhg_solver_, @@ -2008,34 +2399,112 @@ void pdlp_solver_t::compute_fixed_error(std::vector& has_restarte // Computing the deltas // TODO batch mdoe: this only works if everyone restarts - cub::DeviceTransform::Transform(cuda::std::make_tuple(pdhg_solver_.get_reflected_primal().data(), - pdhg_solver_.get_primal_solution().data()), - pdhg_solver_.get_saddle_point_state().get_delta_primal().data(), - pdhg_solver_.get_primal_solution().size(), - cuda::std::minus{}, - stream_view_.value()); - cub::DeviceTransform::Transform(cuda::std::make_tuple(pdhg_solver_.get_reflected_dual().data(), - pdhg_solver_.get_dual_solution().data()), - pdhg_solver_.get_saddle_point_state().get_delta_dual().data(), - pdhg_solver_.get_dual_solution().size(), - cuda::std::minus{}, - stream_view_.value()); + if (multi_gpu_engine) { + // Go faire une fonction compute_delta_primal, compute_delta primal ? + for (auto& shard : multi_gpu_engine->shards) { + raft::device_setter guard(shard->device_id); + auto& sub_pdhg = shard->sub_pdlp->pdhg_solver_; + cub::DeviceTransform::Transform(cuda::std::make_tuple(sub_pdhg.get_reflected_primal().data(), + sub_pdhg.get_primal_solution().data()), + sub_pdhg.get_saddle_point_state().get_delta_primal().data(), + sub_pdhg.get_primal_solution().size(), + cuda::std::minus{}, + shard->stream.view()); + cub::DeviceTransform::Transform(cuda::std::make_tuple(sub_pdhg.get_reflected_dual().data(), + sub_pdhg.get_dual_solution().data()), + sub_pdhg.get_saddle_point_state().get_delta_dual().data(), + sub_pdhg.get_dual_solution().size(), + cuda::std::minus{}, + shard->stream.view()); + } + } else { + cub::DeviceTransform::Transform( + cuda::std::make_tuple(pdhg_solver_.get_reflected_primal().data(), + pdhg_solver_.get_primal_solution().data()), + pdhg_solver_.get_saddle_point_state().get_delta_primal().data(), + pdhg_solver_.get_primal_solution().size(), + cuda::std::minus{}, + stream_view_.value()); + cub::DeviceTransform::Transform(cuda::std::make_tuple(pdhg_solver_.get_reflected_dual().data(), + pdhg_solver_.get_dual_solution().data()), + pdhg_solver_.get_saddle_point_state().get_delta_dual().data(), + pdhg_solver_.get_dual_solution().size(), + cuda::std::minus{}, + stream_view_.value()); + } auto& cusparse_view = pdhg_solver_.get_cusparse_view(); - // Sync to make sure all previous cuSparse operations are finished before setting the - // potential_next_dual_solution - RAFT_CUDA_TRY(cudaStreamSynchronize(stream_view_)); - // Make potential_next_dual_solution point towards reflected dual solution to reuse the code - RAFT_CUSPARSE_TRY(cusparseDnVecSetValues(cusparse_view.potential_next_dual_solution, - (void*)pdhg_solver_.get_reflected_dual().data())); + if (multi_gpu_engine) { + // SpMV is the first operation in compute_interaction_and_movement so we can do halo before and + // call it naturally we then reduce the local dot products + multi_gpu_engine->halo_exchange_cstr( + [](auto& pdhg) -> rmm::device_uvector& { return pdhg.get_reflected_dual(); }); - if (batch_mode_) - RAFT_CUSPARSE_TRY(cusparseDnMatSetValues(cusparse_view.batch_potential_next_dual_solution, + for (auto& shard : multi_gpu_engine->shards) { + raft::device_setter guard(shard->device_id); + auto& sub_pdlp = *shard->sub_pdlp; + auto& sub_cv = sub_pdlp.pdhg_solver_.get_cusparse_view(); + + RAFT_CUSPARSE_TRY( + cusparseDnVecSetValues(sub_cv.potential_next_dual_solution, + (void*)sub_pdlp.pdhg_solver_.get_reflected_dual().data())); + + // Ensure norm is on owned size + sub_pdlp.step_size_strategy_.compute_interaction_and_movement( + sub_pdlp.pdhg_solver_.get_primal_tmp_resource(), + sub_cv, + sub_pdlp.pdhg_solver_.get_saddle_point_state(), + shard->rank_data.owned_var_size, + shard->rank_data.owned_cstr_size); + + RAFT_CUSPARSE_TRY(cusparseDnVecSetValues( + sub_cv.potential_next_dual_solution, + (void*)sub_pdlp.pdhg_solver_.get_potential_next_dual_solution().data())); + } + + multi_gpu_engine->allreduce_sum_inplace( + [](auto& sp) -> f_t* { return sp.step_size_strategy_.get_interaction().data(); }); + multi_gpu_engine->allreduce_sum_inplace([](auto& sp) -> f_t* { + return sp.step_size_strategy_.get_norm_squared_delta_primal().data(); + }); + multi_gpu_engine->allreduce_sum_inplace( + [](auto& sp) -> f_t* { return sp.step_size_strategy_.get_norm_squared_delta_dual().data(); }); + + auto& s0 = *multi_gpu_engine->shards[0]; + { + raft::device_setter guard(s0.device_id); + RAFT_CUDA_TRY(cudaStreamSynchronize(s0.stream.view().value())); + } + auto& src_sp = s0.sub_pdlp->step_size_strategy_; + raft::copy(step_size_strategy_.get_interaction().data(), + src_sp.get_interaction().data(), + 1, + stream_view_); + raft::copy(step_size_strategy_.get_norm_squared_delta_primal().data(), + src_sp.get_norm_squared_delta_primal().data(), + 1, + stream_view_); + raft::copy(step_size_strategy_.get_norm_squared_delta_dual().data(), + src_sp.get_norm_squared_delta_dual().data(), + 1, + stream_view_); + } else { + // Sync to make sure all previous cuSparse operations are finished before setting the + // potential_next_dual_solution + RAFT_CUDA_TRY(cudaStreamSynchronize(stream_view_)); + + // Make potential_next_dual_solution point towards reflected dual solution to reuse the code + RAFT_CUSPARSE_TRY(cusparseDnVecSetValues(cusparse_view.potential_next_dual_solution, (void*)pdhg_solver_.get_reflected_dual().data())); - step_size_strategy_.compute_interaction_and_movement( - pdhg_solver_.get_primal_tmp_resource(), cusparse_view, pdhg_solver_.get_saddle_point_state()); + if (batch_mode_) + RAFT_CUSPARSE_TRY(cusparseDnMatSetValues(cusparse_view.batch_potential_next_dual_solution, + (void*)pdhg_solver_.get_reflected_dual().data())); + + step_size_strategy_.compute_interaction_and_movement( + pdhg_solver_.get_primal_tmp_resource(), cusparse_view, pdhg_solver_.get_saddle_point_state()); + } if (batch_mode_) { const auto [grid_size, block_size] = kernel_config_from_batch_size(climber_strategies_.size()); @@ -2066,11 +2535,12 @@ void pdlp_solver_t::compute_fixed_error(std::vector& has_restarte // potential_next_dual_solution RAFT_CUDA_TRY(cudaStreamSynchronize(stream_view_)); - // Put back - RAFT_CUSPARSE_TRY( - cusparseDnVecSetValues(cusparse_view.potential_next_dual_solution, - (void*)pdhg_solver_.get_potential_next_dual_solution().data())); - + // Put back, already done in multi-gpu side + if (!multi_gpu_engine) { + RAFT_CUSPARSE_TRY( + cusparseDnVecSetValues(cusparse_view.potential_next_dual_solution, + (void*)pdhg_solver_.get_potential_next_dual_solution().data())); + } if (batch_mode_) { RAFT_CUSPARSE_TRY( cusparseDnMatSetValues(cusparse_view.batch_potential_next_dual_solution, @@ -2309,245 +2779,250 @@ optimization_problem_solution_t pdlp_solver_t::run_solver(co std::cout << "Starting PDLP loop:" << std::endl; #endif - // TODO handle that properly - if (settings_.hyper_params.compute_initial_step_size_before_scaling && - !settings_.get_initial_step_size().has_value()) - compute_initial_step_size(); - if (settings_.hyper_params.compute_initial_primal_weight_before_scaling && - !settings_.get_initial_primal_weight().has_value()) - compute_initial_primal_weight(); - - initial_scaling_strategy_.scale_problem(); - if constexpr (std::is_same_v) { - if (!batch_mode_ && !pdhg_solver_.get_cusparse_view().mixed_precision_enabled_) { - pdhg_solver_.get_cusparse_view().create_spmv_op_plans( - settings_.hyper_params.use_reflected_primal_dual); - } - } - - // Update FP32 matrix copies for mixed precision SpMV after scaling - pdhg_solver_.get_cusparse_view().update_mixed_precision_matrices(); + bool warm_start_was_given = settings_.get_pdlp_warm_start_data().is_populated(); - // Redirect cuSPARSE descriptors to use the original problem's structural data (offsets, indices), - // then free the duplicated structural vectors from the scaled copy to save device memory. - pdhg_solver_.get_cusparse_view().redirect_cusparse_csr_structure_pointers(*problem_ptr); - op_problem_scaled_.variables.resize(0, stream_view_); - op_problem_scaled_.offsets.resize(0, stream_view_); - op_problem_scaled_.reverse_constraints.resize(0, stream_view_); - op_problem_scaled_.reverse_offsets.resize(0, stream_view_); + // In distributed mode, skip all setup, it is done before + if (!settings_.hyper_params.use_distributed_pdlp) { + // TODO handle that properly + if (settings_.hyper_params.compute_initial_step_size_before_scaling && + !settings_.get_initial_step_size().has_value()) + compute_initial_step_size(); + if (settings_.hyper_params.compute_initial_primal_weight_before_scaling && + !settings_.get_initial_primal_weight().has_value()) + compute_initial_primal_weight(); + + initial_scaling_strategy_.scale_problem(); + if constexpr (std::is_same_v) { + if (!batch_mode_ && !pdhg_solver_.get_cusparse_view().mixed_precision_enabled_) { + pdhg_solver_.get_cusparse_view().create_spmv_op_plans( + settings_.hyper_params.use_reflected_primal_dual); + } + } - if (!settings_.hyper_params.compute_initial_step_size_before_scaling && - !settings_.get_initial_step_size().has_value()) - compute_initial_step_size(); - if (!settings_.hyper_params.compute_initial_primal_weight_before_scaling && - !settings_.get_initial_primal_weight().has_value()) - compute_initial_primal_weight(); + // Update FP32 matrix copies for mixed precision SpMV after scaling + pdhg_solver_.get_cusparse_view().update_mixed_precision_matrices(); + + // Redirect cuSPARSE descriptors to use the original problem's structural data (offsets, + // indices), then free the duplicated structural vectors from the scaled copy to save device + // memory. + pdhg_solver_.get_cusparse_view().redirect_cusparse_csr_structure_pointers(*problem_ptr); + op_problem_scaled_.variables.resize(0, stream_view_); + op_problem_scaled_.offsets.resize(0, stream_view_); + op_problem_scaled_.reverse_constraints.resize(0, stream_view_); + op_problem_scaled_.reverse_offsets.resize(0, stream_view_); + + if (!settings_.hyper_params.compute_initial_step_size_before_scaling && + !settings_.get_initial_step_size().has_value()) + compute_initial_step_size(); + if (!settings_.hyper_params.compute_initial_primal_weight_before_scaling && + !settings_.get_initial_primal_weight().has_value()) + compute_initial_primal_weight(); #ifdef PDLP_DEBUG_MODE - std::cout << "Initial Scaling done" << std::endl; + std::cout << "Initial Scaling done" << std::endl; #endif - - // Needs to be performed here before the below line to make sure the initial primal_weight / step - // size are used as previous point when potentially updating them in this next call - if (settings_.get_initial_step_size().has_value() || initial_step_size_.has_value()) { - if (initial_step_size_.has_value()) - thrust::uninitialized_fill(handle_ptr_->get_thrust_policy(), - step_size_.begin(), - step_size_.end(), - initial_step_size_.value()); - else - thrust::uninitialized_fill(handle_ptr_->get_thrust_policy(), - step_size_.begin(), - step_size_.end(), - settings_.get_initial_step_size().value()); - } - if (settings_.get_initial_primal_weight().has_value() || initial_primal_weight_.has_value()) { - if (initial_primal_weight_.has_value()) { - thrust::uninitialized_fill(handle_ptr_->get_thrust_policy(), - primal_weight_.begin(), - primal_weight_.end(), - initial_primal_weight_.value()); - if (is_cupdlpx_restart(settings_.hyper_params)) + // Needs to be performed here before the below line to make sure the initial primal_weight / + // step size are used as previous point when potentially updating them in this next call + if (settings_.get_initial_step_size().has_value() || initial_step_size_.has_value()) { + if (initial_step_size_.has_value()) + thrust::uninitialized_fill(handle_ptr_->get_thrust_policy(), + step_size_.begin(), + step_size_.end(), + initial_step_size_.value()); + else thrust::uninitialized_fill(handle_ptr_->get_thrust_policy(), - best_primal_weight_.begin(), - best_primal_weight_.end(), + step_size_.begin(), + step_size_.end(), + settings_.get_initial_step_size().value()); + } + if (settings_.get_initial_primal_weight().has_value() || initial_primal_weight_.has_value()) { + if (initial_primal_weight_.has_value()) { + thrust::uninitialized_fill(handle_ptr_->get_thrust_policy(), + primal_weight_.begin(), + primal_weight_.end(), initial_primal_weight_.value()); - } else { - thrust::uninitialized_fill(handle_ptr_->get_thrust_policy(), - primal_weight_.begin(), - primal_weight_.end(), - settings_.get_initial_primal_weight().value()); - if (is_cupdlpx_restart(settings_.hyper_params)) + if (is_cupdlpx_restart(settings_.hyper_params)) + thrust::uninitialized_fill(handle_ptr_->get_thrust_policy(), + best_primal_weight_.begin(), + best_primal_weight_.end(), + initial_primal_weight_.value()); + } else { thrust::uninitialized_fill(handle_ptr_->get_thrust_policy(), - best_primal_weight_.begin(), - best_primal_weight_.end(), + primal_weight_.begin(), + primal_weight_.end(), settings_.get_initial_primal_weight().value()); + if (is_cupdlpx_restart(settings_.hyper_params)) + thrust::uninitialized_fill(handle_ptr_->get_thrust_policy(), + best_primal_weight_.begin(), + best_primal_weight_.end(), + settings_.get_initial_primal_weight().value()); + } + } + if (initial_k_.has_value()) { + pdhg_solver_.total_pdhg_iterations_ = initial_k_.value(); + pdhg_solver_.get_d_total_pdhg_iterations().set_value_async(initial_k_.value(), stream_view_); + } + if (settings_.get_initial_pdlp_iteration().has_value()) { + total_pdlp_iterations_ = settings_.get_initial_pdlp_iteration().value(); + // This is meaningless in batch mode since pdhg step is never used, set it just to avoid + // assertions + pdhg_solver_.get_d_total_pdhg_iterations().set_value_async(total_pdlp_iterations_, + stream_view_); + pdhg_solver_.total_pdhg_iterations_ = total_pdlp_iterations_; + // Reset the fixed point error since at this pdlp iteration it is expected to already be + // initialized to some value + std::fill(restart_strategy_.initial_fixed_point_error_.begin(), + restart_strategy_.initial_fixed_point_error_.end(), + f_t(0.0)); + std::fill(restart_strategy_.fixed_point_error_.begin(), + restart_strategy_.fixed_point_error_.end(), + f_t(0.0)); } - } - if (initial_k_.has_value()) { - pdhg_solver_.total_pdhg_iterations_ = initial_k_.value(); - pdhg_solver_.get_d_total_pdhg_iterations().set_value_async(initial_k_.value(), stream_view_); - } - if (settings_.get_initial_pdlp_iteration().has_value()) { - total_pdlp_iterations_ = settings_.get_initial_pdlp_iteration().value(); - // This is meaningless in batch mode since pdhg step is never used, set it just to avoid - // assertions - pdhg_solver_.get_d_total_pdhg_iterations().set_value_async(total_pdlp_iterations_, - stream_view_); - pdhg_solver_.total_pdhg_iterations_ = total_pdlp_iterations_; - // Reset the fixed point error since at this pdlp iteration it is expected to already be - // initialized to some value - std::fill(restart_strategy_.initial_fixed_point_error_.begin(), - restart_strategy_.initial_fixed_point_error_.end(), - f_t(0.0)); - std::fill(restart_strategy_.fixed_point_error_.begin(), - restart_strategy_.fixed_point_error_.end(), - f_t(0.0)); - } - - // Only the primal_weight_ and step_size_ variables are initialized during the initial phase - // The associated primal/dual step_size (computed using the two firstly mentionned) are not - // initialized. This calls ensures the latter - // In the event of a given primal and dual solutions and if the option is toggled, calling the - // update primal_weight and step_size will also update the associated primal_step_size_, - // dual_step_size_. - // In summary: the below call is only mandatory at the beginning when - // computing/setting the initial primal weight and step size and if they are not recomputed later. - step_size_strategy_.get_primal_and_dual_stepsizes(primal_step_size_, dual_step_size_); + + // Only the primal_weight_ and step_size_ variables are initialized during the initial phase + // The associated primal/dual step_size (computed using the two firstly mentionned) are not + // initialized. This calls ensures the latter + // In the event of a given primal and dual solutions and if the option is toggled, calling the + // update primal_weight and step_size will also update the associated primal_step_size_, + // dual_step_size_. + // In summary: the below call is only mandatory at the beginning when + // computing/setting the initial primal weight and step size and if they are not recomputed + // later. + step_size_strategy_.get_primal_and_dual_stepsizes(primal_step_size_, dual_step_size_); #ifdef CUPDLP_DEBUG_MODE - if (initial_primal_.size() != 0 || initial_dual_.size() != 0) { - std::cout << "Initial primal and dual solution before scaling" << std::endl; - if (initial_primal_.size() != 0) { print("initial_primal_", initial_primal_); } - if (initial_dual_.size() != 0) { print("initial_dual_", initial_dual_); } - } + if (initial_primal_.size() != 0 || initial_dual_.size() != 0) { + std::cout << "Initial primal and dual solution before scaling" << std::endl; + if (initial_primal_.size() != 0) { print("initial_primal_", initial_primal_); } + if (initial_dual_.size() != 0) { print("initial_dual_", initial_dual_); } + } #endif - // If there is an initial primal or dual we should update the restart info as if there was a step - // that has happend - if (initial_primal_.size() != 0 || initial_dual_.size() != 0) { - update_primal_dual_solutions( - (initial_primal_.size() != 0) ? std::make_optional(&initial_primal_) : std::nullopt, - (initial_dual_.size() != 0) ? std::make_optional(&initial_dual_) : std::nullopt); - } + // If there is an initial primal or dual we should update the restart info as if there was a + // step that has happend + if (initial_primal_.size() != 0 || initial_dual_.size() != 0) { + update_primal_dual_solutions( + (initial_primal_.size() != 0) ? std::make_optional(&initial_primal_) : std::nullopt, + (initial_dual_.size() != 0) ? std::make_optional(&initial_dual_) : std::nullopt); + } #ifdef CUPDLP_DEBUG_MODE - std::cout << "Solution before projection" << std::endl; - print("pdhg_solver_.get_primal_solution()", pdhg_solver_.get_primal_solution()); - print("pdhg_solver_.get_dual_solution()", pdhg_solver_.get_dual_solution()); - print("pdhg_solver_.get_potential_next_primal_solution()", - pdhg_solver_.get_potential_next_primal_solution()); - print("pdhg_solver_.get_potential_next_dual_solution()", - pdhg_solver_.get_potential_next_dual_solution()); - print("restart_strategy_.last_restart_duality_gap_.primal_solution_", - restart_strategy_.last_restart_duality_gap_.primal_solution_); - print("restart_strategy_.last_restart_duality_gap_.dual_solution_", - restart_strategy_.last_restart_duality_gap_.dual_solution_); + std::cout << "Solution before projection" << std::endl; + print("pdhg_solver_.get_primal_solution()", pdhg_solver_.get_primal_solution()); + print("pdhg_solver_.get_dual_solution()", pdhg_solver_.get_dual_solution()); + print("pdhg_solver_.get_potential_next_primal_solution()", + pdhg_solver_.get_potential_next_primal_solution()); + print("pdhg_solver_.get_potential_next_dual_solution()", + pdhg_solver_.get_potential_next_dual_solution()); + print("restart_strategy_.last_restart_duality_gap_.primal_solution_", + restart_strategy_.last_restart_duality_gap_.primal_solution_); + print("restart_strategy_.last_restart_duality_gap_.dual_solution_", + restart_strategy_.last_restart_duality_gap_.dual_solution_); #endif - // Project initial primal solution - if (settings_.hyper_params.project_initial_primal) { - using f_t2 = typename type_2::type; - if (batch_mode_) { - // In batch mode variable_bounds are shared and only the bound rescaling is per climber. - // Apply it here too so the initial point is projected into the correct saacled space - cub::DeviceTransform::Transform( - cuda::std::make_tuple( + // Project initial primal solution + if (settings_.hyper_params.project_initial_primal) { + using f_t2 = typename type_2::type; + if (batch_mode_) { + // In batch mode variable_bounds are shared and only the bound rescaling is per climber. + // Apply it here too so the initial point is projected into the correct scaled space. + cub::DeviceTransform::Transform( + cuda::std::make_tuple( + pdhg_solver_.get_primal_solution().data(), + thrust::make_transform_iterator( + thrust::make_zip_iterator( + problem_wrap_container(op_problem_scaled_.variable_bounds), + batch_wrapped_container(initial_scaling_strategy_.get_bound_rescaling_vector(), + primal_size_h_)), + scale_bounds_by_scalar_op{})), pdhg_solver_.get_primal_solution().data(), - thrust::make_transform_iterator( - thrust::make_zip_iterator( - problem_wrap_container(op_problem_scaled_.variable_bounds), - batch_wrapped_container(initial_scaling_strategy_.get_bound_rescaling_vector(), - primal_size_h_)), - scale_bounds_by_scalar_op{})), - pdhg_solver_.get_primal_solution().data(), - pdhg_solver_.get_primal_solution().size(), - clamp(), - stream_view_.value()); - } else { - cub::DeviceTransform::Transform( - cuda::std::make_tuple(pdhg_solver_.get_primal_solution().data(), - problem_wrap_container(op_problem_scaled_.variable_bounds)), - pdhg_solver_.get_primal_solution().data(), - pdhg_solver_.get_primal_solution().size(), - clamp(), - stream_view_.value()); - } + pdhg_solver_.get_primal_solution().size(), + clamp(), + stream_view_.value()); + } else { + cub::DeviceTransform::Transform( + cuda::std::make_tuple(pdhg_solver_.get_primal_solution().data(), + problem_wrap_container(op_problem_scaled_.variable_bounds)), + pdhg_solver_.get_primal_solution().data(), + pdhg_solver_.get_primal_solution().size(), + clamp(), + stream_view_.value()); + } - pdhg_solver_.refine_initial_primal_projection( - initial_scaling_strategy_.get_bound_rescaling_vector()); + pdhg_solver_.refine_initial_primal_projection( + initial_scaling_strategy_.get_bound_rescaling_vector()); - if (!settings_.hyper_params.never_restart_to_average) { - cuopt_expects(!batch_mode_, - cuopt::error_type_t::ValidationError, - "Restart to average not supported in batch mode"); - cub::DeviceTransform::Transform( - cuda::std::make_tuple(unscaled_primal_avg_solution_.data(), - op_problem_scaled_.variable_bounds.data()), - unscaled_primal_avg_solution_.data(), - primal_size_h_, - clamp(), - stream_view_.value()); + if (!settings_.hyper_params.never_restart_to_average) { + cuopt_expects(!batch_mode_, + cuopt::error_type_t::ValidationError, + "Restart to average not supported in batch mode"); + cub::DeviceTransform::Transform( + cuda::std::make_tuple(unscaled_primal_avg_solution_.data(), + op_problem_scaled_.variable_bounds.data()), + unscaled_primal_avg_solution_.data(), + primal_size_h_, + clamp(), + stream_view_.value()); + } } - } #ifdef CUPDLP_DEBUG_MODE - std::cout << "Solution after projection" << std::endl; - print("pdhg_solver_.get_primal_solution()", pdhg_solver_.get_primal_solution()); - print("pdhg_solver_.get_dual_solution()", pdhg_solver_.get_dual_solution()); - print("pdhg_solver_.get_potential_next_primal_solution()", - pdhg_solver_.get_potential_next_primal_solution()); - print("pdhg_solver_.get_potential_next_dual_solution()", - pdhg_solver_.get_potential_next_dual_solution()); - print("restart_strategy_.last_restart_duality_gap_.primal_solution_", - restart_strategy_.last_restart_duality_gap_.primal_solution_); - print("restart_strategy_.last_restart_duality_gap_.dual_solution_", - restart_strategy_.last_restart_duality_gap_.dual_solution_); + std::cout << "Solution after projection" << std::endl; + print("pdhg_solver_.get_primal_solution()", pdhg_solver_.get_primal_solution()); + print("pdhg_solver_.get_dual_solution()", pdhg_solver_.get_dual_solution()); + print("pdhg_solver_.get_potential_next_primal_solution()", + pdhg_solver_.get_potential_next_primal_solution()); + print("pdhg_solver_.get_potential_next_dual_solution()", + pdhg_solver_.get_potential_next_dual_solution()); + print("restart_strategy_.last_restart_duality_gap_.primal_solution_", + restart_strategy_.last_restart_duality_gap_.primal_solution_); + print("restart_strategy_.last_restart_duality_gap_.dual_solution_", + restart_strategy_.last_restart_duality_gap_.dual_solution_); #endif - // Need to to tranpose primal solution to row format as there might be initial values or clamping - // Value may not be all 0 - if (batch_mode_) { - rmm::device_uvector dummy(0, stream_view_); - transpose_primal_dual_to_row( - pdhg_solver_.get_primal_solution(), pdhg_solver_.get_dual_solution(), dummy); - if (settings_.hyper_params.use_reflected_primal_dual) { - transpose_primal_dual_to_row(pdhg_solver_.get_potential_next_primal_solution(), - pdhg_solver_.get_potential_next_dual_solution(), - dummy); - transpose_primal_dual_to_row(restart_strategy_.last_restart_duality_gap_.primal_solution_, - restart_strategy_.last_restart_duality_gap_.dual_solution_, - dummy); + // Need to to tranpose primal solution to row format as there might be initial values or + // clamping Value may not be all 0 + if (batch_mode_) { + rmm::device_uvector dummy(0, stream_view_); + transpose_primal_dual_to_row( + pdhg_solver_.get_primal_solution(), pdhg_solver_.get_dual_solution(), dummy); + if (settings_.hyper_params.use_reflected_primal_dual) { + transpose_primal_dual_to_row(pdhg_solver_.get_potential_next_primal_solution(), + pdhg_solver_.get_potential_next_dual_solution(), + dummy); + transpose_primal_dual_to_row(restart_strategy_.last_restart_duality_gap_.primal_solution_, + restart_strategy_.last_restart_duality_gap_.dual_solution_, + dummy); + } + transpose_problem_fields(/*to_row=*/true); + } + + if (verbose) { + std::cout << "primal_size_h_ " << primal_size_h_ << " dual_size_h_ " << dual_size_h_ + << " nnz " << problem_ptr->nnz << std::endl; + std::cout << "Problem before scaling" << std::endl; + print_problem_info(problem_ptr->coefficients, + problem_ptr->objective_coefficients, + problem_ptr->combined_bounds); + std::cout << "Problem after scaling" << std::endl; + print_problem_info(op_problem_scaled_.coefficients, + op_problem_scaled_.objective_coefficients, + op_problem_scaled_.combined_bounds); + raft::print_device_vector("Initial step_size", step_size_.data(), 1, std::cout); + raft::print_device_vector("Initial primal_weight", primal_weight_.data(), 1, std::cout); + raft::print_device_vector("Initial primal_step_size", primal_step_size_.data(), 1, std::cout); + raft::print_device_vector("Initial dual_step_size", dual_step_size_.data(), 1, std::cout); } - transpose_problem_fields(/*to_row=*/true); - } - - if (verbose) { - std::cout << "primal_size_h_ " << primal_size_h_ << " dual_size_h_ " << dual_size_h_ << " nnz " - << problem_ptr->nnz << std::endl; - std::cout << "Problem before scaling" << std::endl; - print_problem_info( - problem_ptr->coefficients, problem_ptr->objective_coefficients, problem_ptr->combined_bounds); - std::cout << "Problem after scaling" << std::endl; - print_problem_info(op_problem_scaled_.coefficients, - op_problem_scaled_.objective_coefficients, - op_problem_scaled_.combined_bounds); - raft::print_device_vector("Initial step_size", step_size_.data(), 1, std::cout); - raft::print_device_vector("Initial primal_weight", primal_weight_.data(), 1, std::cout); - raft::print_device_vector("Initial primal_step_size", primal_step_size_.data(), 1, std::cout); - raft::print_device_vector("Initial dual_step_size", dual_step_size_.data(), 1, std::cout); - } #ifdef CUPDLP_DEBUG_MODE - raft::print_device_vector("Initial step_size", step_size_.data(), step_size_.size(), std::cout); - raft::print_device_vector( - "Initial primal_weight", primal_weight_.data(), primal_weight_.size(), std::cout); + raft::print_device_vector("Initial step_size", step_size_.data(), step_size_.size(), std::cout); + raft::print_device_vector( + "Initial primal_weight", primal_weight_.data(), primal_weight_.size(), std::cout); #endif - bool warm_start_was_given = settings_.get_pdlp_warm_start_data().is_populated(); - - if (!inside_mip_) { - CUOPT_LOG_INFO( - " Iter Primal Obj. Dual Obj. Gap Primal Res. Dual Res. Time"); + if (!inside_mip_) { + CUOPT_LOG_INFO( + " Iter Primal Obj. Dual Obj. Gap Primal Res. Dual Res. Time"); + } } while (true) { #ifdef CUPDLP_DEBUG_MODE @@ -2593,6 +3068,7 @@ optimization_problem_solution_t pdlp_solver_t::run_solver(co // 1. At the very beginning of the solver, when no steps have been taken yet // 2. After a single step, since average of one step is the same step if (internal_solver_iterations_ <= 1) { + if (multi_gpu_engine) { assert(false && "Not implemented"); } raft::copy(unscaled_primal_avg_solution_.data(), pdhg_solver_.get_primal_solution().data(), primal_size_h_, @@ -2646,10 +3122,21 @@ optimization_problem_solution_t pdlp_solver_t::run_solver(co initial_scaling_strategy_.unscale_solutions(pdhg_solver_.get_primal_solution(), pdhg_solver_.get_dual_solution()); } else { - initial_scaling_strategy_.unscale_solutions( - pdhg_solver_.get_potential_next_primal_solution(), - pdhg_solver_.get_potential_next_dual_solution(), - pdhg_solver_.get_dual_slack()); + if (multi_gpu_engine) { + // The only branch in cuPDLPx (Stable3) + multi_gpu_engine->for_each_shard([&](auto& shard) { + auto& sub = *shard.sub_pdlp; + sub.get_initial_scaling_strategy().unscale_solutions( + sub.pdhg_solver_.get_potential_next_primal_solution(), + sub.pdhg_solver_.get_potential_next_dual_solution(), + sub.pdhg_solver_.get_dual_slack()); + }); + } else { + initial_scaling_strategy_.unscale_solutions( + pdhg_solver_.get_potential_next_primal_solution(), + pdhg_solver_.get_potential_next_dual_solution(), + pdhg_solver_.get_dual_slack()); + } } #ifdef CUPDLP_DEBUG_MODE @@ -2681,10 +3168,21 @@ optimization_problem_solution_t pdlp_solver_t::run_solver(co initial_scaling_strategy_.scale_solutions(pdhg_solver_.get_primal_solution(), pdhg_solver_.get_dual_solution()); } else { - initial_scaling_strategy_.scale_solutions( - pdhg_solver_.get_potential_next_primal_solution(), - pdhg_solver_.get_potential_next_dual_solution(), - pdhg_solver_.get_dual_slack()); + if (multi_gpu_engine) { + // The only branch in cuPDLPx (Stable3) + multi_gpu_engine->for_each_shard([&](auto& shard) { + auto& sub = *shard.sub_pdlp; + sub.get_initial_scaling_strategy().scale_solutions( + sub.pdhg_solver_.get_potential_next_primal_solution(), + sub.pdhg_solver_.get_potential_next_dual_solution(), + sub.pdhg_solver_.get_dual_slack()); + }); + } else { + initial_scaling_strategy_.scale_solutions( + pdhg_solver_.get_potential_next_primal_solution(), + pdhg_solver_.get_potential_next_dual_solution(), + pdhg_solver_.get_dual_slack()); + } } } @@ -2782,13 +3280,23 @@ optimization_problem_solution_t pdlp_solver_t::run_solver(co transpose_problem_fields(/*to_row=*/true); } } - halpern_update(); + if (multi_gpu_engine) { + multi_gpu_engine->for_each_shard([&](auto& shard) { shard.sub_pdlp->halpern_update(); }); + } else { + halpern_update(); + } } ++total_pdlp_iterations_; ++internal_solver_iterations_; - if (settings_.hyper_params.never_restart_to_average) + if (settings_.hyper_params.never_restart_to_average) { restart_strategy_.increment_iteration_since_last_restart(); + if (multi_gpu_engine) { + multi_gpu_engine->for_each_shard([&](auto& shard) { + shard.sub_pdlp->restart_strategy_.increment_iteration_since_last_restart(); + }); + } + } } return optimization_problem_solution_t{pdlp_termination_status_t::NumericalError, stream_view_}; diff --git a/cpp/src/pdlp/pdlp.cuh b/cpp/src/pdlp/pdlp.cuh index 9447eaeaf3..3544de89fa 100644 --- a/cpp/src/pdlp/pdlp.cuh +++ b/cpp/src/pdlp/pdlp.cuh @@ -8,10 +8,12 @@ #pragma once #include +#include #include #include #include +#include #include #include #include @@ -62,6 +64,11 @@ class pdlp_solver_t { pdlp_solver_settings_t const& settings, bool is_batch_mode = false); + // Distributed Solver Constructor + pdlp_solver_t(problem_t& placeholder_problem, + cuopt::linear_programming::io::mps_data_model_t const& mps, + pdlp_solver_settings_t const& settings); + optimization_problem_solution_t run_solver(const timer_t& timer); f_t get_primal_weight_h(i_t id) const; @@ -98,6 +105,25 @@ class pdlp_solver_t { void compute_initial_step_size(); void compute_initial_primal_weight(); + // Needed by multi-GPU to mutate them + problem_t& get_op_problem_scaled() { return op_problem_scaled_; } + detail::pdlp_initial_scaling_strategy_t& get_initial_scaling_strategy() + { + return initial_scaling_strategy_; + } + detail::pdlp_restart_strategy_t& get_restart_strategy() { return restart_strategy_; } + + // Per-shard primal/dual step sizes are private state on pdlp_solver_t but + // are needed inside the multi-GPU dispatch paths that fan out a master cub + // call across all shards' pdhg_solver_t::*_transform methods. + rmm::device_uvector& get_primal_step_size() { return primal_step_size_; } + rmm::device_uvector& get_dual_step_size() { return dual_step_size_; } + // Multi-GPU restart broadcast needs to mirror master's primal_weight / + // best_primal_weight onto every shard after each cuPDLPx restart so that + // downstream shard-side restart machinery stays in sync with master. + rmm::device_uvector& get_primal_weight() { return primal_weight_; } + rmm::device_uvector& get_best_primal_weight() { return best_primal_weight_; } + private: void print_termination_criteria(const timer_t& timer, bool is_average = false); void print_final_termination_criteria( @@ -247,6 +273,11 @@ class pdlp_solver_t { primal_quality_adapter_t best_primal_quality_so_far_; // Flag to indicate if solver is being called from MIP. No logging is done in this case. bool inside_mip_{false}; + + // std::optional because multi_gpu_engine_t is non-default-constructible + // (collectively bootstraps NCCL, owns RMM resources). Stays nullopt in + // single-GPU mode; emplaced by the multi-GPU ctor. + std::optional> multi_gpu_engine; }; } // namespace cuopt::linear_programming::detail diff --git a/cpp/src/pdlp/restart_strategy/pdlp_restart_strategy.cu b/cpp/src/pdlp/restart_strategy/pdlp_restart_strategy.cu index 17c7abcac5..ee1d19b96b 100644 --- a/cpp/src/pdlp/restart_strategy/pdlp_restart_strategy.cu +++ b/cpp/src/pdlp/restart_strategy/pdlp_restart_strategy.cu @@ -8,6 +8,8 @@ #include #include +#include +#include #include #include #include @@ -892,20 +894,64 @@ void pdlp_restart_strategy_t::cupdlpx_restart( "If any, all should be true"); // Computing the deltas - distance_squared_moved_from_last_restart_period( - pdhg_solver.get_potential_next_primal_solution(), - last_restart_duality_gap_.primal_solution_, - pdhg_solver.get_primal_tmp_resource(), - primal_size_h_, - 1, - last_restart_duality_gap_.primal_distance_traveled_); - distance_squared_moved_from_last_restart_period( - pdhg_solver.get_potential_next_dual_solution(), - last_restart_duality_gap_.dual_solution_, - pdhg_solver.get_dual_tmp_resource(), - dual_size_h_, - 1, - last_restart_duality_gap_.dual_distance_traveled_); + if (auto* engine = pdhg_solver.get_mgpu_engine()) { + engine->for_each_shard([&](auto& shard) { + auto& sub = *shard.sub_pdlp; + auto& sub_rest = sub.get_restart_strategy(); + sub_rest.distance_squared_moved_from_last_restart_period( + sub.pdhg_solver_.get_potential_next_primal_solution(), + sub_rest.last_restart_duality_gap_.primal_solution_, + sub.pdhg_solver_.get_primal_tmp_resource(), + shard.rank_data.owned_var_size, + 1, + sub_rest.last_restart_duality_gap_.primal_distance_traveled_); + sub_rest.distance_squared_moved_from_last_restart_period( + sub.pdhg_solver_.get_potential_next_dual_solution(), + sub_rest.last_restart_duality_gap_.dual_solution_, + sub.pdhg_solver_.get_dual_tmp_resource(), + shard.rank_data.owned_cstr_size, + 1, + sub_rest.last_restart_duality_gap_.dual_distance_traveled_); + }); + + engine->allreduce_sum_inplace([](pdlp_solver_t& sp) -> f_t* { + return sp.get_restart_strategy().last_restart_duality_gap_.primal_distance_traveled_.data(); + }); + engine->allreduce_sum_inplace([](pdlp_solver_t& sp) -> f_t* { + return sp.get_restart_strategy().last_restart_duality_gap_.dual_distance_traveled_.data(); + }); + + auto& s0 = *engine->shards[0]; + { + raft::device_setter guard(s0.device_id); + RAFT_CUDA_TRY(cudaStreamSynchronize(s0.stream.view().value())); + } + raft::copy(last_restart_duality_gap_.primal_distance_traveled_.data(), + s0.sub_pdlp->get_restart_strategy() + .last_restart_duality_gap_.primal_distance_traveled_.data(), + 1, + stream_view_); + raft::copy( + last_restart_duality_gap_.dual_distance_traveled_.data(), + s0.sub_pdlp->get_restart_strategy().last_restart_duality_gap_.dual_distance_traveled_.data(), + 1, + stream_view_); + } else { + distance_squared_moved_from_last_restart_period( + pdhg_solver.get_potential_next_primal_solution(), + last_restart_duality_gap_.primal_solution_, + pdhg_solver.get_primal_tmp_resource(), + primal_size_h_, + 1, + last_restart_duality_gap_.primal_distance_traveled_); + distance_squared_moved_from_last_restart_period( + pdhg_solver.get_potential_next_dual_solution(), + last_restart_duality_gap_.dual_solution_, + pdhg_solver.get_dual_tmp_resource(), + dual_size_h_, + 1, + last_restart_duality_gap_.dual_distance_traveled_); + } auto view = make_cupdlpx_restart_view(last_restart_duality_gap_.primal_distance_traveled_, last_restart_duality_gap_.dual_distance_traveled_, @@ -958,24 +1004,71 @@ void pdlp_restart_strategy_t::cupdlpx_restart( best_primal_weight.set_element_async(0, best_primal_weight_value, stream_view_); } + // mGPU: Broadcast all primal-weight / step-size scalars updated by the cuPDLPx + // restart on the master to every shard so the restart-state on + // each shard stays in sync with master. + if (auto* engine = pdhg_solver.get_mgpu_engine()) { + RAFT_CUDA_TRY(cudaStreamSynchronize(stream_view_)); + + f_t h_primal_step_size{}, h_dual_step_size{}; + f_t h_primal_weight{}, h_best_primal_weight{}; + + raft::copy(&h_primal_step_size, primal_step_size.data(), 1, stream_view_); + raft::copy(&h_dual_step_size, dual_step_size.data(), 1, stream_view_); + raft::copy(&h_primal_weight, primal_weight.data(), 1, stream_view_); + raft::copy(&h_best_primal_weight, best_primal_weight.data(), 1, stream_view_); + RAFT_CUDA_TRY(cudaStreamSynchronize(stream_view_)); + + engine->for_each_shard([&](auto& shard) { + auto& sub = *shard.sub_pdlp; + raft::copy(sub.get_primal_step_size().data(), &h_primal_step_size, 1, shard.stream.view()); + raft::copy(sub.get_dual_step_size().data(), &h_dual_step_size, 1, shard.stream.view()); + raft::copy(sub.get_primal_weight().data(), &h_primal_weight, 1, shard.stream.view()); + raft::copy( + sub.get_best_primal_weight().data(), &h_best_primal_weight, 1, shard.stream.view()); + }); + } // TODO later batch mode: remove if you have per climber restart - raft::copy(last_restart_duality_gap_.primal_solution_.data(), - pdhg_solver.get_potential_next_primal_solution().data(), - last_restart_duality_gap_.primal_solution_.size(), - stream_view_); - raft::copy(pdhg_solver.get_primal_solution().data(), - pdhg_solver.get_potential_next_primal_solution().data(), - last_restart_duality_gap_.primal_solution_.size(), - stream_view_); - raft::copy(last_restart_duality_gap_.dual_solution_.data(), - pdhg_solver.get_potential_next_dual_solution().data(), - last_restart_duality_gap_.dual_solution_.size(), - stream_view_); - raft::copy(pdhg_solver.get_dual_solution().data(), - pdhg_solver.get_potential_next_dual_solution().data(), - last_restart_duality_gap_.dual_solution_.size(), - stream_view_); + if (auto* engine = pdhg_solver.get_mgpu_engine()) { + engine->for_each_shard([&](auto& shard) { + auto& sub = *shard.sub_pdlp; + auto& sub_rest = sub.get_restart_strategy(); + raft::copy(sub_rest.last_restart_duality_gap_.primal_solution_.data(), + sub.pdhg_solver_.get_potential_next_primal_solution().data(), + sub_rest.last_restart_duality_gap_.primal_solution_.size(), + shard.stream.view()); + raft::copy(sub.pdhg_solver_.get_primal_solution().data(), + sub.pdhg_solver_.get_potential_next_primal_solution().data(), + sub.pdhg_solver_.get_primal_solution().size(), + shard.stream.view()); + raft::copy(sub_rest.last_restart_duality_gap_.dual_solution_.data(), + sub.pdhg_solver_.get_potential_next_dual_solution().data(), + sub_rest.last_restart_duality_gap_.dual_solution_.size(), + shard.stream.view()); + raft::copy(sub.pdhg_solver_.get_dual_solution().data(), + sub.pdhg_solver_.get_potential_next_dual_solution().data(), + sub.pdhg_solver_.get_dual_solution().size(), + shard.stream.view()); + }); + } else { + raft::copy(last_restart_duality_gap_.primal_solution_.data(), + pdhg_solver.get_potential_next_primal_solution().data(), + last_restart_duality_gap_.primal_solution_.size(), + stream_view_); + raft::copy(pdhg_solver.get_primal_solution().data(), + pdhg_solver.get_potential_next_primal_solution().data(), + last_restart_duality_gap_.primal_solution_.size(), + stream_view_); + raft::copy(last_restart_duality_gap_.dual_solution_.data(), + pdhg_solver.get_potential_next_dual_solution().data(), + last_restart_duality_gap_.dual_solution_.size(), + stream_view_); + raft::copy(pdhg_solver.get_dual_solution().data(), + pdhg_solver.get_potential_next_dual_solution().data(), + last_restart_duality_gap_.dual_solution_.size(), + stream_view_); + } #ifdef CUPDLP_DEBUG_MODE print("New last_restart_duality_gap_.primal_solution_", @@ -990,6 +1083,13 @@ void pdlp_restart_strategy_t::cupdlpx_restart( weighted_average_solution_.iterations_since_last_restart_ = 0; last_trial_fixed_point_error_[i] = std::numeric_limits::infinity(); } + + if (auto* engine = pdhg_solver.get_mgpu_engine()) { + engine->for_each_shard([&](auto& shard) { + shard.sub_pdlp->get_restart_strategy() + .weighted_average_solution_.iterations_since_last_restart_ = 0; + }); + } } template diff --git a/cpp/src/pdlp/saddle_point.cu b/cpp/src/pdlp/saddle_point.cu index f740176a3c..07a5d0146e 100644 --- a/cpp/src/pdlp/saddle_point.cu +++ b/cpp/src/pdlp/saddle_point.cu @@ -38,8 +38,11 @@ saddle_point_state_t::saddle_point_state_t( current_AtY_{batch_size * primal_size, handle_ptr->get_stream()}, next_AtY_{batch_size * primal_size, handle_ptr->get_stream()} { - EXE_CUOPT_EXPECTS(primal_size > 0, "Size of the primal problem must be larger than 0"); - EXE_CUOPT_EXPECTS(dual_size > 0, "Size of the dual problem must be larger than 0"); + // >= 0 (not > 0): distributed PDLP builds the master pdlp_solver_t from a + // shape-0 placeholder problem so the master never materializes per-variable + // / per-constraint vectors; size-0 device_uvectors are valid throughout. + EXE_CUOPT_EXPECTS(primal_size >= 0, "Size of the primal problem must be non-negative"); + EXE_CUOPT_EXPECTS(dual_size >= 0, "Size of the dual problem must be non-negative"); // Starting from all 0 thrust::fill( diff --git a/cpp/src/pdlp/solve.cu b/cpp/src/pdlp/solve.cu index 43d507b2f9..595c06b20a 100644 --- a/cpp/src/pdlp/solve.cu +++ b/cpp/src/pdlp/solve.cu @@ -756,6 +756,9 @@ static optimization_problem_solution_t run_pdlp_solver( const timer_t& timer, bool is_batch_mode) { + detail::pdlp_graph_disabled_flag().store(settings.hyper_params.pdlp_disable_graph, + std::memory_order_relaxed); + if (problem.n_constraints == 0) { CUOPT_LOG_CONDITIONAL_INFO( !settings.inside_mip, @@ -770,6 +773,15 @@ static optimization_problem_solution_t run_pdlp_solver( } } #endif + // Distributed PDLP cannot enter through this path: by the time we have a + // problem_t, the full problem already lives on the master GPU, which defeats + // the purpose of distributed mode. Callers must route to + // solve_lp_distributed_from_mps via solve_lp(mps_data_model, ...). + cuopt_expects(!settings.hyper_params.use_distributed_pdlp, + error_type_t::ValidationError, + "Distributed PDLP must be entered via solve_lp(mps_data_model, ...) " + "so the master GPU never materializes the full problem. Call sites " + "with a problem_t cannot dispatch to distributed mode."); detail::pdlp_solver_t solver(problem, settings, is_batch_mode); if (settings.inside_mip) { solver.set_inside_mip(true); } return solver.run_solver(timer); @@ -2114,8 +2126,110 @@ optimization_problem_solution_t solve_lp( bool problem_checking, bool use_pdlp_solver_mode) { - auto op_problem = mps_data_model_to_optimization_problem(handle_ptr, mps_data_model); - return solve_lp(op_problem, settings, problem_checking, use_pdlp_solver_mode); + cuopt_expects( + settings.hyper_params.use_distributed_pdlp, + error_type_t::ValidationError, + "solve_lp from mps_data_model: settings.hyper_params.use_distributed_pdlp must be true"); + return solve_lp_distributed_from_mps( + handle_ptr, mps_data_model, settings, problem_checking, use_pdlp_solver_mode); +} + +template +optimization_problem_solution_t solve_lp_distributed_from_mps( + raft::handle_t const* handle_ptr, + const cuopt::linear_programming::io::mps_data_model_t& mps_data_model, + pdlp_solver_settings_t const& settings, + bool problem_checking, + bool use_pdlp_solver_mode) +{ + cuopt_expects(handle_ptr != nullptr, + error_type_t::ValidationError, + "solve_lp_distributed_from_mps: handle_ptr must not be null"); + cuopt_expects(settings.hyper_params.use_distributed_pdlp, + error_type_t::ValidationError, + "solve_lp_distributed_from_mps: settings.hyper_params.use_distributed_pdlp " + "must be true"); + cuopt_expects(settings.presolver == cuopt::linear_programming::presolver_t::None, + error_type_t::ValidationError, + "solve_lp_distributed_from_mps: presolve is not yet supported with " + "use_distributed_pdlp; please set settings.presolver = presolver_t::None"); + + pdlp_solver_settings_t settings_resolved = settings; + + detail::pdlp_graph_disabled_flag().store(settings_resolved.hyper_params.pdlp_disable_graph, + std::memory_order_relaxed); + + if (settings_resolved.distributed_pdlp_num_gpus == -1) { + settings_resolved.distributed_pdlp_num_gpus = raft::device_setter::get_device_count(); + CUOPT_LOG_INFO( + "solve_lp_distributed_from_mps: distributed_pdlp_num_gpus == -1, auto-detected " + "%d visible CUDA device(s)", + settings_resolved.distributed_pdlp_num_gpus); + } + // PDLP precision validations (mirror the checks in run_pdlp; distributed + // path only supports the default-precision, non-batch double config). + cuopt_expects(settings_resolved.pdlp_precision == pdlp_precision_t::DefaultPrecision, + error_type_t::ValidationError, + "Distributed PDLP only supports DefaultPrecision (double)."); + cuopt_expects(!settings_resolved.inside_mip, + error_type_t::ValidationError, + "Distributed PDLP is not yet supported from inside MIP."); + + init_logger_t log(settings_resolved.log_file, settings_resolved.log_to_console); + print_version_info(); + init_handler(handle_ptr); + + const i_t n_vars = static_cast(mps_data_model.get_objective_coefficients().size()); + const i_t n_cstr = static_cast(mps_data_model.get_constraint_lower_bounds().size()); + const i_t nnz = static_cast(mps_data_model.get_constraint_matrix_values().size()); + CUOPT_LOG_INFO( + "Solving a problem with %d constraints, %d variables (%d integers), and %d " + "nonzeros (distributed mps-direct path)", + n_cstr, + n_vars, + 0, + nnz); + + auto lp_timer = cuopt::timer_t(settings_resolved.time_limit); + + // Shape-0 placeholder: needed to build an empty pdlp_solver + cuopt::linear_programming::optimization_problem_t placeholder_op(handle_ptr); + { + std::vector empty_offsets = {0}; + placeholder_op.set_csr_constraint_matrix( + nullptr, 0, nullptr, 0, empty_offsets.data(), static_cast(empty_offsets.size())); + } + detail::problem_t placeholder_problem(placeholder_op); + + detail::pdlp_solver_t solver(placeholder_problem, mps_data_model, settings_resolved); + + auto sol = solver.run_solver(lp_timer); + + // Maximization post-processing (matches run_pdlp at solve.cu:835-839): + // PDLP internally solves the negated objective, so flip dual / reduced + // cost signs on the gathered solution before returning. + if (mps_data_model.get_sense()) { + adjust_dual_solution_and_reduced_cost( + sol.get_dual_solution(), sol.get_reduced_cost(), handle_ptr->get_stream()); + handle_ptr->sync_stream(); + } + + sol.set_solve_time(lp_timer.elapsed_time()); + CUOPT_LOG_INFO("PDLP finished"); + if (sol.get_termination_status() != pdlp_termination_status_t::ConcurrentLimit) { + CUOPT_LOG_INFO("Status: %s Objective: %.8e Iterations: %d Time: %.3fs", + sol.get_termination_status_string().c_str(), + sol.get_objective_value(), + sol.get_additional_termination_information().number_of_steps_taken, + sol.get_solve_time()); + } + + if (settings_resolved.sol_file != "") { + CUOPT_LOG_INFO("Writing solution to file %s", settings_resolved.sol_file.c_str()); + sol.write_to_sol_file(settings_resolved.sol_file, handle_ptr->get_stream()); + } + + return sol; } // ============================================================================ @@ -2258,6 +2372,14 @@ std::unique_ptr> solve_lp( template optimization_problem_t mps_data_model_to_optimization_problem( \ raft::handle_t const* handle_ptr, \ const cuopt::linear_programming::io::mps_data_model_t& data_model); \ + \ + template optimization_problem_solution_t solve_lp_distributed_from_mps( \ + raft::handle_t const* handle_ptr, \ + const cuopt::linear_programming::io::mps_data_model_t& mps_data_model, \ + pdlp_solver_settings_t const& settings, \ + bool problem_checking, \ + bool use_pdlp_solver_mode); \ + \ template void set_pdlp_solver_mode(pdlp_solver_settings_t& settings); #if MIP_INSTANTIATE_FLOAT diff --git a/cpp/src/pdlp/solve.cuh b/cpp/src/pdlp/solve.cuh index 90e5e4fe95..160f4602ba 100644 --- a/cpp/src/pdlp/solve.cuh +++ b/cpp/src/pdlp/solve.cuh @@ -32,6 +32,45 @@ cuopt::linear_programming::optimization_problem_solution_t solve_lp_wi const timer_t& timer, bool is_batch_mode = false); +/** + * @brief Distributed-PDLP entry point that consumes the host MPS data model + * directly, without ever materializing the full problem on a single + * (master) GPU. + * + * This is the entry point intended for problems whose `nnz` exceeds the memory + * of a single device. Today (Step 1 of the mGPU memory refactor) it is a thin + * routing shim: it resolves `distributed_pdlp_num_gpus == -1` against the + * visible-device count and delegates to the legacy + * `mps_data_model_to_optimization_problem(...)` + device-side `solve_lp(...)` + * pipeline, which still allocates the full problem on master. The shim exists + * so the public-facing call site is already in place; subsequent commits will + * replace the body with: + * 1. host-side METIS partitioning straight off the MPS CSR + * 2. per-shard host CSR slicing + * 3. construction of an mGPU-native pdlp_solver_t whose master only holds + * scalar metadata + gather buffers (no full A / A^T / scaled copies). + * + * Until then, behaviour and memory footprint are identical to the legacy path. + * + * @param handle_ptr Master raft handle (its stream owns the gather buffers + * and any master-side aggregator allocations). + * @param mps_data_model Host-resident MPS data (CPU vectors only). + * @param settings User-supplied PDLP solver settings; the + * `distributed_pdlp_num_gpus == -1` sentinel is resolved + * here against the visible-device count. + * @param problem_checking Forwarded to the eventual solver. + * @param use_pdlp_solver_mode Forwarded to the eventual solver. + * + * @pre `settings.hyper_params.use_distributed_pdlp == true`. + */ +template +cuopt::linear_programming::optimization_problem_solution_t solve_lp_distributed_from_mps( + raft::handle_t const* handle_ptr, + const cuopt::linear_programming::io::mps_data_model_t& mps_data_model, + pdlp_solver_settings_t const& settings, + bool problem_checking, + bool use_pdlp_solver_mode); + /** * @brief Entry point for batch PDLP. Solves multiple LPs sharing the same constraint * matrix structure in a single batched GPU run. diff --git a/cpp/src/pdlp/step_size_strategy/adaptive_step_size_strategy.cu b/cpp/src/pdlp/step_size_strategy/adaptive_step_size_strategy.cu index 1f137dc9ea..aac777a44e 100644 --- a/cpp/src/pdlp/step_size_strategy/adaptive_step_size_strategy.cu +++ b/cpp/src/pdlp/step_size_strategy/adaptive_step_size_strategy.cu @@ -309,6 +309,24 @@ adaptive_step_size_strategy_t::get_norm_squared_delta_dual() const return norm_squared_delta_dual_; } +template +rmm::device_uvector& adaptive_step_size_strategy_t::get_interaction() +{ + return interaction_; +} + +template +rmm::device_uvector& adaptive_step_size_strategy_t::get_norm_squared_delta_primal() +{ + return norm_squared_delta_primal_; +} + +template +rmm::device_uvector& adaptive_step_size_strategy_t::get_norm_squared_delta_dual() +{ + return norm_squared_delta_dual_; +} + template void adaptive_step_size_strategy_t::set_valid_step_size(i_t valid) { @@ -346,8 +364,16 @@ template void adaptive_step_size_strategy_t::compute_interaction_and_movement( rmm::device_uvector& tmp_primal, cusparse_view_t& cusparse_view, - saddle_point_state_t& current_saddle_point_state) + saddle_point_state_t& current_saddle_point_state, + i_t owned_primal_size, + i_t owned_cstr_size) { + // mGPU needs to know owned size to restrict the reductions to the owned prefix + const i_t reduce_primal_size = + (owned_primal_size >= 0) ? owned_primal_size : current_saddle_point_state.get_primal_size(); + const i_t reduce_dual_size = + (owned_cstr_size >= 0) ? owned_cstr_size : current_saddle_point_state.get_dual_size(); + // QP would need this: // if iszero(problem.objective_matrix) // primal_objective_interaction = 0.0 @@ -426,7 +452,7 @@ void adaptive_step_size_strategy_t::compute_interaction_and_movement( // compute interaction (x'-x) . (A(y'-y)) RAFT_CUBLAS_TRY( raft::linalg::detail::cublasdot(handle_ptr_->get_cublas_handle(), - current_saddle_point_state.get_primal_size(), + reduce_primal_size, tmp_primal.data(), primal_stride, current_saddle_point_state.get_delta_primal().data(), @@ -444,7 +470,7 @@ void adaptive_step_size_strategy_t::compute_interaction_and_movement( // norm(delta_dual) ^ 2; RAFT_CUBLAS_TRY( raft::linalg::detail::cublasdot(handle_ptr_->get_cublas_handle(), - current_saddle_point_state.get_primal_size(), + reduce_primal_size, current_saddle_point_state.get_delta_primal().data(), primal_stride, current_saddle_point_state.get_delta_primal().data(), @@ -454,7 +480,7 @@ void adaptive_step_size_strategy_t::compute_interaction_and_movement( RAFT_CUBLAS_TRY( raft::linalg::detail::cublasdot(handle_ptr_->get_cublas_handle(), - current_saddle_point_state.get_dual_size(), + reduce_dual_size, current_saddle_point_state.get_delta_dual().data(), dual_stride, current_saddle_point_state.get_delta_dual().data(), diff --git a/cpp/src/pdlp/step_size_strategy/adaptive_step_size_strategy.hpp b/cpp/src/pdlp/step_size_strategy/adaptive_step_size_strategy.hpp index 1e969150e7..238735e8ff 100644 --- a/cpp/src/pdlp/step_size_strategy/adaptive_step_size_strategy.hpp +++ b/cpp/src/pdlp/step_size_strategy/adaptive_step_size_strategy.hpp @@ -81,9 +81,20 @@ class adaptive_step_size_strategy_t { const rmm::device_uvector& get_norm_squared_delta_primal() const; const rmm::device_uvector& get_norm_squared_delta_dual() const; + // Mutable overloads — used by the multi-GPU path to NCCL-allreduce the + // per-shard scalar contributions in place and to mirror them back to the + // master step_size_strategy_. + rmm::device_uvector& get_interaction(); + rmm::device_uvector& get_norm_squared_delta_primal(); + rmm::device_uvector& get_norm_squared_delta_dual(); + + // owned_primal_size / owned_cstr_size are mGPU overrides. + // mGPU needs to know owned size to restrict the reductions to the owned prefix void compute_interaction_and_movement(rmm::device_uvector& tmp_primal, cusparse_view_t& cusparse_view, - saddle_point_state_t& current_saddle_point_state); + saddle_point_state_t& current_saddle_point_state, + i_t owned_primal_size = -1, + i_t owned_cstr_size = -1); void swap_context(const thrust::universal_host_pinned_vector>& swap_pairs); void resize_context(i_t new_size); diff --git a/cpp/src/pdlp/termination_strategy/convergence_information.cu b/cpp/src/pdlp/termination_strategy/convergence_information.cu index a6d6d14d96..1dfc8229da 100644 --- a/cpp/src/pdlp/termination_strategy/convergence_information.cu +++ b/cpp/src/pdlp/termination_strategy/convergence_information.cu @@ -5,12 +5,16 @@ */ /* clang-format on */ +#include +#include #include #include #include #include #include +#include + #include #include @@ -208,6 +212,77 @@ void convergence_information_t::init_l2_norms() } } +template +void convergence_information_t::compute_owned_reference_norm_partials(i_t owned_var_size, + i_t owned_cstr_size) +{ + cuopt_assert(!batch_mode_, "owned reference-norm partials only used in non-batch mGPU mode"); + cuopt_assert(owned_var_size <= primal_size_h_, "owned_var_size must be <= primal_size_h_"); + cuopt_assert(owned_cstr_size <= dual_size_h_, "owned_cstr_size must be <= dual_size_h_"); + + // Σ objective[0:owned_var]² + RAFT_CUBLAS_TRY(raft::linalg::detail::cublasdot(handle_ptr_->get_cublas_handle(), + static_cast(owned_var_size), + problem_ptr->objective_coefficients.data(), + 1, + problem_ptr->objective_coefficients.data(), + 1, + l2_norm_primal_linear_objective_.data(), + stream_view_)); + + // rhs_sum_of_squares(lower[0:owned_cstr], upper[0:owned_cstr]) (no sqrt) + { + rmm::device_buffer d_temp_storage; + size_t bytes = 0; + auto zip_begin = thrust::make_zip_iterator(problem_ptr->constraint_lower_bounds.data(), + problem_ptr->constraint_upper_bounds.data()); + cub::DeviceReduce::TransformReduce(nullptr, + bytes, + zip_begin, + l2_norm_primal_right_hand_side_.data(), + static_cast(owned_cstr_size), + cuda::std::plus<>{}, + rhs_sum_of_squares_t{}, + f_t(0), + stream_view_); + d_temp_storage.resize(bytes, stream_view_); + cub::DeviceReduce::TransformReduce(d_temp_storage.data(), + bytes, + zip_begin, + l2_norm_primal_right_hand_side_.data(), + static_cast(owned_cstr_size), + cuda::std::plus<>{}, + rhs_sum_of_squares_t{}, + f_t(0), + stream_view_); + } + RAFT_CUDA_TRY(cudaStreamSynchronize(stream_view_)); +} + +template +void convergence_information_t::sqrt_reference_norms_inplace() +{ + cub::DeviceTransform::Transform(l2_norm_primal_linear_objective_.data(), + l2_norm_primal_linear_objective_.data(), + 1, + sqrt_func_t{}, + stream_view_); + cub::DeviceTransform::Transform(l2_norm_primal_right_hand_side_.data(), + l2_norm_primal_right_hand_side_.data(), + 1, + sqrt_func_t{}, + stream_view_); + // Broadcast slot [0] to all climbers (no-op outside batch mode). + thrust::fill(handle_ptr_->get_thrust_policy(), + l2_norm_primal_linear_objective_.begin(), + l2_norm_primal_linear_objective_.end(), + l2_norm_primal_linear_objective_.element(0, stream_view_)); + thrust::fill(handle_ptr_->get_thrust_policy(), + l2_norm_primal_right_hand_side_.begin(), + l2_norm_primal_right_hand_side_.end(), + l2_norm_primal_right_hand_side_.element(0, stream_view_)); +} + // --------------------------------------------------------------------------- // init_reduction_storage: allocate and size the temporary buffers used by // cub::DeviceReduce and cub::DeviceSegmentedReduce throughout solving. @@ -416,17 +491,88 @@ void convergence_information_t::compute_convergence_information( print("dual_slack", dual_slack); #endif - compute_primal_residual( - op_problem_cusparse_view_, current_pdhg_solver.get_dual_tmp_resource(), dual_iterate); - compute_primal_objective(primal_iterate); + if (current_pdhg_solver.is_multi_gpu()) { + auto* engine = current_pdhg_solver.get_mgpu_engine(); + cuopt_assert(engine != nullptr, + "mGPU branch reached but current_pdhg_solver has no engine (shard pdhg?)"); + cuopt_expects(!settings.per_constraint_residual, + error_type_t::ValidationError, + "per_constraint_residual is not yet supported in multi-GPU mode"); + + // Prepares halo values in potential_next_primal_solution + + engine->halo_exchange_var([](pdhg_solver_t& pdhg) -> rmm::device_uvector& { + return pdhg.get_potential_next_primal_solution(); + }); + + for (auto& shard : engine->shards) { + raft::device_setter guard(shard->device_id); + auto& sub_pdlp = *shard->sub_pdlp; + auto& sub_conv = sub_pdlp.get_current_termination_strategy().get_convergence_information(); + sub_conv.compute_primal_residual(sub_conv.op_problem_cusparse_view_, + sub_pdlp.pdhg_solver_.get_dual_tmp_resource(), + sub_pdlp.pdhg_solver_.get_potential_next_dual_solution()); + sub_conv.compute_primal_objective_owned_partial( + sub_pdlp.pdhg_solver_.get_potential_next_primal_solution(), + shard->rank_data.owned_var_size); + } + + // Reduce all primal objectives across shards + cuopt_assert(!batch_mode_, "multi-GPU PDLP is not supported in batch mode"); + engine->allreduce_sum_inplace([](pdlp_solver_t& sp) -> f_t* { + return sp.get_current_termination_strategy() + .get_convergence_information() + .get_primal_objective() + .data(); + }); + + // Get the reduced primal objective from the shard[0] (arbitrary) + // Sync shards with master stream to avoid race conditions + engine->sync_await_shards(stream_view_); + { + auto& s0 = *engine->shards[0]; + raft::device_setter guard(s0.device_id); + auto& s0_conv = s0.sub_pdlp->get_current_termination_strategy().get_convergence_information(); + raft::copy(primal_objective_.data(), s0_conv.get_primal_objective().data(), 1, stream_view_); + } + apply_primal_objective_scaling_and_offset(); + } else { + compute_primal_residual( + op_problem_cusparse_view_, current_pdhg_solver.get_dual_tmp_resource(), dual_iterate); + compute_primal_objective(primal_iterate); + } #ifdef CUPDLP_DEBUG_MODE print("Primal Residual", primal_residual_); #endif - if (!batch_mode_) + // L2 Norm + if (current_pdhg_solver.is_multi_gpu()) { + auto* engine = current_pdhg_solver.get_mgpu_engine(); + engine->distributed_l2_norm( + [](pdlp_solver_t& sp) -> rmm::device_uvector& { + return sp.get_current_termination_strategy().get_convergence_information().primal_residual_; + }, + [](pdlp_solver_t& sp) -> f_t* { + return sp.get_current_termination_strategy() + .get_convergence_information() + .l2_primal_residual_.data(); + }, + [](pdlp_shard_t& shard) -> i_t { return shard.rank_data.owned_cstr_size; }); + + // distributed L2 norm before copying scalar data out of shard 0. + engine->sync_await_shards(stream_view_); + auto& s0 = *engine->shards[0]; + raft::device_setter guard(s0.device_id); + raft::copy(l2_primal_residual_.data(), + s0.sub_pdlp->get_current_termination_strategy() + .get_convergence_information() + .l2_primal_residual_.data(), + 1, + stream_view_); + } else if (!batch_mode_) { my_l2_norm(primal_residual_, l2_primal_residual_, handle_ptr_); - else { + } else { segmented_sum_handler_.segmented_sum_helper( thrust::make_transform_iterator(primal_residual_.data(), power_two_func_t{}), l2_primal_residual_.data(), @@ -444,6 +590,7 @@ void convergence_information_t::compute_convergence_information( print("Absolute Primal Residual", l2_primal_residual_); #endif // If per_constraint_residual is false we still need to perform the l2 since it's used in kkt + // Not suported in mGPU if (settings.per_constraint_residual) { // Compute the linf of (residual_i - rel * b_i) if (settings.save_best_primal_so_far) { @@ -466,19 +613,108 @@ void convergence_information_t::compute_convergence_information( std::numeric_limits::lowest()); } - compute_dual_residual(op_problem_cusparse_view_, - current_pdhg_solver.get_primal_tmp_resource(), - primal_iterate, - dual_slack); - compute_dual_objective(dual_iterate, primal_iterate, dual_slack); + if (current_pdhg_solver.is_multi_gpu()) { + auto* engine = current_pdhg_solver.get_mgpu_engine(); + + // 1) Halo-exchange potential_next_dual_solution on every shard so the + // A_T_shard @ y SpMV inside compute_dual_residual reads correct values + // in the cstr halo region. The SpMV is driven through the eval view's + // cv.dual_solution descriptor, which (cuPDLPx, see + // cusparse_view.cu:931-937) is bound to _potential_next_dual -- not to + // current.dual_solution. So we must halo-exchange the same buffer. + engine->halo_exchange_cstr([](pdhg_solver_t& pdhg) -> rmm::device_uvector& { + return pdhg.get_potential_next_dual_solution(); + }); + + // 2-3) Per-shard: + // - compute_dual_residual: shard.dual_residual_ has owned-var entries + // correct, halo var entries garbage (their A_T row isn't on this + // shard). + // - compute_dual_objective_owned_partial: writes a *partial* + // dot(slack[0:nv], x[0:nv]) + Σ primal_slack[0:nc] into + // shard.dual_objective_, with NO scaling/offset. Relies on + // primal_slack_ already populated by the per-shard + // compute_primal_residual above. + // + // Same primal_iterate fix as the primal block above: use the shard's + // (fresh, unscaled) potential_next_primal_solution, matching single-GPU + // cuPDLPx (pdlp.cu:1190-1203). The previous code's get_primal_solution() + // would mix scaled x with unscaled dual_slack in the dual_objective + // cublasdot. + for (auto& shard : engine->shards) { + raft::device_setter guard(shard->device_id); + auto& sub_pdlp = *shard->sub_pdlp; + auto& sub_conv = sub_pdlp.get_current_termination_strategy().get_convergence_information(); + sub_conv.compute_dual_residual(sub_conv.op_problem_cusparse_view_, + sub_pdlp.pdhg_solver_.get_primal_tmp_resource(), + sub_pdlp.pdhg_solver_.get_potential_next_primal_solution(), + sub_pdlp.pdhg_solver_.get_dual_slack()); + sub_conv.compute_dual_objective_owned_partial( + sub_pdlp.pdhg_solver_.get_potential_next_primal_solution(), + sub_pdlp.pdhg_solver_.get_dual_slack(), + shard->rank_data.owned_var_size, + shard->rank_data.owned_cstr_size); + } + + // 4) Allreduce dual_objective_ across shards (sum, in place). Same + // offset/scaling-after-allreduce reasoning as primal: applying offset + // per-shard would over-count it Nshards times. + engine->allreduce_sum_inplace([](pdlp_solver_t& sp) -> f_t* { + return sp.get_current_termination_strategy() + .get_convergence_information() + .get_dual_objective() + .data(); + }); + + // Sync shards with master stream to avoid race conditions + engine->sync_await_shards(stream_view_); + { + auto& s0 = *engine->shards[0]; + raft::device_setter guard(s0.device_id); + auto& s0_conv = s0.sub_pdlp->get_current_termination_strategy().get_convergence_information(); + raft::copy(dual_objective_.data(), s0_conv.get_dual_objective().data(), 1, stream_view_); + } + apply_dual_objective_scaling_and_offset(); + } else { + compute_dual_residual(op_problem_cusparse_view_, + current_pdhg_solver.get_primal_tmp_resource(), + primal_iterate, + dual_slack); + compute_dual_objective(dual_iterate, primal_iterate, dual_slack); + } #ifdef CUPDLP_DEBUG_MODE print("Dual Residual", dual_residual_); #endif - if (!batch_mode_) + if (current_pdhg_solver.is_multi_gpu()) { + // Multi-GPU dual residual L2 norm: same pattern as the primal L2 above, + // but the dual residual is var-shaped so we clip to owned_var_size. + auto* engine = current_pdhg_solver.get_mgpu_engine(); + engine->distributed_l2_norm( + [](pdlp_solver_t& sp) -> rmm::device_uvector& { + return sp.get_current_termination_strategy().get_convergence_information().dual_residual_; + }, + [](pdlp_solver_t& sp) -> f_t* { + return sp.get_current_termination_strategy() + .get_convergence_information() + .l2_dual_residual_.data(); + }, + [](pdlp_shard_t& shard) -> i_t { return shard.rank_data.owned_var_size; }); + + // distributed L2 norm before copying scalar data out of shard 0. + engine->sync_await_shards(stream_view_); + auto& s0 = *engine->shards[0]; + raft::device_setter guard(s0.device_id); + raft::copy(l2_dual_residual_.data(), + s0.sub_pdlp->get_current_termination_strategy() + .get_convergence_information() + .l2_dual_residual_.data(), + 1, + stream_view_); + } else if (!batch_mode_) { my_l2_norm(dual_residual_, l2_dual_residual_, handle_ptr_); - else { + } else { segmented_sum_handler_.segmented_sum_helper( thrust::make_transform_iterator(dual_residual_.data(), power_two_func_t{}), l2_dual_residual_.data(), @@ -509,6 +745,8 @@ void convergence_information_t::compute_convergence_information( std::numeric_limits::lowest()); } + // In mGPU, full primal_objective and dual_objective already mirrored to master so no special + // behaviour const auto [grid_size, block_size] = kernel_config_from_batch_size(climber_strategies_.size()); compute_remaining_stats_kernel <<>>(this->view(), climber_strategies_.size()); @@ -615,6 +853,24 @@ __global__ void apply_objective_scaling_and_offset(raft::device_span object objective[idx] = objective_scaling_factor * (objective[idx] + objective_offsets[idx]); } +template +void convergence_information_t::compute_primal_objective_owned_partial( + rmm::device_uvector& primal_solution, i_t n_owned) +{ + raft::common::nvtx::range fun_scope("compute_primal_objective_owned_partial"); + cuopt_assert(!batch_mode_, "owned-partial primal objective is only used in non-batch mGPU mode"); + cuopt_assert(n_owned <= primal_size_h_, + "n_owned must be <= primal_size_h_ (owned slice is a prefix)"); + RAFT_CUBLAS_TRY(raft::linalg::detail::cublasdot(handle_ptr_->get_cublas_handle(), + static_cast(n_owned), + primal_solution.data(), + primal_stride, + problem_ptr->objective_coefficients.data(), + primal_stride, + primal_objective_.data(), + stream_view_)); +} + template void convergence_information_t::compute_primal_objective( rmm::device_uvector& primal_solution) @@ -643,21 +899,25 @@ void convergence_information_t::compute_primal_objective( // Apply per-climber objective scaling and offset. objective_offsets_ is always populated // (defaults to the scalar problem offset replicated, or user-specified per-climber offsets). - { - const auto [grid_size, block_size] = kernel_config_from_batch_size(climber_strategies_.size()); - apply_objective_scaling_and_offset<<>>( - make_span(primal_objective_), - problem_ptr->presolve_data.objective_scaling_factor, - make_span(objective_offsets_), - climber_strategies_.size()); - RAFT_CUDA_TRY(cudaPeekAtLastError()); - } + apply_primal_objective_scaling_and_offset(); #ifdef CUPDLP_DEBUG_MODE print("Primal objective", primal_objective_); #endif } +template +void convergence_information_t::apply_primal_objective_scaling_and_offset() +{ + const auto [grid_size, block_size] = kernel_config_from_batch_size(climber_strategies_.size()); + apply_objective_scaling_and_offset<<>>( + make_span(primal_objective_), + problem_ptr->presolve_data.objective_scaling_factor, + make_span(objective_offsets_), + climber_strategies_.size()); + RAFT_CUDA_TRY(cudaPeekAtLastError()); +} + template void convergence_information_t::compute_dual_residual( cusparse_view_t& cusparse_view, @@ -740,6 +1000,50 @@ void convergence_information_t::compute_dual_residual( } } +template +void convergence_information_t::compute_dual_objective_owned_partial( + rmm::device_uvector& primal_solution, + rmm::device_uvector& dual_slack, + i_t n_owned_var, + i_t n_owned_cstr) +{ + raft::common::nvtx::range fun_scope("compute_dual_objective_owned_partial"); + cuopt_assert(!batch_mode_, "owned-partial dual objective is only used in non-batch mGPU mode"); + cuopt_assert(hyper_params_.use_reflected_primal_dual, + "owned-partial dual objective requires use_reflected_primal_dual"); + cuopt_assert(n_owned_var <= primal_size_h_, + "n_owned_var must be <= primal_size_h_ (owned slice is a prefix)"); + cuopt_assert(n_owned_cstr <= dual_size_h_, + "n_owned_cstr must be <= dual_size_h_ (owned slice is a prefix)"); + + // dual_dot_ = dot(dual_slack[0:n_owned_var], primal_solution[0:n_owned_var]) + RAFT_CUBLAS_TRY(raft::linalg::detail::cublasdot(handle_ptr_->get_cublas_handle(), + static_cast(n_owned_var), + dual_slack.data(), + primal_stride, + primal_solution.data(), + primal_stride, + dual_dot_.data(), + stream_view_)); + + // sum_primal_slack_ = Σ primal_slack_[0:n_owned_cstr] + // primal_slack_ is assumed populated for owned cstrs by a prior + // compute_primal_residual call on this same shard. + cub::DeviceReduce::Sum(rmm_tmp_buffer_.data(), + size_of_buffer_, + primal_slack_.data(), + sum_primal_slack_.data(), + static_cast(n_owned_cstr), + stream_view_); + + // dual_objective_ = dual_dot_ + sum_primal_slack_ (still a partial sum). + cub::DeviceTransform::Transform(cuda::std::make_tuple(dual_dot_.data(), sum_primal_slack_.data()), + dual_objective_.data(), + 1, + cuda::std::plus<>{}, + stream_view_); +} + template void convergence_information_t::compute_dual_objective( rmm::device_uvector& dual_solution, @@ -821,21 +1125,25 @@ void convergence_information_t::compute_dual_objective( } // Apply per-climber objective scaling and offset. - { - const auto [grid_size, block_size] = kernel_config_from_batch_size(climber_strategies_.size()); - apply_objective_scaling_and_offset<<>>( - make_span(dual_objective_), - problem_ptr->presolve_data.objective_scaling_factor, - make_span(objective_offsets_), - climber_strategies_.size()); - RAFT_CUDA_TRY(cudaPeekAtLastError()); - } + apply_dual_objective_scaling_and_offset(); #ifdef CUPDLP_DEBUG_MODE print("Dual objective", dual_objective_); #endif } +template +void convergence_information_t::apply_dual_objective_scaling_and_offset() +{ + const auto [grid_size, block_size] = kernel_config_from_batch_size(climber_strategies_.size()); + apply_objective_scaling_and_offset<<>>( + make_span(dual_objective_), + problem_ptr->presolve_data.objective_scaling_factor, + make_span(objective_offsets_), + climber_strategies_.size()); + RAFT_CUDA_TRY(cudaPeekAtLastError()); +} + template void convergence_information_t::compute_reduced_cost_from_primal_gradient( const rmm::device_uvector& primal_gradient, const rmm::device_uvector& primal_solution) @@ -916,12 +1224,24 @@ const rmm::device_uvector& convergence_information_t::get_primal_ return primal_objective_; } +template +rmm::device_uvector& convergence_information_t::get_primal_objective() +{ + return primal_objective_; +} + template const rmm::device_uvector& convergence_information_t::get_dual_objective() const { return dual_objective_; } +template +rmm::device_uvector& convergence_information_t::get_dual_objective() +{ + return dual_objective_; +} + template const rmm::device_uvector& convergence_information_t::get_l2_dual_residual() const { diff --git a/cpp/src/pdlp/termination_strategy/convergence_information.hpp b/cpp/src/pdlp/termination_strategy/convergence_information.hpp index 2389a60fae..7ff45e46f0 100644 --- a/cpp/src/pdlp/termination_strategy/convergence_information.hpp +++ b/cpp/src/pdlp/termination_strategy/convergence_information.hpp @@ -52,7 +52,10 @@ class convergence_information_t { // Needed for kkt restart & debug prints const rmm::device_uvector& get_primal_objective() const; + // Non-const overload used by the multi-GPU branch to mirror / allreduce. + rmm::device_uvector& get_primal_objective(); const rmm::device_uvector& get_dual_objective() const; + rmm::device_uvector& get_dual_objective(); const rmm::device_uvector& get_l2_primal_residual() const; const rmm::device_uvector& get_l2_dual_residual() const; const rmm::device_uvector& get_relative_linf_primal_residual() const; @@ -66,6 +69,11 @@ class convergence_information_t { const rmm::device_uvector& get_l2_norm_primal_linear_objective() const; const rmm::device_uvector& get_l2_norm_primal_right_hand_side() const; + void compute_owned_reference_norm_partials(i_t owned_var_size, i_t owned_cstr_size); + void sqrt_reference_norms_inplace(); + f_t* l2_norm_primal_right_hand_side_data() { return l2_norm_primal_right_hand_side_.data(); } + f_t* l2_norm_primal_linear_objective_data() { return l2_norm_primal_linear_objective_.data(); } + struct view_t { i_t primal_size; i_t dual_size; @@ -123,12 +131,40 @@ class convergence_information_t { rmm::device_uvector& tmp_dual, [[maybe_unused]] const rmm::device_uvector& dual_iterate); + // Multi-GPU shard helper: writes a partial dot(c[0:n_owned], x[0:n_owned]) + // into primal_objective_ (no scaling, no offset). Master is responsible for + // allreduce SUM across shards and then applying scaling + offset once on the + // reduced value. n_owned must be <= primal_size_h_; pass owned_var_size on + // each shard. + void compute_primal_objective_owned_partial(rmm::device_uvector& primal_solution, + i_t n_owned); + + // Multi-GPU shard helper: writes a partial dual objective into + // dual_objective_ (no scaling, no offset). Computes + // dual_dot_ = dot(dual_slack[0:n_owned_var], primal_solution[0:n_owned_var]) + // sum_primal_slack_ = Σ primal_slack_[0:n_owned_cstr] + // dual_objective_ = dual_dot_ + sum_primal_slack_ + // primal_slack_ is assumed already populated by a prior per-shard + // compute_primal_residual call. Use only in the use_reflected_primal_dual + // path (the multi-GPU mode). + void compute_dual_objective_owned_partial(rmm::device_uvector& primal_solution, + rmm::device_uvector& dual_slack, + i_t n_owned_var, + i_t n_owned_cstr); + void swap_context(const thrust::universal_host_pinned_vector>& swap_pairs); void resize_context(i_t new_size); private: void compute_primal_objective(rmm::device_uvector& primal_solution); + // Applies per-climber objective scaling + offset to primal_objective_. + // Single-GPU path: called from compute_primal_objective right after the dot. + // Multi-GPU path: called on master once after allreduce of partial sums. + void apply_primal_objective_scaling_and_offset(); + // Same as above but for dual_objective_. + void apply_dual_objective_scaling_and_offset(); + void compute_dual_residual(cusparse_view_t& cusparse_view, rmm::device_uvector& tmp_primal, rmm::device_uvector& primal_solution, diff --git a/cpp/src/pdlp/termination_strategy/termination_strategy.cu b/cpp/src/pdlp/termination_strategy/termination_strategy.cu index d1a88799d6..0320b420a8 100644 --- a/cpp/src/pdlp/termination_strategy/termination_strategy.cu +++ b/cpp/src/pdlp/termination_strategy/termination_strategy.cu @@ -195,6 +195,13 @@ pdlp_termination_strategy_t::get_convergence_information() const return convergence_information_; } +template +convergence_information_t& +pdlp_termination_strategy_t::get_convergence_information() +{ + return convergence_information_; +} + template const infeasibility_information_t& pdlp_termination_strategy_t::get_infeasibility_information() const diff --git a/cpp/src/pdlp/termination_strategy/termination_strategy.hpp b/cpp/src/pdlp/termination_strategy/termination_strategy.hpp index 5cd43d7be7..63b2e81ff4 100644 --- a/cpp/src/pdlp/termination_strategy/termination_strategy.hpp +++ b/cpp/src/pdlp/termination_strategy/termination_strategy.hpp @@ -187,6 +187,7 @@ class pdlp_termination_strategy_t { i_t get_optimal_solution_id() const; const convergence_information_t& get_convergence_information() const; + convergence_information_t& get_convergence_information(); const infeasibility_information_t& get_infeasibility_information() const; // Deep copy is used when save best primal so far is toggled diff --git a/cpp/src/pdlp/utilities/mgpu_trace.cuh b/cpp/src/pdlp/utilities/mgpu_trace.cuh new file mode 100644 index 0000000000..d9975d3202 --- /dev/null +++ b/cpp/src/pdlp/utilities/mgpu_trace.cuh @@ -0,0 +1,52 @@ +/* clang-format off */ +/* + * SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + */ +/* clang-format on */ +#pragma once + +// Lightweight env-gated tracing for multi-GPU PDLP diagnosis. +// +// Enable by setting CUOPT_MGPU_TRACE=1 in the environment. +// All prints go to stderr (line-buffered + explicit flush) so they survive +// a CUDA hang and interleave with cuOpt's normal output. +// +// Usage: +// MGPU_TRACE("entering compute_At_y"); +// MGPU_TRACE_FMT("shard %d nnz=%lld", r, (long long)nnz); +// +// The guard reads the env var once on first use (thread-safe via static +// initialization) and the cost when disabled is a single load + branch. + +#include +#include + +namespace cuopt::linear_programming::detail { + +inline bool mgpu_trace_enabled() +{ + static const bool enabled = []() { + const char* v = std::getenv("CUOPT_MGPU_TRACE"); + return v != nullptr && v[0] != '\0' && v[0] != '0'; + }(); + return enabled; +} + +} // namespace cuopt::linear_programming::detail + +#define MGPU_TRACE(msg) \ + do { \ + if (::cuopt::linear_programming::detail::mgpu_trace_enabled()) { \ + std::fprintf(stderr, "[mgpu %s:%d] %s\n", __func__, __LINE__, (msg)); \ + std::fflush(stderr); \ + } \ + } while (0) + +#define MGPU_TRACE_FMT(fmt, ...) \ + do { \ + if (::cuopt::linear_programming::detail::mgpu_trace_enabled()) { \ + std::fprintf(stderr, "[mgpu %s:%d] " fmt "\n", __func__, __LINE__, __VA_ARGS__); \ + std::fflush(stderr); \ + } \ + } while (0) diff --git a/cpp/src/pdlp/utilities/ping_pong_graph.cuh b/cpp/src/pdlp/utilities/ping_pong_graph.cuh index dbc8fe5828..6b527f81b2 100644 --- a/cpp/src/pdlp/utilities/ping_pong_graph.cuh +++ b/cpp/src/pdlp/utilities/ping_pong_graph.cuh @@ -12,10 +12,25 @@ #include +#include #include namespace cuopt::linear_programming::detail { +// Debug/diagnostic toggle: when set, ping_pong_graph_t::run() bypasses CUDA +// graph capture and executes its work eagerly on every iteration. Useful for +// for debugging +inline std::atomic& pdlp_graph_disabled_flag() +{ + static std::atomic s_flag{false}; + return s_flag; +} + +inline bool pdlp_graph_disabled() +{ + return pdlp_graph_disabled_flag().load(std::memory_order_relaxed); +} + // Two-slot CUDA-graph cache for PDLP. PDLP swaps pointers (rather than // copying vectors) at the end of adaptive pdhg step, so the captured graph // topology alternates between two layouts depending on iteration parity. @@ -49,7 +64,7 @@ class ping_pong_graph_t { #ifdef CUPDLP_DEBUG_MODE work(); #else - if (is_legacy_batch_mode_) { + if (is_legacy_batch_mode_ || pdlp_graph_disabled()) { work(); return; } diff --git a/cpp/tests/linear_programming/pdlp_test.cu b/cpp/tests/linear_programming/pdlp_test.cu index d29995efc5..d17cf2af6f 100644 --- a/cpp/tests/linear_programming/pdlp_test.cu +++ b/cpp/tests/linear_programming/pdlp_test.cu @@ -8,12 +8,16 @@ #include #include #include +#include +#include #include #include #include #include #include +#include + #include "utilities/pdlp_test_utilities.cuh" #include "../mip/mip_utils.cuh" @@ -45,11 +49,13 @@ #include #include +#include #include #include #include #include #include +#include #include #include #include @@ -91,6 +97,166 @@ TEST(pdlp_class, run_double) afiro_primal_objective, solution.get_additional_termination_information().primal_objective)); } +// Distributed-PDLP partition round-trip: partition the afiro constraint/variable +// bipartite graph with METIS, write it out, read it back, and confirm the parsed +// vector is identical to what the partitioner produced. +TEST(pdlp_class, distributed_partition_metis_export_import_roundtrip) +{ + using namespace cuopt::linear_programming::detail; + namespace ds = cuopt::linear_programming::dual_simplex; + + auto path = make_path_absolute("linear_programming/afiro_original.mps"); + cuopt::linear_programming::io::mps_data_model_t mps = + cuopt::linear_programming::io::parse_mps(path, true); + + const int n_vars = static_cast(mps.get_objective_coefficients().size()); + const int n_cstr = static_cast(mps.get_constraint_lower_bounds().size()); + const int nnz = static_cast(mps.get_constraint_matrix_values().size()); + + std::vector h_A_row_offsets = mps.get_constraint_matrix_offsets(); + std::vector h_A_col_indices = mps.get_constraint_matrix_indices(); + std::vector h_A_values = mps.get_constraint_matrix_values(); + + // Transpose A -> A^T (CSR of A^T == CSC of A), mirroring solve_lp_distributed_from_mps. + ds::csr_matrix_t A_csr(n_cstr, n_vars, nnz); + A_csr.row_start = h_A_row_offsets; + A_csr.j = h_A_col_indices; + A_csr.x = h_A_values; + ds::csc_matrix_t AT_as_csc(n_vars, n_cstr, nnz); + A_csr.to_compressed_col(AT_as_csc); + std::vector h_A_t_row_offsets = AT_as_csc.col_start; + std::vector h_A_t_col_indices = AT_as_csc.i; + + partitioner_input_t input; + input.nb_cstr = n_cstr; + input.nb_vars = n_vars; + input.nb_parts = 2; + input.A.row_offsets = &h_A_row_offsets; + input.A.col_indices = &h_A_col_indices; + input.A.num_rows = n_cstr; + input.A.num_cols = n_vars; + input.A_t.row_offsets = &h_A_t_row_offsets; + input.A_t.col_indices = &h_A_t_col_indices; + input.A_t.num_rows = n_vars; + input.A_t.num_cols = n_cstr; + + auto partitioner = make_partitioner(partitioner_kind_t::Metis); + std::vector parts = partitioner->partition(input); + ASSERT_EQ(parts.size(), static_cast(n_cstr + n_vars)); + + std::string dir = ::testing::TempDir(); + if (!dir.empty() && dir.back() != '/') { dir.push_back('/'); } + const std::string out_path = dir + "afiro_metis_roundtrip.parts"; + + partition_loader_t::export_distributed_pdlp_partition_file(out_path, parts); + std::vector reloaded = + partition_loader_t::parse_distributed_pdlp_partition_file(out_path); + + EXPECT_EQ(parts, reloaded); +} + +namespace { + +// Solve `mps_rel_path` with the single-GPU PDLP ("base") and with distributed PDLP +// (num_gpus = -1 => auto-detect; 1 GPU is fine), then assert the distributed run +// matches the base run on everything meaningful: termination status, step count +// (within 15%), primal/dual objective, and the full primal/dual solution vectors. +// All value comparisons use a loose relative tolerance. +void expect_distributed_matches_base(raft::handle_t const& handle, + std::string const& mps_rel_path, + bool fixed_mps_format = false) +{ + constexpr double loose_rel = 1e-3; + auto near_rel = [](double a, double b, double rel) { + return std::fabs(a - b) <= rel * (1.0 + std::fabs(a)); + }; + + auto path = make_path_absolute(mps_rel_path); + io::mps_data_model_t problem = io::parse_mps(path, fixed_mps_format); + + // Shared settings: PDLP, no presolve (distributed requires presolver == None, so the + // base run must match to keep the two problems identical). + pdlp_solver_settings_t base_settings{}; + base_settings.method = method_t::PDLP; + base_settings.presolver = presolver_t::None; + + // ----- base: single-GPU PDLP (materialize the full problem on one GPU) ----- + auto base_op = mps_data_model_to_optimization_problem(&handle, problem); + auto base = solve_lp(base_op, base_settings); + + // ----- distributed PDLP (identical settings, only the distributed flags flipped) ----- + pdlp_solver_settings_t dist_settings = base_settings; + dist_settings.hyper_params.use_distributed_pdlp = true; + dist_settings.distributed_pdlp_num_gpus = -1; + auto dist = solve_lp(&handle, problem, dist_settings); + + // ----- termination status ----- + ASSERT_EQ(static_cast(base.get_termination_status()), CUOPT_TERMINATION_STATUS_OPTIMAL) + << mps_rel_path << ": base did not reach optimal"; + EXPECT_EQ(static_cast(dist.get_termination_status()), + static_cast(base.get_termination_status())) + << mps_rel_path << ": distributed termination status differs from base"; + + const auto& base_info = base.get_additional_termination_information(); + const auto& dist_info = dist.get_additional_termination_information(); + + // ----- objectives ----- + EXPECT_TRUE(near_rel(base_info.primal_objective, dist_info.primal_objective, loose_rel)) + << mps_rel_path << ": primal objective base=" << base_info.primal_objective + << " distributed=" << dist_info.primal_objective; + EXPECT_TRUE(near_rel(base_info.dual_objective, dist_info.dual_objective, loose_rel)) + << mps_rel_path << ": dual objective base=" << base_info.dual_objective + << " distributed=" << dist_info.dual_objective; + + // ----- step count: within 15% of the larger of the two ----- + const int base_steps = base_info.number_of_steps_taken; + const int dist_steps = dist_info.number_of_steps_taken; + const int max_steps = std::max(base_steps, dist_steps); + const int step_diff = std::max(base_steps, dist_steps) - std::min(base_steps, dist_steps); + EXPECT_LE(static_cast(step_diff), 0.15 * max_steps) + << mps_rel_path << ": step counts differ by >15% (base=" << base_steps + << ", distributed=" << dist_steps << ")"; + + // ----- primal / dual solution vectors ----- + auto base_primal = cuopt::host_copy(base.get_primal_solution(), handle.get_stream()); + auto dist_primal = cuopt::host_copy(dist.get_primal_solution(), handle.get_stream()); + ASSERT_EQ(base_primal.size(), dist_primal.size()) << mps_rel_path << ": primal size mismatch"; + for (std::size_t i = 0; i < base_primal.size(); ++i) { + EXPECT_TRUE(near_rel(base_primal[i], dist_primal[i], loose_rel)) + << mps_rel_path << ": primal[" << i << "] base=" << base_primal[i] + << " distributed=" << dist_primal[i]; + } + + auto base_dual = cuopt::host_copy(base.get_dual_solution(), handle.get_stream()); + auto dist_dual = cuopt::host_copy(dist.get_dual_solution(), handle.get_stream()); + ASSERT_EQ(base_dual.size(), dist_dual.size()) << mps_rel_path << ": dual size mismatch"; + for (std::size_t i = 0; i < base_dual.size(); ++i) { + EXPECT_TRUE(near_rel(base_dual[i], dist_dual[i], loose_rel)) + << mps_rel_path << ": dual[" << i << "] base=" << base_dual[i] + << " distributed=" << dist_dual[i]; + } +} + +} // namespace + +TEST(pdlp_class, distributed_parity_afiro) +{ + const raft::handle_t handle{}; + expect_distributed_matches_base(handle, "linear_programming/afiro_original.mps", true); +} + +TEST(pdlp_class, distributed_parity_square41) +{ + const raft::handle_t handle{}; + expect_distributed_matches_base(handle, "linear_programming/neos3/neos3.mps"); +} + +TEST(pdlp_class, distributed_parity_a2864) +{ + const raft::handle_t handle{}; + expect_distributed_matches_base(handle, "linear_programming/a2864/a2864.mps"); +} + TEST(pdlp_class, precision_mixed) { using namespace cuopt::linear_programming::detail;