NVIDIA · Bubullzz · May 7, 2026 · May 7, 2026 · May 7, 2026 · May 10, 2026
@@ -314,6 +314,81 @@ create_logger_macros(CUOPT "cuopt::default_logger()" include/cuopt)
 
 find_package(CUDSS REQUIRED)
 
+# ##################################################################################################
+# - NCCL (multi-GPU distributed PDLP) -------------------------------------------------------------
+# NCCL is shipped via the conda env; no canonical CMake config target, so look it
+# up by name in the standard lib paths (plus CONDA_PREFIX as a hint).
+set(NCCL_HINT_PREFIXES "")
+if (DEFINED ENV{CONDA_PREFIX} AND NOT "$ENV{CONDA_PREFIX}" STREQUAL "")
+    list(APPEND NCCL_HINT_PREFIXES "$ENV{CONDA_PREFIX}")
+endif ()
+find_path(NCCL_INCLUDE_DIR
+    NAMES nccl.h
+    HINTS ${NCCL_HINT_PREFIXES}
+    PATH_SUFFIXES include
+)
+find_library(NCCL_LIBRARY
+    NAMES nccl
+    HINTS ${NCCL_HINT_PREFIXES}
+    PATH_SUFFIXES lib lib64
+)
+if (NOT NCCL_INCLUDE_DIR OR NOT NCCL_LIBRARY)
+    message(FATAL_ERROR "NCCL not found. Looked in ${NCCL_HINT_PREFIXES}. Install nccl-dev / libnccl-dev in the active env.")
+endif ()
+add_library(nccl_external UNKNOWN IMPORTED GLOBAL)
+set_target_properties(nccl_external PROPERTIES
+    IMPORTED_LOCATION "${NCCL_LIBRARY}"
+    INTERFACE_INCLUDE_DIRECTORIES "${NCCL_INCLUDE_DIR}"
+)
+message(STATUS "Using NCCL: ${NCCL_LIBRARY}")
+
+# ##################################################################################################
+# - METIS (graph partitioning for distributed PDLP) -----------------------------------------------
+# Found by searching CONDA_PREFIX first, then CUOPT_METIS_ROOT (cmake var or env)
+# if the user wants to pull METIS from a different conda env / system path.
+set(METIS_HINT_PREFIXES "")
+if (DEFINED ENV{CONDA_PREFIX} AND NOT "$ENV{CONDA_PREFIX}" STREQUAL "")
+    list(APPEND METIS_HINT_PREFIXES "$ENV{CONDA_PREFIX}")
+endif ()
+if (DEFINED CUOPT_METIS_ROOT AND NOT "${CUOPT_METIS_ROOT}" STREQUAL "")
+    list(APPEND METIS_HINT_PREFIXES "${CUOPT_METIS_ROOT}")
+endif ()
+if (DEFINED ENV{CUOPT_METIS_ROOT} AND NOT "$ENV{CUOPT_METIS_ROOT}" STREQUAL "")
+    list(APPEND METIS_HINT_PREFIXES "$ENV{CUOPT_METIS_ROOT}")
+endif ()
+find_path(METIS_INCLUDE_DIR
+    NAMES metis.h
+    HINTS ${METIS_HINT_PREFIXES}
+    PATH_SUFFIXES include
+)
+find_library(METIS_LIBRARY
+    NAMES metis libmetis
+    HINTS ${METIS_HINT_PREFIXES}
+    PATH_SUFFIXES lib lib64
+)
+if (NOT METIS_INCLUDE_DIR OR NOT METIS_LIBRARY)
+    message(FATAL_ERROR "METIS not found. Looked in: ${METIS_HINT_PREFIXES}. "
+                        "Install it via 'conda install -c conda-forge metis' in the active env, "
+                        "or set CUOPT_METIS_ROOT to a prefix containing include/metis.h and lib/libmetis.{so,a}.")
+endif ()
+add_library(metis_external UNKNOWN IMPORTED GLOBAL)
+set_target_properties(metis_external PROPERTIES
+    IMPORTED_LOCATION "${METIS_LIBRARY}"
+    INTERFACE_INCLUDE_DIRECTORIES "${METIS_INCLUDE_DIR}"
+)
+message(STATUS "Using METIS: ${METIS_LIBRARY}")
+
+# ##################################################################################################
+# - KaMinPar (multi-threaded partitioning for distributed PDLP) ------------------------------------
+# Brought in the RAPIDS way (rapids_cpm_find): uses an installed KaMinPar (deb/rpm/conda,
+# discovered via its CMake config) if present, otherwise builds the pinned source via CPM.
+# Distributed PDLP prefers KaMinPar over METIS.
+include(cmake/thirdparty/get_kaminpar.cmake)
+if (NOT TARGET KaMinPar::KaMinPar)
+    message(FATAL_ERROR "KaMinPar::KaMinPar was not made available by get_kaminpar.cmake")
+endif ()
+message(STATUS "Using KaMinPar (distributed PDLP prefers KaMinPar over METIS)")
+
 # ##################################################################################################
 # - gRPC and Protobuf setup -----------------------------------------------------------------------
 
@@ -576,6 +651,9 @@ target_link_libraries(cuopt
         ${CUDSS_LIB_FILE}
         PRIVATE
         ${CUOPT_PRIVATE_CUDA_LIBS}
+        nccl_external
+        metis_external
+        KaMinPar::KaMinPar
         $<$<BOOL:${CUOPT_ENABLE_GRPC}>:protobuf::libprotobuf>
         $<$<BOOL:${CUOPT_ENABLE_GRPC}>:gRPC::grpc++>
 )

@@ -0,0 +1,48 @@
+# cmake-format: off
+# SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+# cmake-format: on
+
+# Multi-threaded graph partitioner for distributed PDLP.
+# Uses rapids_cpm_find so a system / conda / .deb install of KaMinPar (which ships a
+# CMake config package exporting KaMinPar::KaMinPar) is used when available, and
+# otherwise the pinned source is cloned and built via CPM. KaMinPar depends on TBB,
+# which cuOpt already requires (see find_package(TBB) for papilo).
+function(find_and_configure_kaminpar)
+    set(oneValueArgs VERSION PINNED_TAG)
+    cmake_parse_arguments(PKG "" "${oneValueArgs}" "" ${ARGN})
+
+    rapids_cpm_find(KaMinPar ${PKG_VERSION}
+        GLOBAL_TARGETS KaMinPar::KaMinPar
+        CPM_ARGS
+        GIT_REPOSITORY https://github.com/KaHIP/KaMinPar.git
+        GIT_TAG ${PKG_PINNED_TAG}
+        EXCLUDE_FROM_ALL
+        OPTIONS
+            "KAMINPAR_BUILD_APPS OFF"
+            "KAMINPAR_BUILD_TOOLS OFF"
+            "KAMINPAR_BUILD_TESTS OFF"
+            "KAMINPAR_BUILD_BENCHMARKS OFF"
+            "KAMINPAR_BUILD_EXAMPLES OFF"
+            "KAMINPAR_BUILD_DISTRIBUTED OFF"
+            # Timers use global state and force single-threaded use of the library
+            # interface; disable so cuOpt can call the partitioner freely.
+            "KAMINPAR_ENABLE_TIMERS OFF"
+            # Avoid an extra hard dependency on Google Sparsehash.
+            "KAMINPAR_BUILD_WITH_SPARSEHASH OFF"
+            # cuOpt's TBB is discovered via a legacy find that only exposes TBB::tbb
+            # (no TBB::tbbmalloc target); disable KaMinPar's optional tbbmalloc use.
+            "KAMINPAR_ENABLE_TBB_MALLOC OFF"
+            # Large LP constraint graphs can exceed 2^31 directed edges.
+            "KAMINPAR_64BIT_EDGE_IDS ON"
+            "INSTALL_KAMINPAR OFF"
+    )
+
+    if(KaMinPar_ADDED)
+        message(VERBOSE "CUOPT: Using KaMinPar located in ${KaMinPar_SOURCE_DIR}")
+    else()
+        message(VERBOSE "CUOPT: Using KaMinPar located in ${KaMinPar_DIR}")
+    endif()
+endfunction()
+
+find_and_configure_kaminpar(VERSION 3.7.3 PINNED_TAG v3.7.3)
@@ -176,7 +176,12 @@ int run_single_file(const std::string& file_path,
       auto solution = cuopt::linear_programming::solve_mip(problem_interface.get(), mip_settings);
     } else {
       auto& lp_settings = settings.get_pdlp_settings();
-      auto solution     = cuopt::linear_programming::solve_lp(problem_interface.get(), lp_settings);
+
+      if (lp_settings.hyper_params.use_distributed_pdlp) {
+        cuopt::linear_programming::solve_lp(handle_ptr.get(), mps_data_model, lp_settings);
+      } else {
+        cuopt::linear_programming::solve_lp(problem_interface.get(), lp_settings);
+      }
     }
   } catch (const std::exception& e) {
     fprintf(stderr, "cuopt_cli error: %s\n", e.what());
@@ -426,10 +431,21 @@ int main(int argc, char* argv[])
   std::vector<rmm::mr::cuda_async_memory_resource> memory_resources;
 
   if (memory_backend == cuopt::linear_programming::memory_backend_t::GPU) {
-    const int num_gpus = settings.get_parameter<int>(CUOPT_NUM_GPUS);
+    // Distributed PDLP scales one shard per GPU and uses its own knob; everything else
+    // (concurrent, batch, MIP) uses num_gpus which is capped at 2.
+    // For distributed PDLP, -1 means "auto-detect": resolve to the visible device
+    // count so the RMM memory pools match what solve.cu will eventually dispatch.
+    const bool use_distributed_pdlp = settings.get_parameter<bool>(CUOPT_USE_DISTRIBUTED_PDLP);
+    int requested_gpus              = use_distributed_pdlp
+                                        ? settings.get_parameter<int>(CUOPT_DISTRIBUTED_PDLP_NUM_GPUS)
+                                        : settings.get_parameter<int>(CUOPT_NUM_GPUS);
+    if (use_distributed_pdlp && requested_gpus == -1) {
+      requested_gpus = raft::device_setter::get_device_count();
+    }
+    const int provisioned_gpus = std::min(raft::device_setter::get_device_count(), requested_gpus);
 
-    memory_resources.reserve(std::min(raft::device_setter::get_device_count(), num_gpus));
-    for (int i = 0; i < std::min(raft::device_setter::get_device_count(), num_gpus); ++i) {
+    memory_resources.reserve(provisioned_gpus);
+    for (int i = 0; i < provisioned_gpus; ++i) {
       RAFT_CUDA_TRY(cudaSetDevice(i));
       memory_resources.emplace_back();
       rmm::mr::set_per_device_resource(rmm::cuda_device_id{i}, memory_resources.back());

@@ -80,14 +80,20 @@
 #define CUOPT_MIP_STRONG_BRANCHING_SIMPLEX_ITERATION_LIMIT \
   "mip_strong_branching_simplex_iteration_limit"
 
-#define CUOPT_SOLUTION_FILE            "solution_file"
-#define CUOPT_NUM_CPU_THREADS          "num_cpu_threads"
-#define CUOPT_NUM_GPUS                 "num_gpus"
-#define CUOPT_USER_PROBLEM_FILE        "user_problem_file"
-#define CUOPT_PRESOLVE_FILE            "presolve_file"
-#define CUOPT_RANDOM_SEED              "random_seed"
-#define CUOPT_PDLP_PRECISION           "pdlp_precision"
-#define CUOPT_MIP_SEMICONTINUOUS_BIG_M "mip_semi_continuous_big_m"
+#define CUOPT_SOLUTION_FILE                   "solution_file"
+#define CUOPT_NUM_CPU_THREADS                 "num_cpu_threads"
+#define CUOPT_NUM_GPUS                        "num_gpus"
+#define CUOPT_DISTRIBUTED_PDLP_NUM_GPUS       "distributed_pdlp_num_gpus"
+#define CUOPT_MULTI_GPU_PARTITION_FILE        "multi_gpu_partition_file"
+#define CUOPT_MULTI_GPU_EXPORT_PARTITION_FILE "multi_gpu_export_partition_file"
+#define CUOPT_DISTRIBUTED_PDLP_PARTITIONER    "distributed_pdlp_partitioner"
+#define CUOPT_USE_DISTRIBUTED_PDLP            "use_distributed_pdlp"
+#define CUOPT_PDLP_DISABLE_GRAPH              "pdlp_disable_graph"
+#define CUOPT_USER_PROBLEM_FILE               "user_problem_file"
+#define CUOPT_PRESOLVE_FILE                   "presolve_file"
+#define CUOPT_RANDOM_SEED                     "random_seed"
+#define CUOPT_PDLP_PRECISION                  "pdlp_precision"
+#define CUOPT_MIP_SEMICONTINUOUS_BIG_M        "mip_semi_continuous_big_m"
 
 #define CUOPT_MIP_HYPER_HEURISTIC_POPULATION_SIZE     "mip_hyper_heuristic_population_size"
 #define CUOPT_MIP_HYPER_HEURISTIC_NUM_CPUFJ_THREADS   "mip_hyper_heuristic_num_cpufj_threads"

@@ -47,12 +47,16 @@ struct pdlp_hyper_params_t {
   bool bound_objective_rescaling                                  = true;
   bool use_reflected_primal_dual                                  = true;
   bool use_fixed_point_error                                      = true;
-  double reflection_coefficient                                   = 1.0;
-  double restart_k_p                                              = 0.99;
-  double restart_k_i                                              = 0.01;
-  double restart_k_d                                              = 0.0;
-  double restart_i_smooth                                         = 0.3;
-  bool use_conditional_major                                      = true;
+  bool use_distributed_pdlp                                       = false;
+  // Debug/diagnostic knob: when true, PDLP bypasses CUDA-graph capture in
+  // ping_pong_graph_t and executes each iteration eagerly
+  bool pdlp_disable_graph       = false;
+  double reflection_coefficient = 1.0;
+  double restart_k_p            = 0.99;
+  double restart_k_i            = 0.01;
+  double restart_k_d            = 0.0;
+  double restart_i_smooth       = 0.3;
+  bool use_conditional_major    = true;
 };
 
 // TODO most likely we want to get rid of pdlp_solver_mode and just have prebuilt

@@ -307,6 +307,25 @@ class pdlp_solver_settings_t {
   presolver_t presolver{presolver_t::Default};
   bool dual_postsolve{true};
   int num_gpus{1};
+  // Number of GPUs to use specifically for distributed PDLP (use_distributed_pdlp=true).
+  // -1 means auto-detect
+  int distributed_pdlp_num_gpus{-1};
+  std::string multi_gpu_partition_file{""};
+  // If non-empty, the partition computed for distributed PDLP is written to this
+  // path (one part-id per line) right after partitioning. The file can be fed
+  // back via multi_gpu_partition_file. Exposed as the multi_gpu_export_partition_file
+  // parameter (CLI: --multi-gpu-export-partition-file <path>).
+  std::string multi_gpu_export_partition_file{""};
+  // Which graph partitioner distributed PDLP uses. One of:
+  //   "auto"     - 1 GPU => Dummy; otherwise KaMinPar
+  //   "dummy"    - round-robin, no graph (trivial)
+  //   "metis"    - serial METIS_PartGraphKway
+  //   "kaminpar" - multi-threaded KaMinPar
+  // Exposed as the distributed_pdlp_partitioner parameter
+  // (CLI: --distributed-pdlp-partitioner <auto|dummy|metis|kaminpar>).
+  std::string distributed_pdlp_partitioner{"auto"};
+  // Set to true inside the shards
+  bool is_distributed_sub_pdlp{false};
   method_t method{method_t::Concurrent};
   bool inside_mip{false};
   // For concurrent termination

@@ -140,6 +140,7 @@ solver_settings_t<i_t, f_t>::solver_settings_t() : pdlp_settings(), mip_settings
     {CUOPT_MIP_REDUCED_COST_STRENGTHENING, &mip_settings.reduced_cost_strengthening, -1, std::numeric_limits<i_t>::max(), -1},
     {CUOPT_NUM_GPUS, &pdlp_settings.num_gpus, 1, 2, 1},
     {CUOPT_NUM_GPUS, &mip_settings.num_gpus, 1, 2, 1},
+    {CUOPT_DISTRIBUTED_PDLP_NUM_GPUS, &pdlp_settings.distributed_pdlp_num_gpus, -1, 576, -1},
     {CUOPT_MIP_BATCH_PDLP_STRONG_BRANCHING, &mip_settings.mip_batch_pdlp_strong_branching, 0, 2, 0},
     {CUOPT_MIP_BATCH_PDLP_RELIABILITY_BRANCHING, &mip_settings.mip_batch_pdlp_reliability_branching, 0, 2, 0},
     {CUOPT_MIP_STRONG_BRANCHING_SIMPLEX_ITERATION_LIMIT, &mip_settings.strong_branching_simplex_iteration_limit, -1,std::numeric_limits<i_t>::max(), -1},
@@ -177,6 +178,8 @@ solver_settings_t<i_t, f_t>::solver_settings_t() : pdlp_settings(), mip_settings
     {CUOPT_DUAL_POSTSOLVE, &pdlp_settings.dual_postsolve, true},
     {CUOPT_BARRIER_ITERATIVE_REFINEMENT, &pdlp_settings.barrier_iterative_refinement, true},
     {CUOPT_MIP_PROBING, &mip_settings.probing, true},
+    {CUOPT_USE_DISTRIBUTED_PDLP, &pdlp_settings.hyper_params.use_distributed_pdlp, false},
+    {CUOPT_PDLP_DISABLE_GRAPH, &pdlp_settings.hyper_params.pdlp_disable_graph, false},
   };
   // String parameters
   string_parameters = {
@@ -187,7 +190,10 @@ solver_settings_t<i_t, f_t>::solver_settings_t() : pdlp_settings(), mip_settings
     {CUOPT_USER_PROBLEM_FILE, &mip_settings.user_problem_file, ""},
     {CUOPT_USER_PROBLEM_FILE, &pdlp_settings.user_problem_file, ""},
     {CUOPT_PRESOLVE_FILE, &mip_settings.presolve_file, ""},
-    {CUOPT_PRESOLVE_FILE, &pdlp_settings.presolve_file, ""}
+    {CUOPT_PRESOLVE_FILE, &pdlp_settings.presolve_file, ""},
+    {CUOPT_MULTI_GPU_PARTITION_FILE, &pdlp_settings.multi_gpu_partition_file, ""},
+    {CUOPT_MULTI_GPU_EXPORT_PARTITION_FILE, &pdlp_settings.multi_gpu_export_partition_file, ""},
+    {CUOPT_DISTRIBUTED_PDLP_PARTITIONER, &pdlp_settings.distributed_pdlp_partitioner, "auto"},
   };
   // clang-format on
 }

@@ -29,6 +29,12 @@ set(LP_CORE_FILES
   ${CMAKE_CURRENT_SOURCE_DIR}/termination_strategy/convergence_information.cu
   ${CMAKE_CURRENT_SOURCE_DIR}/optimal_batch_size_handler/optimal_batch_size_handler.cu
   ${CMAKE_CURRENT_SOURCE_DIR}/utilities/ping_pong_graph.cu
+  ${CMAKE_CURRENT_SOURCE_DIR}/distributed_pdlp/shard.cu
+  ${CMAKE_CURRENT_SOURCE_DIR}/distributed_pdlp/multi_gpu_engine.cu
+  ${CMAKE_CURRENT_SOURCE_DIR}/distributed_pdlp/partition_loader.cu
+  ${CMAKE_CURRENT_SOURCE_DIR}/distributed_pdlp/partitioner.cu
+  ${CMAKE_CURRENT_SOURCE_DIR}/distributed_pdlp/metis_partitioner.cu
+  ${CMAKE_CURRENT_SOURCE_DIR}/distributed_pdlp/kaminpar_partitioner.cpp
 )
 
 # C and Python adapter files

@@ -498,14 +498,17 @@ cusparse_view_t<i_t, f_t>::cusparse_view_t(
   // setup cusparse view
   A.create(op_problem_scaled.n_constraints,
            op_problem_scaled.n_variables,
-           op_problem_scaled.nnz,
+           static_cast<int64_t>(A_.size()),
            const_cast<i_t*>(op_problem_scaled.offsets.data()),
            const_cast<i_t*>(op_problem_scaled.variables.data()),
            const_cast<f_t*>(op_problem_scaled.coefficients.data()));
 
+  // A_T can have a different nnz than A in multi-GPU shards
+  // A is just what is needed to compute A_x for owned constraints
+  // A_T is just what is needed to compute A_T_y for owned variables
   A_T.create(op_problem_scaled.n_variables,
              op_problem_scaled.n_constraints,
-             op_problem_scaled.nnz,
+             static_cast<int64_t>(A_T_.size()),
              const_cast<i_t*>(A_T_offsets_.data()),
              const_cast<i_t*>(A_T_indices_.data()),
              const_cast<f_t*>(A_T_.data()));
@@ -914,14 +917,14 @@ cusparse_view_t<i_t, f_t>::cusparse_view_t(
   // setup cusparse view
   A.create(op_problem.n_constraints,
            op_problem.n_variables,
-           op_problem.nnz,
+           static_cast<int64_t>(A_.size()),
            const_cast<i_t*>(op_problem.offsets.data()),
            const_cast<i_t*>(op_problem.variables.data()),
            const_cast<f_t*>(op_problem.coefficients.data()));
 
   A_T.create(op_problem.n_variables,
              op_problem.n_constraints,
-             op_problem.nnz,
+             static_cast<int64_t>(A_T_.size()),
              const_cast<i_t*>(A_T_offsets_.data()),
              const_cast<i_t*>(A_T_indices_.data()),
              const_cast<f_t*>(A_T_.data()));
@@ -1129,16 +1132,18 @@ cusparse_view_t<i_t, f_t>::cusparse_view_t(
   // Copying them from the existing cuSparse view is a bad practice and creates segfault post
   // CUDA 12.4 Using the saved pointer of the existing cusparse view to make sure we capture the
   // correct pointer
+  // See comment in the PDHG cusparse_view_t ctor: bind the descriptor nnz to
+  // the actual value-buffer length so A and A_T stay symmetric and shard-safe.
   A.create(op_problem.n_constraints,
            op_problem.n_variables,
-           op_problem.nnz,
+           static_cast<int64_t>(A_.size()),
            const_cast<i_t*>(A_offsets_.data()),
            const_cast<i_t*>(A_indices_.data()),
            const_cast<f_t*>(A_.data()));
 
   A_T.create(op_problem.n_variables,
              op_problem.n_constraints,
-             op_problem.nnz,
+             static_cast<int64_t>(existing_cusparse_view.A_T_.size()),
              const_cast<i_t*>(existing_cusparse_view.A_T_offsets_.data()),
              const_cast<i_t*>(existing_cusparse_view.A_T_indices_.data()),
              const_cast<f_t*>(existing_cusparse_view.A_T_.data()));