-
Notifications
You must be signed in to change notification settings - Fork 191
CuD-PDLP #1391
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
CuD-PDLP #1391
Changes from all commits
1e0bd53
978d17b
dd0c0ef
2037eca
0f62eff
d89c85a
5534ff0
dd935c5
09eb20b
b5ebfd2
0965a60
d4d1cab
6659dd9
b2ed271
50d16ce
359d9f4
910a49a
76c0b3f
5ec7138
1f02afd
de19f38
0030a6c
172ebc2
23d0798
30881ce
c33faf2
98e0ce6
84128bf
abe4dd2
5c41497
37b1fda
57c7061
9f78d05
c484485
fc46080
6538382
a88285a
b34c5f6
b784a44
ca7d7a9
0310d50
f811bc8
4d7e2fc
7ad4606
03d1259
cdc912b
04d22cf
b41df45
a1ffe1d
c9394d9
4faa7df
61acddb
b8b59bf
7d74e74
859a299
7daa740
8a39e8c
5a3b9ce
e4739b5
1903f4b
0aacb4f
6df8145
df9f793
4c8bcd1
a8a8054
5abcd2e
0b0ce2c
91b1ae5
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,48 @@ | ||
| # cmake-format: off | ||
| # SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. | ||
| # SPDX-License-Identifier: Apache-2.0 | ||
| # cmake-format: on | ||
|
|
||
| # Multi-threaded graph partitioner for distributed PDLP. | ||
| # Uses rapids_cpm_find so a system / conda / .deb install of KaMinPar (which ships a | ||
| # CMake config package exporting KaMinPar::KaMinPar) is used when available, and | ||
| # otherwise the pinned source is cloned and built via CPM. KaMinPar depends on TBB, | ||
| # which cuOpt already requires (see find_package(TBB) for papilo). | ||
| function(find_and_configure_kaminpar) | ||
| set(oneValueArgs VERSION PINNED_TAG) | ||
| cmake_parse_arguments(PKG "" "${oneValueArgs}" "" ${ARGN}) | ||
|
|
||
| rapids_cpm_find(KaMinPar ${PKG_VERSION} | ||
| GLOBAL_TARGETS KaMinPar::KaMinPar | ||
| CPM_ARGS | ||
| GIT_REPOSITORY https://github.com/KaHIP/KaMinPar.git | ||
| GIT_TAG ${PKG_PINNED_TAG} | ||
| EXCLUDE_FROM_ALL | ||
| OPTIONS | ||
| "KAMINPAR_BUILD_APPS OFF" | ||
| "KAMINPAR_BUILD_TOOLS OFF" | ||
| "KAMINPAR_BUILD_TESTS OFF" | ||
| "KAMINPAR_BUILD_BENCHMARKS OFF" | ||
| "KAMINPAR_BUILD_EXAMPLES OFF" | ||
| "KAMINPAR_BUILD_DISTRIBUTED OFF" | ||
| # Timers use global state and force single-threaded use of the library | ||
| # interface; disable so cuOpt can call the partitioner freely. | ||
| "KAMINPAR_ENABLE_TIMERS OFF" | ||
| # Avoid an extra hard dependency on Google Sparsehash. | ||
| "KAMINPAR_BUILD_WITH_SPARSEHASH OFF" | ||
| # cuOpt's TBB is discovered via a legacy find that only exposes TBB::tbb | ||
| # (no TBB::tbbmalloc target); disable KaMinPar's optional tbbmalloc use. | ||
| "KAMINPAR_ENABLE_TBB_MALLOC OFF" | ||
| # Large LP constraint graphs can exceed 2^31 directed edges. | ||
| "KAMINPAR_64BIT_EDGE_IDS ON" | ||
| "INSTALL_KAMINPAR OFF" | ||
| ) | ||
|
|
||
| if(KaMinPar_ADDED) | ||
| message(VERBOSE "CUOPT: Using KaMinPar located in ${KaMinPar_SOURCE_DIR}") | ||
| else() | ||
| message(VERBOSE "CUOPT: Using KaMinPar located in ${KaMinPar_DIR}") | ||
| endif() | ||
| endfunction() | ||
|
|
||
| find_and_configure_kaminpar(VERSION 3.7.3 PINNED_TAG v3.7.3) |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -176,7 +176,12 @@ int run_single_file(const std::string& file_path, | |
| auto solution = cuopt::linear_programming::solve_mip(problem_interface.get(), mip_settings); | ||
| } else { | ||
| auto& lp_settings = settings.get_pdlp_settings(); | ||
| auto solution = cuopt::linear_programming::solve_lp(problem_interface.get(), lp_settings); | ||
|
|
||
| if (lp_settings.hyper_params.use_distributed_pdlp) { | ||
| cuopt::linear_programming::solve_lp(handle_ptr.get(), mps_data_model, lp_settings); | ||
| } else { | ||
| cuopt::linear_programming::solve_lp(problem_interface.get(), lp_settings); | ||
| } | ||
| } | ||
| } catch (const std::exception& e) { | ||
| fprintf(stderr, "cuopt_cli error: %s\n", e.what()); | ||
|
|
@@ -426,10 +431,21 @@ int main(int argc, char* argv[]) | |
| std::vector<rmm::mr::cuda_async_memory_resource> memory_resources; | ||
|
|
||
| if (memory_backend == cuopt::linear_programming::memory_backend_t::GPU) { | ||
| const int num_gpus = settings.get_parameter<int>(CUOPT_NUM_GPUS); | ||
| // Distributed PDLP scales one shard per GPU and uses its own knob; everything else | ||
| // (concurrent, batch, MIP) uses num_gpus which is capped at 2. | ||
| // For distributed PDLP, -1 means "auto-detect": resolve to the visible device | ||
| // count so the RMM memory pools match what solve.cu will eventually dispatch. | ||
| const bool use_distributed_pdlp = settings.get_parameter<bool>(CUOPT_USE_DISTRIBUTED_PDLP); | ||
| int requested_gpus = use_distributed_pdlp | ||
| ? settings.get_parameter<int>(CUOPT_DISTRIBUTED_PDLP_NUM_GPUS) | ||
| : settings.get_parameter<int>(CUOPT_NUM_GPUS); | ||
| if (use_distributed_pdlp && requested_gpus == -1) { | ||
| requested_gpus = raft::device_setter::get_device_count(); | ||
| } | ||
| const int provisioned_gpus = std::min(raft::device_setter::get_device_count(), requested_gpus); | ||
|
|
||
| memory_resources.reserve(std::min(raft::device_setter::get_device_count(), num_gpus)); | ||
| for (int i = 0; i < std::min(raft::device_setter::get_device_count(), num_gpus); ++i) { | ||
| memory_resources.reserve(provisioned_gpus); | ||
|
Comment on lines
+439
to
+447
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Validate GPU count before provisioning memory resources.
Suggested fix- if (use_distributed_pdlp && requested_gpus == -1) {
- requested_gpus = raft::device_setter::get_device_count();
- }
- const int provisioned_gpus = std::min(raft::device_setter::get_device_count(), requested_gpus);
+ const int visible_gpus = raft::device_setter::get_device_count();
+ if (use_distributed_pdlp && requested_gpus == -1) {
+ requested_gpus = visible_gpus;
+ }
+ if (requested_gpus <= 0) {
+ std::cerr << "Invalid GPU count: " << requested_gpus
+ << " (must be > 0, or -1 only with distributed PDLP)." << std::endl;
+ return 1;
+ }
+ const int provisioned_gpus = std::min(visible_gpus, requested_gpus);
+ if (provisioned_gpus <= 0) {
+ std::cerr << "No visible GPUs available for GPU backend." << std::endl;
+ return 1;
+ }🤖 Prompt for AI Agents |
||
| for (int i = 0; i < provisioned_gpus; ++i) { | ||
| RAFT_CUDA_TRY(cudaSetDevice(i)); | ||
| memory_resources.emplace_back(); | ||
| rmm::mr::set_per_device_resource(rmm::cuda_device_id{i}, memory_resources.back()); | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -498,14 +498,17 @@ cusparse_view_t<i_t, f_t>::cusparse_view_t( | |
| // setup cusparse view | ||
| A.create(op_problem_scaled.n_constraints, | ||
| op_problem_scaled.n_variables, | ||
| op_problem_scaled.nnz, | ||
| static_cast<int64_t>(A_.size()), | ||
| const_cast<i_t*>(op_problem_scaled.offsets.data()), | ||
| const_cast<i_t*>(op_problem_scaled.variables.data()), | ||
| const_cast<f_t*>(op_problem_scaled.coefficients.data())); | ||
|
|
||
| // A_T can have a different nnz than A in multi-GPU shards | ||
| // A is just what is needed to compute A_x for owned constraints | ||
| // A_T is just what is needed to compute A_T_y for owned variables | ||
| A_T.create(op_problem_scaled.n_variables, | ||
| op_problem_scaled.n_constraints, | ||
| op_problem_scaled.nnz, | ||
| static_cast<int64_t>(A_T_.size()), | ||
|
Comment on lines
+501
to
+511
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Use shard-local nnz in the mixed-precision setup too. Now that this ctor allows Suggested follow-up- A_float_.resize(op_problem_scaled.nnz, handle_ptr->get_stream());
- A_T_float_.resize(op_problem_scaled.nnz, handle_ptr->get_stream());
+ A_float_.resize(A_.size(), handle_ptr->get_stream());
+ A_T_float_.resize(A_T_.size(), handle_ptr->get_stream());
- RAFT_CUDA_TRY(cub::DeviceTransform::Transform(op_problem_scaled.coefficients.data(),
+ RAFT_CUDA_TRY(cub::DeviceTransform::Transform(A_.data(),
A_float_.data(),
- op_problem_scaled.nnz,
+ A_.size(),
double_to_float_functor{},
handle_ptr->get_stream().value()));
RAFT_CUDA_TRY(cub::DeviceTransform::Transform(A_T_.data(),
A_T_float_.data(),
- op_problem_scaled.nnz,
+ A_T_.size(),
double_to_float_functor{},
handle_ptr->get_stream().value()));
A_mixed_.create(op_problem_scaled.n_constraints,
op_problem_scaled.n_variables,
- op_problem_scaled.nnz,
+ static_cast<int64_t>(A_.size()),
const_cast<i_t*>(op_problem_scaled.offsets.data()),
const_cast<i_t*>(op_problem_scaled.variables.data()),
A_float_.data());
A_T_mixed_.create(op_problem_scaled.n_variables,
op_problem_scaled.n_constraints,
- op_problem_scaled.nnz,
+ static_cast<int64_t>(A_T_.size()),
const_cast<i_t*>(A_T_offsets_.data()),
const_cast<i_t*>(A_T_indices_.data()),
A_T_float_.data());As per coding guidelines, "Prevent invalid memory access (out-of-bounds, use-after-free, host/device confusion) in GPU code." 🤖 Prompt for AI Agents |
||
| const_cast<i_t*>(A_T_offsets_.data()), | ||
| const_cast<i_t*>(A_T_indices_.data()), | ||
| const_cast<f_t*>(A_T_.data())); | ||
|
|
@@ -914,14 +917,14 @@ cusparse_view_t<i_t, f_t>::cusparse_view_t( | |
| // setup cusparse view | ||
| A.create(op_problem.n_constraints, | ||
| op_problem.n_variables, | ||
| op_problem.nnz, | ||
| static_cast<int64_t>(A_.size()), | ||
| const_cast<i_t*>(op_problem.offsets.data()), | ||
| const_cast<i_t*>(op_problem.variables.data()), | ||
| const_cast<f_t*>(op_problem.coefficients.data())); | ||
|
|
||
| A_T.create(op_problem.n_variables, | ||
| op_problem.n_constraints, | ||
| op_problem.nnz, | ||
| static_cast<int64_t>(A_T_.size()), | ||
| const_cast<i_t*>(A_T_offsets_.data()), | ||
| const_cast<i_t*>(A_T_indices_.data()), | ||
| const_cast<f_t*>(A_T_.data())); | ||
|
|
@@ -1129,16 +1132,18 @@ cusparse_view_t<i_t, f_t>::cusparse_view_t( | |
| // Copying them from the existing cuSparse view is a bad practice and creates segfault post | ||
| // CUDA 12.4 Using the saved pointer of the existing cusparse view to make sure we capture the | ||
| // correct pointer | ||
| // See comment in the PDHG cusparse_view_t ctor: bind the descriptor nnz to | ||
| // the actual value-buffer length so A and A_T stay symmetric and shard-safe. | ||
| A.create(op_problem.n_constraints, | ||
| op_problem.n_variables, | ||
| op_problem.nnz, | ||
| static_cast<int64_t>(A_.size()), | ||
| const_cast<i_t*>(A_offsets_.data()), | ||
| const_cast<i_t*>(A_indices_.data()), | ||
| const_cast<f_t*>(A_.data())); | ||
|
|
||
| A_T.create(op_problem.n_variables, | ||
| op_problem.n_constraints, | ||
| op_problem.nnz, | ||
| static_cast<int64_t>(existing_cusparse_view.A_T_.size()), | ||
| const_cast<i_t*>(existing_cusparse_view.A_T_offsets_.data()), | ||
| const_cast<i_t*>(existing_cusparse_view.A_T_indices_.data()), | ||
| const_cast<f_t*>(existing_cusparse_view.A_T_.data())); | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Guard distributed solve path against null GPU handle.
When distributed PDLP is enabled, this branch can pass a null
handle_ptrif the selected memory backend is not GPU. Fail fast before calling the distributed overload.Suggested fix
🤖 Prompt for AI Agents