diff --git a/conda/environments/all_cuda-129_arch-aarch64.yaml b/conda/environments/all_cuda-129_arch-aarch64.yaml index 145850d4d..acc652407 100644 --- a/conda/environments/all_cuda-129_arch-aarch64.yaml +++ b/conda/environments/all_cuda-129_arch-aarch64.yaml @@ -35,6 +35,7 @@ dependencies: - libcusolver-dev - libcusparse-dev - libgrpc >=1.78.0,<1.80.0a0 +- libnvjitlink-dev - libprotobuf - libraft-headers==26.6.*,>=0.0.0a0 - librmm==26.6.*,>=0.0.0a0 diff --git a/conda/environments/all_cuda-129_arch-x86_64.yaml b/conda/environments/all_cuda-129_arch-x86_64.yaml index 293b49fbe..ae6772cca 100644 --- a/conda/environments/all_cuda-129_arch-x86_64.yaml +++ b/conda/environments/all_cuda-129_arch-x86_64.yaml @@ -35,6 +35,7 @@ dependencies: - libcusolver-dev - libcusparse-dev - libgrpc >=1.78.0,<1.80.0a0 +- libnvjitlink-dev - libprotobuf - libraft-headers==26.6.*,>=0.0.0a0 - librmm==26.6.*,>=0.0.0a0 diff --git a/conda/environments/all_cuda-132_arch-aarch64.yaml b/conda/environments/all_cuda-132_arch-aarch64.yaml index fa8844a1f..17d986fee 100644 --- a/conda/environments/all_cuda-132_arch-aarch64.yaml +++ b/conda/environments/all_cuda-132_arch-aarch64.yaml @@ -35,6 +35,7 @@ dependencies: - libcusolver-dev - libcusparse-dev - libgrpc >=1.78.0,<1.80.0a0 +- libnvjitlink-dev - libprotobuf - libraft-headers==26.6.*,>=0.0.0a0 - librmm==26.6.*,>=0.0.0a0 diff --git a/conda/environments/all_cuda-132_arch-x86_64.yaml b/conda/environments/all_cuda-132_arch-x86_64.yaml index a37d8718c..62d876e71 100644 --- a/conda/environments/all_cuda-132_arch-x86_64.yaml +++ b/conda/environments/all_cuda-132_arch-x86_64.yaml @@ -35,6 +35,7 @@ dependencies: - libcusolver-dev - libcusparse-dev - libgrpc >=1.78.0,<1.80.0a0 +- libnvjitlink-dev - libprotobuf - libraft-headers==26.6.*,>=0.0.0a0 - librmm==26.6.*,>=0.0.0a0 diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 7e9dd1371..395f36480 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -439,7 +439,10 @@ set_target_properties(cuopt CXX_SCAN_FOR_MODULES OFF ) -target_compile_definitions(cuopt PUBLIC "CUOPT_LOG_ACTIVE_LEVEL=RAPIDS_LOGGER_LOG_LEVEL_${LIBCUOPT_LOGGING_LEVEL}") +target_compile_definitions(cuopt + PUBLIC "CUOPT_LOG_ACTIVE_LEVEL=RAPIDS_LOGGER_LOG_LEVEL_${LIBCUOPT_LOGGING_LEVEL}" + PUBLIC CUSPARSE_ENABLE_EXPERIMENTAL_API +) target_compile_options(cuopt PRIVATE "$<$:${CUOPT_CXX_FLAGS}>" diff --git a/cpp/src/pdlp/cusparse_view.cu b/cpp/src/pdlp/cusparse_view.cu index 359bb7e92..18d2650cf 100644 --- a/cpp/src/pdlp/cusparse_view.cu +++ b/cpp/src/pdlp/cusparse_view.cu @@ -153,6 +153,90 @@ cusparse_dn_mat_descr_wrapper_t::operator cusparseDnMatDescr_t() const return descr_; } +#if CUDA_VER_13_2_UP +cusparse_spmvop_descr_wrapper_t::cusparse_spmvop_descr_wrapper_t() + : descr_(nullptr), need_destruction_(false) +{ +} + +cusparse_spmvop_descr_wrapper_t::~cusparse_spmvop_descr_wrapper_t() +{ + if (need_destruction_) { RAFT_CUSPARSE_TRY_NO_THROW(cusparseSpMVOp_destroyDescr(descr_)); } +} + +cusparse_spmvop_descr_wrapper_t::cusparse_spmvop_descr_wrapper_t( + const cusparse_spmvop_descr_wrapper_t& other) + : descr_(other.descr_), need_destruction_(false) +{ +} + +cusparse_spmvop_descr_wrapper_t& cusparse_spmvop_descr_wrapper_t::operator=( + cusparse_spmvop_descr_wrapper_t&& other) +{ + if (need_destruction_) { RAFT_CUSPARSE_TRY(cusparseSpMVOp_destroyDescr(descr_)); } + descr_ = other.descr_; + need_destruction_ = other.need_destruction_; + other.need_destruction_ = false; + return *this; +} + +void cusparse_spmvop_descr_wrapper_t::create(cusparseHandle_t handle, + cusparseOperation_t opA, + cusparseSpMatDescr_t matA, + cusparseDnVecDescr_t vecX, + cusparseDnVecDescr_t vecY, + cusparseDnVecDescr_t vecZ, + cudaDataType computeType, + void* buffer) +{ + if (need_destruction_) { RAFT_CUSPARSE_TRY(cusparseSpMVOp_destroyDescr(descr_)); } + RAFT_CUSPARSE_TRY( + cusparseSpMVOp_createDescr(handle, &descr_, opA, matA, vecX, vecY, vecZ, computeType, buffer)); + need_destruction_ = true; +} + +cusparse_spmvop_descr_wrapper_t::operator cusparseSpMVOpDescr_t() const { return descr_; } + +cusparse_spmvop_plan_wrapper_t::cusparse_spmvop_plan_wrapper_t() + : plan_(nullptr), need_destruction_(false) +{ +} + +cusparse_spmvop_plan_wrapper_t::~cusparse_spmvop_plan_wrapper_t() +{ + if (need_destruction_) { RAFT_CUSPARSE_TRY_NO_THROW(cusparseSpMVOp_destroyPlan(plan_)); } +} + +cusparse_spmvop_plan_wrapper_t::cusparse_spmvop_plan_wrapper_t( + const cusparse_spmvop_plan_wrapper_t& other) + : plan_(other.plan_), need_destruction_(false) +{ +} + +cusparse_spmvop_plan_wrapper_t& cusparse_spmvop_plan_wrapper_t::operator=( + cusparse_spmvop_plan_wrapper_t&& other) +{ + if (need_destruction_) { RAFT_CUSPARSE_TRY(cusparseSpMVOp_destroyPlan(plan_)); } + plan_ = other.plan_; + need_destruction_ = other.need_destruction_; + other.need_destruction_ = false; + return *this; +} + +void cusparse_spmvop_plan_wrapper_t::create(cusparseHandle_t handle, + cusparseSpMVOpDescr_t descr, + char* lto_buffer, + size_t lto_buffer_size) +{ + if (need_destruction_) { RAFT_CUSPARSE_TRY(cusparseSpMVOp_destroyPlan(plan_)); } + RAFT_CUSPARSE_TRY(cusparseSpMVOp_createPlan(handle, descr, &plan_, lto_buffer, lto_buffer_size)); + need_destruction_ = true; +} + +cusparse_spmvop_plan_wrapper_t::operator cusparseSpMVOpPlan_t() const { return plan_; } + +#endif + #if CUDA_VER_12_4_UP struct dynamic_load_runtime { static void* get_cusparse_runtime_handle() @@ -304,6 +388,8 @@ cusparse_view_t::cusparse_view_t( A_T_indices_{op_problem_scaled.reverse_constraints}, buffer_non_transpose{0, handle_ptr->get_stream()}, buffer_transpose{0, handle_ptr->get_stream()}, + buffer_non_transpose_spmvop{0, handle_ptr->get_stream()}, + buffer_transpose_spmvop{0, handle_ptr->get_stream()}, buffer_transpose_batch{0, handle_ptr->get_stream()}, buffer_non_transpose_batch{0, handle_ptr->get_stream()}, buffer_transpose_batch_row_row_{0, handle_ptr->get_stream()}, @@ -717,6 +803,8 @@ cusparse_view_t::cusparse_view_t( A_T_indices_{_A_T_indices}, buffer_non_transpose{0, handle_ptr->get_stream()}, buffer_transpose{0, handle_ptr->get_stream()}, + buffer_non_transpose_spmvop{0, handle_ptr->get_stream()}, + buffer_transpose_spmvop{0, handle_ptr->get_stream()}, buffer_transpose_batch{0, handle_ptr->get_stream()}, buffer_non_transpose_batch{0, handle_ptr->get_stream()}, buffer_transpose_batch_row_row_{0, handle_ptr->get_stream()}, @@ -926,6 +1014,8 @@ cusparse_view_t::cusparse_view_t( tmp_dual(existing_cusparse_view.tmp_dual), buffer_non_transpose{0, handle_ptr->get_stream()}, buffer_transpose{0, handle_ptr->get_stream()}, + buffer_non_transpose_spmvop{0, handle_ptr->get_stream()}, + buffer_transpose_spmvop{0, handle_ptr->get_stream()}, buffer_transpose_batch{0, handle_ptr->get_stream()}, buffer_non_transpose_batch{0, handle_ptr->get_stream()}, buffer_transpose_batch_row_row_{0, handle_ptr->get_stream()}, @@ -1041,6 +1131,8 @@ cusparse_view_t::cusparse_view_t( : handle_ptr_(handle_ptr), buffer_non_transpose{0, handle_ptr->get_stream()}, buffer_transpose{0, handle_ptr->get_stream()}, + buffer_non_transpose_spmvop{0, handle_ptr->get_stream()}, + buffer_transpose_spmvop{0, handle_ptr->get_stream()}, buffer_transpose_batch{0, handle_ptr->get_stream()}, buffer_non_transpose_batch{0, handle_ptr->get_stream()}, buffer_transpose_batch_row_row_{0, handle_ptr->get_stream()}, @@ -1182,6 +1274,66 @@ bool is_cusparse_runtime_mixed_precision_supported() return (major > 12) || (major == 12 && minor >= 5); } +// Creates SpMVOp plans. Must be called after scale_problem() so plans use the scaled matrix. +template +void cusparse_view_t::create_spmv_op_plans(bool is_reflected) +{ +#if CUDA_VER_13_2_UP + CUSPARSE_CHECK(cusparseSetStream(handle_ptr_->get_cusparse_handle(), handle_ptr_->get_stream())); + // Prepare buffers for At_y SpMVOp + size_t buffer_size_transpose = 0; + RAFT_CUSPARSE_TRY(cusparseSpMVOp_bufferSize(handle_ptr_->get_cusparse_handle(), + CUSPARSE_OPERATION_NON_TRANSPOSE, + A_T, + dual_solution, + current_AtY, + current_AtY, + CUDA_R_64F, + &buffer_size_transpose)); + buffer_transpose_spmvop.resize(buffer_size_transpose, handle_ptr_->get_stream()); + + spmv_op_descr_A_t_.create(handle_ptr_->get_cusparse_handle(), + CUSPARSE_OPERATION_NON_TRANSPOSE, + A_T, + dual_solution, + current_AtY, + current_AtY, + CUDA_R_64F, + buffer_transpose_spmvop.data()); + + char* lto_buffer = NULL; + size_t lto_buffer_size = 0; + spmv_op_plan_A_t_.create( + handle_ptr_->get_cusparse_handle(), spmv_op_descr_A_t_, lto_buffer, lto_buffer_size); + + // Only prepare buffers for A_x if we are using reflected_halpern + if (is_reflected) { + size_t buffer_size_non_transpose = 0; + RAFT_CUSPARSE_TRY(cusparseSpMVOp_bufferSize(handle_ptr_->get_cusparse_handle(), + CUSPARSE_OPERATION_NON_TRANSPOSE, + A, + reflected_primal_solution, + dual_gradient, + dual_gradient, + CUDA_R_64F, + &buffer_size_non_transpose)); + buffer_non_transpose_spmvop.resize(buffer_size_non_transpose, handle_ptr_->get_stream()); + + spmv_op_descr_A_.create(handle_ptr_->get_cusparse_handle(), + CUSPARSE_OPERATION_NON_TRANSPOSE, + A, + reflected_primal_solution, + dual_gradient, + dual_gradient, + CUDA_R_64F, + buffer_non_transpose_spmvop.data()); + + spmv_op_plan_A_.create( + handle_ptr_->get_cusparse_handle(), spmv_op_descr_A_, lto_buffer, lto_buffer_size); + } +#endif +} + #if MIP_INSTANTIATE_FLOAT || PDLP_INSTANTIATE_FLOAT template class cusparse_sp_mat_descr_wrapper_t; template class cusparse_dn_vec_descr_wrapper_t; diff --git a/cpp/src/pdlp/cusparse_view.hpp b/cpp/src/pdlp/cusparse_view.hpp index c6d0ddea6..23e28e11b 100644 --- a/cpp/src/pdlp/cusparse_view.hpp +++ b/cpp/src/pdlp/cusparse_view.hpp @@ -20,6 +20,8 @@ #include +#define CUDA_VER_13_2_UP (CUDART_VERSION >= 13020) + namespace cuopt::linear_programming::detail { template @@ -79,6 +81,54 @@ class cusparse_dn_mat_descr_wrapper_t { bool need_destruction_; }; +#if CUDA_VER_13_2_UP +class cusparse_spmvop_descr_wrapper_t { + public: + cusparse_spmvop_descr_wrapper_t(); + ~cusparse_spmvop_descr_wrapper_t(); + + cusparse_spmvop_descr_wrapper_t(const cusparse_spmvop_descr_wrapper_t& other); + cusparse_spmvop_descr_wrapper_t& operator=(cusparse_spmvop_descr_wrapper_t&& other); + cusparse_spmvop_descr_wrapper_t& operator=(const cusparse_spmvop_descr_wrapper_t& other) = delete; + + void create(cusparseHandle_t handle, + cusparseOperation_t opA, + cusparseSpMatDescr_t matA, + cusparseDnVecDescr_t vecX, + cusparseDnVecDescr_t vecY, + cusparseDnVecDescr_t vecZ, + cudaDataType computeType, + void* buffer); + + operator cusparseSpMVOpDescr_t() const; + + private: + cusparseSpMVOpDescr_t descr_; + bool need_destruction_; +}; + +class cusparse_spmvop_plan_wrapper_t { + public: + cusparse_spmvop_plan_wrapper_t(); + ~cusparse_spmvop_plan_wrapper_t(); + + cusparse_spmvop_plan_wrapper_t(const cusparse_spmvop_plan_wrapper_t& other); + cusparse_spmvop_plan_wrapper_t& operator=(cusparse_spmvop_plan_wrapper_t&& other); + cusparse_spmvop_plan_wrapper_t& operator=(const cusparse_spmvop_plan_wrapper_t& other) = delete; + + void create(cusparseHandle_t handle, + cusparseSpMVOpDescr_t descr, + char* lto_buffer, + size_t lto_buffer_size); + + operator cusparseSpMVOpPlan_t() const; + + private: + cusparseSpMVOpPlan_t plan_; + bool need_destruction_; +}; +#endif + template class cusparse_view_t { public: @@ -172,6 +222,17 @@ class cusparse_view_t { rmm::device_uvector buffer_non_transpose; rmm::device_uvector buffer_transpose; + // SpMVOp buffers for A and A_T + rmm::device_uvector buffer_non_transpose_spmvop{0, handle_ptr_->get_stream()}; + rmm::device_uvector buffer_transpose_spmvop{0, handle_ptr_->get_stream()}; + +#if CUDA_VER_13_2_UP + // SpMVOp descriptors and plans for A and A_T (descr before plan so dtor destroys plan first) + cusparse_spmvop_descr_wrapper_t spmv_op_descr_A_; + cusparse_spmvop_plan_wrapper_t spmv_op_plan_A_; + cusparse_spmvop_descr_wrapper_t spmv_op_descr_A_t_; + cusparse_spmvop_plan_wrapper_t spmv_op_plan_A_t_; +#endif // reuse buffers for cusparse spmm rmm::device_uvector buffer_transpose_batch; rmm::device_uvector buffer_non_transpose_batch; @@ -212,6 +273,8 @@ class cusparse_view_t { // Redirects the cuSPARSE CSR structure pointers from op_problem_scaled_ to the original problem // so the duplicated row/column buffers can be freed. void redirect_cusparse_csr_structure_pointers(const problem_t& original_problem); + // Creates SpMVOp plans. Must be called after scale_problem() so plans use the scaled matrix. + void create_spmv_op_plans(bool is_reflected); }; // Mixed precision SpMV: FP32 matrix with FP64 vectors and FP64 compute type diff --git a/cpp/src/pdlp/pdhg.cu b/cpp/src/pdlp/pdhg.cu index d9dbb083f..ddc60c5e2 100644 --- a/cpp/src/pdlp/pdhg.cu +++ b/cpp/src/pdlp/pdhg.cu @@ -444,6 +444,59 @@ void pdhg_solver_t::compute_next_dual_solution(rmm::device_uvector +void pdhg_solver_t::spmvop_At_y() +{ +#if CUDA_VER_13_2_UP + RAFT_CUSPARSE_TRY(cusparseSetStream(handle_ptr_->get_cusparse_handle(), stream_view_.value())); + RAFT_CUSPARSE_TRY(cusparseSpMVOp(handle_ptr_->get_cusparse_handle(), + cusparse_view_.spmv_op_plan_A_t_, + reusable_device_scalar_value_1_.data(), + reusable_device_scalar_value_0_.data(), + cusparse_view_.dual_solution, + cusparse_view_.current_AtY, + cusparse_view_.current_AtY)); +#else + RAFT_CUSPARSE_TRY(raft::sparse::detail::cusparsespmv(handle_ptr_->get_cusparse_handle(), + CUSPARSE_OPERATION_NON_TRANSPOSE, + reusable_device_scalar_value_1_.data(), + cusparse_view_.A_T, + cusparse_view_.dual_solution, + reusable_device_scalar_value_0_.data(), + cusparse_view_.current_AtY, + CUSPARSE_SPMV_CSR_ALG2, + (f_t*)cusparse_view_.buffer_transpose.data(), + stream_view_)); +#endif +} + +template +void pdhg_solver_t::spmvop_A_x() +{ +#if CUDA_VER_13_2_UP + RAFT_CUSPARSE_TRY(cusparseSetStream(handle_ptr_->get_cusparse_handle(), stream_view_.value())); + RAFT_CUSPARSE_TRY(cusparseSpMVOp(handle_ptr_->get_cusparse_handle(), + cusparse_view_.spmv_op_plan_A_, + reusable_device_scalar_value_1_.data(), + reusable_device_scalar_value_0_.data(), + cusparse_view_.reflected_primal_solution, + cusparse_view_.dual_gradient, + cusparse_view_.dual_gradient)); +#else + RAFT_CUSPARSE_TRY( + raft::sparse::detail::cusparsespmv(handle_ptr_->get_cusparse_handle(), + CUSPARSE_OPERATION_NON_TRANSPOSE, + reusable_device_scalar_value_1_.data(), + cusparse_view_.A, + cusparse_view_.reflected_primal_solution, + reusable_device_scalar_value_0_.data(), + cusparse_view_.dual_gradient, + CUSPARSE_SPMV_CSR_ALG2, + (f_t*)cusparse_view_.buffer_non_transpose.data(), + stream_view_)); +#endif +} + template void pdhg_solver_t::compute_At_y() { @@ -462,9 +515,10 @@ void pdhg_solver_t::compute_At_y() CUSPARSE_SPMV_CSR_ALG2, cusparse_view_.buffer_transpose_mixed_.data(), stream_view_); + } else { + spmvop_At_y(); } - } - if (!cusparse_view_.mixed_precision_enabled_) { + } else { RAFT_CUSPARSE_TRY( raft::sparse::detail::cusparsespmv(handle_ptr_->get_cusparse_handle(), CUSPARSE_OPERATION_NON_TRANSPOSE, @@ -510,9 +564,10 @@ void pdhg_solver_t::compute_A_x() CUSPARSE_SPMV_CSR_ALG2, cusparse_view_.buffer_non_transpose_mixed_.data(), stream_view_); + } else { + spmvop_A_x(); } - } - if (!cusparse_view_.mixed_precision_enabled_) { + } else { RAFT_CUSPARSE_TRY( raft::sparse::detail::cusparsespmv(handle_ptr_->get_cusparse_handle(), CUSPARSE_OPERATION_NON_TRANSPOSE, diff --git a/cpp/src/pdlp/pdhg.hpp b/cpp/src/pdlp/pdhg.hpp index 52f45dc83..d16400bd3 100644 --- a/cpp/src/pdlp/pdhg.hpp +++ b/cpp/src/pdlp/pdhg.hpp @@ -95,7 +95,10 @@ class pdhg_solver_t { void compute_primal_projection(rmm::device_uvector& primal_step_size); void compute_At_y(); void compute_A_x(); + void spmvop_At_y(); + void spmvop_A_x(); + void my_spmvop(f_t* alpha, f_t* A, f_t* x, f_t* beta, f_t* y, f_t* result); bool batch_mode_{false}; raft::handle_t const* handle_ptr_{nullptr}; rmm::cuda_stream_view stream_view_; diff --git a/cpp/src/pdlp/pdlp.cu b/cpp/src/pdlp/pdlp.cu index 49c77e44d..fd0cc9ffc 100644 --- a/cpp/src/pdlp/pdlp.cu +++ b/cpp/src/pdlp/pdlp.cu @@ -2316,6 +2316,12 @@ optimization_problem_solution_t pdlp_solver_t::run_solver(co compute_initial_primal_weight(); initial_scaling_strategy_.scale_problem(); + if constexpr (std::is_same_v) { + if (!batch_mode_ && !pdhg_solver_.get_cusparse_view().mixed_precision_enabled_) { + pdhg_solver_.get_cusparse_view().create_spmv_op_plans( + settings_.hyper_params.use_reflected_primal_dual); + } + } // Update FP32 matrix copies for mixed precision SpMV after scaling pdhg_solver_.get_cusparse_view().update_mixed_precision_matrices(); diff --git a/dependencies.yaml b/dependencies.yaml index 1601058cf..b7463fe8c 100644 --- a/dependencies.yaml +++ b/dependencies.yaml @@ -733,6 +733,7 @@ dependencies: - libcusolver-dev - libcusparse-dev - cuda-nvtx-dev + - libnvjitlink-dev cuda_wheels: