Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Allow to instantiate DenseKKTSystem on CUDA GPU #73

Merged
merged 5 commits into from
Oct 8, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 7 additions & 2 deletions lib/MadNLPGPU/Project.toml
Original file line number Diff line number Diff line change
Expand Up @@ -5,16 +5,21 @@ version = "0.1.0"

[deps]
CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
CUDAKernels = "72cfdca4-0801-4ab0-bf6a-d52aa10adc57"
KernelAbstractions = "63c18a36-062a-441e-b654-da1e3ab1ce7c"
LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
MadNLP = "2621e9c9-9eb4-46b1-8089-e8c72242dfb6"

[compat]
CUDA = "~2,~3"
CUDAKernels = "0.3.0"
KernelAbstractions = "0.7"
MadNLP = "~0.2"
julia = "1.5"

[extras]
Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
MadNLPTests = "b52a2a03-04ab-4a5f-9698-6a2deff93217"
Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"

[targets]
test = ["Test","MadNLPTests"]
test = ["Test", "MadNLPTests"]
13 changes: 12 additions & 1 deletion lib/MadNLPGPU/src/MadNLPGPU.jl
Original file line number Diff line number Diff line change
@@ -1,12 +1,23 @@
module MadNLPGPU

import CUDA: CUBLAS, CUSOLVER, CuVector, CuMatrix, toolkit_version, R_64F, has_cuda
import LinearAlgebra
# CUDA
import CUDA: CUBLAS, CUSOLVER, CuVector, CuMatrix, CuArray, toolkit_version, R_64F, has_cuda
# Kernels
import KernelAbstractions: @kernel, @index, wait
import CUDAKernels: CUDADevice

import MadNLP

import MadNLP:
@kwdef, Logger, @debug, @warn, @error,
AbstractOptions, AbstractLinearSolver, set_options!, MadNLPLapackCPU,
SymbolicException,FactorizationException,SolveException,InertiaException,
introduce, factorize!, solve!, improve!, is_inertia, inertia, tril_to_full!


include("kernels.jl")

if has_cuda()
include("lapackgpu.jl")
export MadNLPLapackGPU
Expand Down
147 changes: 147 additions & 0 deletions lib/MadNLPGPU/src/kernels.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,147 @@
#=
MadNLP utils
=#

@kernel function _copy_diag!(dest, src)
i = @index(Global)
dest[i] = src[i, i]
end

function MadNLP.diag!(dest::CuVector{T}, src::CuMatrix{T}) where T
@assert length(dest) == size(src, 1)
ev = _copy_diag!(CUDADevice())(dest, src, ndrange=length(dest))
wait(ev)
end

@kernel function _add_diagonal!(dest, src1, src2)
i = @index(Global)
dest[i, i] = src1[i] + src2[i]
end

function MadNLP.diag_add!(dest::CuMatrix, src1::CuVector, src2::CuVector)
ev = _add_diagonal!(CUDADevice())(dest, src1, src2, ndrange=size(dest, 1))
wait(ev)
end

#=
MadNLP kernels
=#

# Overload is_valid to avoid fallback to default is_valid, slow on GPU
MadNLP.is_valid(src::CuArray) = true

# Constraint scaling
function MadNLP.set_con_scale!(con_scale::AbstractVector, jac::CuMatrix, nlp_scaling_max_gradient)
# Compute reduction on the GPU with built-in CUDA.jl function
d_con_scale = maximum(abs, jac, dims=2)
copyto!(con_scale, d_con_scale)
con_scale .= min.(1.0, nlp_scaling_max_gradient ./ con_scale)
end

@kernel function _treat_fixed_variable_kernell!(dest, ind_fixed)
k, j = @index(Global, NTuple)
i = ind_fixed[k]

if i == j
dest[i, i] = 1.0
else
dest[i, j] = 0.0
dest[j, i] = 0.0
end
end

function MadNLP.treat_fixed_variable!(kkt::MadNLP.AbstractKKTSystem{T, MT}) where {T, MT<:CuMatrix{T}}
length(kkt.ind_fixed) == 0 && return
aug = kkt.aug_com
d_ind_fixed = kkt.ind_fixed |> CuVector # TODO: allocate ind_fixed directly on the GPU
ndrange = (length(d_ind_fixed), size(aug, 1))
ev = _treat_fixed_variable_kernell!(CUDADevice())(aug, d_ind_fixed, ndrange=ndrange)
wait(ev)
end

#=
DenseKKTSystem kernels
=#
function MadNLP.mul!(y::AbstractVector, kkt::MadNLP.DenseKKTSystem{T, VT, MT}, x::AbstractVector) where {T, VT<:CuVector{T}, MT<:CuMatrix{T}}
# Load buffers
haskey(kkt.etc, :hess_w1) || (kkt.etc[:hess_w1] = CuVector{T}(undef, size(kkt.aug_com, 1)))
haskey(kkt.etc, :hess_w2) || (kkt.etc[:hess_w2] = CuVector{T}(undef, size(kkt.aug_com, 1)))

d_x = kkt.etc[:hess_w1]::VT
d_y = kkt.etc[:hess_w2]::VT

# x and y can be host arrays. Copy them on the device to avoid side effect.
copyto!(d_x, x)
LinearAlgebra.mul!(d_y, kkt.aug_com, d_x)
copyto!(y, d_y)
end

function MadNLP.jtprod!(y::AbstractVector, kkt::MadNLP.DenseKKTSystem{T, VT, MT}, x::AbstractVector) where {T, VT<:CuVector{T}, MT<:CuMatrix{T}}
# Load buffers
haskey(kkt.etc, :jac_w1) || (kkt.etc[:jac_w1] = CuVector{T}(undef, size(kkt.jac, 1)))
haskey(kkt.etc, :jac_w2) || (kkt.etc[:jac_w2] = CuVector{T}(undef, size(kkt.jac, 2)))

d_x = kkt.etc[:jac_w1]::VT
d_y = kkt.etc[:jac_w2]::VT

# x and y can be host arrays. Copy them on the device to avoid side effect.
copyto!(d_x, x)
LinearAlgebra.mul!(d_y, kkt.jac', d_x)
copyto!(y, d_y)
end
frapac marked this conversation as resolved.
Show resolved Hide resolved

function MadNLP.set_aug_diagonal!(kkt::MadNLP.DenseKKTSystem{T, VT, MT}, ips::MadNLP.InteriorPointSolver) where {T, VT<:CuVector{T}, MT<:CuMatrix{T}}
haskey(kkt.etc, :pr_diag_host) || (kkt.etc[:pr_diag_host] = Vector{T}(undef, length(kkt.pr_diag)))
pr_diag_h = kkt.etc[:pr_diag_host]::Vector{T}
# Broadcast is not working as MadNLP array are allocated on the CPU,
# whereas pr_diag is allocated on the GPU
pr_diag_h .= ips.zl./(ips.x.-ips.xl) .+ ips.zu./(ips.xu.-ips.x)
copyto!(kkt.pr_diag, pr_diag_h)
fill!(kkt.du_diag, 0.0)
end

@kernel function _build_dense_kkt_system_kernel!(
dest, hess, jac, pr_diag, du_diag, diag_hess, n, m, ns
)
i, j = @index(Global, NTuple)
if (i <= n)
# Transfer Hessian
if (i == j)
dest[i, i] = pr_diag[i] + diag_hess[i]
elseif j <= n
dest[i, j] = hess[i, j]
dest[j, i] = hess[j, i]
end
elseif i <= n + ns
# Transfer slack diagonal
dest[i, i] = pr_diag[i]
elseif i <= n + ns + m
# Transfer Jacobian
i_ = i - n - ns
dest[i, j] = jac[i_, j]
dest[j, i] = jac[i_, j]
# Transfer dual regularization
dest[i, i] = du_diag[i_]
end
end

function MadNLP._build_dense_kkt_system!(
dest::CuMatrix, hess::CuMatrix, jac::CuMatrix,
pr_diag::CuVector, du_diag::CuVector, diag_hess::CuVector, n, m, ns
)
ndrange = (n+m+ns, n+ns)
ev = _build_dense_kkt_system_kernel!(CUDADevice())(dest, hess, jac, pr_diag, du_diag, diag_hess, n, m, ns, ndrange=ndrange)
wait(ev)
end

function MadNLP.compress_jacobian!(kkt::MadNLP.DenseKKTSystem{T, VT, MT}) where {T, VT<:CuVector{T}, MT<:CuMatrix{T}}
m = size(kkt.jac, 1)
n = size(kkt.hess, 1)
# Extract diagonal terms corresponding to inequalities
index = (LinearAlgebra.diagind(kkt.jac) .+ n * m)[kkt.ind_ineq]
# Add slack indexes
kkt.jac[index] .= -one(T)
# Scale
kkt.jac .*= kkt.jacobian_scaling
return
end
24 changes: 12 additions & 12 deletions lib/MadNLPGPU/src/lapackgpu.jl
Original file line number Diff line number Diff line change
Expand Up @@ -23,8 +23,8 @@ const INPUT_MATRIX_TYPE = :dense
lapackgpu_algorithm::Algorithms = BUNCHKAUFMAN
end

mutable struct Solver <: AbstractLinearSolver
dense::Matrix{Float64}
mutable struct Solver{MT} <: AbstractLinearSolver
dense::MT
fact::CuMatrix{Float64}
rhs::CuVector{Float64}
work::CuVector{Float64}
Expand All @@ -37,10 +37,10 @@ mutable struct Solver <: AbstractLinearSolver
logger::Logger
end

function Solver(dense::Matrix{Float64};
function Solver(dense::MT;
option_dict::Dict{Symbol,Any}=Dict{Symbol,Any}(),
opt=Options(),logger=Logger(),
kwargs...)
kwargs...) where {MT <: AbstractMatrix}

set_options!(opt,option_dict,kwargs...)
fact = CuMatrix{Float64}(undef,size(dense))
Expand Down Expand Up @@ -88,10 +88,10 @@ introduce(M::Solver) = "Lapack-GPU ($(M.opt.lapackgpu_algorithm))"
if toolkit_version() >= v"11.3.1"

is_inertia(M::Solver) = M.opt.lapackgpu_algorithm == CHOLESKY # TODO: implement inertia(M::Solver) for BUNCHKAUFMAN

function factorize_bunchkaufman!(M::Solver)
haskey(M.etc,:ipiv) || (M.etc[:ipiv] = CuVector{Int32}(undef,size(M.dense,1)))
haskey(M.etc,:ipiv64) || (M.etc[:ipiv64] = CuVector{Int64}(undef,length(M.etc[:ipiv])))
haskey(M.etc,:ipiv) || (M.etc[:ipiv] = CuVector{Int32}(undef,size(M.dense,1)))
haskey(M.etc,:ipiv64) || (M.etc[:ipiv64] = CuVector{Int64}(undef,length(M.etc[:ipiv])))

copyto!(M.fact,M.dense)
cusolverDnDsytrf_bufferSize(
Expand All @@ -105,7 +105,7 @@ if toolkit_version() >= v"11.3.1"
end

function solve_bunchkaufman!(M::Solver,x)

copyto!(M.etc[:ipiv64],M.etc[:ipiv])
copyto!(M.rhs,x)
ccall((:cusolverDnXsytrs_bufferSize, libcusolver()), cusolverStatus_t,
Expand All @@ -126,12 +126,12 @@ if toolkit_version() >= v"11.3.1"
size(M.fact,1),1,R_64F,M.fact,size(M.fact,2),
M.etc[:ipiv64],R_64F,M.rhs,length(M.rhs),M.work,M.lwork[],M.work_host,M.lwork_host[],M.info)
copyto!(x,M.rhs)

return x
end
else
is_inertia(M::Solver) =
M.opt.lapackgpu_algorithm == CHOLESKY || M.opt.lapackgpu_algorithm == CHOLESKY
M.opt.lapackgpu_algorithm == CHOLESKY || M.opt.lapackgpu_algorithm == CHOLESKY

function factorize_bunchkaufman!(M::Solver)
haskey(M.etc,:ipiv) || (M.etc[:ipiv] = CuVector{Int32}(undef,size(M.dense,1)))
Expand Down Expand Up @@ -162,7 +162,7 @@ else
Cvoid,
(Ref{Cchar},Ref{Int},Ref{Int},Ptr{Cdouble},Ref{Int},Ptr{Int},Ptr{Cdouble},Ref{Int},Ptr{Int}),
'L',size(M.fact,1),1,M.etc[:fact_cpu],size(M.fact,2),M.etc[:ipiv_cpu],x,length(x),[1])

return x
end
end
Expand Down Expand Up @@ -242,7 +242,7 @@ function inertia(M::Solver)
if M.opt.lapackgpu_algorithm == BUNCHKAUFMAN
inertia(M.etc[:fact_cpu],M.etc[:ipiv_cpu],M.etc[:info_cpu][])
elseif M.opt.lapackgpu_algorithm == CHOLESKY
sum(M.info) == 0 ? (size(M.fact,1),0,0) : (0,size(M.fact,1),0)
sum(M.info) == 0 ? (size(M.fact,1),0,0) : (0,size(M.fact,1),0)
else
error(LOGGER,"Invalid lapackcpu_algorithm")
end
Expand Down
44 changes: 44 additions & 0 deletions lib/MadNLPGPU/test/densekkt_gpu.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@

using CUDA
using MadNLPTests

function _compare_gpu_with_cpu(n, m, ind_fixed)
madnlp_options = Dict{Symbol, Any}(
:kkt_system=>MadNLP.DENSE_KKT_SYSTEM,
:linear_solver=>MadNLPLapackGPU,
:print_level=>MadNLP.ERROR,
)

nlp = MadNLPTests.DenseDummyQP(; n=n, m=m, fixed_variables=ind_fixed)

h_ips = MadNLP.InteriorPointSolver(nlp; option_dict=copy(madnlp_options))
MadNLP.optimize!(h_ips)

# Reinit NonlinearProgram to avoid side effect
ind_cons = MadNLP.get_index_constraints(nlp)
ns = length(ind_cons.ind_ineq)

# Init KKT on the GPU
TKKTGPU = MadNLP.DenseKKTSystem{Float64, CuVector{Float64}, CuMatrix{Float64}}
opt = MadNLP.Options(; madnlp_options...)
# Instantiate Solver with KKT on the GPU
d_ips = MadNLP.InteriorPointSolver{TKKTGPU}(nlp, opt; option_linear_solver=copy(madnlp_options))
MadNLP.optimize!(d_ips)

# Check that both results match exactly
@test h_ips.cnt.k == d_ips.cnt.k
@test h_ips.obj_val ≈ d_ips.obj_val atol=1e-10
@test h_ips.x ≈ d_ips.x atol=1e-10
@test h_ips.l ≈ d_ips.l atol=1e-10
end

@testset "MadNLP: dense versus sparse" begin
@testset "Size: ($n, $m)" for (n, m) in [(10, 0), (10, 5), (50, 10)]
_compare_gpu_with_cpu(n, m, Int[])
end
@testset "Fixed variables" begin
n, m = 10, 5
_compare_gpu_with_cpu(10, 5, Int[1, 2])
end
end

5 changes: 4 additions & 1 deletion lib/MadNLPGPU/test/runtests.jl
Original file line number Diff line number Diff line change
Expand Up @@ -39,10 +39,13 @@ testset = [
],
]

@testset "MadNLPGPU test" begin
# Test LapackGPU wrapper
@testset "LapackGPU test" begin
for (name,optimizer_constructor,exclude) in testset
test_madnlp(name,optimizer_constructor,exclude)
end
end

# Test DenseKKTSystem on GPU
include("densekkt_gpu.jl")

5 changes: 4 additions & 1 deletion lib/MadNLPTests/Project.toml
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,11 @@ version = "0.1.0"
[deps]
LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
JuMP = "4076af6c-e467-56ae-b986-b466b2749572"
MadNLP = "2621e9c9-9eb4-46b1-8089-e8c72242dfb6"
NLPModels = "a4795742-8479-5a88-8948-cc11e1c8c1a6"
Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"

[compat]
JuMP = "~0.21"
julia = "1.3"
julia = "1.3"
Loading