Skip to content

Commit

Permalink
support DenseKKTSystem on GPUs
Browse files Browse the repository at this point in the history
* add dedicated GPU kernels for KKT operations
* add proper tests for DenseKKTSystem on GPU
* move build_qp_dense function in MadNLPTests
  • Loading branch information
frapac committed Sep 10, 2021
1 parent 01bc23e commit 6bc1867
Show file tree
Hide file tree
Showing 11 changed files with 371 additions and 170 deletions.
9 changes: 7 additions & 2 deletions lib/MadNLPGPU/Project.toml
Original file line number Diff line number Diff line change
Expand Up @@ -5,16 +5,21 @@ version = "0.1.0"

[deps]
CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
CUDAKernels = "72cfdca4-0801-4ab0-bf6a-d52aa10adc57"
KernelAbstractions = "63c18a36-062a-441e-b654-da1e3ab1ce7c"
LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
MadNLP = "2621e9c9-9eb4-46b1-8089-e8c72242dfb6"

[compat]
CUDA = "~2,~3"
CUDAKernels = "0.3.0"
KernelAbstractions = "0.7"
MadNLP = "~0.2"
julia = "1.5"

[extras]
Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
MadNLPTests = "b52a2a03-04ab-4a5f-9698-6a2deff93217"
Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"

[targets]
test = ["Test","MadNLPTests"]
test = ["Test", "MadNLPTests"]
12 changes: 11 additions & 1 deletion lib/MadNLPGPU/src/MadNLPGPU.jl
Original file line number Diff line number Diff line change
@@ -1,12 +1,22 @@
module MadNLPGPU

import CUDA: CUBLAS, CUSOLVER, CuVector, CuMatrix, toolkit_version, R_64F, has_cuda
# CUDA
import CUDA: CUBLAS, CUSOLVER, CuVector, CuMatrix, CuArray, toolkit_version, R_64F, has_cuda
# Kernels
using KernelAbstractions
using CUDAKernels
using LinearAlgebra
using MadNLP

import MadNLP:
@kwdef, Logger, @debug, @warn, @error,
AbstractOptions, AbstractLinearSolver, set_options!, MadNLPLapackCPU,
SymbolicException,FactorizationException,SolveException,InertiaException,
introduce, factorize!, solve!, improve!, is_inertia, inertia, tril_to_full!


include("kernels.jl")

if has_cuda()
include("lapackgpu.jl")
export MadNLPLapackGPU
Expand Down
123 changes: 123 additions & 0 deletions lib/MadNLPGPU/src/kernels.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,123 @@
#=
MadNLP utils
=#

@kernel function _copy_diag!(dest, src)
i = @index(Global)
dest[i] = src[i, i]
end

function MadNLP.diag!(dest::CuVector{T}, src::CuMatrix{T}) where T
@assert length(dest) == size(src, 1)
ev = _copy_diag!(CUDADevice())(dest, src, ndrange=length(dest))
wait(ev)
end

@kernel function _add_diagonal!(dest, src1, src2)
i = @index(Global)
dest[i, i] = src1[i] + src2[i]
end

function MadNLP.diag_add!(dest::CuMatrix, src1::CuVector, src2::CuVector)
ev = _add_diagonal!(CUDADevice())(dest, src1, src2, ndrange=size(dest, 1))
wait(ev)
end

#=
MadNLP kernels
=#

# Overload is_valid to avoid fallback to default is_valid, slow on GPU
MadNLP.is_valid(src::CuArray) = true

# Constraint scaling
function MadNLP.set_con_scale!(con_scale::AbstractVector, jac::CuMatrix, nlp_scaling_max_gradient)
# Compute reduction on the GPU with built-in CUDA.jl function
d_con_scale = maximum(abs, jac, dims=2)
copyto!(con_scale, d_con_scale)
con_scale .= min.(1.0, nlp_scaling_max_gradient ./ con_scale)
end

@kernel function _treat_fixed_variable_kernell!(dest, ind_fixed)
k, j = @index(Global, NTuple)
i = ind_fixed[k]

if i == j
dest[i, i] = 1.0
else
dest[i, j] = 0.0
dest[j, i] = 0.0
end
end

function MadNLP.treat_fixed_variable!(kkt::MadNLP.AbstractKKTSystem{T, MT}) where {T, MT<:CuMatrix{T}}
length(kkt.ind_fixed) == 0 && return
aug = kkt.aug_com
d_ind_fixed = kkt.ind_fixed |> CuVector # TODO: allocate ind_fixed directly on the GPU
ndrange = (length(d_ind_fixed), size(aug, 1))
ev = _treat_fixed_variable_kernell!(CUDADevice())(aug, d_ind_fixed, ndrange=ndrange)
wait(ev)
end

#=
DenseKKTSystem kernels
=#

function MadNLP.mul!(y::AbstractVector, kkt::MadNLP.DenseKKTSystem{T, VT, MT}, x::AbstractVector) where {T, VT<:CuVector{T}, MT<:CuMatrix{T}}
@assert length(kkt._w2) == length(x)
# x and y can be host arrays. Copy them on the device to avoid side effect.
copyto!(kkt._w2, x)
mul!(kkt._w1, kkt.aug_com, kkt._w2)
copyto!(y, kkt._w1)
end

function MadNLP.jtprod!(y::AbstractVector, kkt::MadNLP.DenseKKTSystem{T, VT, MT}, x::AbstractVector) where {T, VT<:CuVector{T}, MT<:CuMatrix{T}}
# x and y can be host arrays. Copy them on the device to avoid side effect.
#
d_x = x |> CuArray
d_y = y |> CuArray
mul!(d_y, kkt.jac', d_x)
copyto!(y, d_y)
end

function MadNLP.set_aug_diagonal!(kkt::MadNLP.DenseKKTSystem{T, VT, MT}, ips::MadNLP.Solver) where {T, VT<:CuVector{T}, MT<:CuMatrix{T}}
# Broadcast is not working as MadNLP array are allocated on the CPU,
# whereas pr_diag is allocated on the GPU
copyto!(kkt.pr_diag, ips.zl./(ips.x.-ips.xl) .+ ips.zu./(ips.xu.-ips.x))
fill!(kkt.du_diag, 0.0)
end

@kernel function _build_dense_kkt_system_kerneq!(
dest, hess, jac, pr_diag, du_diag, diag_hess, n, m, ns
)
i, j = @index(Global, NTuple)
if (i <= n)
# Transfer Hessian
if (i == j)
dest[i, i] = pr_diag[i] + diag_hess[i]
elseif j <= n
dest[i, j] = hess[i, j]
dest[j, i] = hess[j, i]
end
elseif i <= n + ns
# Transfer slack diagonal
dest[i, i] = pr_diag[i]
elseif i <= n + ns + m
# Transfer Jacobian
i_ = i - n - ns
dest[i, j] = jac[i_, j]
dest[j, i] = jac[i_, j]
# Transfer dual regularization
dest[i, i] = du_diag[i_]
end
end

function MadNLP._build_dense_kkt_system!(
dest::CuMatrix, hess::CuMatrix, jac::CuMatrix,
pr_diag::CuVector, du_diag::CuVector, diag_hess::CuVector, n, m, ns
)
ndrange = (n+m+ns, n+ns)
ev = _build_dense_kkt_system_kerneq!(CUDADevice())(dest, hess, jac, pr_diag, du_diag, diag_hess, n, m, ns, ndrange=ndrange)
wait(ev)
end

20 changes: 10 additions & 10 deletions lib/MadNLPGPU/src/lapackgpu.jl
Original file line number Diff line number Diff line change
Expand Up @@ -21,8 +21,8 @@ const INPUT_MATRIX_TYPE = :dense
lapackgpu_algorithm::Algorithms = BUNCHKAUFMAN
end

mutable struct Solver <: AbstractLinearSolver
dense::Matrix{Float64}
mutable struct Solver{MT} <: AbstractLinearSolver
dense::MT
fact::CuMatrix{Float64}
rhs::CuVector{Float64}
work::CuVector{Float64}
Expand All @@ -35,10 +35,10 @@ mutable struct Solver <: AbstractLinearSolver
logger::Logger
end

function Solver(dense::Matrix{Float64};
function Solver(dense::MT;
option_dict::Dict{Symbol,Any}=Dict{Symbol,Any}(),
opt=Options(),logger=Logger(),
kwargs...)
kwargs...) where {MT <: AbstractMatrix}

set_options!(opt,option_dict,kwargs...)
fact = CuMatrix{Float64}(undef,size(dense))
Expand Down Expand Up @@ -82,10 +82,10 @@ introduce(M::Solver) = "Lapack-GPU ($(M.opt.lapackgpu_algorithm))"
if toolkit_version() >= v"11.3.1"

is_inertia(M::Solver) = false # TODO: implement inertia(M::Solver) for BUNCHKAUFMAN

function factorize_bunchkaufman!(M::Solver)
haskey(M.etc,:ipiv) || (M.etc[:ipiv] = CuVector{Int32}(undef,size(M.dense,1)))
haskey(M.etc,:ipiv64) || (M.etc[:ipiv64] = CuVector{Int64}(undef,length(M.etc[:ipiv])))
haskey(M.etc,:ipiv) || (M.etc[:ipiv] = CuVector{Int32}(undef,size(M.dense,1)))
haskey(M.etc,:ipiv64) || (M.etc[:ipiv64] = CuVector{Int64}(undef,length(M.etc[:ipiv])))

copyto!(M.fact,M.dense)
cusolverDnDsytrf_bufferSize(
Expand All @@ -99,7 +99,7 @@ if toolkit_version() >= v"11.3.1"
end

function solve_bunchkaufman!(M::Solver,x)

copyto!(M.etc[:ipiv64],M.etc[:ipiv])
copyto!(M.rhs,x)
ccall((:cusolverDnXsytrs_bufferSize, libcusolver()), cusolverStatus_t,
Expand All @@ -120,7 +120,7 @@ if toolkit_version() >= v"11.3.1"
size(M.fact,1),1,R_64F,M.fact,size(M.fact,2),
M.etc[:ipiv64],R_64F,M.rhs,length(M.rhs),M.work,M.lwork[],M.work_host,M.lwork_host[],M.info)
copyto!(x,M.rhs)

return x
end
else
Expand Down Expand Up @@ -156,7 +156,7 @@ else
Cvoid,
(Ref{Cchar},Ref{Int},Ref{Int},Ptr{Cdouble},Ref{Int},Ptr{Int},Ptr{Cdouble},Ref{Int},Ptr{Int}),
'L',size(M.fact,1),1,M.etc[:fact_cpu],size(M.fact,2),M.etc[:ipiv_cpu],x,length(x),[1])

return x
end
end
Expand Down
47 changes: 47 additions & 0 deletions lib/MadNLPGPU/test/densekkt_gpu.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@

using CUDA
using MadNLPTests

function _compare_gpu_with_cpu(n, m, ind_fixed)
madnlp_options = Dict{Symbol, Any}(
:kkt_system=>MadNLP.DENSE_KKT_SYSTEM,
:linear_solver=>MadNLPLapackGPU,
:print_level=>MadNLP.ERROR,
)

nlp = build_qp_test(; n=n, m=m, fixed_variables=ind_fixed)
x, l = copy(nlp.x), copy(nlp.l)

h_ips = MadNLP.Solver(nlp; option_dict=copy(madnlp_options))
MadNLP.optimize!(h_ips)

# Reinit NonlinearProgram to avoid side effect
nlp = MadNLPTests.build_qp_test(; n=n, m=m, fixed_variables=ind_fixed)
ind_cons = MadNLP.get_index_constraints(nlp)
ns = length(ind_cons.ind_ineq)

# Init KKT on the GPU
kkt = MadNLP.DenseKKTSystem{Float64, CuVector{Float64}, CuMatrix{Float64}}(
nlp, ind_cons; buffer_size=(nlp.n+nlp.m+ns),
)
# Instantiate Solver with KKT on the GPU
d_ips = MadNLP.Solver(nlp, kkt; option_dict=copy(madnlp_options))
MadNLP.optimize!(d_ips)

# Check that both results match exactly
@test h_ips.cnt.k == d_ips.cnt.k
@test h_ips.obj_val d_ips.obj_val atol=1e-10
@test h_ips.x d_ips.x atol=1e-10
@test h_ips.l d_ips.l atol=1e-10
end

@testset "MadNLP: dense versus sparse" begin
@testset "Size: ($n, $m)" for (n, m) in [(10, 0), (10, 5), (50, 10)]
_compare_gpu_with_cpu(n, m, Int[])
end
@testset "Fixed variables" begin
n, m = 10, 5
_compare_gpu_with_cpu(10, 5, Int[1, 2])
end
end

5 changes: 4 additions & 1 deletion lib/MadNLPGPU/test/runtests.jl
Original file line number Diff line number Diff line change
Expand Up @@ -30,10 +30,13 @@ testset = [
],
]

@testset "MadNLPGPU test" begin
# Test LapackGPU wrapper
@testset "LapackGPU test" begin
for (name,optimizer_constructor,exclude) in testset
test_madnlp(name,optimizer_constructor,exclude)
end
end

# Test DenseKKTSystem on GPU
include("densekkt_gpu.jl")

5 changes: 4 additions & 1 deletion lib/MadNLPTests/Project.toml
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,11 @@ version = "0.1.0"
[deps]
LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
JuMP = "4076af6c-e467-56ae-b986-b466b2749572"
MadNLP = "2621e9c9-9eb4-46b1-8089-e8c72242dfb6"
NLPModels = "a4795742-8479-5a88-8948-cc11e1c8c1a6"
Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"

[compat]
JuMP = "~0.21"
julia = "1.3"
julia = "1.3"

0 comments on commit 6bc1867

Please sign in to comment.