In [1]:
using TopOpt.GPUUtils, TopOpt.MMA, CUDAnative, CuArrays, LinearAlgebra, SparseArrays, Random, Optim, Suppressor
import CUDAdrv

In [2]:
struct Loss{TV, TM} <: Function
    B::TM
    c::TV
    r::TV
end
function (f::Loss)(w, grad)
    #if w isa Array
    #    sleep(0.1)
    #end
    mul!(f.r, f.B, w)
    grad .= 2 .* f.r .- 2 .* f.c
    return dot(f.r, w) - 2 * dot(f.c, w) + dot(w, w)    
end
GPUUtils.whichdevice(l::Loss) = GPUUtils.whichdevice(l.B);

In [3]:
struct HeterLoss{TV, TV2, TM} <: Function
    B::TM
    c::TV
    r::TV
    w::TV2
    grad::TV2
end
function HeterLoss(B, y, r)
    w = CuArray(r)
    grad = similar(w)
    return HeterLoss(B, c, r, w, grad)
end
function (f::HeterLoss)(w, grad)
    copyto!(f.w, w)
    mul!(f.r, f.B, f.w)
    f.grad .= 2 .* f.r .- 2 .* f.c
    copyto!(grad, f.grad)
    
    return dot(f.r, f.w) - 2 * dot(f.c, f.w) + dot(f.w, f.w)
end
GPUUtils.whichdevice(l::Loss) = GPUUtils.CPU();

In [4]:
nparams = 4000;
npoints = 10;

In [5]:
struct LowerBound{T} <: Function
    v::T
end
struct UpperBound{T} <: Function
    v::T
end
(b::LowerBound)(w, grad) = onenormlower(w, grad, b.v)
(b::UpperBound)(w, grad) = onenormupper(w, grad, b.v)

function onenormupper(w, grad, limit)
    grad .= sign.(w)
    return norm(w, 1) - limit
end
function onenormlower(w, grad, limit)
    grad .= -sign.(w)
    return limit - norm(w, 1)
end

onenormlower (generic function with 1 method)

In [6]:
Random.seed!(1);
lb = 5.0;
ub = 50.0;

In [7]:
X_cpu = sprand(npoints, nparams, 5/nparams);
B_cpu = X_cpu' * X_cpu
true_w_cpu = Vector(sprand(nparams, (lb+ub)/2/nparams));
y_cpu = X_cpu * true_w_cpu;
c_cpu = X_cpu' * y_cpu
r_cpu = similar(c_cpu)
w0_cpu = rand(nparams);
cpuloss = Loss(B_cpu, c_cpu, r_cpu)

(::Loss{Array{Float64,1},SparseMatrixCSC{Float64,Int64}}) (generic function with 1 method)

In [12]:
m_cpu = MMAModel(nparams, cpuloss, xtol = 1e-4, maxiter = 200);
box!(m_cpu, 0.0, 1.0);
ineq_constraint!(m_cpu, UpperBound(ub));
#ineq_constraint!(m_cpu, LowerBound(lb));

@time @suppress MMA.optimize(m_cpu, w0_cpu, MMA.MMA87(), Optim.ConjugateGradient(), dual_caps = (0.0, 100.0));

  0.085705 seconds (42.79 k allocations: 3.000 MiB)


In [14]:
B_gpu = CuArrays.CUSPARSE.CuSparseMatrixCSC(B_cpu);
c_gpu = CuArray(c_cpu)
r_gpu = CuArray(r_cpu)
w0_gpu = CuArray(w0_cpu);
gpuloss = Loss(B_gpu, c_gpu, r_gpu);

In [15]:
heterloss = HeterLoss(X_gpu, y_gpu);
m_heter = MMAModel(nparams, heterloss, xtol = 1e-4, maxiter = 200);
box!(m_heter, 0.0, 1.0);
ineq_constraint!(m_heter, UpperBound(ub));
#ineq_constraint!(m_cpu, LowerBound(lb));

@time @suppress MMA.optimize(m_heter, w0_cpu, MMA.MMA87(), Optim.ConjugateGradient(), dual_caps = (0.0, 100.0));

UndefVarError: UndefVarError: X_gpu not defined

In [26]:
m_gpu = MMAModel(nparams, gpuloss, xtol = 1e-4, maxiter = 200);
box!(m_gpu, 0.0, 1.0);
ineq_constraint!(m_gpu, UpperBound(ub));
#ineq_constraint!(m_gpu, LowerBound(lb));

@time @suppress MMA.optimize(m_gpu, w0_gpu, MMA.MMA87(), Optim.ConjugateGradient(), dual_caps = (0.0, 100.0));

  7.517029 seconds (3.02 M allocations: 192.312 MiB, 0.50% gc time)


In [13]:
grad_cpu = similar(w0_cpu)
@time cpuloss(w0_cpu, grad_cpu);

  0.205466 seconds (347.82 k allocations: 17.369 MiB, 2.72% gc time)


In [31]:
grad_gpu = similar(w0_gpu)
@time gpuloss(w0_gpu, grad_gpu);

  0.002568 seconds (62 allocations: 3.016 KiB)
