Skip to content
This repository has been archived by the owner on Mar 12, 2021. It is now read-only.

Support for early-freeing GPU memory #275

Merged
merged 8 commits into from
Feb 8, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
21 changes: 20 additions & 1 deletion src/array.jl
Original file line number Diff line number Diff line change
Expand Up @@ -21,8 +21,13 @@ CuVector{T} = CuArray{T,1}
CuMatrix{T} = CuArray{T,2}
CuVecOrMat{T} = Union{CuVector{T},CuMatrix{T}}

function unsafe_free!(xs::CuArray)
const INVALID = Mem.alloc(0)

function unsafe_free!(xs::CuArray{<:Any,N}) where {N}
xs.buf === INVALID && return
Mem.release(xs.buf) && dealloc(xs.buf, prod(xs.dims)*sizeof(eltype(xs)))
xs.dims = Tuple(0 for _ in 1:N)
xs.buf = INVALID
return
end

Expand All @@ -44,6 +49,20 @@ CuArray{T}(::UndefInitializer, dims::Integer...) where {T} =
# empty vector constructor
CuArray{T,1}() where {T} = CuArray{T,1}(undef, 0)

# do-block constructors
for (ctor, tvars) in (:CuArray => (), :(CuArray{T}) => (:T,), :(CuArray{T,N}) => (:T, :N))
@eval begin
function $ctor(f::Function, args...) where {$(tvars...)}
xs = $ctor(args...)
try
f(xs)
finally
unsafe_free!(xs)
end
end
end
end


Base.similar(a::CuArray{T,N}) where {T,N} = CuArray{T,N}(undef, size(a))
Base.similar(a::CuArray{T}, dims::Base.Dims{N}) where {T,N} = CuArray{T,N}(undef, dims)
Expand Down
2 changes: 1 addition & 1 deletion src/blas/CUBLAS.jl
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ import CUDAdrv: CUDAdrv, CuContext, CuStream_t, CuPtr, PtrOrCuPtr, CU_NULL
import CUDAapi

using ..CuArrays
using ..CuArrays: libcublas, active_context
using ..CuArrays: libcublas, active_context, unsafe_free!

using LinearAlgebra

Expand Down
48 changes: 38 additions & 10 deletions src/blas/wrappers.jl
Original file line number Diff line number Diff line change
Expand Up @@ -221,7 +221,7 @@ for (fname, elty) in ((:cublasIdamax_v2,:Float64),
dx::CuArray{$elty},
incx::Integer)
result = Ref{Cint}()
$fname(handle(), n, dx, incx, result)
$fname(handle(), n, dx, incx, result)
return result[]
end
end
Expand Down Expand Up @@ -836,7 +836,7 @@ for (fname, elty) in
B::Array{CuMatrix{$elty},1},
beta::($elty),
C::Array{CuMatrix{$elty},1})
if( length(A) != length(B) || length(A) != length(C) )
if length(A) != length(B) || length(A) != length(C)
throw(DimensionMismatch(""))
end
for (As,Bs,Cs) in zip(A,B,C)
Expand All @@ -847,6 +847,7 @@ for (fname, elty) in
throw(DimensionMismatch(""))
end
end

m = size(A[1], transA == 'N' ? 1 : 2)
k = size(A[1], transA == 'N' ? 2 : 1)
n = size(B[1], transB == 'N' ? 2 : 1)
Expand All @@ -860,6 +861,10 @@ for (fname, elty) in
Cptrs = device_batch(C)
$fname(handle(), cutransA,cutransB, m, n, k, [alpha], Aptrs, lda, Bptrs,
ldb, [beta], Cptrs, ldc, length(A))
unsafe_free!(Cptrs)
unsafe_free!(Bptrs)
unsafe_free!(Aptrs)

C
end
function gemm_batched(transA::Char,
Expand Down Expand Up @@ -1346,7 +1351,7 @@ for (fname, elty) in
cuuplo = cublasfill(uplo)
cutransa = cublasop(transa)
cudiag = cublasdiag(diag)
if( length(A) != length(B) )
if length(A) != length(B)
throw(DimensionMismatch(""))
end
for (As,Bs) in zip(A,B)
Expand All @@ -1355,12 +1360,16 @@ for (fname, elty) in
if mA != nA throw(DimensionMismatch("A must be square")) end
if nA != (side == 'L' ? m : n) throw(DimensionMismatch("trsm_batched!")) end
end

m,n = size(B[1])
lda = max(1,stride(A[1],2))
ldb = max(1,stride(B[1],2))
Aptrs = device_batch(A)
Bptrs = device_batch(B)
$fname(handle(), cuside, cuuplo, cutransa, cudiag, m, n, [alpha], Aptrs, lda, Bptrs, ldb, length(A))
unsafe_free!(Bptrs)
unsafe_free!(Aptrs)

B
end
function trsm_batched(side::Char,
Expand Down Expand Up @@ -1453,13 +1462,16 @@ for (fname, elty) in
throw(DimensionMismatch("All matrices must be square!"))
end
end

m,n = size(A[1])
lda = max(1,stride(A[1],2))
Aptrs = device_batch(A)
info = CuArray{Cint}(undef, length(A))
pivotArray = Pivot ? CuArray{Int32}(undef, (n, length(A))) : CU_NULL
$fname(handle(), n, Aptrs, lda, pivotArray, info, length(A))
if( !Pivot )
unsafe_free!(Aptrs)

if !Pivot
pivotArray = CuArray(zeros(Cint, (n, length(A))))
end
pivotArray, info, A
Expand Down Expand Up @@ -1493,6 +1505,7 @@ for (fname, elty) in
throw(DimensionMismatch("All A matrices must be square!"))
end
end

C = CuMatrix{$elty}[similar(A[1]) for i in 1:length(A)]
n = size(A[1])[1]
lda = max(1,stride(A[1],2))
Expand All @@ -1501,6 +1514,9 @@ for (fname, elty) in
Cptrs = device_batch(C)
info = CuArray(zeros(Cint,length(A)))
$fname(handle(), n, Aptrs, lda, pivotArray, Cptrs, ldc, info, length(A))
unsafe_free!(Cptrs)
unsafe_free!(Aptrs)

pivotArray, info, C
end
end
Expand Down Expand Up @@ -1528,6 +1544,7 @@ for (fname, elty) in
throw(ArgumentError("matinv requires all matrices be smaller than 32 x 32"))
end
end

C = CuMatrix{$elty}[similar(A[1]) for i in 1:length(A)]
n = size(A[1])[1]
lda = max(1,stride(A[1],2))
Expand All @@ -1536,6 +1553,9 @@ for (fname, elty) in
Cptrs = device_batch(C)
info = CuArray(zeros(Cint,length(A)))
$fname(handle(), n, Aptrs, lda, Cptrs, ldc, info, length(A))
unsafe_free!(Cptrs)
unsafe_free!(Aptrs)

info, C
end
end
Expand All @@ -1560,14 +1580,17 @@ for (fname, elty) in
hTauArray = [zeros($elty, min(m,n)) for i in 1:length(A)]
TauArray = CuArray{$elty,1}[]
for i in 1:length(A)
push!(TauArray,CuArray(hTauArray[i]))
push!(TauArray, CuArray(hTauArray[i]))
end
Tauptrs = device_batch(TauArray)
info = zero(Cint)
$fname(handle(), m, n, Aptrs, lda, Tauptrs, [info], length(A))
if( info != 0 )
unsafe_free!(Tauptrs)

if info != 0
throw(ArgumentError,string("Invalid value at ",-info))
end

TauArray, A
end
function geqrf_batched(A::Array{CuMatrix{$elty},1})
Expand All @@ -1593,20 +1616,21 @@ for (fname, elty) in
A::Array{CuMatrix{$elty},1},
C::Array{CuMatrix{$elty},1})
cutrans = cublasop(trans)
if( length(A) != length(C) )
if length(A) != length(C)
throw(DimensionMismatch(""))
end
for (As,Cs) in zip(A,C)
m,n = size(As)
mC,nC = size(Cs)
if( n != mC )
if n != mC
throw(DimensionMismatch(""))
end
end
m,n = size(A[1])
if( m < n )
if m < n
throw(ArgumentError("System must be overdetermined"))
end

nrhs = size(C[1])[2]
lda = max(1,stride(A[1],2))
ldc = max(1,stride(A[1],2))
Expand All @@ -1615,9 +1639,13 @@ for (fname, elty) in
info = zero(Cint)
infoarray = CuArray(zeros(Cint, length(A)))
$fname(handle(), cutrans, m, n, nrhs, Aptrs, lda, Cptrs, ldc, [info], infoarray, length(A))
if( info != 0 )
unsafe_free!(Cptrs)
unsafe_free!(Aptrs)

if info != 0
throw(ArgumentError,string("Invalid value at ",-info))
end

A, C, infoarray
end
function gels_batched(trans::Char,
Expand Down
2 changes: 1 addition & 1 deletion src/dnn/CUDNN.jl
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ import CUDAapi
import CUDAdrv: CUDAdrv, CuContext, CuPtr, CU_NULL

using ..CuArrays
using ..CuArrays: libcudnn, active_context, configured
using ..CuArrays: libcudnn, active_context, configured, unsafe_free!

include("libcudnn_types.jl")
include("error.jl")
Expand Down
3 changes: 0 additions & 3 deletions src/dnn/libcudnn.jl
Original file line number Diff line number Diff line change
Expand Up @@ -245,7 +245,6 @@ function cudnnConvolutionBiasActivationForward(y::CuArray{T,N}, x::CuArray{T,N},
end

function cudnnConvolutionForward(alpha, xDesc, x, wDesc, w, convDesc, algo, workspace, workspace_size, beta, yDesc, y)
workspace = something(workspace, CU_NULL)
@check ccall((:cudnnConvolutionForward, libcudnn),
cudnnStatus_t,
(cudnnHandle_t, Ptr{Nothing}, cudnnTensorDescriptor_t, CuPtr{Nothing},
Expand Down Expand Up @@ -285,7 +284,6 @@ function cudnnGetConvolutionForwardWorkspaceSize(y::CuArray{T,N}, x::CuArray{T,N
end

function cudnnConvolutionBackwardData(alpha, wDesc, w, dyDesc, dy, convDesc, algo, workspace, workspace_size, beta, dxDesc, dx)
workspace = something(workspace, CU_NULL)
@check ccall((:cudnnConvolutionBackwardData, libcudnn),
cudnnStatus_t,
(cudnnHandle_t, Ptr{Nothing}, cudnnFilterDescriptor_t, CuPtr{Nothing},
Expand Down Expand Up @@ -325,7 +323,6 @@ function cudnnGetConvolutionBackwardDataWorkspaceSize(dx::CuArray{T,N}, w::CuArr
end

function cudnnConvolutionBackwardFilter(alpha, xDesc, x, dyDesc, dy, convDesc, algo, workspace, workspace_size, beta, dwDesc, dw)
workspace = something(workspace, CU_NULL)
@check ccall((:cudnnConvolutionBackwardFilter, libcudnn),
cudnnStatus_t,
(cudnnHandle_t, Ptr{Nothing}, cudnnTensorDescriptor_t, CuPtr{Nothing},
Expand Down
76 changes: 30 additions & 46 deletions src/dnn/nnlib.jl
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ import NNlib: conv!, ∇conv_filter!, ∇conv_data!,
import ..CuArrays: CuVecOrMat, CuVector
using CUDAnative


# Softmax

const CUDNNFloat = Union{Float16,Float32,Float64}
Expand Down Expand Up @@ -36,73 +37,56 @@ end
∇logsoftmax(Δ::CuVecOrMat{T}, xs::CuVecOrMat{T}) where T<:CUDNNFloat =
∇logsoftmax!(similar(xs), Δ, xs)

# Convolution

const _conv_workspace = Ref{CuVector{UInt8}}()

function conv_workspace(bytes)
global _conv_workspace
if isassigned(_conv_workspace) && bytes <= length(_conv_workspace[])
_conv_workspace[]
else
_conv_workspace[] = CuVector{UInt8}(undef, bytes)
end
end
# Convolution

function conv!(y::CuArray{T}, x::CuArray{T}, w::CuArray{T};
pad=0, stride=1, flipkernel=0, alpha=1, dilation=1,
workspace::Union{CuVector, Nothing}=nothing, algo=0) where T<:CUDNNFloat
pad=0, stride=1, flipkernel=0, alpha=1, dilation=1, algo=0) where T<:CUDNNFloat
if version() < v"6"
all(x -> x == 1, dilation) || error("Only dilation = 1 is supported in cuDNN version < 6")
end
if workspace === nothing
workspace_size =
cudnnGetConvolutionForwardWorkspaceSize(y, x, w, padding=pad, stride=stride, dilation=dilation,
algo=algo, mode=flipkernel)
workspace = workspace_size != 0 ? conv_workspace(workspace_size) : workspace
else
workspace_size = length(workspace[])

workspace_size =
cudnnGetConvolutionForwardWorkspaceSize(y, x, w, padding=pad, stride=stride, dilation=dilation,
algo=algo, mode=flipkernel)

CuVector{UInt8}(undef, workspace_size) do workspace
cudnnConvolutionForward(y, x, w, padding=pad, stride=stride, dilation=dilation, mode=flipkernel,
alpha=alpha, algo=algo, workspace=workspace, workspace_size=workspace_size)
end
cudnnConvolutionForward(y, x, w, padding=pad, stride=stride, dilation=dilation, mode=flipkernel,
alpha=alpha, algo=algo, workspace=workspace, workspace_size=workspace_size)
end

function ∇conv_filter!(dw::CuArray{T}, dy::CuArray{T}, x::CuArray{T};
pad=0, stride=1, flipkernel=0, alpha=1, dilation=1,
workspace::Union{CuVector, Nothing}=nothing, algo=0) where T<:CUDNNFloat
pad=0, stride=1, flipkernel=0, alpha=1, dilation=1, algo=0) where T<:CUDNNFloat
if version() < v"6"
all(x -> x == 1, dilation) || error("Only dilation = 1 is supported in cuDNN version < 6")
end
if workspace === nothing
workspace_size =
cudnnGetConvolutionBackwardFilterWorkspaceSize(dw, x, dy, padding=pad, stride=stride,
dilation=dilation, algo=algo, mode=flipkernel)
workspace = workspace_size != 0 ? conv_workspace(workspace_size) : workspace
else
workspace_size = length(workspace[])

workspace_size =
cudnnGetConvolutionBackwardFilterWorkspaceSize(dw, x, dy, padding=pad, stride=stride,
dilation=dilation, algo=algo, mode=flipkernel)

CuVector{UInt8}(undef, workspace_size) do workspace
cudnnConvolutionBackwardFilter(dw, x, dy, padding=pad, stride=stride, dilation=dilation,
mode=flipkernel, alpha=alpha, algo=algo, workspace=workspace,
workspace_size=workspace_size)
end
cudnnConvolutionBackwardFilter(dw, x, dy, padding=pad, stride=stride, dilation=dilation,
mode=flipkernel, alpha=alpha, algo=algo, workspace=workspace,
workspace_size=workspace_size)
end

function ∇conv_data!(dx::CuArray{T}, dy::CuArray{T}, w::CuArray{T};
pad=0, stride=1, flipkernel=0, alpha=1, dilation=1,
workspace::Union{CuVector, Nothing}=nothing, algo=0) where T<:CUDNNFloat
pad=0, stride=1, flipkernel=0, alpha=1, dilation=1, algo=0) where T<:CUDNNFloat
if version() < v"6"
all(x -> x == 1, dilation) || error("Only dilation = 1 is supported in cuDNN version < 6")
end
if workspace === nothing
workspace_size =
cudnnGetConvolutionBackwardDataWorkspaceSize(dx, w, dy, padding=pad, stride=stride,
dilation=dilation, algo=algo, mode=flipkernel)
workspace = workspace_size != 0 ? conv_workspace(workspace_size) : workspace
else
workspace_size = length(workspace[])

workspace_size =
cudnnGetConvolutionBackwardDataWorkspaceSize(dx, w, dy, padding=pad, stride=stride,
dilation=dilation, algo=algo, mode=flipkernel)
CuVector{UInt8}(undef, workspace_size) do workspace
cudnnConvolutionBackwardData(dx, w, dy, padding=pad, stride=stride, dilation=dilation,
mode=flipkernel, alpha=alpha, algo=algo, workspace=workspace,
workspace_size=workspace_size)
end
cudnnConvolutionBackwardData(dx, w, dy, padding=pad, stride=stride, dilation=dilation,
mode=flipkernel, alpha=alpha, algo=algo, workspace=workspace,
workspace_size=workspace_size)
end

∇conv_bias!(db::CuArray{T}, dy::CuArray{T}; alpha=1, beta=0) where T<:CUDNNFloat =
Expand Down
2 changes: 1 addition & 1 deletion src/memory.jl
Original file line number Diff line number Diff line change
Expand Up @@ -302,7 +302,7 @@ end

function dealloc(buf, bytes)
# 0-byte allocations shouldn't hit the pool
bytes == 0 && return Mem.alloc(0)
bytes == 0 && return

stats.req_nfree += 1
stats.user_free += bytes
Expand Down
2 changes: 1 addition & 1 deletion src/solver/CUSOLVER.jl
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ import CUDAdrv: CUDAdrv, CuContext, CuStream_t, CuPtr, PtrOrCuPtr, CU_NULL
import CUDAapi

using ..CuArrays
using ..CuArrays: libcusolver, active_context, _getindex
using ..CuArrays: libcusolver, active_context, _getindex, unsafe_free!

using LinearAlgebra
using SparseArrays
Expand Down