JuliaGPU · maleadt · Feb 8, 2019 · Feb 7, 2019 · Feb 7, 2019 · Feb 7, 2019
diff --git a/src/array.jl b/src/array.jl
@@ -21,8 +21,13 @@ CuVector{T} = CuArray{T,1}
 CuMatrix{T} = CuArray{T,2}
 CuVecOrMat{T} = Union{CuVector{T},CuMatrix{T}}
 
-function unsafe_free!(xs::CuArray)
+const INVALID = Mem.alloc(0)
+
+function unsafe_free!(xs::CuArray{<:Any,N}) where {N}
+  xs.buf === INVALID && return
   Mem.release(xs.buf) && dealloc(xs.buf, prod(xs.dims)*sizeof(eltype(xs)))
+  xs.dims = Tuple(0 for _ in 1:N)
+  xs.buf = INVALID
   return
 end
 
@@ -44,6 +49,20 @@ CuArray{T}(::UndefInitializer, dims::Integer...) where {T} =
 # empty vector constructor
 CuArray{T,1}() where {T} = CuArray{T,1}(undef, 0)
 
+# do-block constructors
+for (ctor, tvars) in (:CuArray => (), :(CuArray{T}) => (:T,), :(CuArray{T,N}) => (:T, :N))
+  @eval begin
+    function $ctor(f::Function, args...) where {$(tvars...)}
+      xs = $ctor(args...)
+      try
+        f(xs)
+      finally
+        unsafe_free!(xs)
+      end
+    end
+  end
+end
+
 
 Base.similar(a::CuArray{T,N}) where {T,N} = CuArray{T,N}(undef, size(a))
 Base.similar(a::CuArray{T}, dims::Base.Dims{N}) where {T,N} = CuArray{T,N}(undef, dims)

diff --git a/src/blas/CUBLAS.jl b/src/blas/CUBLAS.jl
@@ -4,7 +4,7 @@ import CUDAdrv: CUDAdrv, CuContext, CuStream_t, CuPtr, PtrOrCuPtr, CU_NULL
 import CUDAapi
 
 using ..CuArrays
-using ..CuArrays: libcublas, active_context
+using ..CuArrays: libcublas, active_context, unsafe_free!
 
 using LinearAlgebra
 

diff --git a/src/blas/wrappers.jl b/src/blas/wrappers.jl
@@ -221,7 +221,7 @@ for (fname, elty) in ((:cublasIdamax_v2,:Float64),
                        dx::CuArray{$elty},
                        incx::Integer)
             result = Ref{Cint}()
-           $fname(handle(), n, dx, incx, result)
+            $fname(handle(), n, dx, incx, result)
             return result[]
         end
     end
@@ -836,7 +836,7 @@ for (fname, elty) in
                                B::Array{CuMatrix{$elty},1},
                                beta::($elty),
                                C::Array{CuMatrix{$elty},1})
-            if( length(A) != length(B) || length(A) != length(C) )
+            if length(A) != length(B) || length(A) != length(C)
                 throw(DimensionMismatch(""))
             end
             for (As,Bs,Cs) in zip(A,B,C)
@@ -847,6 +847,7 @@ for (fname, elty) in
                     throw(DimensionMismatch(""))
                 end
             end
+
             m = size(A[1], transA == 'N' ? 1 : 2)
             k = size(A[1], transA == 'N' ? 2 : 1)
             n = size(B[1], transB == 'N' ? 2 : 1)
@@ -860,6 +861,10 @@ for (fname, elty) in
             Cptrs = device_batch(C)
             $fname(handle(), cutransA,cutransB, m, n, k, [alpha], Aptrs, lda, Bptrs,
                    ldb, [beta], Cptrs, ldc, length(A))
+            unsafe_free!(Cptrs)
+            unsafe_free!(Bptrs)
+            unsafe_free!(Aptrs)
+
             C
         end
         function gemm_batched(transA::Char,
@@ -1346,7 +1351,7 @@ for (fname, elty) in
             cuuplo = cublasfill(uplo)
             cutransa = cublasop(transa)
             cudiag = cublasdiag(diag)
-            if( length(A) != length(B) )
+            if length(A) != length(B)
                 throw(DimensionMismatch(""))
             end
             for (As,Bs) in zip(A,B)
@@ -1355,12 +1360,16 @@ for (fname, elty) in
                 if mA != nA throw(DimensionMismatch("A must be square")) end
                 if nA != (side == 'L' ? m : n) throw(DimensionMismatch("trsm_batched!")) end
             end
+
             m,n = size(B[1])
             lda = max(1,stride(A[1],2))
             ldb = max(1,stride(B[1],2))
             Aptrs = device_batch(A)
             Bptrs = device_batch(B)
             $fname(handle(), cuside, cuuplo, cutransa, cudiag, m, n, [alpha], Aptrs, lda, Bptrs, ldb, length(A))
+            unsafe_free!(Bptrs)
+            unsafe_free!(Aptrs)
+
             B
         end
         function trsm_batched(side::Char,
@@ -1453,13 +1462,16 @@ for (fname, elty) in
                     throw(DimensionMismatch("All matrices must be square!"))
                 end
             end
+
             m,n = size(A[1])
             lda = max(1,stride(A[1],2))
             Aptrs = device_batch(A)
             info  = CuArray{Cint}(undef, length(A))
             pivotArray  = Pivot ? CuArray{Int32}(undef, (n, length(A))) : CU_NULL
             $fname(handle(), n, Aptrs, lda, pivotArray, info, length(A))
-            if( !Pivot )
+            unsafe_free!(Aptrs)
+
+            if !Pivot
                 pivotArray = CuArray(zeros(Cint, (n, length(A))))
             end
             pivotArray, info, A
@@ -1493,6 +1505,7 @@ for (fname, elty) in
                     throw(DimensionMismatch("All A matrices must be square!"))
                 end
             end
+
             C = CuMatrix{$elty}[similar(A[1]) for i in 1:length(A)]
             n = size(A[1])[1]
             lda = max(1,stride(A[1],2))
@@ -1501,6 +1514,9 @@ for (fname, elty) in
             Cptrs = device_batch(C)
             info = CuArray(zeros(Cint,length(A)))
             $fname(handle(), n, Aptrs, lda, pivotArray, Cptrs, ldc, info, length(A))
+            unsafe_free!(Cptrs)
+            unsafe_free!(Aptrs)
+
             pivotArray, info, C
         end
     end
@@ -1528,6 +1544,7 @@ for (fname, elty) in
                     throw(ArgumentError("matinv requires all matrices be smaller than 32 x 32"))
                 end
             end
+
             C = CuMatrix{$elty}[similar(A[1]) for i in 1:length(A)]
             n = size(A[1])[1]
             lda = max(1,stride(A[1],2))
@@ -1536,6 +1553,9 @@ for (fname, elty) in
             Cptrs = device_batch(C)
             info = CuArray(zeros(Cint,length(A)))
             $fname(handle(), n, Aptrs, lda, Cptrs, ldc, info, length(A))
+            unsafe_free!(Cptrs)
+            unsafe_free!(Aptrs)
+
             info, C
         end
     end
@@ -1560,14 +1580,17 @@ for (fname, elty) in
             hTauArray = [zeros($elty, min(m,n)) for i in 1:length(A)]
             TauArray = CuArray{$elty,1}[]
             for i in 1:length(A)
-                push!(TauArray,CuArray(hTauArray[i]))
+                push!(TauArray, CuArray(hTauArray[i]))
             end
             Tauptrs = device_batch(TauArray)
             info    = zero(Cint)
             $fname(handle(), m, n, Aptrs, lda, Tauptrs, [info], length(A))
-            if( info != 0 )
+            unsafe_free!(Tauptrs)
+
+            if info != 0
                 throw(ArgumentError,string("Invalid value at ",-info))
             end
+
             TauArray, A
         end
         function geqrf_batched(A::Array{CuMatrix{$elty},1})
@@ -1593,20 +1616,21 @@ for (fname, elty) in
                               A::Array{CuMatrix{$elty},1},
                               C::Array{CuMatrix{$elty},1})
             cutrans = cublasop(trans)
-            if( length(A) != length(C) )
+            if length(A) != length(C)
                 throw(DimensionMismatch(""))
             end
             for (As,Cs) in zip(A,C)
                 m,n = size(As)
                 mC,nC = size(Cs)
-                if( n != mC )
+                if n != mC
                     throw(DimensionMismatch(""))
                 end
             end
             m,n = size(A[1])
-            if( m < n )
+            if m < n
                 throw(ArgumentError("System must be overdetermined"))
             end
+
             nrhs = size(C[1])[2]
             lda = max(1,stride(A[1],2))
             ldc = max(1,stride(A[1],2))
@@ -1615,9 +1639,13 @@ for (fname, elty) in
             info  = zero(Cint)
             infoarray = CuArray(zeros(Cint, length(A)))
             $fname(handle(), cutrans, m, n, nrhs, Aptrs, lda, Cptrs, ldc, [info], infoarray, length(A))
-            if( info != 0 )
+            unsafe_free!(Cptrs)
+            unsafe_free!(Aptrs)
+
+            if info != 0
                 throw(ArgumentError,string("Invalid value at ",-info))
             end
+
             A, C, infoarray
         end
         function gels_batched(trans::Char,

diff --git a/src/dnn/CUDNN.jl b/src/dnn/CUDNN.jl
@@ -5,7 +5,7 @@ import CUDAapi
 import CUDAdrv: CUDAdrv, CuContext, CuPtr, CU_NULL
 
 using ..CuArrays
-using ..CuArrays: libcudnn, active_context, configured
+using ..CuArrays: libcudnn, active_context, configured, unsafe_free!
 
 include("libcudnn_types.jl")
 include("error.jl")

diff --git a/src/dnn/libcudnn.jl b/src/dnn/libcudnn.jl
@@ -245,7 +245,6 @@ function cudnnConvolutionBiasActivationForward(y::CuArray{T,N}, x::CuArray{T,N},
 end
 
 function cudnnConvolutionForward(alpha, xDesc, x, wDesc, w, convDesc, algo, workspace, workspace_size, beta, yDesc, y)
-    workspace = something(workspace, CU_NULL)
     @check ccall((:cudnnConvolutionForward, libcudnn),
                  cudnnStatus_t,
                  (cudnnHandle_t, Ptr{Nothing}, cudnnTensorDescriptor_t, CuPtr{Nothing},
@@ -285,7 +284,6 @@ function cudnnGetConvolutionForwardWorkspaceSize(y::CuArray{T,N}, x::CuArray{T,N
 end
 
 function cudnnConvolutionBackwardData(alpha, wDesc, w, dyDesc, dy, convDesc, algo, workspace, workspace_size, beta, dxDesc, dx)
-    workspace = something(workspace, CU_NULL)
     @check ccall((:cudnnConvolutionBackwardData, libcudnn),
                  cudnnStatus_t,
                  (cudnnHandle_t, Ptr{Nothing}, cudnnFilterDescriptor_t, CuPtr{Nothing},
@@ -325,7 +323,6 @@ function cudnnGetConvolutionBackwardDataWorkspaceSize(dx::CuArray{T,N}, w::CuArr
 end
 
 function cudnnConvolutionBackwardFilter(alpha, xDesc, x, dyDesc, dy, convDesc, algo, workspace, workspace_size, beta, dwDesc, dw)
-    workspace = something(workspace, CU_NULL)
     @check ccall((:cudnnConvolutionBackwardFilter, libcudnn),
                  cudnnStatus_t,
                  (cudnnHandle_t, Ptr{Nothing}, cudnnTensorDescriptor_t, CuPtr{Nothing},

diff --git a/src/dnn/nnlib.jl b/src/dnn/nnlib.jl
@@ -5,6 +5,7 @@ import NNlib: conv!, ∇conv_filter!, ∇conv_data!,
 import ..CuArrays: CuVecOrMat, CuVector
 using CUDAnative
 
+
 # Softmax
 
 const CUDNNFloat = Union{Float16,Float32,Float64}
@@ -36,73 +37,56 @@ end
 ∇logsoftmax(Δ::CuVecOrMat{T}, xs::CuVecOrMat{T}) where T<:CUDNNFloat =
   ∇logsoftmax!(similar(xs), Δ, xs)
 
-# Convolution
-
-const _conv_workspace = Ref{CuVector{UInt8}}()
 
-function conv_workspace(bytes)
-  global _conv_workspace
-  if isassigned(_conv_workspace) && bytes <= length(_conv_workspace[])
-    _conv_workspace[]
-  else
-    _conv_workspace[] = CuVector{UInt8}(undef, bytes)
-  end
-end
+# Convolution
 
 function conv!(y::CuArray{T}, x::CuArray{T}, w::CuArray{T};
-               pad=0, stride=1, flipkernel=0, alpha=1, dilation=1,
-               workspace::Union{CuVector, Nothing}=nothing, algo=0) where T<:CUDNNFloat
+               pad=0, stride=1, flipkernel=0, alpha=1, dilation=1, algo=0) where T<:CUDNNFloat
   if version() < v"6"
     all(x -> x == 1, dilation) || error("Only dilation = 1 is supported in cuDNN version < 6")
   end
-  if workspace === nothing
-    workspace_size =
-      cudnnGetConvolutionForwardWorkspaceSize(y, x, w, padding=pad, stride=stride, dilation=dilation,
-                                              algo=algo, mode=flipkernel)
-    workspace = workspace_size != 0 ? conv_workspace(workspace_size) : workspace
-  else
-    workspace_size = length(workspace[])
+
+  workspace_size =
+    cudnnGetConvolutionForwardWorkspaceSize(y, x, w, padding=pad, stride=stride, dilation=dilation,
+                                            algo=algo, mode=flipkernel)
+
+  CuVector{UInt8}(undef, workspace_size) do workspace
+    cudnnConvolutionForward(y, x, w, padding=pad, stride=stride, dilation=dilation, mode=flipkernel,
+                            alpha=alpha, algo=algo, workspace=workspace, workspace_size=workspace_size)
   end
-  cudnnConvolutionForward(y, x, w, padding=pad, stride=stride, dilation=dilation, mode=flipkernel,
-			  alpha=alpha, algo=algo, workspace=workspace, workspace_size=workspace_size)
 end
 
 function ∇conv_filter!(dw::CuArray{T}, dy::CuArray{T}, x::CuArray{T};
-                       pad=0, stride=1, flipkernel=0, alpha=1, dilation=1,
-                       workspace::Union{CuVector, Nothing}=nothing, algo=0) where T<:CUDNNFloat
+                       pad=0, stride=1, flipkernel=0, alpha=1, dilation=1, algo=0) where T<:CUDNNFloat
   if version() < v"6"
     all(x -> x == 1, dilation) || error("Only dilation = 1 is supported in cuDNN version < 6")
   end
-  if workspace === nothing
-    workspace_size =
-      cudnnGetConvolutionBackwardFilterWorkspaceSize(dw, x, dy, padding=pad, stride=stride, 
-					             dilation=dilation, algo=algo, mode=flipkernel)
-    workspace = workspace_size != 0 ? conv_workspace(workspace_size) : workspace
-  else
-    workspace_size = length(workspace[])
+
+  workspace_size =
+    cudnnGetConvolutionBackwardFilterWorkspaceSize(dw, x, dy, padding=pad, stride=stride,
+                                                   dilation=dilation, algo=algo, mode=flipkernel)
+
+  CuVector{UInt8}(undef, workspace_size) do workspace
+    cudnnConvolutionBackwardFilter(dw, x, dy, padding=pad, stride=stride, dilation=dilation,
+                                   mode=flipkernel, alpha=alpha, algo=algo, workspace=workspace,
+                                   workspace_size=workspace_size)
   end
-  cudnnConvolutionBackwardFilter(dw, x, dy, padding=pad, stride=stride, dilation=dilation,
-				 mode=flipkernel, alpha=alpha, algo=algo, workspace=workspace,
-                                 workspace_size=workspace_size)
 end
 
 function ∇conv_data!(dx::CuArray{T}, dy::CuArray{T}, w::CuArray{T};
-                     pad=0, stride=1, flipkernel=0, alpha=1, dilation=1,
-                     workspace::Union{CuVector, Nothing}=nothing, algo=0) where T<:CUDNNFloat
+                     pad=0, stride=1, flipkernel=0, alpha=1, dilation=1, algo=0) where T<:CUDNNFloat
   if version() < v"6"
     all(x -> x == 1, dilation) || error("Only dilation = 1 is supported in cuDNN version < 6")
   end
-  if workspace === nothing
-    workspace_size =
-      cudnnGetConvolutionBackwardDataWorkspaceSize(dx, w, dy, padding=pad, stride=stride,
-                                                   dilation=dilation, algo=algo, mode=flipkernel)
-    workspace = workspace_size != 0 ? conv_workspace(workspace_size) : workspace
-  else
-    workspace_size = length(workspace[])
+
+  workspace_size =
+    cudnnGetConvolutionBackwardDataWorkspaceSize(dx, w, dy, padding=pad, stride=stride,
+                                                 dilation=dilation, algo=algo, mode=flipkernel)
+  CuVector{UInt8}(undef, workspace_size) do workspace
+    cudnnConvolutionBackwardData(dx, w, dy, padding=pad, stride=stride, dilation=dilation,
+                                 mode=flipkernel, alpha=alpha, algo=algo, workspace=workspace,
+                                 workspace_size=workspace_size)
   end
-  cudnnConvolutionBackwardData(dx, w, dy, padding=pad, stride=stride, dilation=dilation,
-			       mode=flipkernel, alpha=alpha, algo=algo, workspace=workspace,
-                               workspace_size=workspace_size)
 end
 
 ∇conv_bias!(db::CuArray{T}, dy::CuArray{T}; alpha=1, beta=0) where T<:CUDNNFloat =

diff --git a/src/memory.jl b/src/memory.jl
@@ -302,7 +302,7 @@ end
 
 function dealloc(buf, bytes)
   # 0-byte allocations shouldn't hit the pool
-  bytes == 0 && return Mem.alloc(0)
+  bytes == 0 && return
 
   stats.req_nfree += 1
   stats.user_free += bytes

diff --git a/src/solver/CUSOLVER.jl b/src/solver/CUSOLVER.jl
@@ -4,7 +4,7 @@ import CUDAdrv: CUDAdrv, CuContext, CuStream_t, CuPtr, PtrOrCuPtr, CU_NULL
 import CUDAapi
 
 using ..CuArrays
-using ..CuArrays: libcusolver, active_context, _getindex
+using ..CuArrays: libcusolver, active_context, _getindex, unsafe_free!
 
 using LinearAlgebra
 using SparseArrays