diff --git a/.gitignore b/.gitignore index 8c960ec..6cb042d 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,5 @@ *.jl.cov *.jl.*.cov *.jl.mem +*~ +.juliahistory \ No newline at end of file diff --git a/src/CuConv.jl b/src/CuConv.jl index 0c463c1..b91fe02 100644 --- a/src/CuConv.jl +++ b/src/CuConv.jl @@ -1,5 +1,5 @@ module CuConv -# package code goes here +include("core.jl") end # module diff --git a/src/conv.jl b/src/conv.jl new file mode 100644 index 0000000..ae161cc --- /dev/null +++ b/src/conv.jl @@ -0,0 +1,240 @@ + +function conv4{T}(w::CuArray{T},x::CuArray{T}; + handle=cudnnhandle(), algo=0, workSpace=C_NULL, workSpaceSizeInBytes=0, alpha=1, + o...) # padding=0, stride=1, upscale=1, mode=0 + y = similar(x, cdims(w,x;o...)) + beta=0 # nonzero beta does not make sense when we create y + @cuda(cudnn, cudnnConvolutionForward, + (Cptr,Ptr{T},Cptr,Ptr{T},Cptr,Ptr{T},Cptr, UInt32,Cptr, Csize_t, Ptr{T},Cptr,Ptr{T}), + handle,Ref(T(alpha)),TD(x),x.ptr,FD(w),w.ptr,CD(w,x;o...),algo,workSpace,workSpaceSizeInBytes,Ref(T(beta)),TD(y),y.ptr) + return y +end + +function conv4x{T}(w::CuArray{T},x::CuArray{T},dy::CuArray{T}; + handle=cudnnhandle(), alpha=1, algo=0, workSpace=C_NULL, workSpaceSizeInBytes=0, + o...) # padding=0, stride=1, upscale=1, mode=0 + beta = 0 + dx = similar(x) + if cudnnVersion >= 4000 + @cuda(cudnn,cudnnConvolutionBackwardData, + (Cptr,Ptr{T},Cptr,Ptr{T},Cptr,Ptr{T},Cptr, UInt32,Cptr, Csize_t, Ptr{T},Cptr,Ptr{T}), + handle,Ref(T(alpha)),FD(w),w.ptr,TD(dy),dy.ptr,CD(w,x;o...),algo,workSpace,workSpaceSizeInBytes,Ref(T(beta)),TD(dx),dx.ptr) + elseif cudnnVersion >= 3000 + @cuda(cudnn,cudnnConvolutionBackwardData_v3, + (Cptr,Ptr{T},Cptr,Ptr{T},Cptr,Ptr{T},Cptr, UInt32,Cptr, Csize_t, Ptr{T},Cptr,Ptr{T}), + handle,Ref(T(alpha)),FD(w),w.ptr,TD(dy),dy.ptr,CD(w,x;o...),algo,workSpace,workSpaceSizeInBytes,Ref(T(beta)),TD(dx),dx.ptr) + else + @cuda(cudnn,cudnnConvolutionBackwardData, + (Cptr,Ptr{T},Cptr,Ptr{T},Cptr,Ptr{T},Cptr, Ptr{T},Cptr,Ptr{T}), + handle,Ref(T(alpha)),FD(w),w.ptr,TD(dy),dy.ptr,CD(w,x;o...),Ref(T(beta)),TD(dx),dx.ptr) + end + return dx +end + +function conv4w{T}(w::CuArray{T},x::CuArray{T},dy::CuArray{T}; + handle=cudnnhandle(), alpha=1, algo=0, workSpace=C_NULL, workSpaceSizeInBytes=0, + o...) # padding=0, stride=1, upscale=1, mode=0 + beta = 0 + dw = similar(w) + if cudnnVersion >= 4000 + @cuda(cudnn,cudnnConvolutionBackwardFilter, + (Cptr,Ptr{T},Cptr,Ptr{T},Cptr,Ptr{T},Cptr, UInt32,Cptr, Csize_t, Ptr{T},Cptr,Ptr{T}), + handle,Ref(T(alpha)),TD(x),x.ptr,TD(dy),dy.ptr,CD(w,x;o...),algo,workSpace,workSpaceSizeInBytes,Ref(T(beta)),FD(dw),dw.ptr) + elseif cudnnVersion >= 3000 + @cuda(cudnn,cudnnConvolutionBackwardFilter_v3, + (Cptr,Ptr{T},Cptr,Ptr{T},Cptr,Ptr{T},Cptr, UInt32,Cptr, Csize_t, Ptr{T},Cptr,Ptr{T}), + handle,Ref(T(alpha)),TD(x),x.ptr,TD(dy),dy.ptr,CD(w,x;o...),algo,workSpace,workSpaceSizeInBytes,Ref(T(beta)),FD(dw),dw.ptr) + else + @cuda(cudnn,cudnnConvolutionBackwardFilter, + (Cptr,Ptr{T},Cptr,Ptr{T},Cptr,Ptr{T},Cptr, Ptr{T},Cptr,Ptr{T}), + handle,Ref(T(alpha)),TD(x),x.ptr,TD(dy),dy.ptr,CD(w,x;o...),Ref(T(beta)),FD(dw),dw.ptr) + end + return dw +end + + + + + +mutable struct TD; ptr + function TD(a::CuArray) + d = Cptr[0] + @cuda(cudnn,cudnnCreateTensorDescriptor,(Ptr{Cptr},),d) + n = ndims(a) + sz = [Cint(size(a,n-i+1)) for i=1:n] + st = [Cint(stride(a,n-i+1)) for i=1:n] + @cuda(cudnn,cudnnSetTensorNdDescriptor, + (Cptr,UInt32,Cint,Ptr{Cint},Ptr{Cint}), + d[1], DT(a), n, sz, st) + td = new(d[1]) + finalizer(td, x->@cuda(cudnn,cudnnDestroyTensorDescriptor,(Cptr,),x.ptr)) + return td + end +end + +mutable struct FD; ptr + function FD(a::CuArray) + d = Cptr[0] + @cuda(cudnn,cudnnCreateFilterDescriptor,(Ptr{Cptr},),d) + n = ndims(a) + sz = [Cint(size(a,n-i+1)) for i=1:n] + if cudnnVersion >= 5000 + @cuda(cudnn,cudnnSetFilterNdDescriptor, + (Cptr,UInt32,UInt32,Cint,Ptr{Cint}), + d[1], DT(a), 0, n, sz) + elseif cudnnVersion >= 4000 + @cuda(cudnn,cudnnSetFilterNdDescriptor_v4, + (Cptr,UInt32,UInt32,Cint,Ptr{Cint}), + d[1], DT(a), 0, n, sz) + else + @cuda(cudnn,cudnnSetFilterNdDescriptor, + (Cptr,UInt32,Cint,Ptr{Cint}), + d[1], DT(a), n, sz) + end + fd = new(d[1]) + finalizer(fd, x->@cuda(cudnn,cudnnDestroyFilterDescriptor,(Cptr,),x.ptr)) + return fd + end +end + +mutable struct CD; ptr + function CD(w::CuArray,x::CuArray; padding=0, stride=1, upscale=1, mode=0) + d = Cptr[0] + @cuda(cudnn,cudnnCreateConvolutionDescriptor,(Ptr{Cptr},),d) + nd = ndims(x)-2 + if cudnnVersion >= 4000 + @cuda(cudnn,cudnnSetConvolutionNdDescriptor, + (Cptr,Cint,Ptr{Cint},Ptr{Cint},Ptr{Cint},UInt32,UInt32), + d[1],nd,cdsize(padding,nd),cdsize(stride,nd),cdsize(upscale,nd),mode,DT(x)) + elseif cudnnVersion > 3000 # does not work when cudnnVersion==3000 + @cuda(cudnn,cudnnSetConvolutionNdDescriptor_v3, + (Cptr,Cint,Ptr{Cint},Ptr{Cint},Ptr{Cint},UInt32,UInt32), + d[1],nd,cdsize(padding,nd),cdsize(stride,nd),cdsize(upscale,nd),mode,DT(x)) + else + @cuda(cudnn,cudnnSetConvolutionNdDescriptor, + (Cptr,Cint,Ptr{Cint},Ptr{Cint},Ptr{Cint},UInt32), + d[1],nd,cdsize(padding,nd),cdsize(stride,nd),cdsize(upscale,nd),mode) + end + cd = new(d[1]) + finalizer(cd, x->@cuda(cudnn,cudnnDestroyConvolutionDescriptor,(Cptr,),x.ptr)) + return cd + end +end + +mutable struct PD; ptr + function PD(x::CuArray; window=2, padding=0, stride=window, mode=0, maxpoolingNanOpt=0) + d = Cptr[0] + @cuda(cudnn,cudnnCreatePoolingDescriptor,(Ptr{Cptr},),d) + nd = ndims(x)-2 + if cudnnVersion >= 5000 + @cuda(cudnn,cudnnSetPoolingNdDescriptor, + (Cptr,UInt32,UInt32,Cint,Ptr{Cint},Ptr{Cint},Ptr{Cint}), + d[1],mode,maxpoolingNanOpt,nd,cdsize(window,nd),cdsize(padding,nd),cdsize(stride,nd)) + elseif cudnnVersion >= 4000 + @cuda(cudnn,cudnnSetPoolingNdDescriptor_v4, + (Cptr,UInt32,UInt32,Cint,Ptr{Cint},Ptr{Cint},Ptr{Cint}), + d[1],mode,maxpoolingNanOpt,nd,cdsize(window,nd),cdsize(padding,nd),cdsize(stride,nd)) + else + @cuda(cudnn,cudnnSetPoolingNdDescriptor, + (Cptr,UInt32,Cint,Ptr{Cint},Ptr{Cint},Ptr{Cint}), + d[1],mode,nd,cdsize(window,nd),cdsize(padding,nd),cdsize(stride,nd)) + end + pd = new(d[1]) + finalizer(pd, x->@cuda(cudnn,cudnnDestroyPoolingDescriptor,(Cptr,),x.ptr)) + return pd + end +end + +import Base: unsafe_convert +unsafe_convert(::Type{Cptr}, td::TD)=td.ptr +unsafe_convert(::Type{Cptr}, fd::FD)=fd.ptr +unsafe_convert(::Type{Cptr}, cd::CD)=cd.ptr +unsafe_convert(::Type{Cptr}, pd::PD)=pd.ptr + +# fill and reverse Cint array with padding etc. for cudnn calls +function cdsize(w, nd) + if isa(w,Number) + fill(Cint(w),nd) + elseif length(w)==nd + [ Cint(w[nd-i+1]) for i=1:nd ] + else + throw(DimensionMismatch("$w $nd")) + end +end + +# convert padding etc. size to an Int array of the right dimension +function psize(p, x) + nd = ndims(x)-2 + if isa(p,Number) + fill(Int(p),nd) + elseif length(p)==nd + collect(Int,p) + else + throw(DimensionMismatch("psize: $p $nd")) + end +end + +DT(::CuArray{Float32})=UInt32(0) +DT(::CuArray{Float64})=UInt32(1) +DT(::CuArray{Float16})=UInt32(2) + +function cdims(w,x; padding=0, stride=1, o...) + N = ndims(x) + ntuple(N) do i + if i < N-1 + pi = (if isa(padding,Number); padding; else padding[i]; end) + si = (if isa(stride,Number); stride; else stride[i]; end) + 1 + div(size(x,i) - size(w,i) + 2*pi, si) + elseif i == N-1 + size(w,N) + else # i == N + size(x,N) + end + end +end + +function pdims(x; window=2, padding=0, stride=window, o...) + N = ndims(x) + ntuple(N) do i + if i < N-1 + wi = (if isa(window,Number); window; else window[i]; end) + pi = (if isa(padding,Number); padding; else padding[i]; end) + si = (if isa(stride,Number); stride; else stride[i]; end) + 1 + div(size(x,i) + 2*pi - wi, si) + else + size(x,i) + end + end +end + +function dcdims(w,x; padding=0, stride=1, o...) + N = ndims(x) + ntuple(N) do i + if i < N-1 + pi = (if isa(padding,Number); padding; else padding[i]; end) + si = (if isa(stride,Number); stride; else stride[i]; end) + si*(size(x,i)-1) + size(w,i) - 2*pi + elseif i == N-1 + size(w,N) + else + size(x,N) + end + end +end + +function updims(x; window=2, padding=0, stride=window, o...) + window = psize(window,x) + stride = psize(stride,x) + padding = psize(padding,x) + N = ndims(x) + ntuple(N) do i + if i < N-1 + (size(x,i)-1)*stride[i]+window[i]-2*padding[i] + else + size(x,i) + end + end +end + +# convolution padding size that preserves the input size when filter size is odd and stride=1 +padsize(w)=ntuple(i->div(size(w,i)-1,2), ndims(w)-2) diff --git a/src/core.jl b/src/core.jl new file mode 100644 index 0000000..aea7aef --- /dev/null +++ b/src/core.jl @@ -0,0 +1,24 @@ + + +using CuArrays + +const Cptr = Ptr{Void} +macro gs(); if false; esc(:(ccall(("cudaDeviceSynchronize","libcudart"),UInt32,()))); end; end + +include("gpu.jl") +include("conv.jl") + + + + + +# See if we have a gpu at initialization: +function __init__() + try + r = gpu(true) + # info(r >= 0 ? "Knet using GPU $r" : "No GPU found, Knet using the CPU") + catch e + gpu(false) + # warn("Knet using the CPU: $e") + end +end diff --git a/src/gpu.jl b/src/gpu.jl new file mode 100644 index 0000000..d85f3fc --- /dev/null +++ b/src/gpu.jl @@ -0,0 +1,181 @@ +macro gpu(_ex); if gpu()>=0; esc(_ex); end; end + +macro cuda(lib,fun,x...) # give an error if library missing, or if error code!=0 + if Libdl.find_library(["lib$lib"], []) != "" + if VERSION >= v"0.6.0" + fx = Expr(:call, :ccall, ("$fun","lib$lib"), :UInt32, x...) + else + fx = Expr(:ccall, ("$fun","lib$lib"), :UInt32, x...) + end + msg = "$lib.$fun error " + err = gensym() + # esc(:(if ($err=$fx) != 0; warn($msg, $err); Base.show_backtrace(STDOUT, backtrace()); end)) + esc(:(if ($err=$fx) != 0; error($msg, $err); end; @gs)) + else + Expr(:call,:error,"Cannot find lib$lib, please install it and rerun Pkg.build(\"Knet\").") + end +end + +macro cuda1(lib,fun,x...) # return -1 if library missing, error code if run + if Libdl.find_library(["lib$lib"], []) != "" + if VERSION >= v"0.6.0" + fx = Expr(:call, :ccall, ("$fun","lib$lib"), :UInt32, x...) + else + fx = Expr(:ccall, ("$fun","lib$lib"), :UInt32, x...) + end + err = gensym() + esc(:($err=$fx; @gs; $err)) + else + -1 + end +end + +macro knet8(fun,x...) # error if libknet8 missing, nothing if run + if libknet8 != "" + if VERSION >= v"0.6.0" + fx = Expr(:call, :ccall, ("$fun",libknet8), :Void, x...) + else + fx = Expr(:ccall, ("$fun",libknet8), :Void, x...) + end + err = gensym() + esc(:($err=$fx; @gs; $err)) + else + Expr(:call,:error,"Cannot find libknet8, please rerun Pkg.build(\"Knet\").") + end +end + +const Cptr = Ptr{Void} + +""" + +`gpu()` returns the id of the active GPU device or -1 if none are +active. + +`gpu(true)` resets all GPU devices and activates the one with the most +available memory. + +`gpu(false)` resets and deactivates all GPU devices. + +`gpu(d::Int)` activates the GPU device `d` if `0 <= d < gpuCount()`, +otherwise deactivates devices. + +`gpu(true/false)` resets all devices. If there are any allocated +KnetArrays their pointers will be left dangling. Thus +`gpu(true/false)` should only be used during startup. If you want to +suspend GPU use temporarily, use `gpu(-1)`. + +`gpu(d::Int)` does not reset the devices. You can select a previous +device and find allocated memory preserved. However trying to operate +on arrays of an inactive device will result in error. + +""" +function gpu end + +let GPU=-1, GPUCNT=-1, CUBLAS=nothing, CUDNN=nothing + global gpu, gpuCount, cublashandle, cudnnhandle, cudaRuntimeVersion, cudaDriverVersion + + gpu()=GPU + + function gpuCount() # should not bomb when there is no gpu or nvidia libs + if GPUCNT == -1 + GPUCNT = try + p=Cuint[0] + # @cuda does not stay quiet so we use @cuda1 here + # This code is only run once if successful, so nvmlInit here is ok + @cuda1("nvidia-ml",nvmlInit,()) + @cuda1("nvidia-ml",nvmlDeviceGetCount,(Ptr{Cuint},),p) + # Let us keep nvml initialized for future ops such as meminfo + # @cuda1("nvidia-ml",nvmlShutdown,()) + Int(p[1]) + catch + 0 + end + end + return GPUCNT + end + + function gpu(i::Int) + (GPU == i) && return i + if 0 <= i < gpuCount() + @cuda(cudart,cudaSetDevice, (Cint,), i) + cudaRuntimeVersion = (p=Cint[0];@cuda(cudart,cudaRuntimeGetVersion,(Ptr{Cint},),p);Int(p[1])) + cudaDriverVersion = (p=Cint[0];@cuda(cudart,cudaDriverGetVersion, (Ptr{Cint},),p);Int(p[1])) + else + i = -1 + # @cuda(cudart,cudaDeviceReset,()) # may still go back and use arrays allocated in a previous gpu + end + return (GPU = i) + end + + function gpu(usegpu::Bool) + if usegpu && gpuCount() > 0 + pick = free = same = -1 + for i=0:gpuCount()-1 + mem = nvmlDeviceGetMemoryInfo(i) + if mem[2] > free + pick = i + free = mem[2] + same = 1 + elseif mem[2] == free + # pick one of equal devices randomly + rand(1:(same+=1)) == 1 && (pick = i) + end + end + gpu(pick) + else + for i=0:gpuCount()-1 + @cuda(cudart,cudaDeviceReset,()) + end + gpu(-1) + end + end + + function cublashandle(dev=gpu()) + if dev==-1; error("No cublashandle for CPU"); end + i = dev+2 + if CUBLAS == nothing; CUBLAS=Array{Any}(gpuCount()+1); end + if !isassigned(CUBLAS,i); CUBLAS[i]=cublasCreate(); end + return CUBLAS[i] + end + + function cudnnhandle(dev=gpu()) + if dev==-1; error("No cudnnhandle for CPU"); end + i = dev+2 + if CUDNN == nothing; CUDNN=Array{Any}(gpuCount()+1); end + if !isassigned(CUDNN,i); CUDNN[i]=cudnnCreate(); end + return CUDNN[i] + end +end + +# cudaGetDeviceCount is deprecated, use gpuCount instead: +cudaGetDeviceCount()=(try; p=Cint[0]; eval(:(ccall(("cudaGetDeviceCount","libcudart"),UInt32,(Ptr{Cint},),$p))); p[1]; catch; 0; end) # will not bomb when there is no gpu +cudaGetDevice()=(d=Cint[-1];@cuda(cudart,cudaGetDevice,(Ptr{Cint},),d);d[1]) +cudaGetMemInfo()=(f=Csize_t[0];m=Csize_t[0]; @cuda(cudart,cudaMemGetInfo,(Ptr{Csize_t},Ptr{Csize_t}),f,m); (Int(f[1]),Int(m[1]))) +cudaDeviceSynchronize()=@cuda(cudart,cudaDeviceSynchronize,()) + +function nvmlDeviceGetMemoryInfo(i=gpu()) + 0 <= i < gpuCount() || return nothing + dev = Cptr[0] + mem = Array{Culonglong}(3) + @cuda("nvidia-ml","nvmlDeviceGetHandleByIndex",(Cuint,Ptr{Cptr}),i,dev) + @cuda("nvidia-ml","nvmlDeviceGetMemoryInfo",(Cptr,Ptr{Culonglong}),dev[1],mem) + ntuple(i->Int(mem[i]),length(mem)) +end + +function cublasCreate() + handleP = Cptr[0] + @cuda(cublas,cublasCreate_v2, (Ptr{Cptr},), handleP) + handle = handleP[1] + atexit(()->@cuda(cublas,cublasDestroy_v2, (Cptr,), handle)) + global cublasVersion = (p=Cint[0];@cuda(cublas,cublasGetVersion_v2,(Cptr,Ptr{Cint}),handle,p);Int(p[1])) + return handle +end + +function cudnnCreate() + handleP = Cptr[0] + @cuda(cudnn,cudnnCreate,(Ptr{Cptr},), handleP) + handle = handleP[1] + atexit(()->@cuda(cudnn,cudnnDestroy,(Cptr,), handle)) + global cudnnVersion = Int(ccall((:cudnnGetVersion,:libcudnn),Csize_t,())) + return handle +end