conv4 and friends on CuArray

JuliaAttic · Oct 3, 2017 · c0bc866 · c0bc866
1 parent 54a57ef
commit c0bc866
Show file tree

Hide file tree

Showing 5 changed files with 448 additions and 1 deletion.
diff --git a/.gitignore b/.gitignore
@@ -1,3 +1,5 @@
 *.jl.cov
 *.jl.*.cov
 *.jl.mem
+*~
+.juliahistory
diff --git a/src/CuConv.jl b/src/CuConv.jl
@@ -1,5 +1,5 @@
 module CuConv
 
-# package code goes here
+include("core.jl")
 
 end # module
diff --git a/src/conv.jl b/src/conv.jl
@@ -0,0 +1,240 @@
+
+function conv4{T}(w::CuArray{T},x::CuArray{T};
+                  handle=cudnnhandle(), algo=0, workSpace=C_NULL, workSpaceSizeInBytes=0, alpha=1,
+                  o...) # padding=0, stride=1, upscale=1, mode=0
+    y = similar(x, cdims(w,x;o...))
+    beta=0 # nonzero beta does not make sense when we create y
+    @cuda(cudnn, cudnnConvolutionForward,
+          (Cptr,Ptr{T},Cptr,Ptr{T},Cptr,Ptr{T},Cptr,   UInt32,Cptr,     Csize_t,             Ptr{T},Cptr,Ptr{T}),
+          handle,Ref(T(alpha)),TD(x),x.ptr,FD(w),w.ptr,CD(w,x;o...),algo,workSpace,workSpaceSizeInBytes,Ref(T(beta)),TD(y),y.ptr)
+    return y
+end
+
+function conv4x{T}(w::CuArray{T},x::CuArray{T},dy::CuArray{T};
+                   handle=cudnnhandle(), alpha=1, algo=0, workSpace=C_NULL, workSpaceSizeInBytes=0,
+                   o...) # padding=0, stride=1, upscale=1, mode=0
+    beta = 0
+    dx = similar(x)
+    if cudnnVersion >= 4000
+        @cuda(cudnn,cudnnConvolutionBackwardData,
+              (Cptr,Ptr{T},Cptr,Ptr{T},Cptr,Ptr{T},Cptr,     UInt32,Cptr,     Csize_t,             Ptr{T},Cptr,Ptr{T}),
+              handle,Ref(T(alpha)),FD(w),w.ptr,TD(dy),dy.ptr,CD(w,x;o...),algo,workSpace,workSpaceSizeInBytes,Ref(T(beta)),TD(dx),dx.ptr)
+    elseif cudnnVersion >= 3000
+        @cuda(cudnn,cudnnConvolutionBackwardData_v3,
+              (Cptr,Ptr{T},Cptr,Ptr{T},Cptr,Ptr{T},Cptr,     UInt32,Cptr,     Csize_t,             Ptr{T},Cptr,Ptr{T}),
+              handle,Ref(T(alpha)),FD(w),w.ptr,TD(dy),dy.ptr,CD(w,x;o...),algo,workSpace,workSpaceSizeInBytes,Ref(T(beta)),TD(dx),dx.ptr)
+    else
+        @cuda(cudnn,cudnnConvolutionBackwardData,
+              (Cptr,Ptr{T},Cptr,Ptr{T},Cptr,Ptr{T},Cptr,       Ptr{T},Cptr,Ptr{T}),
+              handle,Ref(T(alpha)),FD(w),w.ptr,TD(dy),dy.ptr,CD(w,x;o...),Ref(T(beta)),TD(dx),dx.ptr)
+    end
+    return dx
+end
+
+function conv4w{T}(w::CuArray{T},x::CuArray{T},dy::CuArray{T};
+                   handle=cudnnhandle(), alpha=1, algo=0, workSpace=C_NULL, workSpaceSizeInBytes=0,
+                   o...) # padding=0, stride=1, upscale=1, mode=0
+    beta = 0
+    dw = similar(w)
+    if cudnnVersion >= 4000
+        @cuda(cudnn,cudnnConvolutionBackwardFilter,
+              (Cptr,Ptr{T},Cptr,Ptr{T},Cptr,Ptr{T},Cptr,     UInt32,Cptr,     Csize_t,             Ptr{T},Cptr,Ptr{T}),
+              handle,Ref(T(alpha)),TD(x),x.ptr,TD(dy),dy.ptr,CD(w,x;o...),algo,workSpace,workSpaceSizeInBytes,Ref(T(beta)),FD(dw),dw.ptr)
+    elseif cudnnVersion >= 3000
+        @cuda(cudnn,cudnnConvolutionBackwardFilter_v3,
+              (Cptr,Ptr{T},Cptr,Ptr{T},Cptr,Ptr{T},Cptr,     UInt32,Cptr,     Csize_t,             Ptr{T},Cptr,Ptr{T}),
+              handle,Ref(T(alpha)),TD(x),x.ptr,TD(dy),dy.ptr,CD(w,x;o...),algo,workSpace,workSpaceSizeInBytes,Ref(T(beta)),FD(dw),dw.ptr)
+    else
+        @cuda(cudnn,cudnnConvolutionBackwardFilter,
+              (Cptr,Ptr{T},Cptr,Ptr{T},Cptr,Ptr{T},Cptr,       Ptr{T},Cptr,Ptr{T}),
+              handle,Ref(T(alpha)),TD(x),x.ptr,TD(dy),dy.ptr,CD(w,x;o...),Ref(T(beta)),FD(dw),dw.ptr)
+    end
+    return dw
+end
+
+
+
+
+
+mutable struct TD; ptr
+    function TD(a::CuArray)
+        d = Cptr[0]
+        @cuda(cudnn,cudnnCreateTensorDescriptor,(Ptr{Cptr},),d)
+        n = ndims(a)
+        sz = [Cint(size(a,n-i+1)) for i=1:n]
+        st = [Cint(stride(a,n-i+1)) for i=1:n]
+        @cuda(cudnn,cudnnSetTensorNdDescriptor,
+              (Cptr,UInt32,Cint,Ptr{Cint},Ptr{Cint}),
+              d[1], DT(a), n, sz, st)
+        td = new(d[1])
+        finalizer(td, x->@cuda(cudnn,cudnnDestroyTensorDescriptor,(Cptr,),x.ptr))
+        return td
+    end
+end
+
+mutable struct FD; ptr
+    function FD(a::CuArray)
+        d = Cptr[0]
+        @cuda(cudnn,cudnnCreateFilterDescriptor,(Ptr{Cptr},),d)
+        n = ndims(a)
+        sz = [Cint(size(a,n-i+1)) for i=1:n]
+        if cudnnVersion >= 5000
+            @cuda(cudnn,cudnnSetFilterNdDescriptor,
+                  (Cptr,UInt32,UInt32,Cint,Ptr{Cint}),
+                  d[1], DT(a), 0,     n,   sz)
+        elseif cudnnVersion >= 4000
+            @cuda(cudnn,cudnnSetFilterNdDescriptor_v4,
+                  (Cptr,UInt32,UInt32,Cint,Ptr{Cint}),
+                  d[1], DT(a), 0,     n,   sz)
+        else
+            @cuda(cudnn,cudnnSetFilterNdDescriptor,
+                  (Cptr,UInt32,Cint,Ptr{Cint}),
+                  d[1], DT(a),    n,   sz)
+        end
+        fd = new(d[1])
+        finalizer(fd, x->@cuda(cudnn,cudnnDestroyFilterDescriptor,(Cptr,),x.ptr))
+        return fd
+    end
+end
+
+mutable struct CD; ptr
+    function CD(w::CuArray,x::CuArray; padding=0, stride=1, upscale=1, mode=0)
+        d = Cptr[0]
+        @cuda(cudnn,cudnnCreateConvolutionDescriptor,(Ptr{Cptr},),d)
+        nd = ndims(x)-2
+        if cudnnVersion >= 4000
+            @cuda(cudnn,cudnnSetConvolutionNdDescriptor,
+                  (Cptr,Cint,Ptr{Cint},Ptr{Cint},Ptr{Cint},UInt32,UInt32),
+                  d[1],nd,cdsize(padding,nd),cdsize(stride,nd),cdsize(upscale,nd),mode,DT(x))
+        elseif cudnnVersion > 3000 # does not work when cudnnVersion==3000
+            @cuda(cudnn,cudnnSetConvolutionNdDescriptor_v3,
+                  (Cptr,Cint,Ptr{Cint},Ptr{Cint},Ptr{Cint},UInt32,UInt32),
+                  d[1],nd,cdsize(padding,nd),cdsize(stride,nd),cdsize(upscale,nd),mode,DT(x))
+        else
+            @cuda(cudnn,cudnnSetConvolutionNdDescriptor,
+                  (Cptr,Cint,Ptr{Cint},Ptr{Cint},Ptr{Cint},UInt32),
+                  d[1],nd,cdsize(padding,nd),cdsize(stride,nd),cdsize(upscale,nd),mode)
+        end
+        cd = new(d[1])
+        finalizer(cd, x->@cuda(cudnn,cudnnDestroyConvolutionDescriptor,(Cptr,),x.ptr))
+        return cd
+    end
+end
+
+mutable struct PD; ptr
+    function PD(x::CuArray; window=2, padding=0, stride=window, mode=0, maxpoolingNanOpt=0)
+        d = Cptr[0]
+        @cuda(cudnn,cudnnCreatePoolingDescriptor,(Ptr{Cptr},),d)
+        nd = ndims(x)-2
+        if cudnnVersion >= 5000
+            @cuda(cudnn,cudnnSetPoolingNdDescriptor,
+                  (Cptr,UInt32,UInt32,Cint,Ptr{Cint},Ptr{Cint},Ptr{Cint}),
+                  d[1],mode,maxpoolingNanOpt,nd,cdsize(window,nd),cdsize(padding,nd),cdsize(stride,nd))
+        elseif cudnnVersion >= 4000
+            @cuda(cudnn,cudnnSetPoolingNdDescriptor_v4,
+                  (Cptr,UInt32,UInt32,Cint,Ptr{Cint},Ptr{Cint},Ptr{Cint}),
+                  d[1],mode,maxpoolingNanOpt,nd,cdsize(window,nd),cdsize(padding,nd),cdsize(stride,nd))
+        else
+            @cuda(cudnn,cudnnSetPoolingNdDescriptor,
+                  (Cptr,UInt32,Cint,Ptr{Cint},Ptr{Cint},Ptr{Cint}),
+                  d[1],mode,nd,cdsize(window,nd),cdsize(padding,nd),cdsize(stride,nd))
+        end
+        pd = new(d[1])
+        finalizer(pd, x->@cuda(cudnn,cudnnDestroyPoolingDescriptor,(Cptr,),x.ptr))
+        return pd
+    end
+end
+
+import Base: unsafe_convert
+unsafe_convert(::Type{Cptr}, td::TD)=td.ptr
+unsafe_convert(::Type{Cptr}, fd::FD)=fd.ptr
+unsafe_convert(::Type{Cptr}, cd::CD)=cd.ptr
+unsafe_convert(::Type{Cptr}, pd::PD)=pd.ptr
+
+# fill and reverse Cint array with padding etc. for cudnn calls
+function cdsize(w, nd)
+    if isa(w,Number)
+        fill(Cint(w),nd)
+    elseif length(w)==nd 
+        [ Cint(w[nd-i+1]) for i=1:nd ]
+    else
+        throw(DimensionMismatch("$w $nd"))
+    end
+end
+
+# convert padding etc. size to an Int array of the right dimension
+function psize(p, x)
+    nd = ndims(x)-2
+    if isa(p,Number)
+        fill(Int(p),nd)
+    elseif length(p)==nd
+        collect(Int,p)
+    else
+        throw(DimensionMismatch("psize: $p $nd"))
+    end
+end
+
+DT(::CuArray{Float32})=UInt32(0)
+DT(::CuArray{Float64})=UInt32(1)
+DT(::CuArray{Float16})=UInt32(2)
+
+function cdims(w,x; padding=0, stride=1, o...)
+    N = ndims(x)
+    ntuple(N) do i
+        if i < N-1
+            pi = (if isa(padding,Number); padding; else padding[i]; end)
+            si = (if isa(stride,Number); stride; else stride[i]; end)
+            1 + div(size(x,i) - size(w,i) + 2*pi, si)
+        elseif i == N-1
+            size(w,N)
+        else # i == N
+            size(x,N)
+        end
+    end
+end
+
+function pdims(x; window=2, padding=0, stride=window, o...)
+    N = ndims(x)
+    ntuple(N) do i
+        if i < N-1
+            wi = (if isa(window,Number); window; else window[i]; end)
+            pi = (if isa(padding,Number); padding; else padding[i]; end)
+            si = (if isa(stride,Number); stride; else stride[i]; end)
+            1 + div(size(x,i) + 2*pi - wi, si)
+        else
+            size(x,i)
+        end
+    end
+end
+
+function dcdims(w,x; padding=0, stride=1, o...)
+    N = ndims(x)
+    ntuple(N) do i
+        if i < N-1
+            pi = (if isa(padding,Number); padding; else padding[i]; end)
+            si = (if isa(stride,Number); stride; else stride[i]; end)
+            si*(size(x,i)-1) + size(w,i) - 2*pi
+        elseif i == N-1
+            size(w,N)
+        else
+            size(x,N)
+        end
+    end
+end
+
+function updims(x; window=2, padding=0, stride=window, o...)
+    window = psize(window,x)
+    stride = psize(stride,x)
+    padding = psize(padding,x)
+    N = ndims(x)
+    ntuple(N) do i
+        if i < N-1
+            (size(x,i)-1)*stride[i]+window[i]-2*padding[i]
+        else
+            size(x,i)
+        end
+    end
+end
+
+# convolution padding size that preserves the input size when filter size is odd and stride=1
+padsize(w)=ntuple(i->div(size(w,i)-1,2), ndims(w)-2)
diff --git a/src/core.jl b/src/core.jl
@@ -0,0 +1,24 @@
+
+
+using CuArrays
+
+const Cptr = Ptr{Void}
+macro gs(); if false; esc(:(ccall(("cudaDeviceSynchronize","libcudart"),UInt32,()))); end; end
+
+include("gpu.jl")
+include("conv.jl")
+
+
+
+
+
+# See if we have a gpu at initialization:
+function __init__()
+    try
+        r = gpu(true)
+        # info(r >= 0 ? "Knet using GPU $r" : "No GPU found, Knet using the CPU")
+    catch e
+        gpu(false)
+        # warn("Knet using the CPU: $e")
+    end
+end