Skip to content

Commit

Permalink
conv4 and friends on CuArray
Browse files Browse the repository at this point in the history
  • Loading branch information
dfdx committed Oct 3, 2017
1 parent 54a57ef commit c0bc866
Show file tree
Hide file tree
Showing 5 changed files with 448 additions and 1 deletion.
2 changes: 2 additions & 0 deletions .gitignore
@@ -1,3 +1,5 @@
*.jl.cov
*.jl.*.cov
*.jl.mem
*~
.juliahistory
2 changes: 1 addition & 1 deletion src/CuConv.jl
@@ -1,5 +1,5 @@
module CuConv

# package code goes here
include("core.jl")

end # module
240 changes: 240 additions & 0 deletions src/conv.jl
@@ -0,0 +1,240 @@

function conv4{T}(w::CuArray{T},x::CuArray{T};
handle=cudnnhandle(), algo=0, workSpace=C_NULL, workSpaceSizeInBytes=0, alpha=1,
o...) # padding=0, stride=1, upscale=1, mode=0
y = similar(x, cdims(w,x;o...))
beta=0 # nonzero beta does not make sense when we create y
@cuda(cudnn, cudnnConvolutionForward,
(Cptr,Ptr{T},Cptr,Ptr{T},Cptr,Ptr{T},Cptr, UInt32,Cptr, Csize_t, Ptr{T},Cptr,Ptr{T}),
handle,Ref(T(alpha)),TD(x),x.ptr,FD(w),w.ptr,CD(w,x;o...),algo,workSpace,workSpaceSizeInBytes,Ref(T(beta)),TD(y),y.ptr)
return y
end

function conv4x{T}(w::CuArray{T},x::CuArray{T},dy::CuArray{T};
handle=cudnnhandle(), alpha=1, algo=0, workSpace=C_NULL, workSpaceSizeInBytes=0,
o...) # padding=0, stride=1, upscale=1, mode=0
beta = 0
dx = similar(x)
if cudnnVersion >= 4000
@cuda(cudnn,cudnnConvolutionBackwardData,
(Cptr,Ptr{T},Cptr,Ptr{T},Cptr,Ptr{T},Cptr, UInt32,Cptr, Csize_t, Ptr{T},Cptr,Ptr{T}),
handle,Ref(T(alpha)),FD(w),w.ptr,TD(dy),dy.ptr,CD(w,x;o...),algo,workSpace,workSpaceSizeInBytes,Ref(T(beta)),TD(dx),dx.ptr)
elseif cudnnVersion >= 3000
@cuda(cudnn,cudnnConvolutionBackwardData_v3,
(Cptr,Ptr{T},Cptr,Ptr{T},Cptr,Ptr{T},Cptr, UInt32,Cptr, Csize_t, Ptr{T},Cptr,Ptr{T}),
handle,Ref(T(alpha)),FD(w),w.ptr,TD(dy),dy.ptr,CD(w,x;o...),algo,workSpace,workSpaceSizeInBytes,Ref(T(beta)),TD(dx),dx.ptr)
else
@cuda(cudnn,cudnnConvolutionBackwardData,
(Cptr,Ptr{T},Cptr,Ptr{T},Cptr,Ptr{T},Cptr, Ptr{T},Cptr,Ptr{T}),
handle,Ref(T(alpha)),FD(w),w.ptr,TD(dy),dy.ptr,CD(w,x;o...),Ref(T(beta)),TD(dx),dx.ptr)
end
return dx
end

function conv4w{T}(w::CuArray{T},x::CuArray{T},dy::CuArray{T};
handle=cudnnhandle(), alpha=1, algo=0, workSpace=C_NULL, workSpaceSizeInBytes=0,
o...) # padding=0, stride=1, upscale=1, mode=0
beta = 0
dw = similar(w)
if cudnnVersion >= 4000
@cuda(cudnn,cudnnConvolutionBackwardFilter,
(Cptr,Ptr{T},Cptr,Ptr{T},Cptr,Ptr{T},Cptr, UInt32,Cptr, Csize_t, Ptr{T},Cptr,Ptr{T}),
handle,Ref(T(alpha)),TD(x),x.ptr,TD(dy),dy.ptr,CD(w,x;o...),algo,workSpace,workSpaceSizeInBytes,Ref(T(beta)),FD(dw),dw.ptr)
elseif cudnnVersion >= 3000
@cuda(cudnn,cudnnConvolutionBackwardFilter_v3,
(Cptr,Ptr{T},Cptr,Ptr{T},Cptr,Ptr{T},Cptr, UInt32,Cptr, Csize_t, Ptr{T},Cptr,Ptr{T}),
handle,Ref(T(alpha)),TD(x),x.ptr,TD(dy),dy.ptr,CD(w,x;o...),algo,workSpace,workSpaceSizeInBytes,Ref(T(beta)),FD(dw),dw.ptr)
else
@cuda(cudnn,cudnnConvolutionBackwardFilter,
(Cptr,Ptr{T},Cptr,Ptr{T},Cptr,Ptr{T},Cptr, Ptr{T},Cptr,Ptr{T}),
handle,Ref(T(alpha)),TD(x),x.ptr,TD(dy),dy.ptr,CD(w,x;o...),Ref(T(beta)),FD(dw),dw.ptr)
end
return dw
end





mutable struct TD; ptr
function TD(a::CuArray)
d = Cptr[0]
@cuda(cudnn,cudnnCreateTensorDescriptor,(Ptr{Cptr},),d)
n = ndims(a)
sz = [Cint(size(a,n-i+1)) for i=1:n]
st = [Cint(stride(a,n-i+1)) for i=1:n]
@cuda(cudnn,cudnnSetTensorNdDescriptor,
(Cptr,UInt32,Cint,Ptr{Cint},Ptr{Cint}),
d[1], DT(a), n, sz, st)
td = new(d[1])
finalizer(td, x->@cuda(cudnn,cudnnDestroyTensorDescriptor,(Cptr,),x.ptr))
return td
end
end

mutable struct FD; ptr
function FD(a::CuArray)
d = Cptr[0]
@cuda(cudnn,cudnnCreateFilterDescriptor,(Ptr{Cptr},),d)
n = ndims(a)
sz = [Cint(size(a,n-i+1)) for i=1:n]
if cudnnVersion >= 5000
@cuda(cudnn,cudnnSetFilterNdDescriptor,
(Cptr,UInt32,UInt32,Cint,Ptr{Cint}),
d[1], DT(a), 0, n, sz)
elseif cudnnVersion >= 4000
@cuda(cudnn,cudnnSetFilterNdDescriptor_v4,
(Cptr,UInt32,UInt32,Cint,Ptr{Cint}),
d[1], DT(a), 0, n, sz)
else
@cuda(cudnn,cudnnSetFilterNdDescriptor,
(Cptr,UInt32,Cint,Ptr{Cint}),
d[1], DT(a), n, sz)
end
fd = new(d[1])
finalizer(fd, x->@cuda(cudnn,cudnnDestroyFilterDescriptor,(Cptr,),x.ptr))
return fd
end
end

mutable struct CD; ptr
function CD(w::CuArray,x::CuArray; padding=0, stride=1, upscale=1, mode=0)
d = Cptr[0]
@cuda(cudnn,cudnnCreateConvolutionDescriptor,(Ptr{Cptr},),d)
nd = ndims(x)-2
if cudnnVersion >= 4000
@cuda(cudnn,cudnnSetConvolutionNdDescriptor,
(Cptr,Cint,Ptr{Cint},Ptr{Cint},Ptr{Cint},UInt32,UInt32),
d[1],nd,cdsize(padding,nd),cdsize(stride,nd),cdsize(upscale,nd),mode,DT(x))
elseif cudnnVersion > 3000 # does not work when cudnnVersion==3000
@cuda(cudnn,cudnnSetConvolutionNdDescriptor_v3,
(Cptr,Cint,Ptr{Cint},Ptr{Cint},Ptr{Cint},UInt32,UInt32),
d[1],nd,cdsize(padding,nd),cdsize(stride,nd),cdsize(upscale,nd),mode,DT(x))
else
@cuda(cudnn,cudnnSetConvolutionNdDescriptor,
(Cptr,Cint,Ptr{Cint},Ptr{Cint},Ptr{Cint},UInt32),
d[1],nd,cdsize(padding,nd),cdsize(stride,nd),cdsize(upscale,nd),mode)
end
cd = new(d[1])
finalizer(cd, x->@cuda(cudnn,cudnnDestroyConvolutionDescriptor,(Cptr,),x.ptr))
return cd
end
end

mutable struct PD; ptr
function PD(x::CuArray; window=2, padding=0, stride=window, mode=0, maxpoolingNanOpt=0)
d = Cptr[0]
@cuda(cudnn,cudnnCreatePoolingDescriptor,(Ptr{Cptr},),d)
nd = ndims(x)-2
if cudnnVersion >= 5000
@cuda(cudnn,cudnnSetPoolingNdDescriptor,
(Cptr,UInt32,UInt32,Cint,Ptr{Cint},Ptr{Cint},Ptr{Cint}),
d[1],mode,maxpoolingNanOpt,nd,cdsize(window,nd),cdsize(padding,nd),cdsize(stride,nd))
elseif cudnnVersion >= 4000
@cuda(cudnn,cudnnSetPoolingNdDescriptor_v4,
(Cptr,UInt32,UInt32,Cint,Ptr{Cint},Ptr{Cint},Ptr{Cint}),
d[1],mode,maxpoolingNanOpt,nd,cdsize(window,nd),cdsize(padding,nd),cdsize(stride,nd))
else
@cuda(cudnn,cudnnSetPoolingNdDescriptor,
(Cptr,UInt32,Cint,Ptr{Cint},Ptr{Cint},Ptr{Cint}),
d[1],mode,nd,cdsize(window,nd),cdsize(padding,nd),cdsize(stride,nd))
end
pd = new(d[1])
finalizer(pd, x->@cuda(cudnn,cudnnDestroyPoolingDescriptor,(Cptr,),x.ptr))
return pd
end
end

import Base: unsafe_convert
unsafe_convert(::Type{Cptr}, td::TD)=td.ptr
unsafe_convert(::Type{Cptr}, fd::FD)=fd.ptr
unsafe_convert(::Type{Cptr}, cd::CD)=cd.ptr
unsafe_convert(::Type{Cptr}, pd::PD)=pd.ptr

# fill and reverse Cint array with padding etc. for cudnn calls
function cdsize(w, nd)
if isa(w,Number)
fill(Cint(w),nd)
elseif length(w)==nd
[ Cint(w[nd-i+1]) for i=1:nd ]
else
throw(DimensionMismatch("$w $nd"))
end
end

# convert padding etc. size to an Int array of the right dimension
function psize(p, x)
nd = ndims(x)-2
if isa(p,Number)
fill(Int(p),nd)
elseif length(p)==nd
collect(Int,p)
else
throw(DimensionMismatch("psize: $p $nd"))
end
end

DT(::CuArray{Float32})=UInt32(0)
DT(::CuArray{Float64})=UInt32(1)
DT(::CuArray{Float16})=UInt32(2)

function cdims(w,x; padding=0, stride=1, o...)
N = ndims(x)
ntuple(N) do i
if i < N-1
pi = (if isa(padding,Number); padding; else padding[i]; end)
si = (if isa(stride,Number); stride; else stride[i]; end)
1 + div(size(x,i) - size(w,i) + 2*pi, si)
elseif i == N-1
size(w,N)
else # i == N
size(x,N)
end
end
end

function pdims(x; window=2, padding=0, stride=window, o...)
N = ndims(x)
ntuple(N) do i
if i < N-1
wi = (if isa(window,Number); window; else window[i]; end)
pi = (if isa(padding,Number); padding; else padding[i]; end)
si = (if isa(stride,Number); stride; else stride[i]; end)
1 + div(size(x,i) + 2*pi - wi, si)
else
size(x,i)
end
end
end

function dcdims(w,x; padding=0, stride=1, o...)
N = ndims(x)
ntuple(N) do i
if i < N-1
pi = (if isa(padding,Number); padding; else padding[i]; end)
si = (if isa(stride,Number); stride; else stride[i]; end)
si*(size(x,i)-1) + size(w,i) - 2*pi
elseif i == N-1
size(w,N)
else
size(x,N)
end
end
end

function updims(x; window=2, padding=0, stride=window, o...)
window = psize(window,x)
stride = psize(stride,x)
padding = psize(padding,x)
N = ndims(x)
ntuple(N) do i
if i < N-1
(size(x,i)-1)*stride[i]+window[i]-2*padding[i]
else
size(x,i)
end
end
end

# convolution padding size that preserves the input size when filter size is odd and stride=1
padsize(w)=ntuple(i->div(size(w,i)-1,2), ndims(w)-2)
24 changes: 24 additions & 0 deletions src/core.jl
@@ -0,0 +1,24 @@


using CuArrays

const Cptr = Ptr{Void}
macro gs(); if false; esc(:(ccall(("cudaDeviceSynchronize","libcudart"),UInt32,()))); end; end

include("gpu.jl")
include("conv.jl")





# See if we have a gpu at initialization:
function __init__()
try
r = gpu(true)
# info(r >= 0 ? "Knet using GPU $r" : "No GPU found, Knet using the CPU")
catch e
gpu(false)
# warn("Knet using the CPU: $e")
end
end

0 comments on commit c0bc866

Please sign in to comment.