diff --git a/.gitignore b/.gitignore
index 8c960ec..6cb042d 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,3 +1,5 @@
 *.jl.cov
 *.jl.*.cov
 *.jl.mem
+*~
+.juliahistory
\ No newline at end of file
diff --git a/src/CuConv.jl b/src/CuConv.jl
index 0c463c1..b91fe02 100644
--- a/src/CuConv.jl
+++ b/src/CuConv.jl
@@ -1,5 +1,5 @@
 module CuConv
 
-# package code goes here
+include("core.jl")
 
 end # module
diff --git a/src/conv.jl b/src/conv.jl
new file mode 100644
index 0000000..ae161cc
--- /dev/null
+++ b/src/conv.jl
@@ -0,0 +1,240 @@
+
+function conv4{T}(w::CuArray{T},x::CuArray{T};
+                  handle=cudnnhandle(), algo=0, workSpace=C_NULL, workSpaceSizeInBytes=0, alpha=1,
+                  o...) # padding=0, stride=1, upscale=1, mode=0
+    y = similar(x, cdims(w,x;o...))
+    beta=0 # nonzero beta does not make sense when we create y
+    @cuda(cudnn, cudnnConvolutionForward,
+          (Cptr,Ptr{T},Cptr,Ptr{T},Cptr,Ptr{T},Cptr,   UInt32,Cptr,     Csize_t,             Ptr{T},Cptr,Ptr{T}),
+          handle,Ref(T(alpha)),TD(x),x.ptr,FD(w),w.ptr,CD(w,x;o...),algo,workSpace,workSpaceSizeInBytes,Ref(T(beta)),TD(y),y.ptr)
+    return y
+end
+
+function conv4x{T}(w::CuArray{T},x::CuArray{T},dy::CuArray{T};
+                   handle=cudnnhandle(), alpha=1, algo=0, workSpace=C_NULL, workSpaceSizeInBytes=0,
+                   o...) # padding=0, stride=1, upscale=1, mode=0
+    beta = 0
+    dx = similar(x)
+    if cudnnVersion >= 4000
+        @cuda(cudnn,cudnnConvolutionBackwardData,
+              (Cptr,Ptr{T},Cptr,Ptr{T},Cptr,Ptr{T},Cptr,     UInt32,Cptr,     Csize_t,             Ptr{T},Cptr,Ptr{T}),
+              handle,Ref(T(alpha)),FD(w),w.ptr,TD(dy),dy.ptr,CD(w,x;o...),algo,workSpace,workSpaceSizeInBytes,Ref(T(beta)),TD(dx),dx.ptr)
+    elseif cudnnVersion >= 3000
+        @cuda(cudnn,cudnnConvolutionBackwardData_v3,
+              (Cptr,Ptr{T},Cptr,Ptr{T},Cptr,Ptr{T},Cptr,     UInt32,Cptr,     Csize_t,             Ptr{T},Cptr,Ptr{T}),
+              handle,Ref(T(alpha)),FD(w),w.ptr,TD(dy),dy.ptr,CD(w,x;o...),algo,workSpace,workSpaceSizeInBytes,Ref(T(beta)),TD(dx),dx.ptr)
+    else
+        @cuda(cudnn,cudnnConvolutionBackwardData,
+              (Cptr,Ptr{T},Cptr,Ptr{T},Cptr,Ptr{T},Cptr,       Ptr{T},Cptr,Ptr{T}),
+              handle,Ref(T(alpha)),FD(w),w.ptr,TD(dy),dy.ptr,CD(w,x;o...),Ref(T(beta)),TD(dx),dx.ptr)
+    end
+    return dx
+end
+
+function conv4w{T}(w::CuArray{T},x::CuArray{T},dy::CuArray{T};
+                   handle=cudnnhandle(), alpha=1, algo=0, workSpace=C_NULL, workSpaceSizeInBytes=0,
+                   o...) # padding=0, stride=1, upscale=1, mode=0
+    beta = 0
+    dw = similar(w)
+    if cudnnVersion >= 4000
+        @cuda(cudnn,cudnnConvolutionBackwardFilter,
+              (Cptr,Ptr{T},Cptr,Ptr{T},Cptr,Ptr{T},Cptr,     UInt32,Cptr,     Csize_t,             Ptr{T},Cptr,Ptr{T}),
+              handle,Ref(T(alpha)),TD(x),x.ptr,TD(dy),dy.ptr,CD(w,x;o...),algo,workSpace,workSpaceSizeInBytes,Ref(T(beta)),FD(dw),dw.ptr)
+    elseif cudnnVersion >= 3000
+        @cuda(cudnn,cudnnConvolutionBackwardFilter_v3,
+              (Cptr,Ptr{T},Cptr,Ptr{T},Cptr,Ptr{T},Cptr,     UInt32,Cptr,     Csize_t,             Ptr{T},Cptr,Ptr{T}),
+              handle,Ref(T(alpha)),TD(x),x.ptr,TD(dy),dy.ptr,CD(w,x;o...),algo,workSpace,workSpaceSizeInBytes,Ref(T(beta)),FD(dw),dw.ptr)
+    else
+        @cuda(cudnn,cudnnConvolutionBackwardFilter,
+              (Cptr,Ptr{T},Cptr,Ptr{T},Cptr,Ptr{T},Cptr,       Ptr{T},Cptr,Ptr{T}),
+              handle,Ref(T(alpha)),TD(x),x.ptr,TD(dy),dy.ptr,CD(w,x;o...),Ref(T(beta)),FD(dw),dw.ptr)
+    end
+    return dw
+end
+
+
+
+
+
+mutable struct TD; ptr
+    function TD(a::CuArray)
+        d = Cptr[0]
+        @cuda(cudnn,cudnnCreateTensorDescriptor,(Ptr{Cptr},),d)
+        n = ndims(a)
+        sz = [Cint(size(a,n-i+1)) for i=1:n]
+        st = [Cint(stride(a,n-i+1)) for i=1:n]
+        @cuda(cudnn,cudnnSetTensorNdDescriptor,
+              (Cptr,UInt32,Cint,Ptr{Cint},Ptr{Cint}),
+              d[1], DT(a), n, sz, st)
+        td = new(d[1])
+        finalizer(td, x->@cuda(cudnn,cudnnDestroyTensorDescriptor,(Cptr,),x.ptr))
+        return td
+    end
+end
+
+mutable struct FD; ptr
+    function FD(a::CuArray)
+        d = Cptr[0]
+        @cuda(cudnn,cudnnCreateFilterDescriptor,(Ptr{Cptr},),d)
+        n = ndims(a)
+        sz = [Cint(size(a,n-i+1)) for i=1:n]
+        if cudnnVersion >= 5000
+            @cuda(cudnn,cudnnSetFilterNdDescriptor,
+                  (Cptr,UInt32,UInt32,Cint,Ptr{Cint}),
+                  d[1], DT(a), 0,     n,   sz)
+        elseif cudnnVersion >= 4000
+            @cuda(cudnn,cudnnSetFilterNdDescriptor_v4,
+                  (Cptr,UInt32,UInt32,Cint,Ptr{Cint}),
+                  d[1], DT(a), 0,     n,   sz)
+        else
+            @cuda(cudnn,cudnnSetFilterNdDescriptor,
+                  (Cptr,UInt32,Cint,Ptr{Cint}),
+                  d[1], DT(a),    n,   sz)
+        end
+        fd = new(d[1])
+        finalizer(fd, x->@cuda(cudnn,cudnnDestroyFilterDescriptor,(Cptr,),x.ptr))
+        return fd
+    end
+end
+
+mutable struct CD; ptr
+    function CD(w::CuArray,x::CuArray; padding=0, stride=1, upscale=1, mode=0)
+        d = Cptr[0]
+        @cuda(cudnn,cudnnCreateConvolutionDescriptor,(Ptr{Cptr},),d)
+        nd = ndims(x)-2
+        if cudnnVersion >= 4000
+            @cuda(cudnn,cudnnSetConvolutionNdDescriptor,
+                  (Cptr,Cint,Ptr{Cint},Ptr{Cint},Ptr{Cint},UInt32,UInt32),
+                  d[1],nd,cdsize(padding,nd),cdsize(stride,nd),cdsize(upscale,nd),mode,DT(x))
+        elseif cudnnVersion > 3000 # does not work when cudnnVersion==3000
+            @cuda(cudnn,cudnnSetConvolutionNdDescriptor_v3,
+                  (Cptr,Cint,Ptr{Cint},Ptr{Cint},Ptr{Cint},UInt32,UInt32),
+                  d[1],nd,cdsize(padding,nd),cdsize(stride,nd),cdsize(upscale,nd),mode,DT(x))
+        else
+            @cuda(cudnn,cudnnSetConvolutionNdDescriptor,
+                  (Cptr,Cint,Ptr{Cint},Ptr{Cint},Ptr{Cint},UInt32),
+                  d[1],nd,cdsize(padding,nd),cdsize(stride,nd),cdsize(upscale,nd),mode)
+        end
+        cd = new(d[1])
+        finalizer(cd, x->@cuda(cudnn,cudnnDestroyConvolutionDescriptor,(Cptr,),x.ptr))
+        return cd
+    end
+end
+
+mutable struct PD; ptr
+    function PD(x::CuArray; window=2, padding=0, stride=window, mode=0, maxpoolingNanOpt=0)
+        d = Cptr[0]
+        @cuda(cudnn,cudnnCreatePoolingDescriptor,(Ptr{Cptr},),d)
+        nd = ndims(x)-2
+        if cudnnVersion >= 5000
+            @cuda(cudnn,cudnnSetPoolingNdDescriptor,
+                  (Cptr,UInt32,UInt32,Cint,Ptr{Cint},Ptr{Cint},Ptr{Cint}),
+                  d[1],mode,maxpoolingNanOpt,nd,cdsize(window,nd),cdsize(padding,nd),cdsize(stride,nd))
+        elseif cudnnVersion >= 4000
+            @cuda(cudnn,cudnnSetPoolingNdDescriptor_v4,
+                  (Cptr,UInt32,UInt32,Cint,Ptr{Cint},Ptr{Cint},Ptr{Cint}),
+                  d[1],mode,maxpoolingNanOpt,nd,cdsize(window,nd),cdsize(padding,nd),cdsize(stride,nd))
+        else
+            @cuda(cudnn,cudnnSetPoolingNdDescriptor,
+                  (Cptr,UInt32,Cint,Ptr{Cint},Ptr{Cint},Ptr{Cint}),
+                  d[1],mode,nd,cdsize(window,nd),cdsize(padding,nd),cdsize(stride,nd))
+        end
+        pd = new(d[1])
+        finalizer(pd, x->@cuda(cudnn,cudnnDestroyPoolingDescriptor,(Cptr,),x.ptr))
+        return pd
+    end
+end
+
+import Base: unsafe_convert
+unsafe_convert(::Type{Cptr}, td::TD)=td.ptr
+unsafe_convert(::Type{Cptr}, fd::FD)=fd.ptr
+unsafe_convert(::Type{Cptr}, cd::CD)=cd.ptr
+unsafe_convert(::Type{Cptr}, pd::PD)=pd.ptr
+
+# fill and reverse Cint array with padding etc. for cudnn calls
+function cdsize(w, nd)
+    if isa(w,Number)
+        fill(Cint(w),nd)
+    elseif length(w)==nd 
+        [ Cint(w[nd-i+1]) for i=1:nd ]
+    else
+        throw(DimensionMismatch("$w $nd"))
+    end
+end
+
+# convert padding etc. size to an Int array of the right dimension
+function psize(p, x)
+    nd = ndims(x)-2
+    if isa(p,Number)
+        fill(Int(p),nd)
+    elseif length(p)==nd
+        collect(Int,p)
+    else
+        throw(DimensionMismatch("psize: $p $nd"))
+    end
+end
+
+DT(::CuArray{Float32})=UInt32(0)
+DT(::CuArray{Float64})=UInt32(1)
+DT(::CuArray{Float16})=UInt32(2)
+
+function cdims(w,x; padding=0, stride=1, o...)
+    N = ndims(x)
+    ntuple(N) do i
+        if i < N-1
+            pi = (if isa(padding,Number); padding; else padding[i]; end)
+            si = (if isa(stride,Number); stride; else stride[i]; end)
+            1 + div(size(x,i) - size(w,i) + 2*pi, si)
+        elseif i == N-1
+            size(w,N)
+        else # i == N
+            size(x,N)
+        end
+    end
+end
+
+function pdims(x; window=2, padding=0, stride=window, o...)
+    N = ndims(x)
+    ntuple(N) do i
+        if i < N-1
+            wi = (if isa(window,Number); window; else window[i]; end)
+            pi = (if isa(padding,Number); padding; else padding[i]; end)
+            si = (if isa(stride,Number); stride; else stride[i]; end)
+            1 + div(size(x,i) + 2*pi - wi, si)
+        else
+            size(x,i)
+        end
+    end
+end
+
+function dcdims(w,x; padding=0, stride=1, o...)
+    N = ndims(x)
+    ntuple(N) do i
+        if i < N-1
+            pi = (if isa(padding,Number); padding; else padding[i]; end)
+            si = (if isa(stride,Number); stride; else stride[i]; end)
+            si*(size(x,i)-1) + size(w,i) - 2*pi
+        elseif i == N-1
+            size(w,N)
+        else
+            size(x,N)
+        end
+    end
+end
+
+function updims(x; window=2, padding=0, stride=window, o...)
+    window = psize(window,x)
+    stride = psize(stride,x)
+    padding = psize(padding,x)
+    N = ndims(x)
+    ntuple(N) do i
+        if i < N-1
+            (size(x,i)-1)*stride[i]+window[i]-2*padding[i]
+        else
+            size(x,i)
+        end
+    end
+end
+
+# convolution padding size that preserves the input size when filter size is odd and stride=1
+padsize(w)=ntuple(i->div(size(w,i)-1,2), ndims(w)-2)
diff --git a/src/core.jl b/src/core.jl
new file mode 100644
index 0000000..aea7aef
--- /dev/null
+++ b/src/core.jl
@@ -0,0 +1,24 @@
+
+
+using CuArrays
+
+const Cptr = Ptr{Void}
+macro gs(); if false; esc(:(ccall(("cudaDeviceSynchronize","libcudart"),UInt32,()))); end; end
+
+include("gpu.jl")
+include("conv.jl")
+
+
+
+
+
+# See if we have a gpu at initialization:
+function __init__()
+    try
+        r = gpu(true)
+        # info(r >= 0 ? "Knet using GPU $r" : "No GPU found, Knet using the CPU")
+    catch e
+        gpu(false)
+        # warn("Knet using the CPU: $e")
+    end
+end
diff --git a/src/gpu.jl b/src/gpu.jl
new file mode 100644
index 0000000..d85f3fc
--- /dev/null
+++ b/src/gpu.jl
@@ -0,0 +1,181 @@
+macro gpu(_ex); if gpu()>=0; esc(_ex); end; end
+
+macro cuda(lib,fun,x...)        # give an error if library missing, or if error code!=0
+    if Libdl.find_library(["lib$lib"], []) != ""
+        if VERSION >= v"0.6.0"
+            fx = Expr(:call, :ccall, ("$fun","lib$lib"), :UInt32, x...)
+        else
+            fx = Expr(:ccall, ("$fun","lib$lib"), :UInt32, x...)
+        end
+        msg = "$lib.$fun error "
+        err = gensym()
+        # esc(:(if ($err=$fx) != 0; warn($msg, $err); Base.show_backtrace(STDOUT, backtrace()); end))
+        esc(:(if ($err=$fx) != 0; error($msg, $err); end; @gs))
+    else
+        Expr(:call,:error,"Cannot find lib$lib, please install it and rerun Pkg.build(\"Knet\").")
+    end
+end
+
+macro cuda1(lib,fun,x...)       # return -1 if library missing, error code if run
+    if Libdl.find_library(["lib$lib"], []) != ""
+        if VERSION >= v"0.6.0"
+            fx = Expr(:call, :ccall, ("$fun","lib$lib"), :UInt32, x...)
+        else
+            fx = Expr(:ccall, ("$fun","lib$lib"), :UInt32, x...)
+        end
+        err = gensym()
+        esc(:($err=$fx; @gs; $err))
+    else
+        -1
+    end
+end
+
+macro knet8(fun,x...)       # error if libknet8 missing, nothing if run
+    if libknet8 != ""
+        if VERSION >= v"0.6.0"
+            fx = Expr(:call, :ccall, ("$fun",libknet8), :Void, x...)
+        else
+            fx = Expr(:ccall, ("$fun",libknet8), :Void, x...)
+        end
+        err = gensym()
+        esc(:($err=$fx; @gs; $err))
+    else
+        Expr(:call,:error,"Cannot find libknet8, please rerun Pkg.build(\"Knet\").")
+    end
+end
+
+const Cptr = Ptr{Void}
+
+"""
+
+`gpu()` returns the id of the active GPU device or -1 if none are
+active.
+
+`gpu(true)` resets all GPU devices and activates the one with the most
+available memory.
+
+`gpu(false)` resets and deactivates all GPU devices.
+
+`gpu(d::Int)` activates the GPU device `d` if `0 <= d < gpuCount()`,
+otherwise deactivates devices.
+
+`gpu(true/false)` resets all devices.  If there are any allocated
+KnetArrays their pointers will be left dangling.  Thus
+`gpu(true/false)` should only be used during startup.  If you want to
+suspend GPU use temporarily, use `gpu(-1)`.
+
+`gpu(d::Int)` does not reset the devices.  You can select a previous
+device and find allocated memory preserved.  However trying to operate
+on arrays of an inactive device will result in error.
+
+"""
+function gpu end
+
+let GPU=-1, GPUCNT=-1, CUBLAS=nothing, CUDNN=nothing
+    global gpu, gpuCount, cublashandle, cudnnhandle, cudaRuntimeVersion, cudaDriverVersion
+
+    gpu()=GPU
+
+    function gpuCount() # should not bomb when there is no gpu or nvidia libs
+        if GPUCNT == -1
+            GPUCNT = try
+	        p=Cuint[0]
+                # @cuda does not stay quiet so we use @cuda1 here
+                # This code is only run once if successful, so nvmlInit here is ok
+                @cuda1("nvidia-ml",nvmlInit,())
+                @cuda1("nvidia-ml",nvmlDeviceGetCount,(Ptr{Cuint},),p)
+                # Let us keep nvml initialized for future ops such as meminfo
+                # @cuda1("nvidia-ml",nvmlShutdown,())
+	        Int(p[1])
+            catch
+	        0
+            end
+        end
+        return GPUCNT
+    end
+
+    function gpu(i::Int)
+        (GPU == i) && return i
+        if 0 <= i < gpuCount()
+            @cuda(cudart,cudaSetDevice, (Cint,), i)
+            cudaRuntimeVersion = (p=Cint[0];@cuda(cudart,cudaRuntimeGetVersion,(Ptr{Cint},),p);Int(p[1]))
+            cudaDriverVersion  = (p=Cint[0];@cuda(cudart,cudaDriverGetVersion, (Ptr{Cint},),p);Int(p[1]))
+        else
+            i = -1
+            # @cuda(cudart,cudaDeviceReset,()) # may still go back and use arrays allocated in a previous gpu
+        end
+        return (GPU = i)
+    end
+
+    function gpu(usegpu::Bool)
+        if usegpu && gpuCount() > 0
+            pick = free = same = -1
+            for i=0:gpuCount()-1
+                mem = nvmlDeviceGetMemoryInfo(i)
+                if mem[2] > free
+                    pick = i
+                    free = mem[2]
+                    same = 1
+                elseif mem[2] == free
+                    # pick one of equal devices randomly
+                    rand(1:(same+=1)) == 1 && (pick = i)
+                end
+            end
+            gpu(pick)
+        else
+            for i=0:gpuCount()-1
+                @cuda(cudart,cudaDeviceReset,())
+            end
+            gpu(-1)
+        end
+    end
+
+    function cublashandle(dev=gpu())
+        if dev==-1; error("No cublashandle for CPU"); end
+        i = dev+2
+        if CUBLAS == nothing; CUBLAS=Array{Any}(gpuCount()+1); end
+        if !isassigned(CUBLAS,i); CUBLAS[i]=cublasCreate(); end
+        return CUBLAS[i]
+    end
+
+    function cudnnhandle(dev=gpu())
+        if dev==-1; error("No cudnnhandle for CPU"); end
+        i = dev+2
+        if CUDNN == nothing; CUDNN=Array{Any}(gpuCount()+1); end
+        if !isassigned(CUDNN,i); CUDNN[i]=cudnnCreate(); end
+        return CUDNN[i]
+    end
+end
+
+# cudaGetDeviceCount is deprecated, use gpuCount instead:
+cudaGetDeviceCount()=(try; p=Cint[0]; eval(:(ccall(("cudaGetDeviceCount","libcudart"),UInt32,(Ptr{Cint},),$p))); p[1]; catch; 0; end) # will not bomb when there is no gpu
+cudaGetDevice()=(d=Cint[-1];@cuda(cudart,cudaGetDevice,(Ptr{Cint},),d);d[1])
+cudaGetMemInfo()=(f=Csize_t[0];m=Csize_t[0]; @cuda(cudart,cudaMemGetInfo,(Ptr{Csize_t},Ptr{Csize_t}),f,m); (Int(f[1]),Int(m[1])))
+cudaDeviceSynchronize()=@cuda(cudart,cudaDeviceSynchronize,())
+
+function nvmlDeviceGetMemoryInfo(i=gpu())
+    0 <= i < gpuCount() || return nothing
+    dev = Cptr[0]
+    mem = Array{Culonglong}(3)
+    @cuda("nvidia-ml","nvmlDeviceGetHandleByIndex",(Cuint,Ptr{Cptr}),i,dev)
+    @cuda("nvidia-ml","nvmlDeviceGetMemoryInfo",(Cptr,Ptr{Culonglong}),dev[1],mem)
+    ntuple(i->Int(mem[i]),length(mem))
+end
+
+function cublasCreate()
+    handleP = Cptr[0]
+    @cuda(cublas,cublasCreate_v2, (Ptr{Cptr},), handleP)
+    handle = handleP[1]
+    atexit(()->@cuda(cublas,cublasDestroy_v2, (Cptr,), handle))
+    global cublasVersion = (p=Cint[0];@cuda(cublas,cublasGetVersion_v2,(Cptr,Ptr{Cint}),handle,p);Int(p[1]))
+    return handle
+end
+
+function cudnnCreate()
+    handleP = Cptr[0]
+    @cuda(cudnn,cudnnCreate,(Ptr{Cptr},), handleP)
+    handle = handleP[1]
+    atexit(()->@cuda(cudnn,cudnnDestroy,(Cptr,), handle))
+    global cudnnVersion = Int(ccall((:cudnnGetVersion,:libcudnn),Csize_t,()))
+    return handle
+end