lib/cudnn/nnlib.jl

# interfacing with NNlib.jl

import NNlib: conv!, ∇conv_filter!, ∇conv_data!, stride, dilation, flipkernel,
  maxpool!, meanpool!, ∇maxpool!, ∇meanpool!, spatial_dims, padding, kernel_size,
  softmax, softmax!, ∇softmax!, logsoftmax, logsoftmax!, ∇logsoftmax,
  conv_bias_act!


# Softmax

const CUDNNFloat = Union{Float16,Float32,Float64}

reshape4D(x::AbstractVector) = reshape(x, 1, 1, length(x), 1)
reshape4D(x::AbstractMatrix) = reshape(x, 1, 1, size(x)...)

function softmax(xs::CuVecOrMat{T}; dims=1) where T<:CUDNNFloat
  out = similar(xs)
  softmax!(out, xs, dims=dims)
  return out
end

function softmax!(out::CuVecOrMat{T}, xs::CuVecOrMat{T}; dims=1) where T<:CUDNNFloat
  # use fast over accurate algorithm if fast math is enabled
  if Base.JLOptions().fast_math == 1
    algorithm = CUDNN_SOFTMAX_FAST
  else
    algorithm = CUDNN_SOFTMAX_ACCURATE
  end
  cudnnSoftmaxForward(reshape4D(xs), reshape4D(out), algorithm=algorithm, mode=cudnnSoftmaxMode_t(dims-1))
  return out
end

function ∇softmax!(out::CuVecOrMat{T}, Δ::CuVecOrMat{T}, xs::CuVecOrMat{T}) where T<:CUDNNFloat
  cudnnSoftmaxBackward(reshape4D(softmax(xs)), reshape4D(Δ), reshape4D(out))
  return out
end

function logsoftmax!(out::CuVecOrMat{T}, xs::CuVecOrMat{T}) where T<:CUDNNFloat
  cudnnSoftmaxForward(reshape4D(xs), reshape4D(out), algorithm=CUDNN_SOFTMAX_LOG)
  return out
end

function ∇logsoftmax!(out::CuVecOrMat{T}, Δ::CuVecOrMat{T}, xs::CuVecOrMat{T}) where T<:CUDNNFloat
  cudnnSoftmaxBackward(reshape4D(logsoftmax(xs)), reshape4D(Δ), reshape4D(out);
                       algorithm=CUDNN_SOFTMAX_LOG)
  return out
end

∇logsoftmax(Δ::CuVecOrMat{T}, xs::CuVecOrMat{T}) where T<:CUDNNFloat =
  ∇logsoftmax!(similar(xs), Δ, xs)


# Convolution

# Since CUDNN does not support 1D convolution, Conv in Flux will give a CUDNNError if the size is 1-dimensional.
# We have to reshape the CuArray/PoolDims/DenseConvDims to 4D before feeding to CUDNN.
fix1d(x) = x

fix1d(x::CuArray{T, 3}) where T = reshape(x, size(x, 1), 1, size(x, 2), size(x, 3))

fix1d(cdims::DenseConvDims{1,K,C_in,C_out,S,P,D,F}) where {K,C_in,C_out,S,P,D,F} =
  DenseConvDims{2,(K...,1),C_in,C_out,(S...,1),(P...,0,0),(D...,1),F}((cdims.I...,1))

fix1d(pdims::PoolDims{1,K,S,P,D}) where {K,S,P,D,F} =
  PoolDims{2,(K...,1),(S...,1),(P...,0,0),(D...,1)}((pdims.I..., 1), pdims.C_in)

function conv_bias_act!(y::CuArray{T}, x::CuArray{T}, w::CuArray{T}, cdims::DenseConvDims, b::CuArray{T}, σ;
                        z::CuArray{T}=y, alpha1=1, alpha2=0, algo=-1) where T<:CUDNNFloat
  if version() < v"6"
    all(x -> x == 1, dilation(cdims)) || error("Only dilation = 1 is supported in cuDNN version < 6")
  end

  # only relu and identity are supported
  if σ == NNlib.relu
    if algo < 0
      # algo = UInt32(cudnnGetConvolutionForwardAlgorithm(fix1d(y), fix1d(x), fix1d(w), fix1d(cdims))) # will be removed in cuDNN 8
      returnedAlgoCount, perfResults = cudnnGetConvolutionForwardAlgorithm_v7(fix1d(y), fix1d(x), fix1d(w), fix1d(cdims))
      # returnedAlgoCount, perfResults = cudnnFindConvolutionForwardAlgorithm(fix1d(y), fix1d(x), fix1d(w), fix1d(cdims))
      # returnedAlgoCount, perfResults = cudnnFindConvolutionForwardAlgorithmEx(fix1d(y), fix1d(x), fix1d(w), fix1d(cdims))
      algo = UInt32(perfResults[1].algo)
    end
    cudnnConvolutionBiasActivationForward(fix1d(y), fix1d(x), fix1d(w), fix1d(z), fix1d(b),
                                          fix1d(cdims), algo=algo, alpha1=alpha1, alpha2=alpha2,
                                          activationMode=CUDNN_ACTIVATION_RELU, activationCoeff=0.0)
  else
    # algo must be CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_​PRECOMP_GEMM (1) when activationMode equals CUDNN_ACTIVATION_IDENTITY
    cudnnConvolutionBiasActivationForward(fix1d(y), fix1d(x), fix1d(w), fix1d(z), fix1d(b),
                                          fix1d(cdims), algo=1, alpha1=alpha1, alpha2=alpha2,
                                          activationMode=CUDNN_ACTIVATION_IDENTITY, activationCoeff=0.0)
    if σ != NNlib.identity
      y.= σ.(y)
    end
  end
  return y
end

function conv!(y::CuArray{T}, x::CuArray{T}, w::CuArray{T}, cdims::DenseConvDims;
               alpha=1, algo=-1) where T<:CUDNNFloat
  if version() < v"6"
    all(x -> x == 1, dilation(cdims)) || error("Only dilation = 1 is supported in cuDNN version < 6")
  end

  if algo < 0
    # algo = UInt32(cudnnGetConvolutionForwardAlgorithm(fix1d(y), fix1d(x), fix1d(w), fix1d(cdims))) # will be removed in cuDNN 8
    returnedAlgoCount, perfResults = cudnnGetConvolutionForwardAlgorithm_v7(fix1d(y), fix1d(x), fix1d(w), fix1d(cdims))
    # returnedAlgoCount, perfResults = cudnnFindConvolutionForwardAlgorithm(fix1d(y), fix1d(x), fix1d(w), fix1d(cdims))
    # returnedAlgoCount, perfResults = cudnnFindConvolutionForwardAlgorithmEx(fix1d(y), fix1d(x), fix1d(w), fix1d(cdims))
    algo = UInt32(perfResults[1].algo)
  end

  cudnnConvolutionForward(fix1d(y), fix1d(x), fix1d(w), fix1d(cdims), alpha=alpha, algo=algo)
  return y
end

function ∇conv_filter!(dw::CuArray{T}, x::CuArray{T}, dy::CuArray{T},
                       cdims::DenseConvDims; alpha=1, algo=0) where T<:CUDNNFloat
  if version() < v"6"
    all(x -> x == 1, dilation(cdims)) || error("Only dilation = 1 is supported in cuDNN version < 6")
  end

  cudnnConvolutionBackwardFilter(fix1d(dw), fix1d(x), fix1d(dy), fix1d(cdims), alpha=alpha, algo=algo)
  return dw
end

function ∇conv_data!(dx::CuArray{T}, dy::CuArray{T}, w::CuArray{T},
                     cdims::DenseConvDims; alpha=1, algo=0) where T<:CUDNNFloat
  if version() < v"6"
    all(x -> x == 1, dilation(cdims)) || error("Only dilation = 1 is supported in cuDNN version < 6")
  end

  cudnnConvolutionBackwardData(fix1d(dx), fix1d(w), fix1d(dy), fix1d(cdims), alpha=alpha, algo=algo)
  return dx
end

∇conv_bias!(db::CuArray{T}, dy::CuArray{T}; alpha=1, beta=0) where T<:CUDNNFloat =
  (cudnnConvolutionBackwardBias(fix1d(db), fix1d(dy), alpha=alpha, beta=beta); return db)

# Pooling

maxpool!(y::CuArray{T}, x::CuArray{T}, pdims::PoolDims) where T<:CUDNNFloat =
  (cudnnPoolingForward(fix1d(y), fix1d(x), fix1d(pdims); mode=0); return y)

∇maxpool!(dx::CuArray{T}, dy::CuArray{T}, y::CuArray{T}, x::CuArray{T},
          pdims::PoolDims) where T<:CUDNNFloat =
  (cudnnPoolingBackward(fix1d(dx), fix1d(dy), fix1d(x), fix1d(y), fix1d(pdims), mode=0); return dx)

meanpool!(y::CuArray{T}, x::CuArray{T}, pdims::PoolDims) where T<:CUDNNFloat =
  (cudnnPoolingForward(fix1d(y), fix1d(x), fix1d(pdims), mode=1); return y)

∇meanpool!(dx::CuArray{T}, dy::CuArray{T}, y::CuArray{T}, x::CuArray{T},
           pdims::PoolDims) where T<:CUDNNFloat =
  (cudnnPoolingBackward(fix1d(dx), fix1d(dy), fix1d(x), fix1d(y), fix1d(pdims), mode=1); return dx)

# Activation

Base.broadcasted(::typeof(NNlib.σ), x::CuArray{T}) where {T<:CUDNNFloat} =
  cudnnActivationForward(similar(x), x; mode=CUDNN_ACTIVATION_SIGMOID, coeff=0.0, reluNanOpt=CUDNN_NOT_PROPAGATE_NAN, alpha=1, beta=0)

Base.broadcasted(::typeof(NNlib.relu), x::CuArray{T}) where {T<:CUDNNFloat} =
  cudnnActivationForward(similar(x), x; mode=CUDNN_ACTIVATION_RELU, coeff=0.0, reluNanOpt=CUDNN_NOT_PROPAGATE_NAN, alpha=1, beta=0)

Base.broadcasted(::typeof(NNlib.tanh), x::CuArray{T}) where {T<:CUDNNFloat} =
  cudnnActivationForward(similar(x), x; mode=CUDNN_ACTIVATION_TANH, coeff=0.0, reluNanOpt=CUDNN_NOT_PROPAGATE_NAN, alpha=1, beta=0)

Base.broadcasted(::typeof(NNlib.trelu), x::CuArray{T}) where {T<:CUDNNFloat} =
  cudnnActivationForward(similar(x), x; mode=CUDNN_ACTIVATION_CLIPPED_RELU, coeff=0.0, reluNanOpt=CUDNN_NOT_PROPAGATE_NAN, alpha=1, beta=0)

Base.broadcasted(::typeof(NNlib.elu), x::CuArray{T}) where {T<:CUDNNFloat} =
  cudnnActivationForward(similar(x), x; mode=CUDNN_ACTIVATION_ELU, coeff=0.0, reluNanOpt=CUDNN_NOT_PROPAGATE_NAN, alpha=1, beta=0)

# CUDNN_ACTIVATION_IDENTITY does not work with cudnnActivationForward