Merge pull request #69 from LuxDL/ap/mark_inactive

[Enzyme] Mark certain operations as Enzyme inactive
LuxDL · May 12, 2024 · e829b63 · e829b63 · avik-pal · May 12, 2024
2 parents ea65d23 + bb79996
commit e829b63
Show file tree

Hide file tree

Showing 11 changed files with 30 additions and 288 deletions.
diff --git a/Project.toml b/Project.toml
@@ -1,15 +1,15 @@
 name = "LuxLib"
 uuid = "82251201-b29d-42c6-8e01-566dec8acb11"
 authors = ["Avik Pal <avikpal@mit.edu> and contributors"]
-version = "0.3.22"
+version = "0.3.23"
 
 [deps]
 ArrayInterface = "4fba245c-0d91-5ea0-9b3e-6abc04ee57a9"
 ChainRulesCore = "d360d2e6-b24c-11e9-a2a3-2a2ae2dbcce4"
+EnzymeCore = "f151be2c-9106-41f4-ab19-57ee4f262869"
 FastBroadcast = "7034ab61-46d4-4ed7-9d0f-46aef9175898"
 FastClosures = "9aa1b823-49e4-5ca5-8b0f-3971ec8bab6a"
 GPUArraysCore = "46192b85-c4d5-4398-a991-12ede77f4527"
-KernelAbstractions = "63c18a36-062a-441e-b654-da1e3ab1ce7c"
 LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
 LuxCore = "bb33d45b-7691-41d6-9220-0943567d0623"
 Markdown = "d6f4376e-aef5-505a-96c1-9c027394607a"
@@ -44,19 +44,19 @@ ArrayInterface = "7.9"
 CUDA = "5.3.2"
 ChainRulesCore = "1.23"
 ComponentArrays = "0.15.8"
+EnzymeCore = "0.7"
 ExplicitImports = "1.4.1"
 FastBroadcast = "0.2.8"
 FastClosures = "0.3.2"
 ForwardDiff = "0.10.36"
 GPUArraysCore = "0.1.6"
-KernelAbstractions = "0.9.15"
 LinearAlgebra = "1.10"
 LuxAMDGPU = "0.2.1"
 LuxCUDA = "0.3.1"
 LuxCore = "0.1.13"
 LuxTestUtils = "0.1.15"
 Markdown = "1.10"
-NNlib = "0.9.10"
+NNlib = "0.9.13"
 PrecompileTools = "1.2"
 Random = "1.10"
 ReTestItems = "1.23.1"

diff --git a/ext/LuxLibReverseDiffExt.jl b/ext/LuxLibReverseDiffExt.jl
@@ -21,8 +21,6 @@ end
 @grad_from_chainrules LuxLib._copy_autodiff_barrier(x::TrackedArray)
 @grad_from_chainrules LuxLib._copy_autodiff_barrier(x::TrackedReal)
 
-LuxLib._get_backend(x::TrackedArray) = LuxLib._get_backend(ReverseDiff.value(x))
-
 # api/dropout.jl
 LuxLib._dropout_fptype(x::TrackedArray) = LuxLib._dropout_fptype(ReverseDiff.value(x))
 

diff --git a/ext/LuxLibTrackerExt.jl b/ext/LuxLibTrackerExt.jl
@@ -41,20 +41,7 @@ function LuxLib._copy_autodiff_barrier(x::Union{TrackedArray, TrackedReal})
     return LuxLib._copy_autodiff_barrier(Tracker.data(x))
 end
 
-LuxLib._get_backend(x::TrackedArray) = LuxLib._get_backend(Tracker.data(x))
-
 # api/dropout.jl
 LuxLib._dropout_fptype(x::TrackedArray) = LuxLib._dropout_fptype(Tracker.data(x))
 
-# api/groupnorm.jl
-for T1 in (:TrackedArray, :AbstractArray),
-    T2 in (:TrackedVector, :AbstractVector),
-    T3 in (:TrackedVector, :AbstractVector)
-
-    LuxLib.__is_tracked(T1, T2, T3) || continue
-
-    @eval Tracker.@grad_from_chainrules LuxLib.__fast_groupnorm(
-        x::$T1, groups, scale::$T2, bias::$T3, epsilon::Real)
-end
-
 end
diff --git a/src/LuxLib.jl b/src/LuxLib.jl
@@ -5,28 +5,26 @@ using PrecompileTools: @recompile_invalidations
 @recompile_invalidations begin
     using ArrayInterface: ArrayInterface
     using ChainRulesCore: ChainRulesCore, NoTangent
+    using EnzymeCore: EnzymeCore, EnzymeRules
     using FastBroadcast: @..
     using FastClosures: @closure
     using GPUArraysCore: GPUArraysCore, AnyGPUArray
-    using KernelAbstractions: KernelAbstractions, @Const, @index, @kernel
     using LinearAlgebra: LinearAlgebra, BLAS, mul!
     using LuxCore: LuxCore
     using Markdown: @doc_str
     using NNlib: NNlib
     using Random: Random, AbstractRNG, rand!
     using Reexport: @reexport
-    using Statistics: Statistics, mean, std, var
+    using Statistics: Statistics, mean, var
 end
 
 @reexport using NNlib
 
 const CRC = ChainRulesCore
-const KA = KernelAbstractions
 
 include("utils.jl")
 
 # Low-Level Implementations
-include("impl/groupnorm.jl")
 include("impl/normalization.jl")
 include("impl/fused_dense.jl")
 include("impl/fused_conv.jl")

diff --git a/src/api/dropout.jl b/src/api/dropout.jl
@@ -130,6 +130,7 @@ end
 @inline _dropout_fptype(x) = float(real(eltype(x)))
 
 CRC.@non_differentiable _dropout_fptype(::Any...)
+EnzymeRules.inactive_noinl(::typeof(_dropout_fptype), ::Any...) = nothing
 
 @inline function _alpha_dropout_noise(rng, x)
     rng = LuxCore.replicate(rng)
@@ -139,6 +140,7 @@ CRC.@non_differentiable _dropout_fptype(::Any...)
 end
 
 CRC.@non_differentiable _alpha_dropout_noise(::Any...)
+EnzymeRules.inactive_noinl(::typeof(_alpha_dropout_noise), ::Any...) = nothing
 
 @inline function _generate_dropout_mask(rng::AbstractRNG, x, p, invp; dims)
     realfptype = _dropout_fptype(x)
@@ -148,4 +150,6 @@ CRC.@non_differentiable _alpha_dropout_noise(::Any...)
 end
 
 CRC.@non_differentiable _generate_dropout_mask(::Any...)
+EnzymeRules.inactive_noinl(::typeof(_generate_dropout_mask), ::Any...) = nothing
 CRC.@non_differentiable _dropout_shape(::Any...)
+EnzymeRules.inactive_noinl(::typeof(_dropout_shape), ::Any...) = nothing
diff --git a/src/api/groupnorm.jl b/src/api/groupnorm.jl
@@ -21,39 +21,11 @@ statistics.
 
 The normalized array is returned.
 
-## Performance Considerations
-
-The most common case of this Op -- `x` is a 4D array -- is optimized using
-KernelAbstractions and has a fast custom backwards pass implemented. All other cases have a
-fallback implementation which is not especially optimized.
-
-We have tested the code path for `Float16` and it works, but gradient accumulation is
-extremely fragile. Hence, for `Float16` inputs, it uses the fallback implementation.
-
-If the batch size is small (< 16), then the fallback implementation will be faster than the
-KA version. However, this customization is not possible using the direct `groupnorm`
-interface.
-
 ## References
 
 [1] Wu, Yuxin, and Kaiming He. "Group normalization." Proceedings of the European conference
     on computer vision (ECCV). 2018.
 """
-function groupnorm(x::AbstractArray{<:Union{Float32, Float64}, 4},
-        scale::AbstractVector{<:Union{Float32, Float64}},
-        bias::AbstractVector{<:Union{Float32, Float64}},
-        groups::Int, σ::F=identity, epsilon::Real=1.0f-5) where {F}
-    _test_valid_groupnorm_arguments(x, scale, bias, groups)
-    # FIXME: We need to fuse the activation function into the kernel for optimal performance
-    return fast_activation!!(σ, __fast_groupnorm(x, groups, scale, bias, epsilon))
-end
-
-# Separate this out for a cleaner rrule later on
-@inline function __fast_groupnorm(x, groups, scale, bias, epsilon)
-    return first(_groupnorm(x, groups, scale, bias, epsilon))
-end
-
-# Slow Fallback (without custom Pullback Implementation)
 function groupnorm(x::AbstractArray{<:Real, N}, scale::Union{Nothing, <:AbstractVector},
         bias::Union{Nothing, <:AbstractVector}, groups::Int,
         σ::F=identity, epsilon::Real=1.0f-5) where {F, N}
@@ -71,19 +43,8 @@ end
     return :($(Val(Tuple(collect(1:(N - 1))))))
 end
 
-# Custom Pullbacks
-function CRC.rrule(::typeof(__fast_groupnorm), x, groups, scale, bias, epsilon)
-    y, μ, σ⁻¹ = _groupnorm(x, groups, scale, bias, epsilon)
-    ∇groupnorm = @closure Δ -> begin
-        ∂x, ∂scale, ∂bias = _∇groupnorm(Δ, y, x, groups, scale, bias, μ, σ⁻¹)
-        return NoTangent(), ∂x, NoTangent(), ∂scale, ∂bias, NoTangent()
-    end
-    return y, ∇groupnorm
-end
-
 function _test_valid_groupnorm_arguments(
         x::AbstractArray{T, N}, scale, bias, groups) where {T, N}
-    _assert_same_backend(x, scale, bias)
     if scale !== nothing && bias !== nothing && length(scale) != length(bias) != size(x, 3)
         throw(ArgumentError("Length of `scale` and `bias` must be equal to the number of \
                              channels (N - 1 dim of the input array)."))
@@ -95,3 +56,4 @@ function _test_valid_groupnorm_arguments(
 end
 
 CRC.@non_differentiable _test_valid_groupnorm_arguments(::Any...)
+EnzymeRules.inactive_noinl(::typeof(_test_valid_groupnorm_arguments), ::Any...) = nothing
diff --git a/src/api/instancenorm.jl b/src/api/instancenorm.jl
@@ -47,3 +47,4 @@ function _test_valid_instancenorm_arguments(::AbstractArray{T, N}) where {T, N}
 end
 
 CRC.@non_differentiable _test_valid_instancenorm_arguments(::Any...)
+EnzymeRules.inactive_noinl(::typeof(_test_valid_instancenorm_arguments), ::Any...) = nothing
diff --git a/src/impl/groupnorm.jl b/src/impl/groupnorm.jl
diff --git a/src/impl/normalization.jl b/src/impl/normalization.jl
@@ -20,6 +20,7 @@ end
 @inline __accum_size(x, ::Val{dims}) where {dims} = prod(Base.Fix1(size, x), dims)
 
 CRC.@non_differentiable __accum_size(::Any...)
+EnzymeRules.inactive_noinl(::typeof(__accum_size), ::Any...) = nothing
 
 @inline function _get_batch_statistics(x::AbstractArray, ::Nothing, ::Nothing,
         ::Val{rdims}, ::Val{false}, momentum) where {rdims}