From 6aafdde1f6ae44985e5d83dacaea203f94ddd1ab Mon Sep 17 00:00:00 2001 From: Takafumi Arakaki Date: Sat, 8 Aug 2020 17:49:48 -0700 Subject: [PATCH 01/14] Add a specialized fold for CartesianIndices (#400) The implementation of `foldl` for `CartesianIndices` because I can just redirect this to the one for `Iterators.product` (i.e., `_foldl_product`). Most of the changes are for making sure that the re-transformation `rf = Map(CartesianIndex)'(rf0)` preserves the correct SIMD flag. --- src/processes.jl | 7 +++++++ src/simd.jl | 4 ++++ test/test_processes.jl | 10 ++++++++++ test/test_simd.jl | 16 ++++++++++++++++ 4 files changed, 37 insertions(+) diff --git a/src/processes.jl b/src/processes.jl index 1e7f062f4d..018561341d 100644 --- a/src/processes.jl +++ b/src/processes.jl @@ -263,6 +263,13 @@ end ) end +@inline function __foldl__(rf0, init, cartesian::CartesianIndices) + rf = Map(CartesianIndex)'(rf0) + val = _foldl_product(rf, init, (), cartesian.indices...) + val isa Reduced && return val + return complete(rf, val) +end + @inline function __foldl__( rf, init, prod::Iterators.ProductIterator{<:Tuple{Any,Any,Vararg{Any}}}) diff --git a/src/simd.jl b/src/simd.jl index cbf0bac35d..36ee9af64d 100644 --- a/src/simd.jl +++ b/src/simd.jl @@ -7,6 +7,10 @@ The reducible can support it using `@simd_if`. struct UseSIMD{ivdep} <: Transducer end next(rf::R_{UseSIMD}, result, input) = next(inner(rf), result, input) +# Keep `UseSIMD` as the outer-most transducer when appropriate: +reducingfunction(xf::Transducer, step::R_{UseSIMD}) = + usesimd(Reduction(xf, inner(step)), xform(step)) + # Make sure UseSIMD is the outer-most transducer when UseSIMD is used # via Cat. skipcomplete(rf::R_{UseSIMD}) = diff --git a/test/test_processes.jl b/test/test_processes.jl index c298a66d26..206c53133e 100644 --- a/test/test_processes.jl +++ b/test/test_processes.jl @@ -60,6 +60,16 @@ include("preamble.jl") end end + @testset "CartesianIndices" begin + @testset for cartesian in [ + CartesianIndices((1:2,)), + CartesianIndices((1:2, 3:5)), + CartesianIndices((1:2, 3:5, 6:9)), + ] + @test collect(Map(identity), cartesian) == vec(cartesian) + end + end + @testset "product-of-iterators" begin iterator_prototypes = [ (1, 2), diff --git a/test/test_simd.jl b/test/test_simd.jl index e95db900dc..6bd9ec8e59 100644 --- a/test/test_simd.jl +++ b/test/test_simd.jl @@ -30,6 +30,22 @@ asrf(xfs...) = asrf(opcompose(xfs...)) asrf(Map(sin), Cat(), Map(cos), Cat(), xfsimd, Map(tan)) end +@testset "reducingfunction" begin + @test opcompose(UseSIMD{false}(), Map(sin), Map(cos))'(+) === + Map(sin)'(Map(cos)'(+; simd = true)) + @test opcompose(MapCat(collect), UseSIMD{false}(), Map(cos))'(+) === + MapCat(collect)'(Map(cos)'(+; simd = true)) + @testset for (f, g) in [ + (Map(sin), Map(cos)), + (MapCat(collect), Map(cos)), + (MapCat(collect), opcompose(MapCat(collect), Map(cos))), + (opcompose(MapCat(collect), MapCat(collect)), Map(cos)), + ] + @test f'(g'(+; simd = true)) === f'(g'(+); simd = true) + @test f'(g'(+; simd = true)) === opcompose(f, g)'(+; simd = true) + end +end + @testset "skipcomplete" begin @testset for rf in [ asrf(UseSIMD{false}()), From 71524aa598936ccfe9720bf6ce8fe3e29527b164 Mon Sep 17 00:00:00 2001 From: Takafumi Arakaki Date: Sat, 8 Aug 2020 19:22:31 -0700 Subject: [PATCH 02/14] "Nested loops" for threaded fold; option to convert Cat to TCat (#401) * "Nested loops" for threaded fold; option to convert Cat to TCat * Support eductions to be TCat'ed (call retransform) * Fix docstring --- src/core.jl | 1 + src/library.jl | 7 ++-- src/reduce.jl | 50 +++++++++++++++++++++++- test/test_threading_aux.jl | 58 ++++++++++++++++++++++++++++ test/threads/test_parallel_reduce.jl | 15 +++++++ 5 files changed, 127 insertions(+), 4 deletions(-) create mode 100644 test/test_threading_aux.jl diff --git a/src/core.jl b/src/core.jl index a594d38e07..f1c8ed0fd4 100644 --- a/src/core.jl +++ b/src/core.jl @@ -324,6 +324,7 @@ end prependxf(rf::AbstractReduction, xf) = Reduction(xf, rf) setinner(rf::Reduction, inner) = Reduction(xform(rf), inner) +setxform(rf::Reduction, xform) = Reduction(xform, inner(rf)) Transducer(rf::Reduction) = if inner(rf) isa BottomRF diff --git a/src/library.jl b/src/library.jl index f8d9bf5fab..1a5e968dcb 100644 --- a/src/library.jl +++ b/src/library.jl @@ -237,15 +237,16 @@ end next(rf::R_{TCat}, result, input) = wrapping(rf, result) do init, acc + rfi, itr = retransform(inner(rf), input) subresult = _transduce_assoc_nocomplete( - inner(rf), + rfi, init, - input, + itr, xform(rf).basesize, ) subresult isa Reduced && return init, subresult acc isa Unseen && return init, subresult - return init, combine(inner(rf), acc, subresult) + return init, combine(rfi, acc, subresult) end function combine(rf::R_{TCat}, a, b) diff --git a/src/reduce.jl b/src/reduce.jl index 17afa6f154..351cc9f3b7 100644 --- a/src/reduce.jl +++ b/src/reduce.jl @@ -1,5 +1,5 @@ """ - foldxt(step, xf, reducible; [init, simd, basesize, stoppable]) :: T + foldxt(step, xf, reducible; [init, simd, basesize, stoppable, nestlevel]) :: T e**X**tended **t**hreaded fold (reduce). This is a multi-threaded `reduce` based on extended fold protocol defined in Transducers.jl. @@ -32,6 +32,14 @@ See also: [Parallel processing tutorial](@ref tutorial-parallel), by passing `stoppable = false`. It is usually automatically detected and set appropriately. Note that this option is purely for optimization and does not affect the result value. +- `nestlevel::Union{Integer,Val}`: Specify how many inner `Cat` + (flatten) transducers to be multi-threaded (using [`TCat`](@ref)). + It must be a positive integer, `Val` of positive integer, or + `Val(:inf)`. `Val(:inf)` means to use multi-threading for all `Cat` + transducers. Note that `Cat` transducer should be statically known. + That is to say, `foldxt` sees two `Cat`s in `... |> Map(f) |> Cat() + |> Cat()` but only one `Cat` in `... |> Map(x -> f(x) |> Cat()) |> + Cat()` even though they are semantically identical. - For other keyword arguments, see [`foldl`](@ref). !!! compat "Transducers.jl 0.4.23" @@ -152,9 +160,19 @@ function transduce_assoc( simd::SIMDFlag = Val(false), basesize::Union{Integer,Nothing} = nothing, stoppable::Union{Bool,Nothing} = nothing, + nestlevel::Union{Val,Integer,Nothing} = nothing, ) where {F} rf0 = _reducingfunction(xform, step; init = init) rf, coll = retransform(rf0, coll0) + if nestlevel !== nothing + if basesize === nothing + throw(ArgumentError("`nestlevel` requires `basesize`")) + end + if has(rf, Union{Cat,TCat}) + rf = use_threads_for_inner_cats(rf, basesize, nestlevel) + basesize = 1 + end + end if stoppable === nothing stoppable = _might_return_reduced(rf, init, coll) end @@ -487,3 +505,33 @@ julia> tcollect(x^2 for x in 1:2) """ tcollect(xf, reducible; kwargs...) = tcopy(xf, Vector, reducible; kwargs...) tcollect(itr; kwargs...) = tcollect(extract_transducer(itr)...; kwargs...) + +verify_nestlevel(lvl::Val{:inf}) = lvl +verify_nestlevel(lvl::Integer) = verify_nestlevel(Val(Int(lvl))) +function verify_nestlevel(::Val{n}) where {n} + n isa Integer || + throw(ArgumentError("`nestlevel` must be an integer, `Val` of `Int`, or `Val(:inf)`")) + lvl = Int(n) + lvl > 0 || throw(ArgumentError("`nestlevel` must be positive")) + return Val(lvl) +end + +_dec_lvl(lvl::Val{:inf}) = lvl +_dec_lvl(::Val{n}) where {n} = Val(n - 1) + +use_threads_for_inner_cats(rf, basesize, nestlevel) = + cats_to_tcats(rf, TCat(basesize), verify_nestlevel(nestlevel)) + +# TODO: handle `TeeRF` etc? +cats_to_tcats(rf::R_, innermost_tcat, lvl::Val) = + Reduction(xform(rf), cats_to_tcats(inner(rf), innermost_tcat, lvl)) +cats_to_tcats(rf::R_{Union{Cat,TCat}}, innermost_tcat, lvl::Val) = + if has(inner(rf), Union{Cat,TCat}) + if lvl isa Val{1} + setxform(rf, innermost_tcat) + else + Reduction(TCat(1), cats_to_tcats(inner(rf), innermost_tcat, _dec_lvl(lvl))) + end + else + setxform(rf, innermost_tcat) + end diff --git a/test/test_threading_aux.jl b/test/test_threading_aux.jl new file mode 100644 index 0000000000..3d2cb0f0f9 --- /dev/null +++ b/test/test_threading_aux.jl @@ -0,0 +1,58 @@ +module TestThreadingAux + +include("preamble.jl") +using Transducers: use_threads_for_inner_cats + +@testset "use_threads_for_inner_cats" begin + fivecats = opcompose(Cat(), Cat(), Cat(), Cat(), Cat()) + @test use_threads_for_inner_cats(fivecats'(+), 3, Val(:inf)) === + opcompose(TCat(1), TCat(1), TCat(1), TCat(1), TCat(3))'(+) + @test use_threads_for_inner_cats(fivecats'(+), 3, Val(10)) === + opcompose(TCat(1), TCat(1), TCat(1), TCat(1), TCat(3))'(+) + @test use_threads_for_inner_cats(fivecats'(+), 3, Val(4)) === + opcompose(TCat(1), TCat(1), TCat(1), TCat(3), Cat())'(+) + @test use_threads_for_inner_cats(fivecats'(+), 3, 1) === + opcompose(TCat(3), Cat(), Cat(), Cat(), Cat())'(+) + + @test use_threads_for_inner_cats( + opcompose(Cat(), Map(sin), TCat(1), Map(cos), Cat(), Map(tan))'(+), + 3, + Val(:inf), + ) === opcompose(TCat(1), Map(sin), TCat(1), Map(cos), TCat(3), Map(tan))'(+) + @test use_threads_for_inner_cats( + opcompose(Cat(), Map(sin), TCat(1), Map(cos), Cat(), Map(tan))'(+), + 3, + 10, + ) === opcompose(TCat(1), Map(sin), TCat(1), Map(cos), TCat(3), Map(tan))'(+) + @test use_threads_for_inner_cats( + opcompose(Cat(), Map(sin), TCat(1), Map(cos), Cat(), Map(tan))'(+), + 3, + Val(2), + ) === opcompose(TCat(1), Map(sin), TCat(3), Map(cos), Cat(), Map(tan))'(+) + + @testset "not int" begin + err = @test_error use_threads_for_inner_cats(fivecats'(+), 3, Val(:non_int)) + @test occursin("`nestlevel` must be an integer", sprint(showerror, err)) + end + @testset "not positive" begin + err = @test_error use_threads_for_inner_cats(fivecats'(+), 3, 0) + @test occursin("`nestlevel` must be positive", sprint(showerror, err)) + end +end + +@testset "foldxt" begin + @testset "`nestlevel` requires `basesize`" begin + err = @test_error foldxt(+, 1:0; nestlevel = 3) + @test occursin("`nestlevel` requires `basesize`", sprint(showerror, err)) + end + @testset "not int" begin + err = @test_error foldxt(Cat()'(+), 1:0; nestlevel = Val(:not_int), basesize = 3) + @test occursin("`nestlevel` must be an integer", sprint(showerror, err)) + end + @testset "not positive" begin + err = @test_error foldxt(Cat()'(+), 1:0; nestlevel = -1, basesize = 3) + @test occursin("`nestlevel` must be positive", sprint(showerror, err)) + end +end + +end # module diff --git a/test/threads/test_parallel_reduce.jl b/test/threads/test_parallel_reduce.jl index e93edd7319..dca03a0820 100644 --- a/test/threads/test_parallel_reduce.jl +++ b/test/threads/test_parallel_reduce.jl @@ -170,6 +170,14 @@ end @test collect(xf, 1:3) == desired @test collect(xf, 0:3) == desired end + @testset "Map(x -> 1:x |> Map(x -> 2x)) ⨟ TCat(1)" begin + xf = opcompose(Map(x -> 1:x |> Map(x -> 2x)), TCat(1)) + desired = [2, 2, 4, 2, 4, 6] + @test collect(xf, 1:3) ==ₜ desired + @test collect(xf, 0:3) ==ₜ desired + @test tcollect(xf, 1:3) ==ₜ desired + @test tcollect(xf, 0:3) ==ₜ desired + end end @testset "TakeWhile" begin @@ -225,4 +233,11 @@ end @test foldxt(right, xf3, withprogress(1:1000; interval=0); basesize=1, simd=true) == 100 end +@testset "nestlevel" begin + xs = 1:3 |> MapCat(x -> 1:x) |> MapCat(x -> 1:x) |> MapCat(x -> 1:x) + @test foldxt(+, xs; basesize = 1, nestlevel = 3) == sum(xs) + @test foldxt(+, xs; basesize = 1, nestlevel = Val(2)) == sum(xs) + @test foldxt(+, xs; basesize = 1, nestlevel = Val(:inf)) == sum(xs) +end + end # module From 061a65f641f3480fe1c067b27b63914f1deca45b Mon Sep 17 00:00:00 2001 From: Takafumi Arakaki Date: Sat, 8 Aug 2020 20:16:09 -0700 Subject: [PATCH 03/14] Add more specialization hints to the compiler (#402) --- src/processes.jl | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/processes.jl b/src/processes.jl index 018561341d..e25f3ba751 100644 --- a/src/processes.jl +++ b/src/processes.jl @@ -185,7 +185,7 @@ end complete(rf, @return_if_reduced foldlargs(rf, init, coll...)) # TODO: use IndexStyle -@inline function __foldl__(rf, init, arr::Union{AbstractArray, Broadcasted}) +@inline function __foldl__(rf::RF, init, arr::Union{AbstractArray,Broadcasted}) where {RF} isempty(arr) && return complete(rf, init) idxs = eachindex(arr) val = @next(rf, init, @inbounds arr[idxs[firstindex(idxs)]]) @@ -324,7 +324,8 @@ end Call [`__foldl__`](@ref) without calling [`complete`](@ref). """ -@inline foldl_nocomplete(rf, init, coll) = __foldl__(skipcomplete(rf), init, coll) +@inline foldl_nocomplete(rf::RF, init, coll) where {RF} = + __foldl__(skipcomplete(rf), init, coll) """ foldxl(step, xf::Transducer, reducible; init, simd) :: T From 747697da205ca76272c6269b2154de5dc60ab296 Mon Sep 17 00:00:00 2001 From: Takafumi Arakaki Date: Sat, 8 Aug 2020 22:11:25 -0700 Subject: [PATCH 04/14] Add bench_filter_sum.jl (#404) This is a useful set of benchmarks for assessing the effect of tail-call function-barrier for arrays (#403). --- benchmark/Project.toml | 1 + benchmark/bench_filter_sum.jl | 39 +++++++++++++++++++++++++++++++++++ 2 files changed, 40 insertions(+) create mode 100644 benchmark/bench_filter_sum.jl diff --git a/benchmark/Project.toml b/benchmark/Project.toml index 317fcc0e2b..99fb3a0566 100644 --- a/benchmark/Project.toml +++ b/benchmark/Project.toml @@ -9,6 +9,7 @@ Markdown = "d6f4376e-aef5-505a-96c1-9c027394607a" MicroCollections = "128add7d-3638-4c79-886c-908ea0c25c34" Pkg = "44cfe95a-1eb2-52ea-b672-e2afdf69b78f" PkgBenchmark = "32113eaa-f34f-5b0d-bd6c-c81e245fc73d" +Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c" Referenceables = "42d2dcc6-99eb-4e98-b66c-637b7d73030e" SplitApplyCombine = "03a91e81-4c3e-53e1-a0a4-9c0c8f19dd66" SplittablesBase = "171d559e-b47b-412a-8079-5efa626c420e" diff --git a/benchmark/bench_filter_sum.jl b/benchmark/bench_filter_sum.jl new file mode 100644 index 0000000000..c9b06e39a7 --- /dev/null +++ b/benchmark/bench_filter_sum.jl @@ -0,0 +1,39 @@ +module BenchFilterSum + +import Random +using BenchmarkTools +using Transducers + +const SUITE = BenchmarkGroup() + +function naive_sum(xs, acc = false) + for x in xs + acc += x + end + return acc +end + +Random.seed!(12345) +for n in [1000, 10000] + s0 = SUITE[:n=>n] = BenchmarkGroup() + + for (xslabel, xs, init) in [ + (:UnitRange, (x for x in 1:n if isodd(x)), 0), + (:RandomFloats, (x for x in randn(n) if x > 0), 0.0), + ] + s1 = s0[:xs=>xslabel] = BenchmarkGroup() + + s2 = s1[:withinit=>false] = BenchmarkGroup() + s2[:impl=>:naive] = @benchmarkable naive_sum($xs) + s2[:impl=>:base] = @benchmarkable sum($xs) + s2[:impl=>:xf] = @benchmarkable sum($(eduction(xs))) + + s2 = s1[:withinit=>true] = BenchmarkGroup() + s2[:impl=>:naive] = @benchmarkable naive_sum($xs, $init) + s2[:impl=>:base] = @benchmarkable foldl(+, $xs; init = $init) + s2[:impl=>:xf] = @benchmarkable foldl(+, $(eduction(xs)); init = $init) + end +end + +end # module +BenchFilterSum.SUITE From 65db24b0e874a4472eb3b81543c11306c2fd8581 Mon Sep 17 00:00:00 2001 From: Takafumi Arakaki Date: Sat, 8 Aug 2020 23:31:09 -0700 Subject: [PATCH 05/14] Simplify benchmark group keys (#406) The table is too wide if I use `=>`: https://github.com/JuliaFolds/Transducers-data/blob/benchmark-results/2020/08/09/053256/result.md#results --- benchmark/bench_filter_sum.jl | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/benchmark/bench_filter_sum.jl b/benchmark/bench_filter_sum.jl index c9b06e39a7..e902f259df 100644 --- a/benchmark/bench_filter_sum.jl +++ b/benchmark/bench_filter_sum.jl @@ -15,23 +15,23 @@ end Random.seed!(12345) for n in [1000, 10000] - s0 = SUITE[:n=>n] = BenchmarkGroup() + s0 = SUITE[string(n)] = BenchmarkGroup() for (xslabel, xs, init) in [ - (:UnitRange, (x for x in 1:n if isodd(x)), 0), - (:RandomFloats, (x for x in randn(n) if x > 0), 0.0), + ("UnitRange", (x for x in 1:n if isodd(x)), 0), + ("RandomFloats", (x for x in randn(n) if x > 0), 0.0), ] - s1 = s0[:xs=>xslabel] = BenchmarkGroup() + s1 = s0[xslabel] = BenchmarkGroup() - s2 = s1[:withinit=>false] = BenchmarkGroup() - s2[:impl=>:naive] = @benchmarkable naive_sum($xs) - s2[:impl=>:base] = @benchmarkable sum($xs) - s2[:impl=>:xf] = @benchmarkable sum($(eduction(xs))) + s2 = s1["noinit"] = BenchmarkGroup() + s2["naive"] = @benchmarkable naive_sum($xs) + s2["base"] = @benchmarkable sum($xs) + s2["xf"] = @benchmarkable sum($(eduction(xs))) - s2 = s1[:withinit=>true] = BenchmarkGroup() - s2[:impl=>:naive] = @benchmarkable naive_sum($xs, $init) - s2[:impl=>:base] = @benchmarkable foldl(+, $xs; init = $init) - s2[:impl=>:xf] = @benchmarkable foldl(+, $(eduction(xs)); init = $init) + s2 = s1["withinit"] = BenchmarkGroup() + s2["naive"] = @benchmarkable naive_sum($xs, $init) + s2["base"] = @benchmarkable foldl(+, $xs; init = $init) + s2["xf"] = @benchmarkable foldl(+, $(eduction(xs)); init = $init) end end From eb430cf76a5c25aa909858c20424aead21cfecfd Mon Sep 17 00:00:00 2001 From: Takafumi Arakaki Date: Sun, 9 Aug 2020 01:07:09 -0700 Subject: [PATCH 06/14] Tail-call function-barrier for arrays (#403) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This PR implements the "tail-call function-barrier" pattern for arrays with linear style indexing. This gives us a better performance for type-changing reduction where the iteration at which the type changes is unknown. A good example is filtered sum with no initial value (#404). Interestingly, this version is a bit worse for sum of filtered floats on "short" arrays. Compare the result of `["filter_sum", "1000", "RandomFloats", "noinit", "xf"]`. In my laptop, baseline is ~1.7 μs while this branch is ~2.1 μs. However, for 10x longer input, this branch is faster (> 2x). The benchmarks on CI also show similar results. --- src/basics.jl | 23 ++++++++++++++++++ src/core.jl | 5 ++++ src/library.jl | 1 + src/processes.jl | 55 ++++++++++++++++++++++++++++++------------ test/test_processes.jl | 9 +++++++ 5 files changed, 77 insertions(+), 16 deletions(-) diff --git a/src/basics.jl b/src/basics.jl index 1f4c547531..b6db999132 100644 --- a/src/basics.jl +++ b/src/basics.jl @@ -95,3 +95,26 @@ abstract type _Function <: Function end Base.show(io::IO, ::MIME"text/plain", f::_Function) = show(io, f) Base.print(io::IO, f::_Function) = show(io, f) @specialize + + +# A macro for "manual Union splitting". It is sometimes useful to let +# the compiler know that it is beneficial to type-specialize `body`. +# * https://github.com/JuliaFolds/Transducers.jl/pull/188 +# * https://github.com/JuliaLang/julia/pull/34293#discussion_r363550608 +macro manual_union_split(cond, body) + quote + if $cond + $body + else + $body + end + end |> esc +end + +@inline _firstindex(arr) = firstindex(arr) +@inline _lastindex(arr) = lastindex(arr) + +# Define `firstindex` and `lastindex` for `Broadcasted` with linear +# index style: +@inline _firstindex(bc::Broadcasted) = first((axes(bc)::Tuple{Any})[1]) +@inline _lastindex(bc::Broadcasted) = last((axes(bc)::Tuple{Any})[1]) diff --git a/src/core.jl b/src/core.jl index f1c8ed0fd4..edee313b20 100644 --- a/src/core.jl +++ b/src/core.jl @@ -565,6 +565,9 @@ combine(rf::Reduction, a, b) = combine(inner(rf), a, b) end +is_prelude(_) = false +is_prelude(::InitialValues.InitialValue) = true + privatestate(::T, state, result) where {T <: AbstractReduction} = privatestate(T, state, result) @@ -598,6 +601,8 @@ ownsstate(::R, ::PrivateState{T}) where {R, T} = R === T # took more than 10 min). See also: # https://github.com/JuliaLang/julia/issues/30125 +@inline is_prelude(ps::PrivateState) = is_prelude(psstate(ps)) || is_prelude(psresult(ps)) + """ unwrap(rf, result) diff --git a/src/library.jl b/src/library.jl index 1a5e968dcb..826c8bc352 100644 --- a/src/library.jl +++ b/src/library.jl @@ -927,6 +927,7 @@ struct PartitionBy{F} <: Transducer end struct Unseen end +is_prelude(::Unseen) = true isexpansive(::PartitionBy) = false diff --git a/src/processes.jl b/src/processes.jl index e25f3ba751..c10b1da65e 100644 --- a/src/processes.jl +++ b/src/processes.jl @@ -155,15 +155,7 @@ function __foldl__(rf::RF, init::T, coll) where {RF,T} ret === nothing && return complete(rf, init) x, state = ret val = @next(rf, init, x) - - # Doing "manual Union splitting" (?). This somehow helps the - # compiler to generate faster code even though the code inside the - # `if` branches are identical. - # * https://github.com/JuliaFolds/Transducers.jl/pull/188 - # * https://github.com/JuliaLang/julia/pull/34293#discussion_r363550608 - if val isa T - return _foldl_iter(rf, val, coll, state, FOLDL_RECURSION_LIMIT) - else + @manual_union_split val isa T begin return _foldl_iter(rf, val, coll, state, FOLDL_RECURSION_LIMIT) end end @@ -185,15 +177,46 @@ end complete(rf, @return_if_reduced foldlargs(rf, init, coll...)) # TODO: use IndexStyle -@inline function __foldl__(rf::RF, init, arr::Union{AbstractArray,Broadcasted}) where {RF} +@inline function __foldl__( + rf::RF, + init::T, + arr::Union{AbstractArray,Broadcasted}, +) where {RF,T} isempty(arr) && return complete(rf, init) - idxs = eachindex(arr) - val = @next(rf, init, @inbounds arr[idxs[firstindex(idxs)]]) - @simd_if rf for k in firstindex(idxs) + 1:lastindex(idxs) - i = @inbounds idxs[k] - val = @next(rf, val, @inbounds arr[i]) + i = _firstindex(arr) + acc = @next(rf, init, @inbounds arr[i]) + @manual_union_split acc isa T begin + if is_prelude(acc) + return _foldl_linear_rec(rf, acc, arr, i + 1, FOLDL_RECURSION_LIMIT) + else + return _foldl_linear_bulk(rf, acc, arr, i + 1) + end end - return complete(rf, val) +end + +@inline function _foldl_linear_bulk(rf::RF, acc, arr, i0) where {RF} + @simd_if rf for i in i0:_lastindex(arr) + acc = @next(rf, acc, @inbounds arr[i]) + end + return complete(rf, acc) +end + +@inline function _foldl_linear_rec(rf::RF, acc::T, arr, i0, counter) where {RF,T} + for i in i0:_lastindex(arr) + y = @next(rf, acc, @inbounds arr[i]) + if counter !== Val(0) + if y isa T + elseif is_prelude(y) + return _foldl_linear_rec(rf, y, arr, i + 1, _dec(counter)) + else + # Otherwise, maybe it could be something like `Union{Float64,Missing}` + # where Julia's native loop is fast: + return _foldl_linear_bulk(rf, y, arr, i + 1) + end + end + acc = y + end + return complete(rf, acc) end @inline _getvalues(i) = () diff --git a/test/test_processes.jl b/test/test_processes.jl index 206c53133e..e382e03502 100644 --- a/test/test_processes.jl +++ b/test/test_processes.jl @@ -47,6 +47,15 @@ include("preamble.jl") @test_throws EmptyResultError foldl(+, nested_xf, iter) end + @testset "type-unstable arrays" begin + valof(::Val{x}) where {x} = x + valof(x) = x + @testset for n in 1:valof(Transducers.FOLDL_RECURSION_LIMIT) + 3 + @test collect(Map(valof), [Val(i) for i in 1:n]) == 1:n + end + @test collect(Map(valof), [[Val(i) for i in 1:4]; 5:9;]) == 1:9 + end + @testset "zip-of-arrays" begin @testset for arrays in [ (0:3,), From b8efeed250714361f7bc5683333f75fb71d3ac48 Mon Sep 17 00:00:00 2001 From: Takafumi Arakaki Date: Sun, 9 Aug 2020 01:45:13 -0700 Subject: [PATCH 07/14] Add bench_sum_transpose.jl (#405) This is a benchmark for measuring specialized foldl for cartesian style arrays. --- benchmark/bench_sum_transpose.jl | 42 ++++++++++++++++++++++++++++++++ 1 file changed, 42 insertions(+) create mode 100644 benchmark/bench_sum_transpose.jl diff --git a/benchmark/bench_sum_transpose.jl b/benchmark/bench_sum_transpose.jl new file mode 100644 index 0000000000..bc777b29dc --- /dev/null +++ b/benchmark/bench_sum_transpose.jl @@ -0,0 +1,42 @@ +module BenchSumTranspose + +import Random +using BenchmarkTools +using Transducers + +const SUITE = BenchmarkGroup() + +function iter_sum(xs, acc = false) + for x in xs + acc += x + end + return acc +end + +function man_sum(xs::AbstractMatrix, acc = false) + for i in axes(xs, 2), j in axes(xs, 1) + acc += @inbounds xs[j, i] + end + return acc +end + +Random.seed!(12345) +# for n in [30, 100] +let n = 30 + s1 = SUITE[string(n)] = BenchmarkGroup() + + xs = randn(n, n)' + + s2 = s1["noinit"] = BenchmarkGroup() + s2["iter"] = @benchmarkable iter_sum($xs) + s2["man"] = @benchmarkable man_sum($xs) + s2["xf"] = @benchmarkable foldxl(+, $xs) + + s2 = s1["withinit"] = BenchmarkGroup() + s2["iter"] = @benchmarkable iter_sum($xs, 0.0) + s2["man"] = @benchmarkable man_sum($xs, 0.0) + s2["xf"] = @benchmarkable foldxl(+, $xs; init = 0.0) +end + +end # module +BenchSumTranspose.SUITE From edc1c726b3fd0231f5c85ba00499223df56689b0 Mon Sep 17 00:00:00 2001 From: Takafumi Arakaki Date: Sun, 9 Aug 2020 13:37:11 -0700 Subject: [PATCH 08/14] Manual trigger to vanilla-test-push.yml This patch also changes the name of the job to vanilla-test-push to disambiguate it from vanilla-test for PRs. --- .github/workflows/vanilla-test-push.yml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.github/workflows/vanilla-test-push.yml b/.github/workflows/vanilla-test-push.yml index dac01cb445..541163d340 100644 --- a/.github/workflows/vanilla-test-push.yml +++ b/.github/workflows/vanilla-test-push.yml @@ -7,9 +7,10 @@ on: paths: - Project.toml - test/environments/main/Project.toml + workflow_dispatch: jobs: - vanilla-test: + vanilla-test-push: runs-on: ubuntu-latest steps: - uses: actions/checkout@v2 From 4b2a4a884e40cb5557328450b912068bc39cfc4d Mon Sep 17 00:00:00 2001 From: Takafumi Arakaki Date: Sun, 9 Aug 2020 13:39:43 -0700 Subject: [PATCH 09/14] Change workflow name of vanilla-test-push.yml This is for differentiating it from vanilla-test.yml. --- .github/workflows/vanilla-test-push.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/vanilla-test-push.yml b/.github/workflows/vanilla-test-push.yml index 541163d340..39eb5768f5 100644 --- a/.github/workflows/vanilla-test-push.yml +++ b/.github/workflows/vanilla-test-push.yml @@ -1,4 +1,4 @@ -name: Run test via Pkg.test() +name: Run test via Pkg.test() on push on: push: From e01532681d69ffbf45d0dc27050125b2fd9159eb Mon Sep 17 00:00:00 2001 From: Takafumi Arakaki Date: Sun, 9 Aug 2020 14:16:15 -0700 Subject: [PATCH 10/14] Switch to Julia 1.5 as the main version in CIs (#409) --- .github/workflows/benchmark.yml | 2 +- .github/workflows/check-xfail.yml | 2 +- .github/workflows/multi-thread-benchmark.yml | 2 +- .github/workflows/vanilla-test-push.yml | 2 +- .github/workflows/vanilla-test.yml | 2 +- .travis.yml | 2 +- 6 files changed, 6 insertions(+), 6 deletions(-) diff --git a/.github/workflows/benchmark.yml b/.github/workflows/benchmark.yml index d9144c3333..65fc67d2b9 100644 --- a/.github/workflows/benchmark.yml +++ b/.github/workflows/benchmark.yml @@ -10,7 +10,7 @@ jobs: - uses: actions/checkout@v2 - uses: julia-actions/setup-julia@latest with: - version: 1.4 + version: 1.5 - name: Install dependencies run: julia -e 'using Pkg; pkg"add PkgBenchmark BenchmarkCI@0.1"' - name: Run benchmarks diff --git a/.github/workflows/check-xfail.yml b/.github/workflows/check-xfail.yml index 01b9783617..59cd229da6 100644 --- a/.github/workflows/check-xfail.yml +++ b/.github/workflows/check-xfail.yml @@ -8,7 +8,7 @@ jobs: runs-on: ubuntu-latest strategy: matrix: - julia-version: ['1.4'] + julia-version: ['^1'] fail-fast: false name: Test xfail Julia ${{ matrix.julia-version }} steps: diff --git a/.github/workflows/multi-thread-benchmark.yml b/.github/workflows/multi-thread-benchmark.yml index 654ab1906f..e68cc4b1b5 100644 --- a/.github/workflows/multi-thread-benchmark.yml +++ b/.github/workflows/multi-thread-benchmark.yml @@ -10,7 +10,7 @@ jobs: - uses: actions/checkout@v2 - uses: julia-actions/setup-julia@latest with: - version: 1.4 + version: 1.5 - name: Install dependencies run: julia -e 'using Pkg; pkg"add PkgBenchmark BenchmarkCI@0.1"' - name: Run benchmarks diff --git a/.github/workflows/vanilla-test-push.yml b/.github/workflows/vanilla-test-push.yml index 39eb5768f5..3b83c23b56 100644 --- a/.github/workflows/vanilla-test-push.yml +++ b/.github/workflows/vanilla-test-push.yml @@ -16,7 +16,7 @@ jobs: - uses: actions/checkout@v2 - uses: julia-actions/setup-julia@v1 with: - version: 1.4 + version: ^1 - uses: julia-actions/julia-buildpkg@latest - uses: julia-actions/julia-runtest@latest env: diff --git a/.github/workflows/vanilla-test.yml b/.github/workflows/vanilla-test.yml index a9a9356a4e..d867410aeb 100644 --- a/.github/workflows/vanilla-test.yml +++ b/.github/workflows/vanilla-test.yml @@ -36,7 +36,7 @@ jobs: - uses: julia-actions/setup-julia@v1 if: ${{ steps.check-project-toml.outputs.need_test == 'yes' }} with: - version: 1.4 + version: ^1 - uses: julia-actions/julia-buildpkg@latest if: ${{ steps.check-project-toml.outputs.need_test == 'yes' }} - uses: julia-actions/julia-runtest@latest diff --git a/.travis.yml b/.travis.yml index b3aff1f92a..1b6f174b71 100644 --- a/.travis.yml +++ b/.travis.yml @@ -3,8 +3,8 @@ language: julia os: - linux julia: + - 1.5 # to be used in benchmarks as well - 1.4 - - 1.5 - nightly env: global: From 29ec4581ed78721f6eaa01932b48ca4a602cc60b Mon Sep 17 00:00:00 2001 From: Takafumi Arakaki Date: Sat, 8 Aug 2020 23:54:47 -0700 Subject: [PATCH 11/14] Add bench_teerf_filter.jl --- benchmark/bench_teerf_filter.jl | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) create mode 100644 benchmark/bench_teerf_filter.jl diff --git a/benchmark/bench_teerf_filter.jl b/benchmark/bench_teerf_filter.jl new file mode 100644 index 0000000000..4a3326e9f2 --- /dev/null +++ b/benchmark/bench_teerf_filter.jl @@ -0,0 +1,16 @@ +module BenchTeeRFFilter + +using BenchmarkTools +using Transducers + +const SUITE = BenchmarkGroup() + +let xs = 1:1000 + rf = TeeRF(Filter(isodd)'(+), Filter(iseven)'(+)) + + SUITE["noinit"] = @benchmarkable foldxl($rf, $xs) + SUITE["withinit"] = @benchmarkable foldxl($rf, $xs; init = (0, 0)) +end + +end # module +BenchTeeRFFilter.SUITE From 735aae6d97e2671562034d78af66eba6b9933e60 Mon Sep 17 00:00:00 2001 From: Takafumi Arakaki Date: Sun, 9 Aug 2020 16:10:45 -0700 Subject: [PATCH 12/14] Specialize foldl for cartesian style arrays (#407) This patch implements a specialization of `foldl` on arrays with `IndexCartesian` index style. This gives us more than 2x speedup (see `sum_transpose` benchmark). The implementation mostly just redirects the call to `foldl` of `CartesianIndices`. Most of the code is for compatibility with Julia < 1.5. This PR also fixes a bug in `foldl` for multi-dimensional `Broadcasted` (probably introduced by #403). --- src/basics.jl | 10 ++++++++++ src/processes.jl | 18 ++++++++++++------ test/test_processes.jl | 9 ++++++++- 3 files changed, 30 insertions(+), 7 deletions(-) diff --git a/src/basics.jl b/src/basics.jl index b6db999132..083af50dd6 100644 --- a/src/basics.jl +++ b/src/basics.jl @@ -118,3 +118,13 @@ end # index style: @inline _firstindex(bc::Broadcasted) = first((axes(bc)::Tuple{Any})[1]) @inline _lastindex(bc::Broadcasted) = last((axes(bc)::Tuple{Any})[1]) + +# Define `CartesianIndices` for `Broadcasted` +@inline _CartesianIndices(arr) = CartesianIndices(arr) +@inline _CartesianIndices(bc::Broadcasted) = CartesianIndices(axes(bc)::Tuple) + +# Define `IndexStyle` for `Broadcasted` +_IndexStyle(arr) = IndexStyle(arr) +_IndexStyle(bc::Broadcasted) = _IndexStyle(typeof(bc)) +_IndexStyle(::Type{<:Broadcasted{<:Any,<:Tuple{Any}}}) = IndexLinear() +_IndexStyle(::Type{<:Broadcasted{<:Any}}) = IndexCartesian() diff --git a/src/processes.jl b/src/processes.jl index c10b1da65e..d906c7ba5e 100644 --- a/src/processes.jl +++ b/src/processes.jl @@ -176,13 +176,13 @@ end @inline __foldl__(rf::RF, init, coll::Tuple) where {RF} = complete(rf, @return_if_reduced foldlargs(rf, init, coll...)) -# TODO: use IndexStyle -@inline function __foldl__( - rf::RF, - init::T, - arr::Union{AbstractArray,Broadcasted}, -) where {RF,T} +@inline function __foldl__(rf::RF, init, arr0::Union{AbstractArray,Broadcasted}) where {RF} + arr = Broadcast.instantiate(arr0) isempty(arr) && return complete(rf, init) + return _foldl_array(rf, init, arr, _IndexStyle(arr)) +end + +@inline function _foldl_array(rf::RF, init::T, arr, ::IndexLinear) where {RF,T} i = _firstindex(arr) acc = @next(rf, init, @inbounds arr[i]) @manual_union_split acc isa T begin @@ -219,6 +219,12 @@ end return complete(rf, acc) end +@inline function _foldl_array(rf0::RF, init, arr, ::IndexStyle) where {RF,T} + @inline getvalue(I) = @inbounds arr[I] + rf = Map(getvalue)'(rf0) + return __foldl__(rf, init, _CartesianIndices(arr)) +end + @inline _getvalues(i) = () @inline _getvalues(i, a, rest...) = ((@inbounds a[i]), _getvalues(i, rest...)...) diff --git a/test/test_processes.jl b/test/test_processes.jl index e382e03502..b4c1772d16 100644 --- a/test/test_processes.jl +++ b/test/test_processes.jl @@ -110,7 +110,7 @@ include("preamble.jl") end end - @testset "broadcast" begin + @testset "broadcast (linear)" begin @testset for xs in iterator_variants(1:3) ys = @~ xs.^2 @test collect(Map(identity), ys) == copy(ys) @@ -118,6 +118,13 @@ include("preamble.jl") @test foldl(+, Filter(isodd), ys; init=0) == 10 end end + + @testset "broadcast (cartesian)" begin + xs = @~ (1:3) .+ (4:5)' + @test collect(Map(identity), xs) == vec(copy(xs)) + @test foldl(+, Filter(isodd), xs) == 19 + @test foldl(+, Filter(isodd), xs; init = 0) == 19 + end end From e11604dcc30e97a8a25a93fc7a6e901c93899d1e Mon Sep 17 00:00:00 2001 From: Takafumi Arakaki Date: Sun, 9 Aug 2020 16:51:26 -0700 Subject: [PATCH 13/14] Define is_prelude on Tuple and NamedTuple (#408) This patch handles the sentinel initial values wrapped in `Tuple`s and `NamedTuple`s. This is useful, e.g., when `TeeRF` is wrapping filtering transducers. ~5x speedup in `teerf_filter` benchmark. --- src/core.jl | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/core.jl b/src/core.jl index edee313b20..d8997172fa 100644 --- a/src/core.jl +++ b/src/core.jl @@ -567,6 +567,8 @@ combine(rf::Reduction, a, b) = is_prelude(_) = false is_prelude(::InitialValues.InitialValue) = true +is_prelude(xs::Tuple) = any(map(is_prelude, xs)) +is_prelude(xs::NamedTuple) = is_prelude(Tuple(xs)) privatestate(::T, state, result) where {T <: AbstractReduction} = privatestate(T, state, result) From ef2174ebbb511cb5979b20d8c017957874704ee8 Mon Sep 17 00:00:00 2001 From: Takafumi Arakaki Date: Sun, 9 Aug 2020 17:19:17 -0700 Subject: [PATCH 14/14] Add bench_cartesian.jl (#410) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This characterizes the improvements in 6aafdde1f6ae44985e5d83dacaea203f94ddd1ab Example minimum time: Before (Transducers v0.4.47): 8.720 μs After (Transducers v0.4.48-DEV): 2.628 μs --- benchmark/bench_cartesian.jl | 41 ++++++++++++++++++++++++++++++++++++ 1 file changed, 41 insertions(+) create mode 100644 benchmark/bench_cartesian.jl diff --git a/benchmark/bench_cartesian.jl b/benchmark/bench_cartesian.jl new file mode 100644 index 0000000000..0ecc3588a5 --- /dev/null +++ b/benchmark/bench_cartesian.jl @@ -0,0 +1,41 @@ +module BenchCartesian + +using BenchmarkTools +using Transducers + +function copyto_manual!(ys::AbstractMatrix, xs::AbstractMatrix) + @assert axes(ys) == axes(xs) + for j in 1:size(xs, 2), i in 1:size(xs, 1) + @inbounds ys[i, j] = xs[i, j] + end + return ys +end + +function copyto_iter!(ys, xs) + @assert axes(ys) == axes(xs) + for I in CartesianIndices(xs) + @inbounds ys[I] = xs[I] + end + return ys +end + +function copyto_xf!(ys, xs) + foreach(Map(identity), CartesianIndices(xs)) do I + @inbounds ys[I] = xs[I] + nothing + end + return ys +end + +const SUITE = BenchmarkGroup() + +let xs = randn(3, 10^3) + ys = zero(xs) + s1 = SUITE["copyto!"] = BenchmarkGroup() + s1["man"] = @benchmarkable(copyto_manual!($ys, $xs)) + s1["iter"] = @benchmarkable(copyto_iter!($ys, $xs)) + s1["xf"] = @benchmarkable(copyto_xf!($ys, $xs)) +end + +end # module +BenchCartesian.SUITE