From 6aafdde1f6ae44985e5d83dacaea203f94ddd1ab Mon Sep 17 00:00:00 2001
From: Takafumi Arakaki <aka.tkf@gmail.com>
Date: Sat, 8 Aug 2020 17:49:48 -0700
Subject: [PATCH 01/14] Add a specialized fold for CartesianIndices (#400)

The implementation of `foldl` for `CartesianIndices` because I can
just redirect this to the one for `Iterators.product` (i.e.,
`_foldl_product`).  Most of the changes are for making sure that the
re-transformation `rf = Map(CartesianIndex)'(rf0)` preserves the
correct SIMD flag.
---
 src/processes.jl       |  7 +++++++
 src/simd.jl            |  4 ++++
 test/test_processes.jl | 10 ++++++++++
 test/test_simd.jl      | 16 ++++++++++++++++
 4 files changed, 37 insertions(+)

diff --git a/src/processes.jl b/src/processes.jl
index 1e7f062f4d..018561341d 100644
--- a/src/processes.jl
+++ b/src/processes.jl
@@ -263,6 +263,13 @@ end
     )
 end
 
+@inline function __foldl__(rf0, init, cartesian::CartesianIndices)
+    rf = Map(CartesianIndex)'(rf0)
+    val = _foldl_product(rf, init, (), cartesian.indices...)
+    val isa Reduced && return val
+    return complete(rf, val)
+end
+
 @inline function __foldl__(
         rf, init,
         prod::Iterators.ProductIterator{<:Tuple{Any,Any,Vararg{Any}}})
diff --git a/src/simd.jl b/src/simd.jl
index cbf0bac35d..36ee9af64d 100644
--- a/src/simd.jl
+++ b/src/simd.jl
@@ -7,6 +7,10 @@ The reducible can support it using `@simd_if`.
 struct UseSIMD{ivdep} <: Transducer end
 next(rf::R_{UseSIMD}, result, input) = next(inner(rf), result, input)
 
+# Keep `UseSIMD` as the outer-most transducer when appropriate:
+reducingfunction(xf::Transducer, step::R_{UseSIMD}) =
+    usesimd(Reduction(xf, inner(step)), xform(step))
+
 # Make sure UseSIMD is the outer-most transducer when UseSIMD is used
 # via Cat.
 skipcomplete(rf::R_{UseSIMD}) =
diff --git a/test/test_processes.jl b/test/test_processes.jl
index c298a66d26..206c53133e 100644
--- a/test/test_processes.jl
+++ b/test/test_processes.jl
@@ -60,6 +60,16 @@ include("preamble.jl")
         end
     end
 
+    @testset "CartesianIndices" begin
+        @testset for cartesian in [
+            CartesianIndices((1:2,)),
+            CartesianIndices((1:2, 3:5)),
+            CartesianIndices((1:2, 3:5, 6:9)),
+        ]
+            @test collect(Map(identity), cartesian) == vec(cartesian)
+        end
+    end
+
     @testset "product-of-iterators" begin
         iterator_prototypes = [
             (1, 2),
diff --git a/test/test_simd.jl b/test/test_simd.jl
index e95db900dc..6bd9ec8e59 100644
--- a/test/test_simd.jl
+++ b/test/test_simd.jl
@@ -30,6 +30,22 @@ asrf(xfs...) = asrf(opcompose(xfs...))
           asrf(Map(sin), Cat(), Map(cos), Cat(), xfsimd, Map(tan))
 end
 
+@testset "reducingfunction" begin
+    @test opcompose(UseSIMD{false}(), Map(sin), Map(cos))'(+) ===
+        Map(sin)'(Map(cos)'(+; simd = true))
+    @test opcompose(MapCat(collect), UseSIMD{false}(), Map(cos))'(+) ===
+        MapCat(collect)'(Map(cos)'(+; simd = true))
+    @testset for (f, g) in [
+        (Map(sin), Map(cos)),
+        (MapCat(collect), Map(cos)),
+        (MapCat(collect), opcompose(MapCat(collect), Map(cos))),
+        (opcompose(MapCat(collect), MapCat(collect)), Map(cos)),
+    ]
+        @test f'(g'(+; simd = true)) === f'(g'(+); simd = true)
+        @test f'(g'(+; simd = true)) === opcompose(f, g)'(+; simd = true)
+    end
+end
+
 @testset "skipcomplete" begin
     @testset for rf in [
             asrf(UseSIMD{false}()),

From 71524aa598936ccfe9720bf6ce8fe3e29527b164 Mon Sep 17 00:00:00 2001
From: Takafumi Arakaki <aka.tkf@gmail.com>
Date: Sat, 8 Aug 2020 19:22:31 -0700
Subject: [PATCH 02/14] "Nested loops" for threaded fold; option to convert Cat
 to TCat (#401)

* "Nested loops" for threaded fold; option to convert Cat to TCat

* Support eductions to be TCat'ed (call retransform)

* Fix docstring
---
 src/core.jl                          |  1 +
 src/library.jl                       |  7 ++--
 src/reduce.jl                        | 50 +++++++++++++++++++++++-
 test/test_threading_aux.jl           | 58 ++++++++++++++++++++++++++++
 test/threads/test_parallel_reduce.jl | 15 +++++++
 5 files changed, 127 insertions(+), 4 deletions(-)
 create mode 100644 test/test_threading_aux.jl

diff --git a/src/core.jl b/src/core.jl
index a594d38e07..f1c8ed0fd4 100644
--- a/src/core.jl
+++ b/src/core.jl
@@ -324,6 +324,7 @@ end
 
 prependxf(rf::AbstractReduction, xf) = Reduction(xf, rf)
 setinner(rf::Reduction, inner) = Reduction(xform(rf), inner)
+setxform(rf::Reduction, xform) = Reduction(xform, inner(rf))
 
 Transducer(rf::Reduction) =
     if inner(rf) isa BottomRF
diff --git a/src/library.jl b/src/library.jl
index f8d9bf5fab..1a5e968dcb 100644
--- a/src/library.jl
+++ b/src/library.jl
@@ -237,15 +237,16 @@ end
 
 next(rf::R_{TCat}, result, input) =
     wrapping(rf, result) do init, acc
+        rfi, itr = retransform(inner(rf), input)
         subresult = _transduce_assoc_nocomplete(
-            inner(rf),
+            rfi,
             init,
-            input,
+            itr,
             xform(rf).basesize,
         )
         subresult isa Reduced && return init, subresult
         acc isa Unseen && return init, subresult
-        return init, combine(inner(rf), acc, subresult)
+        return init, combine(rfi, acc, subresult)
     end
 
 function combine(rf::R_{TCat}, a, b)
diff --git a/src/reduce.jl b/src/reduce.jl
index 17afa6f154..351cc9f3b7 100644
--- a/src/reduce.jl
+++ b/src/reduce.jl
@@ -1,5 +1,5 @@
 """
-    foldxt(step, xf, reducible; [init, simd, basesize, stoppable]) :: T
+    foldxt(step, xf, reducible; [init, simd, basesize, stoppable, nestlevel]) :: T
 
 e**X**tended **t**hreaded fold (reduce).  This is a multi-threaded
 `reduce` based on extended fold protocol defined in Transducers.jl.
@@ -32,6 +32,14 @@ See also: [Parallel processing tutorial](@ref tutorial-parallel),
   by passing `stoppable = false`.  It is usually automatically
   detected and set appropriately.  Note that this option is purely for
   optimization and does not affect the result value.
+- `nestlevel::Union{Integer,Val}`: Specify how many inner `Cat`
+  (flatten) transducers to be multi-threaded (using [`TCat`](@ref)).
+  It must be a positive integer, `Val` of positive integer, or
+  `Val(:inf)`.  `Val(:inf)` means to use multi-threading for all `Cat`
+  transducers.  Note that `Cat` transducer should be statically known.
+  That is to say, `foldxt` sees two `Cat`s in `... |> Map(f) |> Cat()
+  |> Cat()` but only one `Cat` in `... |> Map(x -> f(x) |> Cat()) |>
+  Cat()` even though they are semantically identical.
 - For other keyword arguments, see [`foldl`](@ref).
 
 !!! compat "Transducers.jl 0.4.23"
@@ -152,9 +160,19 @@ function transduce_assoc(
     simd::SIMDFlag = Val(false),
     basesize::Union{Integer,Nothing} = nothing,
     stoppable::Union{Bool,Nothing} = nothing,
+    nestlevel::Union{Val,Integer,Nothing} = nothing,
 ) where {F}
     rf0 = _reducingfunction(xform, step; init = init)
     rf, coll = retransform(rf0, coll0)
+    if nestlevel !== nothing
+        if basesize === nothing
+            throw(ArgumentError("`nestlevel` requires `basesize`"))
+        end
+        if has(rf, Union{Cat,TCat})
+            rf = use_threads_for_inner_cats(rf, basesize, nestlevel)
+            basesize = 1
+        end
+    end
     if stoppable === nothing
         stoppable = _might_return_reduced(rf, init, coll)
     end
@@ -487,3 +505,33 @@ julia> tcollect(x^2 for x in 1:2)
 """
 tcollect(xf, reducible; kwargs...) = tcopy(xf, Vector, reducible; kwargs...)
 tcollect(itr; kwargs...) = tcollect(extract_transducer(itr)...; kwargs...)
+
+verify_nestlevel(lvl::Val{:inf}) = lvl
+verify_nestlevel(lvl::Integer) = verify_nestlevel(Val(Int(lvl)))
+function verify_nestlevel(::Val{n}) where {n}
+    n isa Integer ||
+        throw(ArgumentError("`nestlevel` must be an integer, `Val` of `Int`, or `Val(:inf)`"))
+    lvl = Int(n)
+    lvl > 0 || throw(ArgumentError("`nestlevel` must be positive"))
+    return Val(lvl)
+end
+
+_dec_lvl(lvl::Val{:inf}) = lvl
+_dec_lvl(::Val{n}) where {n} = Val(n - 1)
+
+use_threads_for_inner_cats(rf, basesize, nestlevel) =
+    cats_to_tcats(rf, TCat(basesize), verify_nestlevel(nestlevel))
+
+# TODO: handle `TeeRF` etc?
+cats_to_tcats(rf::R_, innermost_tcat, lvl::Val) =
+    Reduction(xform(rf), cats_to_tcats(inner(rf), innermost_tcat, lvl))
+cats_to_tcats(rf::R_{Union{Cat,TCat}}, innermost_tcat, lvl::Val) =
+    if has(inner(rf), Union{Cat,TCat})
+        if lvl isa Val{1}
+            setxform(rf, innermost_tcat)
+        else
+            Reduction(TCat(1), cats_to_tcats(inner(rf), innermost_tcat, _dec_lvl(lvl)))
+        end
+    else
+        setxform(rf, innermost_tcat)
+    end
diff --git a/test/test_threading_aux.jl b/test/test_threading_aux.jl
new file mode 100644
index 0000000000..3d2cb0f0f9
--- /dev/null
+++ b/test/test_threading_aux.jl
@@ -0,0 +1,58 @@
+module TestThreadingAux
+
+include("preamble.jl")
+using Transducers: use_threads_for_inner_cats
+
+@testset "use_threads_for_inner_cats" begin
+    fivecats = opcompose(Cat(), Cat(), Cat(), Cat(), Cat())
+    @test use_threads_for_inner_cats(fivecats'(+), 3, Val(:inf)) ===
+          opcompose(TCat(1), TCat(1), TCat(1), TCat(1), TCat(3))'(+)
+    @test use_threads_for_inner_cats(fivecats'(+), 3, Val(10)) ===
+          opcompose(TCat(1), TCat(1), TCat(1), TCat(1), TCat(3))'(+)
+    @test use_threads_for_inner_cats(fivecats'(+), 3, Val(4)) ===
+          opcompose(TCat(1), TCat(1), TCat(1), TCat(3), Cat())'(+)
+    @test use_threads_for_inner_cats(fivecats'(+), 3, 1) ===
+          opcompose(TCat(3), Cat(), Cat(), Cat(), Cat())'(+)
+
+    @test use_threads_for_inner_cats(
+        opcompose(Cat(), Map(sin), TCat(1), Map(cos), Cat(), Map(tan))'(+),
+        3,
+        Val(:inf),
+    ) === opcompose(TCat(1), Map(sin), TCat(1), Map(cos), TCat(3), Map(tan))'(+)
+    @test use_threads_for_inner_cats(
+        opcompose(Cat(), Map(sin), TCat(1), Map(cos), Cat(), Map(tan))'(+),
+        3,
+        10,
+    ) === opcompose(TCat(1), Map(sin), TCat(1), Map(cos), TCat(3), Map(tan))'(+)
+    @test use_threads_for_inner_cats(
+        opcompose(Cat(), Map(sin), TCat(1), Map(cos), Cat(), Map(tan))'(+),
+        3,
+        Val(2),
+    ) === opcompose(TCat(1), Map(sin), TCat(3), Map(cos), Cat(), Map(tan))'(+)
+
+    @testset "not int" begin
+        err = @test_error use_threads_for_inner_cats(fivecats'(+), 3, Val(:non_int))
+        @test occursin("`nestlevel` must be an integer", sprint(showerror, err))
+    end
+    @testset "not positive" begin
+        err = @test_error use_threads_for_inner_cats(fivecats'(+), 3, 0)
+        @test occursin("`nestlevel` must be positive", sprint(showerror, err))
+    end
+end
+
+@testset "foldxt" begin
+    @testset "`nestlevel` requires `basesize`" begin
+        err = @test_error foldxt(+, 1:0; nestlevel = 3)
+        @test occursin("`nestlevel` requires `basesize`", sprint(showerror, err))
+    end
+    @testset "not int" begin
+        err = @test_error foldxt(Cat()'(+), 1:0; nestlevel = Val(:not_int), basesize = 3)
+        @test occursin("`nestlevel` must be an integer", sprint(showerror, err))
+    end
+    @testset "not positive" begin
+        err = @test_error foldxt(Cat()'(+), 1:0; nestlevel = -1, basesize = 3)
+        @test occursin("`nestlevel` must be positive", sprint(showerror, err))
+    end
+end
+
+end  # module
diff --git a/test/threads/test_parallel_reduce.jl b/test/threads/test_parallel_reduce.jl
index e93edd7319..dca03a0820 100644
--- a/test/threads/test_parallel_reduce.jl
+++ b/test/threads/test_parallel_reduce.jl
@@ -170,6 +170,14 @@ end
         @test collect(xf, 1:3) == desired
         @test collect(xf, 0:3) == desired
     end
+    @testset "Map(x -> 1:x |> Map(x -> 2x)) ⨟ TCat(1)" begin
+        xf = opcompose(Map(x -> 1:x |> Map(x -> 2x)), TCat(1))
+        desired = [2, 2, 4, 2, 4, 6]
+        @test collect(xf, 1:3) ==ₜ desired
+        @test collect(xf, 0:3) ==ₜ desired
+        @test tcollect(xf, 1:3) ==ₜ desired
+        @test tcollect(xf, 0:3) ==ₜ desired
+    end
 end
 
 @testset "TakeWhile" begin
@@ -225,4 +233,11 @@ end
     @test foldxt(right, xf3, withprogress(1:1000; interval=0); basesize=1, simd=true) == 100
 end
 
+@testset "nestlevel" begin
+    xs = 1:3 |> MapCat(x -> 1:x) |> MapCat(x -> 1:x) |> MapCat(x -> 1:x)
+    @test foldxt(+, xs; basesize = 1, nestlevel = 3) == sum(xs)
+    @test foldxt(+, xs; basesize = 1, nestlevel = Val(2)) == sum(xs)
+    @test foldxt(+, xs; basesize = 1, nestlevel = Val(:inf)) == sum(xs)
+end
+
 end  # module

From 061a65f641f3480fe1c067b27b63914f1deca45b Mon Sep 17 00:00:00 2001
From: Takafumi Arakaki <aka.tkf@gmail.com>
Date: Sat, 8 Aug 2020 20:16:09 -0700
Subject: [PATCH 03/14] Add more specialization hints to the compiler (#402)

---
 src/processes.jl | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/src/processes.jl b/src/processes.jl
index 018561341d..e25f3ba751 100644
--- a/src/processes.jl
+++ b/src/processes.jl
@@ -185,7 +185,7 @@ end
     complete(rf, @return_if_reduced foldlargs(rf, init, coll...))
 
 # TODO: use IndexStyle
-@inline function __foldl__(rf, init, arr::Union{AbstractArray, Broadcasted})
+@inline function __foldl__(rf::RF, init, arr::Union{AbstractArray,Broadcasted}) where {RF}
     isempty(arr) && return complete(rf, init)
     idxs = eachindex(arr)
     val = @next(rf, init, @inbounds arr[idxs[firstindex(idxs)]])
@@ -324,7 +324,8 @@ end
 
 Call [`__foldl__`](@ref) without calling [`complete`](@ref).
 """
-@inline foldl_nocomplete(rf, init, coll) = __foldl__(skipcomplete(rf), init, coll)
+@inline foldl_nocomplete(rf::RF, init, coll) where {RF} =
+    __foldl__(skipcomplete(rf), init, coll)
 
 """
     foldxl(step, xf::Transducer, reducible; init, simd) :: T

From 747697da205ca76272c6269b2154de5dc60ab296 Mon Sep 17 00:00:00 2001
From: Takafumi Arakaki <aka.tkf@gmail.com>
Date: Sat, 8 Aug 2020 22:11:25 -0700
Subject: [PATCH 04/14] Add bench_filter_sum.jl (#404)

This is a useful set of benchmarks for assessing the effect of
tail-call function-barrier for arrays (#403).
---
 benchmark/Project.toml        |  1 +
 benchmark/bench_filter_sum.jl | 39 +++++++++++++++++++++++++++++++++++
 2 files changed, 40 insertions(+)
 create mode 100644 benchmark/bench_filter_sum.jl

diff --git a/benchmark/Project.toml b/benchmark/Project.toml
index 317fcc0e2b..99fb3a0566 100644
--- a/benchmark/Project.toml
+++ b/benchmark/Project.toml
@@ -9,6 +9,7 @@ Markdown = "d6f4376e-aef5-505a-96c1-9c027394607a"
 MicroCollections = "128add7d-3638-4c79-886c-908ea0c25c34"
 Pkg = "44cfe95a-1eb2-52ea-b672-e2afdf69b78f"
 PkgBenchmark = "32113eaa-f34f-5b0d-bd6c-c81e245fc73d"
+Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
 Referenceables = "42d2dcc6-99eb-4e98-b66c-637b7d73030e"
 SplitApplyCombine = "03a91e81-4c3e-53e1-a0a4-9c0c8f19dd66"
 SplittablesBase = "171d559e-b47b-412a-8079-5efa626c420e"
diff --git a/benchmark/bench_filter_sum.jl b/benchmark/bench_filter_sum.jl
new file mode 100644
index 0000000000..c9b06e39a7
--- /dev/null
+++ b/benchmark/bench_filter_sum.jl
@@ -0,0 +1,39 @@
+module BenchFilterSum
+
+import Random
+using BenchmarkTools
+using Transducers
+
+const SUITE = BenchmarkGroup()
+
+function naive_sum(xs, acc = false)
+    for x in xs
+        acc += x
+    end
+    return acc
+end
+
+Random.seed!(12345)
+for n in [1000, 10000]
+    s0 = SUITE[:n=>n] = BenchmarkGroup()
+
+    for (xslabel, xs, init) in [
+        (:UnitRange, (x for x in 1:n if isodd(x)), 0),
+        (:RandomFloats, (x for x in randn(n) if x > 0), 0.0),
+    ]
+        s1 = s0[:xs=>xslabel] = BenchmarkGroup()
+
+        s2 = s1[:withinit=>false] = BenchmarkGroup()
+        s2[:impl=>:naive] = @benchmarkable naive_sum($xs)
+        s2[:impl=>:base] = @benchmarkable sum($xs)
+        s2[:impl=>:xf] = @benchmarkable sum($(eduction(xs)))
+
+        s2 = s1[:withinit=>true] = BenchmarkGroup()
+        s2[:impl=>:naive] = @benchmarkable naive_sum($xs, $init)
+        s2[:impl=>:base] = @benchmarkable foldl(+, $xs; init = $init)
+        s2[:impl=>:xf] = @benchmarkable foldl(+, $(eduction(xs)); init = $init)
+    end
+end
+
+end  # module
+BenchFilterSum.SUITE

From 65db24b0e874a4472eb3b81543c11306c2fd8581 Mon Sep 17 00:00:00 2001
From: Takafumi Arakaki <aka.tkf@gmail.com>
Date: Sat, 8 Aug 2020 23:31:09 -0700
Subject: [PATCH 05/14] Simplify benchmark group keys (#406)

The table is too wide if I use `=>`:
https://github.com/JuliaFolds/Transducers-data/blob/benchmark-results/2020/08/09/053256/result.md#results
---
 benchmark/bench_filter_sum.jl | 24 ++++++++++++------------
 1 file changed, 12 insertions(+), 12 deletions(-)

diff --git a/benchmark/bench_filter_sum.jl b/benchmark/bench_filter_sum.jl
index c9b06e39a7..e902f259df 100644
--- a/benchmark/bench_filter_sum.jl
+++ b/benchmark/bench_filter_sum.jl
@@ -15,23 +15,23 @@ end
 
 Random.seed!(12345)
 for n in [1000, 10000]
-    s0 = SUITE[:n=>n] = BenchmarkGroup()
+    s0 = SUITE[string(n)] = BenchmarkGroup()
 
     for (xslabel, xs, init) in [
-        (:UnitRange, (x for x in 1:n if isodd(x)), 0),
-        (:RandomFloats, (x for x in randn(n) if x > 0), 0.0),
+        ("UnitRange", (x for x in 1:n if isodd(x)), 0),
+        ("RandomFloats", (x for x in randn(n) if x > 0), 0.0),
     ]
-        s1 = s0[:xs=>xslabel] = BenchmarkGroup()
+        s1 = s0[xslabel] = BenchmarkGroup()
 
-        s2 = s1[:withinit=>false] = BenchmarkGroup()
-        s2[:impl=>:naive] = @benchmarkable naive_sum($xs)
-        s2[:impl=>:base] = @benchmarkable sum($xs)
-        s2[:impl=>:xf] = @benchmarkable sum($(eduction(xs)))
+        s2 = s1["noinit"] = BenchmarkGroup()
+        s2["naive"] = @benchmarkable naive_sum($xs)
+        s2["base"] = @benchmarkable sum($xs)
+        s2["xf"] = @benchmarkable sum($(eduction(xs)))
 
-        s2 = s1[:withinit=>true] = BenchmarkGroup()
-        s2[:impl=>:naive] = @benchmarkable naive_sum($xs, $init)
-        s2[:impl=>:base] = @benchmarkable foldl(+, $xs; init = $init)
-        s2[:impl=>:xf] = @benchmarkable foldl(+, $(eduction(xs)); init = $init)
+        s2 = s1["withinit"] = BenchmarkGroup()
+        s2["naive"] = @benchmarkable naive_sum($xs, $init)
+        s2["base"] = @benchmarkable foldl(+, $xs; init = $init)
+        s2["xf"] = @benchmarkable foldl(+, $(eduction(xs)); init = $init)
     end
 end
 

From eb430cf76a5c25aa909858c20424aead21cfecfd Mon Sep 17 00:00:00 2001
From: Takafumi Arakaki <aka.tkf@gmail.com>
Date: Sun, 9 Aug 2020 01:07:09 -0700
Subject: [PATCH 06/14] Tail-call function-barrier for arrays (#403)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This PR implements the "tail-call function-barrier" pattern for arrays
with linear style indexing.  This gives us a better performance for
type-changing reduction where the iteration at which the type changes
is unknown.  A good example is filtered sum with no initial value
(#404).

Interestingly, this version is a bit worse for sum of filtered floats
on "short" arrays.  Compare the result of
`["filter_sum", "1000", "RandomFloats", "noinit", "xf"]`.  In my
laptop, baseline is ~1.7 μs while this branch is ~2.1 μs.  However,
for 10x longer input, this branch is faster (> 2x).  The benchmarks on
CI also show similar results.
---
 src/basics.jl          | 23 ++++++++++++++++++
 src/core.jl            |  5 ++++
 src/library.jl         |  1 +
 src/processes.jl       | 55 ++++++++++++++++++++++++++++++------------
 test/test_processes.jl |  9 +++++++
 5 files changed, 77 insertions(+), 16 deletions(-)

diff --git a/src/basics.jl b/src/basics.jl
index 1f4c547531..b6db999132 100644
--- a/src/basics.jl
+++ b/src/basics.jl
@@ -95,3 +95,26 @@ abstract type _Function <: Function end
 Base.show(io::IO, ::MIME"text/plain", f::_Function) = show(io, f)
 Base.print(io::IO, f::_Function) = show(io, f)
 @specialize
+
+
+# A macro for "manual Union splitting".  It is sometimes useful to let
+# the compiler know that it is beneficial to type-specialize `body`.
+# * https://github.com/JuliaFolds/Transducers.jl/pull/188
+# * https://github.com/JuliaLang/julia/pull/34293#discussion_r363550608
+macro manual_union_split(cond, body)
+    quote
+        if $cond
+            $body
+        else
+            $body
+        end
+    end |> esc
+end
+
+@inline _firstindex(arr) = firstindex(arr)
+@inline _lastindex(arr) = lastindex(arr)
+
+# Define `firstindex` and `lastindex` for `Broadcasted` with linear
+# index style:
+@inline _firstindex(bc::Broadcasted) = first((axes(bc)::Tuple{Any})[1])
+@inline _lastindex(bc::Broadcasted) = last((axes(bc)::Tuple{Any})[1])
diff --git a/src/core.jl b/src/core.jl
index f1c8ed0fd4..edee313b20 100644
--- a/src/core.jl
+++ b/src/core.jl
@@ -565,6 +565,9 @@ combine(rf::Reduction, a, b) =
         combine(inner(rf), a, b)
     end
 
+is_prelude(_) = false
+is_prelude(::InitialValues.InitialValue) = true
+
 privatestate(::T, state, result) where {T <: AbstractReduction} =
     privatestate(T, state, result)
 
@@ -598,6 +601,8 @@ ownsstate(::R, ::PrivateState{T}) where {R, T} = R === T
 # took more than 10 min).  See also:
 # https://github.com/JuliaLang/julia/issues/30125
 
+@inline is_prelude(ps::PrivateState) = is_prelude(psstate(ps)) || is_prelude(psresult(ps))
+
 """
     unwrap(rf, result)
 
diff --git a/src/library.jl b/src/library.jl
index 1a5e968dcb..826c8bc352 100644
--- a/src/library.jl
+++ b/src/library.jl
@@ -927,6 +927,7 @@ struct PartitionBy{F} <: Transducer
 end
 
 struct Unseen end
+is_prelude(::Unseen) = true
 
 isexpansive(::PartitionBy) = false
 
diff --git a/src/processes.jl b/src/processes.jl
index e25f3ba751..c10b1da65e 100644
--- a/src/processes.jl
+++ b/src/processes.jl
@@ -155,15 +155,7 @@ function __foldl__(rf::RF, init::T, coll) where {RF,T}
     ret === nothing && return complete(rf, init)
     x, state = ret
     val = @next(rf, init, x)
-
-    # Doing "manual Union splitting" (?).  This somehow helps the
-    # compiler to generate faster code even though the code inside the
-    # `if` branches are identical.
-    # * https://github.com/JuliaFolds/Transducers.jl/pull/188
-    # * https://github.com/JuliaLang/julia/pull/34293#discussion_r363550608
-    if val isa T
-        return _foldl_iter(rf, val, coll, state, FOLDL_RECURSION_LIMIT)
-    else
+    @manual_union_split val isa T begin
         return _foldl_iter(rf, val, coll, state, FOLDL_RECURSION_LIMIT)
     end
 end
@@ -185,15 +177,46 @@ end
     complete(rf, @return_if_reduced foldlargs(rf, init, coll...))
 
 # TODO: use IndexStyle
-@inline function __foldl__(rf::RF, init, arr::Union{AbstractArray,Broadcasted}) where {RF}
+@inline function __foldl__(
+    rf::RF,
+    init::T,
+    arr::Union{AbstractArray,Broadcasted},
+) where {RF,T}
     isempty(arr) && return complete(rf, init)
-    idxs = eachindex(arr)
-    val = @next(rf, init, @inbounds arr[idxs[firstindex(idxs)]])
-    @simd_if rf for k in firstindex(idxs) + 1:lastindex(idxs)
-        i = @inbounds idxs[k]
-        val = @next(rf, val, @inbounds arr[i])
+    i = _firstindex(arr)
+    acc = @next(rf, init, @inbounds arr[i])
+    @manual_union_split acc isa T begin
+        if is_prelude(acc)
+            return _foldl_linear_rec(rf, acc, arr, i + 1, FOLDL_RECURSION_LIMIT)
+        else
+            return _foldl_linear_bulk(rf, acc, arr, i + 1)
+        end
     end
-    return complete(rf, val)
+end
+
+@inline function _foldl_linear_bulk(rf::RF, acc, arr, i0) where {RF}
+    @simd_if rf for i in i0:_lastindex(arr)
+        acc = @next(rf, acc, @inbounds arr[i])
+    end
+    return complete(rf, acc)
+end
+
+@inline function _foldl_linear_rec(rf::RF, acc::T, arr, i0, counter) where {RF,T}
+    for i in i0:_lastindex(arr)
+        y = @next(rf, acc, @inbounds arr[i])
+        if counter !== Val(0)
+            if y isa T
+            elseif is_prelude(y)
+                return _foldl_linear_rec(rf, y, arr, i + 1, _dec(counter))
+            else
+                # Otherwise, maybe it could be something like `Union{Float64,Missing}`
+                # where Julia's native loop is fast:
+                return _foldl_linear_bulk(rf, y, arr, i + 1)
+            end
+        end
+        acc = y
+    end
+    return complete(rf, acc)
 end
 
 @inline _getvalues(i) = ()
diff --git a/test/test_processes.jl b/test/test_processes.jl
index 206c53133e..e382e03502 100644
--- a/test/test_processes.jl
+++ b/test/test_processes.jl
@@ -47,6 +47,15 @@ include("preamble.jl")
         @test_throws EmptyResultError foldl(+, nested_xf, iter)
     end
 
+    @testset "type-unstable arrays" begin
+        valof(::Val{x}) where {x} = x
+        valof(x) = x
+        @testset for n in 1:valof(Transducers.FOLDL_RECURSION_LIMIT) + 3
+            @test collect(Map(valof), [Val(i) for i in 1:n]) == 1:n
+        end
+        @test collect(Map(valof), [[Val(i) for i in 1:4]; 5:9;]) == 1:9
+    end
+
     @testset "zip-of-arrays" begin
         @testset for arrays in [
                 (0:3,),

From b8efeed250714361f7bc5683333f75fb71d3ac48 Mon Sep 17 00:00:00 2001
From: Takafumi Arakaki <aka.tkf@gmail.com>
Date: Sun, 9 Aug 2020 01:45:13 -0700
Subject: [PATCH 07/14] Add bench_sum_transpose.jl (#405)

This is a benchmark for measuring specialized foldl for cartesian
style arrays.
---
 benchmark/bench_sum_transpose.jl | 42 ++++++++++++++++++++++++++++++++
 1 file changed, 42 insertions(+)
 create mode 100644 benchmark/bench_sum_transpose.jl

diff --git a/benchmark/bench_sum_transpose.jl b/benchmark/bench_sum_transpose.jl
new file mode 100644
index 0000000000..bc777b29dc
--- /dev/null
+++ b/benchmark/bench_sum_transpose.jl
@@ -0,0 +1,42 @@
+module BenchSumTranspose
+
+import Random
+using BenchmarkTools
+using Transducers
+
+const SUITE = BenchmarkGroup()
+
+function iter_sum(xs, acc = false)
+    for x in xs
+        acc += x
+    end
+    return acc
+end
+
+function man_sum(xs::AbstractMatrix, acc = false)
+    for i in axes(xs, 2), j in axes(xs, 1)
+        acc += @inbounds xs[j, i]
+    end
+    return acc
+end
+
+Random.seed!(12345)
+# for n in [30, 100]
+let n = 30
+    s1 = SUITE[string(n)] = BenchmarkGroup()
+
+    xs = randn(n, n)'
+
+    s2 = s1["noinit"] = BenchmarkGroup()
+    s2["iter"] = @benchmarkable iter_sum($xs)
+    s2["man"] = @benchmarkable man_sum($xs)
+    s2["xf"] = @benchmarkable foldxl(+, $xs)
+
+    s2 = s1["withinit"] = BenchmarkGroup()
+    s2["iter"] = @benchmarkable iter_sum($xs, 0.0)
+    s2["man"] = @benchmarkable man_sum($xs, 0.0)
+    s2["xf"] = @benchmarkable foldxl(+, $xs; init = 0.0)
+end
+
+end  # module
+BenchSumTranspose.SUITE

From edc1c726b3fd0231f5c85ba00499223df56689b0 Mon Sep 17 00:00:00 2001
From: Takafumi Arakaki <aka.tkf@gmail.com>
Date: Sun, 9 Aug 2020 13:37:11 -0700
Subject: [PATCH 08/14] Manual trigger to vanilla-test-push.yml

This patch also changes the name of the job to vanilla-test-push to
disambiguate it from vanilla-test for PRs.
---
 .github/workflows/vanilla-test-push.yml | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/vanilla-test-push.yml b/.github/workflows/vanilla-test-push.yml
index dac01cb445..541163d340 100644
--- a/.github/workflows/vanilla-test-push.yml
+++ b/.github/workflows/vanilla-test-push.yml
@@ -7,9 +7,10 @@ on:
     paths:
       - Project.toml
       - test/environments/main/Project.toml
+  workflow_dispatch:
 
 jobs:
-  vanilla-test:
+  vanilla-test-push:
     runs-on: ubuntu-latest
     steps:
       - uses: actions/checkout@v2

From 4b2a4a884e40cb5557328450b912068bc39cfc4d Mon Sep 17 00:00:00 2001
From: Takafumi Arakaki <aka.tkf@gmail.com>
Date: Sun, 9 Aug 2020 13:39:43 -0700
Subject: [PATCH 09/14] Change workflow name of vanilla-test-push.yml

This is for differentiating it from vanilla-test.yml.
---
 .github/workflows/vanilla-test-push.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/vanilla-test-push.yml b/.github/workflows/vanilla-test-push.yml
index 541163d340..39eb5768f5 100644
--- a/.github/workflows/vanilla-test-push.yml
+++ b/.github/workflows/vanilla-test-push.yml
@@ -1,4 +1,4 @@
-name: Run test via Pkg.test()
+name: Run test via Pkg.test() on push
 
 on:
   push:

From e01532681d69ffbf45d0dc27050125b2fd9159eb Mon Sep 17 00:00:00 2001
From: Takafumi Arakaki <aka.tkf@gmail.com>
Date: Sun, 9 Aug 2020 14:16:15 -0700
Subject: [PATCH 10/14] Switch to Julia 1.5 as the main version in CIs (#409)

---
 .github/workflows/benchmark.yml              | 2 +-
 .github/workflows/check-xfail.yml            | 2 +-
 .github/workflows/multi-thread-benchmark.yml | 2 +-
 .github/workflows/vanilla-test-push.yml      | 2 +-
 .github/workflows/vanilla-test.yml           | 2 +-
 .travis.yml                                  | 2 +-
 6 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/.github/workflows/benchmark.yml b/.github/workflows/benchmark.yml
index d9144c3333..65fc67d2b9 100644
--- a/.github/workflows/benchmark.yml
+++ b/.github/workflows/benchmark.yml
@@ -10,7 +10,7 @@ jobs:
       - uses: actions/checkout@v2
       - uses: julia-actions/setup-julia@latest
         with:
-          version: 1.4
+          version: 1.5
       - name: Install dependencies
         run: julia -e 'using Pkg; pkg"add PkgBenchmark BenchmarkCI@0.1"'
       - name: Run benchmarks
diff --git a/.github/workflows/check-xfail.yml b/.github/workflows/check-xfail.yml
index 01b9783617..59cd229da6 100644
--- a/.github/workflows/check-xfail.yml
+++ b/.github/workflows/check-xfail.yml
@@ -8,7 +8,7 @@ jobs:
     runs-on: ubuntu-latest
     strategy:
       matrix:
-        julia-version: ['1.4']
+        julia-version: ['^1']
       fail-fast: false
     name: Test xfail Julia ${{ matrix.julia-version }}
     steps:
diff --git a/.github/workflows/multi-thread-benchmark.yml b/.github/workflows/multi-thread-benchmark.yml
index 654ab1906f..e68cc4b1b5 100644
--- a/.github/workflows/multi-thread-benchmark.yml
+++ b/.github/workflows/multi-thread-benchmark.yml
@@ -10,7 +10,7 @@ jobs:
       - uses: actions/checkout@v2
       - uses: julia-actions/setup-julia@latest
         with:
-          version: 1.4
+          version: 1.5
       - name: Install dependencies
         run: julia -e 'using Pkg; pkg"add PkgBenchmark BenchmarkCI@0.1"'
       - name: Run benchmarks
diff --git a/.github/workflows/vanilla-test-push.yml b/.github/workflows/vanilla-test-push.yml
index 39eb5768f5..3b83c23b56 100644
--- a/.github/workflows/vanilla-test-push.yml
+++ b/.github/workflows/vanilla-test-push.yml
@@ -16,7 +16,7 @@ jobs:
       - uses: actions/checkout@v2
       - uses: julia-actions/setup-julia@v1
         with:
-          version: 1.4
+          version: ^1
       - uses: julia-actions/julia-buildpkg@latest
       - uses: julia-actions/julia-runtest@latest
         env:
diff --git a/.github/workflows/vanilla-test.yml b/.github/workflows/vanilla-test.yml
index a9a9356a4e..d867410aeb 100644
--- a/.github/workflows/vanilla-test.yml
+++ b/.github/workflows/vanilla-test.yml
@@ -36,7 +36,7 @@ jobs:
       - uses: julia-actions/setup-julia@v1
         if: ${{ steps.check-project-toml.outputs.need_test == 'yes' }}
         with:
-          version: 1.4
+          version: ^1
       - uses: julia-actions/julia-buildpkg@latest
         if: ${{ steps.check-project-toml.outputs.need_test == 'yes' }}
       - uses: julia-actions/julia-runtest@latest
diff --git a/.travis.yml b/.travis.yml
index b3aff1f92a..1b6f174b71 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -3,8 +3,8 @@ language: julia
 os:
   - linux
 julia:
+  - 1.5  # to be used in benchmarks as well
   - 1.4
-  - 1.5
   - nightly
 env:
    global:

From 29ec4581ed78721f6eaa01932b48ca4a602cc60b Mon Sep 17 00:00:00 2001
From: Takafumi Arakaki <aka.tkf@gmail.com>
Date: Sat, 8 Aug 2020 23:54:47 -0700
Subject: [PATCH 11/14] Add bench_teerf_filter.jl

---
 benchmark/bench_teerf_filter.jl | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)
 create mode 100644 benchmark/bench_teerf_filter.jl

diff --git a/benchmark/bench_teerf_filter.jl b/benchmark/bench_teerf_filter.jl
new file mode 100644
index 0000000000..4a3326e9f2
--- /dev/null
+++ b/benchmark/bench_teerf_filter.jl
@@ -0,0 +1,16 @@
+module BenchTeeRFFilter
+
+using BenchmarkTools
+using Transducers
+
+const SUITE = BenchmarkGroup()
+
+let xs = 1:1000
+    rf = TeeRF(Filter(isodd)'(+), Filter(iseven)'(+))
+
+    SUITE["noinit"] = @benchmarkable foldxl($rf, $xs)
+    SUITE["withinit"] = @benchmarkable foldxl($rf, $xs; init = (0, 0))
+end
+
+end  # module
+BenchTeeRFFilter.SUITE

From 735aae6d97e2671562034d78af66eba6b9933e60 Mon Sep 17 00:00:00 2001
From: Takafumi Arakaki <aka.tkf@gmail.com>
Date: Sun, 9 Aug 2020 16:10:45 -0700
Subject: [PATCH 12/14] Specialize foldl for cartesian style arrays (#407)

This patch implements a specialization of `foldl` on arrays with
`IndexCartesian` index style.  This gives us more than 2x speedup (see
`sum_transpose` benchmark).

The implementation mostly just redirects the call to `foldl` of
`CartesianIndices`.  Most of the code is for compatibility with Julia
< 1.5.  This PR also fixes a bug in `foldl` for multi-dimensional
`Broadcasted` (probably introduced by #403).
---
 src/basics.jl          | 10 ++++++++++
 src/processes.jl       | 18 ++++++++++++------
 test/test_processes.jl |  9 ++++++++-
 3 files changed, 30 insertions(+), 7 deletions(-)

diff --git a/src/basics.jl b/src/basics.jl
index b6db999132..083af50dd6 100644
--- a/src/basics.jl
+++ b/src/basics.jl
@@ -118,3 +118,13 @@ end
 # index style:
 @inline _firstindex(bc::Broadcasted) = first((axes(bc)::Tuple{Any})[1])
 @inline _lastindex(bc::Broadcasted) = last((axes(bc)::Tuple{Any})[1])
+
+# Define `CartesianIndices` for `Broadcasted`
+@inline _CartesianIndices(arr) = CartesianIndices(arr)
+@inline _CartesianIndices(bc::Broadcasted) = CartesianIndices(axes(bc)::Tuple)
+
+# Define `IndexStyle` for `Broadcasted`
+_IndexStyle(arr) = IndexStyle(arr)
+_IndexStyle(bc::Broadcasted) = _IndexStyle(typeof(bc))
+_IndexStyle(::Type{<:Broadcasted{<:Any,<:Tuple{Any}}}) = IndexLinear()
+_IndexStyle(::Type{<:Broadcasted{<:Any}}) = IndexCartesian()
diff --git a/src/processes.jl b/src/processes.jl
index c10b1da65e..d906c7ba5e 100644
--- a/src/processes.jl
+++ b/src/processes.jl
@@ -176,13 +176,13 @@ end
 @inline __foldl__(rf::RF, init, coll::Tuple) where {RF} =
     complete(rf, @return_if_reduced foldlargs(rf, init, coll...))
 
-# TODO: use IndexStyle
-@inline function __foldl__(
-    rf::RF,
-    init::T,
-    arr::Union{AbstractArray,Broadcasted},
-) where {RF,T}
+@inline function __foldl__(rf::RF, init, arr0::Union{AbstractArray,Broadcasted}) where {RF}
+    arr = Broadcast.instantiate(arr0)
     isempty(arr) && return complete(rf, init)
+    return _foldl_array(rf, init, arr, _IndexStyle(arr))
+end
+
+@inline function _foldl_array(rf::RF, init::T, arr, ::IndexLinear) where {RF,T}
     i = _firstindex(arr)
     acc = @next(rf, init, @inbounds arr[i])
     @manual_union_split acc isa T begin
@@ -219,6 +219,12 @@ end
     return complete(rf, acc)
 end
 
+@inline function _foldl_array(rf0::RF, init, arr, ::IndexStyle) where {RF,T}
+    @inline getvalue(I) = @inbounds arr[I]
+    rf = Map(getvalue)'(rf0)
+    return __foldl__(rf, init, _CartesianIndices(arr))
+end
+
 @inline _getvalues(i) = ()
 @inline _getvalues(i, a, rest...) = ((@inbounds a[i]), _getvalues(i, rest...)...)
 
diff --git a/test/test_processes.jl b/test/test_processes.jl
index e382e03502..b4c1772d16 100644
--- a/test/test_processes.jl
+++ b/test/test_processes.jl
@@ -110,7 +110,7 @@ include("preamble.jl")
         end
     end
 
-    @testset "broadcast" begin
+    @testset "broadcast (linear)" begin
         @testset for xs in iterator_variants(1:3)
             ys = @~ xs.^2
             @test collect(Map(identity), ys) == copy(ys)
@@ -118,6 +118,13 @@ include("preamble.jl")
             @test foldl(+, Filter(isodd), ys; init=0) == 10
         end
     end
+
+    @testset "broadcast (cartesian)" begin
+        xs = @~ (1:3) .+ (4:5)'
+        @test collect(Map(identity), xs) == vec(copy(xs))
+        @test foldl(+, Filter(isodd), xs) == 19
+        @test foldl(+, Filter(isodd), xs; init = 0) == 19
+    end
 end
 
 

From e11604dcc30e97a8a25a93fc7a6e901c93899d1e Mon Sep 17 00:00:00 2001
From: Takafumi Arakaki <aka.tkf@gmail.com>
Date: Sun, 9 Aug 2020 16:51:26 -0700
Subject: [PATCH 13/14] Define is_prelude on Tuple and NamedTuple (#408)

This patch handles the sentinel initial values wrapped in `Tuple`s and
`NamedTuple`s. This is useful, e.g., when `TeeRF` is wrapping
filtering transducers.  ~5x speedup in `teerf_filter` benchmark.
---
 src/core.jl | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/core.jl b/src/core.jl
index edee313b20..d8997172fa 100644
--- a/src/core.jl
+++ b/src/core.jl
@@ -567,6 +567,8 @@ combine(rf::Reduction, a, b) =
 
 is_prelude(_) = false
 is_prelude(::InitialValues.InitialValue) = true
+is_prelude(xs::Tuple) = any(map(is_prelude, xs))
+is_prelude(xs::NamedTuple) = is_prelude(Tuple(xs))
 
 privatestate(::T, state, result) where {T <: AbstractReduction} =
     privatestate(T, state, result)

From ef2174ebbb511cb5979b20d8c017957874704ee8 Mon Sep 17 00:00:00 2001
From: Takafumi Arakaki <aka.tkf@gmail.com>
Date: Sun, 9 Aug 2020 17:19:17 -0700
Subject: [PATCH 14/14] Add bench_cartesian.jl (#410)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This characterizes the improvements in
6aafdde1f6ae44985e5d83dacaea203f94ddd1ab

Example minimum time:
Before (Transducers v0.4.47): 8.720 μs
After (Transducers v0.4.48-DEV): 2.628 μs
---
 benchmark/bench_cartesian.jl | 41 ++++++++++++++++++++++++++++++++++++
 1 file changed, 41 insertions(+)
 create mode 100644 benchmark/bench_cartesian.jl

diff --git a/benchmark/bench_cartesian.jl b/benchmark/bench_cartesian.jl
new file mode 100644
index 0000000000..0ecc3588a5
--- /dev/null
+++ b/benchmark/bench_cartesian.jl
@@ -0,0 +1,41 @@
+module BenchCartesian
+
+using BenchmarkTools
+using Transducers
+
+function copyto_manual!(ys::AbstractMatrix, xs::AbstractMatrix)
+    @assert axes(ys) == axes(xs)
+    for j in 1:size(xs, 2), i in 1:size(xs, 1)
+        @inbounds ys[i, j] = xs[i, j]
+    end
+    return ys
+end
+
+function copyto_iter!(ys, xs)
+    @assert axes(ys) == axes(xs)
+    for I in CartesianIndices(xs)
+        @inbounds ys[I] = xs[I]
+    end
+    return ys
+end
+
+function copyto_xf!(ys, xs)
+    foreach(Map(identity), CartesianIndices(xs)) do I
+        @inbounds ys[I] = xs[I]
+        nothing
+    end
+    return ys
+end
+
+const SUITE = BenchmarkGroup()
+
+let xs = randn(3, 10^3)
+    ys = zero(xs)
+    s1 = SUITE["copyto!"] = BenchmarkGroup()
+    s1["man"] = @benchmarkable(copyto_manual!($ys, $xs))
+    s1["iter"] = @benchmarkable(copyto_iter!($ys, $xs))
+    s1["xf"] = @benchmarkable(copyto_xf!($ys, $xs))
+end
+
+end  # module
+BenchCartesian.SUITE