From 06b23b200624943d8bf03970224883367b48879f Mon Sep 17 00:00:00 2001 From: Hendrik Ranocha Date: Fri, 28 Nov 2025 08:53:59 +0100 Subject: [PATCH 1/3] add fix for LoopVectorization issue 543 --- src/vecunroll/memory.jl | 178 ++++++++++++++++++++++++++++++++++++++++ test/runtests.jl | 63 +++++++++++++- 2 files changed, 240 insertions(+), 1 deletion(-) diff --git a/src/vecunroll/memory.jl b/src/vecunroll/memory.jl index 630f6537..41e4f33e 100644 --- a/src/vecunroll/memory.jl +++ b/src/vecunroll/memory.jl @@ -2753,6 +2753,184 @@ end ) end +# W=1 specializations: when W=1, VecUnroll stores T directly instead of Vec{1,T} +@generated function _vstore_unroll!( + sptr::AbstractStridedPointer{T,D,C}, + v::VecUnroll{<:Any,1,T,<:VecUnroll{<:Any,1,T,T}}, + u::UU, + ::A, + ::S, + ::NT, + ::StaticInt{RS}, + ::StaticInt{SVUS} +) where { + T, + A<:StaticBool, + S<:StaticBool, + NT<:StaticBool, + RS, + D, + C, + SVUS, + UU<:NestedUnroll{1} +} + AUO, FO, NO, AV, _W, MO, X, U = unroll_params(UU) + AUI, FI, NI, AV, _W, MI, X, I = unroll_params(U) + vstore_double_unroll_quote( + D, + NO, + NI, + AUO, + FO, + AV, + 1, + MO, + X, + C, + AUI, + FI, + MI, + false, + A === True, + S === True, + NT === True, + RS, + SVUS + ) +end +@generated function _vstore_unroll!( + sptr::AbstractStridedPointer{T,D,C}, + v::VecUnroll{<:Any,1,T,<:VecUnroll{<:Any,1,T,T}}, + u::UU, + ::A, + ::S, + ::NT, + ::StaticInt{RS}, + ::Nothing +) where { + T, + A<:StaticBool, + S<:StaticBool, + NT<:StaticBool, + RS, + D, + C, + UU<:NestedUnroll{1} +} + AUO, FO, NO, AV, _W, MO, X, U = unroll_params(UU) + AUI, FI, NI, AV, _W, MI, X, I = unroll_params(U) + vstore_double_unroll_quote( + D, + NO, + NI, + AUO, + FO, + AV, + 1, + MO, + X, + C, + AUI, + FI, + MI, + false, + A === True, + S === True, + NT === True, + RS, + typemax(Int) + ) +end +@generated function _vstore_unroll!( + sptr::AbstractStridedPointer{T,D,C}, + v::VecUnroll{<:Any,1,T,<:VecUnroll{<:Any,1,T,T}}, + u::UU, + m::AbstractMask{1}, + ::A, + ::S, + ::NT, + ::StaticInt{RS}, + ::StaticInt{SVUS} +) where { + T, + A<:StaticBool, + S<:StaticBool, + NT<:StaticBool, + RS, + D, + C, + SVUS, + UU<:NestedUnroll{1} +} + AUO, FO, NO, AV, _W, MO, X, U = unroll_params(UU) + AUI, FI, NI, AV, _W, MI, X, I = unroll_params(U) + vstore_double_unroll_quote( + D, + NO, + NI, + AUO, + FO, + AV, + 1, + MO, + X, + C, + AUI, + FI, + MI, + true, + A === True, + S === True, + NT === True, + RS, + SVUS + ) +end +@generated function _vstore_unroll!( + sptr::AbstractStridedPointer{T,D,C}, + v::VecUnroll{<:Any,1,T,<:VecUnroll{<:Any,1,T,T}}, + u::UU, + m::AbstractMask{1}, + ::A, + ::S, + ::NT, + ::StaticInt{RS}, + ::Nothing +) where { + T, + A<:StaticBool, + S<:StaticBool, + NT<:StaticBool, + RS, + D, + C, + UU<:NestedUnroll{1} +} + AUO, FO, NO, AV, _W, MO, X, U = unroll_params(UU) + AUI, FI, NI, AV, _W, MI, X, I = unroll_params(U) + vstore_double_unroll_quote( + D, + NO, + NI, + AUO, + FO, + AV, + 1, + MO, + X, + C, + AUI, + FI, + MI, + true, + A === True, + S === True, + NT === True, + RS, + typemax(Int) + ) +end + function vstore_unroll_i_quote(Nm1, Wsplit, W, A, S, NT, rs::Int, mask::Bool) N = Nm1 + 1 N * Wsplit == W || throw( diff --git a/test/runtests.jl b/test/runtests.jl index c9cde993..6c154272 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -12,7 +12,7 @@ include("testsetup.jl") # TODO - Will need a code refactor to properly address these type piracies. # Either: - # 1. Create type wrappers in VectorizationBase + # 1. Create type wrappers in VectorizationBase # 2. Implement overloading upstream # 3. Use package extensions (still buggy in current Julia LTS v1.10.10) @@ -880,6 +880,67 @@ include("testsetup.jl") @test vec(colormat[:, 41:48]) == vec(colormat[:, 1:8]) end + # Test for W=1 nested VecUnroll store operations + # (fix for https://github.com/JuliaSIMD/LoopVectorization.jl/issues/543) + # When W=1, VecUnroll stores T directly instead of Vec{1,T} + println("W=1 Nested VecUnroll Store") + @time @testset "W=1 Nested VecUnroll Store" begin + # Test nested unroll with W=1 (scalar width) + # This tests the _vstore_unroll! methods that were added for W=1 + A = zeros(5, 5) + GC.@preserve A begin + sp = stridedpointer(A) + # Create a nested VecUnroll with W=1 (scalars instead of Vec{1,T}) + inner_vu1 = VectorizationBase.VecUnroll((1.0, 2.0, 3.0, 4.0, 5.0)) + inner_vu2 = VectorizationBase.VecUnroll((6.0, 7.0, 8.0, 9.0, 10.0)) + inner_vu3 = VectorizationBase.VecUnroll((11.0, 12.0, 13.0, 14.0, 15.0)) + inner_vu4 = VectorizationBase.VecUnroll((16.0, 17.0, 18.0, 19.0, 20.0)) + inner_vu5 = VectorizationBase.VecUnroll((21.0, 22.0, 23.0, 24.0, 25.0)) + outer_vu = VectorizationBase.VecUnroll((inner_vu1, inner_vu2, inner_vu3, inner_vu4, inner_vu5)) + + # Verify the type structure: VecUnroll{4, 1, Float64, VecUnroll{4, 1, Float64, Float64}} + @test outer_vu isa VectorizationBase.VecUnroll{4,1,Float64,<:VectorizationBase.VecUnroll{4,1,Float64,Float64}} + + # Create nested Unroll index for _vstore_unroll! + # The Unroll type parameters are: AU (axis of unroll), F (step), N (count), AV (axis of vectorization), W (vector width), M (mask), X (extra) + inner_unroll = VectorizationBase.Unroll{2,1,5,1,1,UInt(0),1}(StaticInt(0)) + outer_unroll = VectorizationBase.Unroll{1,1,5,1,1,UInt(0),1}(inner_unroll) + + # Call _vstore_unroll! directly - this is what was failing in Issue #543 + # We use similar_no_offset to get a pointer suitable for _vstore_unroll! + sptr = VectorizationBase.similar_no_offset(sp, pointer(A)) + VectorizationBase._vstore_unroll!(sptr, outer_vu, outer_unroll, + VectorizationBase.False(), VectorizationBase.False(), VectorizationBase.False(), + VectorizationBase.register_size(), StaticInt(8)) + + # Verify the stored values - the layout is transposed due to unroll ordering + expected = reshape(1.0:25.0, 5, 5)' + @test A == expected + end + + # Test with different array sizes to ensure robustness + for n in [3, 4, 5, 6, 7, 8] + B = zeros(n, n) + GC.@preserve B begin + sp = stridedpointer(B) + # Create nested VecUnroll with n elements + inner_vus = ntuple(i -> VectorizationBase.VecUnroll(ntuple(j -> Float64((i-1)*n + j), n)), n) + outer_vu = VectorizationBase.VecUnroll(inner_vus) + + inner_unroll = VectorizationBase.Unroll{2,1,n,1,1,UInt(0),1}(StaticInt(0)) + outer_unroll = VectorizationBase.Unroll{1,1,n,1,1,UInt(0),1}(inner_unroll) + + sptr = VectorizationBase.similar_no_offset(sp, pointer(B)) + VectorizationBase._vstore_unroll!(sptr, outer_vu, outer_unroll, + VectorizationBase.False(), VectorizationBase.False(), VectorizationBase.False(), + VectorizationBase.register_size(), StaticInt(8)) + + expected = reshape(1.0:Float64(n*n), n, n)' + @test B == expected + end + end + end + println("Grouped Strided Pointers") @time @testset "Grouped Strided Pointers" begin M, K, N = 4, 5, 6 From 2867c8a2cfc4b64077b0e1393aea32e426880eed Mon Sep 17 00:00:00 2001 From: Hendrik Ranocha Date: Fri, 28 Nov 2025 08:54:40 +0100 Subject: [PATCH 2/3] set version to 0.21.73 --- Project.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Project.toml b/Project.toml index 0a4cfd56..53e1f24c 100644 --- a/Project.toml +++ b/Project.toml @@ -1,7 +1,7 @@ name = "VectorizationBase" uuid = "3d5dd08c-fd9d-11e8-17fa-ed2836048c2f" authors = ["Chris Elrod "] -version = "0.21.72" +version = "0.21.73" [deps] ArrayInterface = "4fba245c-0d91-5ea0-9b3e-6abc04ee57a9" From ad7af51f65ade98ca041d3bc8e299088cbd68294 Mon Sep 17 00:00:00 2001 From: Hendrik Ranocha Date: Fri, 28 Nov 2025 09:10:58 +0100 Subject: [PATCH 3/3] additional tests for type stability --- test/runtests.jl | 28 ++++++++++++++-------------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/test/runtests.jl b/test/runtests.jl index 6c154272..d5bbf50d 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -889,26 +889,26 @@ include("testsetup.jl") # This tests the _vstore_unroll! methods that were added for W=1 A = zeros(5, 5) GC.@preserve A begin - sp = stridedpointer(A) + sp = @inferred stridedpointer(A) # Create a nested VecUnroll with W=1 (scalars instead of Vec{1,T}) - inner_vu1 = VectorizationBase.VecUnroll((1.0, 2.0, 3.0, 4.0, 5.0)) - inner_vu2 = VectorizationBase.VecUnroll((6.0, 7.0, 8.0, 9.0, 10.0)) - inner_vu3 = VectorizationBase.VecUnroll((11.0, 12.0, 13.0, 14.0, 15.0)) - inner_vu4 = VectorizationBase.VecUnroll((16.0, 17.0, 18.0, 19.0, 20.0)) - inner_vu5 = VectorizationBase.VecUnroll((21.0, 22.0, 23.0, 24.0, 25.0)) - outer_vu = VectorizationBase.VecUnroll((inner_vu1, inner_vu2, inner_vu3, inner_vu4, inner_vu5)) + inner_vu1 = @inferred VectorizationBase.VecUnroll((1.0, 2.0, 3.0, 4.0, 5.0)) + inner_vu2 = @inferred VectorizationBase.VecUnroll((6.0, 7.0, 8.0, 9.0, 10.0)) + inner_vu3 = @inferred VectorizationBase.VecUnroll((11.0, 12.0, 13.0, 14.0, 15.0)) + inner_vu4 = @inferred VectorizationBase.VecUnroll((16.0, 17.0, 18.0, 19.0, 20.0)) + inner_vu5 = @inferred VectorizationBase.VecUnroll((21.0, 22.0, 23.0, 24.0, 25.0)) + outer_vu = @inferred VectorizationBase.VecUnroll((inner_vu1, inner_vu2, inner_vu3, inner_vu4, inner_vu5)) # Verify the type structure: VecUnroll{4, 1, Float64, VecUnroll{4, 1, Float64, Float64}} @test outer_vu isa VectorizationBase.VecUnroll{4,1,Float64,<:VectorizationBase.VecUnroll{4,1,Float64,Float64}} # Create nested Unroll index for _vstore_unroll! # The Unroll type parameters are: AU (axis of unroll), F (step), N (count), AV (axis of vectorization), W (vector width), M (mask), X (extra) - inner_unroll = VectorizationBase.Unroll{2,1,5,1,1,UInt(0),1}(StaticInt(0)) - outer_unroll = VectorizationBase.Unroll{1,1,5,1,1,UInt(0),1}(inner_unroll) + inner_unroll = @inferred VectorizationBase.Unroll{2,1,5,1,1,UInt(0),1}(StaticInt(0)) + outer_unroll = @inferred VectorizationBase.Unroll{1,1,5,1,1,UInt(0),1}(inner_unroll) # Call _vstore_unroll! directly - this is what was failing in Issue #543 # We use similar_no_offset to get a pointer suitable for _vstore_unroll! - sptr = VectorizationBase.similar_no_offset(sp, pointer(A)) + sptr = @inferred VectorizationBase.similar_no_offset(sp, pointer(A)) VectorizationBase._vstore_unroll!(sptr, outer_vu, outer_unroll, VectorizationBase.False(), VectorizationBase.False(), VectorizationBase.False(), VectorizationBase.register_size(), StaticInt(8)) @@ -922,13 +922,13 @@ include("testsetup.jl") for n in [3, 4, 5, 6, 7, 8] B = zeros(n, n) GC.@preserve B begin - sp = stridedpointer(B) + sp = @inferred stridedpointer(B) # Create nested VecUnroll with n elements inner_vus = ntuple(i -> VectorizationBase.VecUnroll(ntuple(j -> Float64((i-1)*n + j), n)), n) - outer_vu = VectorizationBase.VecUnroll(inner_vus) + outer_vu = @inferred VectorizationBase.VecUnroll(inner_vus) - inner_unroll = VectorizationBase.Unroll{2,1,n,1,1,UInt(0),1}(StaticInt(0)) - outer_unroll = VectorizationBase.Unroll{1,1,n,1,1,UInt(0),1}(inner_unroll) + inner_unroll = @inferred VectorizationBase.Unroll{2,1,n,1,1,UInt(0),1}(StaticInt(0)) + outer_unroll = @inferred VectorizationBase.Unroll{1,1,n,1,1,UInt(0),1}(inner_unroll) sptr = VectorizationBase.similar_no_offset(sp, pointer(B)) VectorizationBase._vstore_unroll!(sptr, outer_vu, outer_unroll,