diff --git a/Project.toml b/Project.toml index c8e7fa1..be7c559 100644 --- a/Project.toml +++ b/Project.toml @@ -4,11 +4,12 @@ authors = ["Mason Protter", "Chris Elrod", "Dilum Aluthge", "contributors"] version = "0.1.0-DEV" [deps] +ArrayInterface = "4fba245c-0d91-5ea0-9b3e-6abc04ee57a9" LoopVectorization = "bdcacae8-1622-11e9-2a5c-532679323890" VectorizationBase = "3d5dd08c-fd9d-11e8-17fa-ed2836048c2f" [compat] -BenchmarkTools = "0.5" +ArrayInterface = "2.14" LoopVectorization = "0.9.14" VectorizationBase = "0.14.9" julia = "1.5" @@ -19,8 +20,8 @@ InteractiveUtils = "b77e0a4c-d291-57a0-90e8-8db25a27a240" LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e" LoopVectorization = "bdcacae8-1622-11e9-2a5c-532679323890" Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c" -VectorizationBase = "3d5dd08c-fd9d-11e8-17fa-ed2836048c2f" Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40" +VectorizationBase = "3d5dd08c-fd9d-11e8-17fa-ed2836048c2f" [targets] test = ["BenchmarkTools", "InteractiveUtils", "LinearAlgebra", "LoopVectorization", "Random", "VectorizationBase", "Test"] diff --git a/src/Octavian.jl b/src/Octavian.jl index 7a22cf8..ee420dc 100644 --- a/src/Octavian.jl +++ b/src/Octavian.jl @@ -1,5 +1,6 @@ module Octavian +import ArrayInterface import LoopVectorization import VectorizationBase diff --git a/src/matmul.jl b/src/matmul.jl index fc853a7..658024d 100644 --- a/src/matmul.jl +++ b/src/matmul.jl @@ -1,6 +1,12 @@ evenly_divide(x, y) = cld(x, cld(x, y)) evenly_divide(x, y, z) = cld(evenly_divide(x, y), z) * z +_dim1contig(::ArrayInterface.Contiguous) = false +_dim1contig(::ArrayInterface.Contiguous{1}) = true +dim1contig(::Type{A}) where {A <: StridedArray} = _dim1contig(ArrayInterface.contiguous_axis(A)) +dim1contig(::Type) = false +dim1contig(A) = dim1contig(typeof(A)) + """ matmul!(C::AbstractMatrix, A::AbstractMatrix, B::AbstractMatrix, _α = 1, _β = 0) """ @@ -10,7 +16,7 @@ function matmul!(C::AbstractMatrix{T}, A::AbstractMatrix{T}, B::AbstractMatrix{T M, K, N = matmul_sizes(C, A, B) # Check if maybe it's better not to pack at all. - if M * K ≤ _Mc * _Kc && A isa DenseArray && C isa StridedArray && B isa StridedArray && # + if M * K ≤ _Mc * _Kc && dim1contig(A) && LoopVectorization.check_args(C, A, B) && (stride(A,2) ≤ 72 || (iszero(stride(A,2) & (VectorizationBase.pick_vector_width(eltype(A))-1)) && iszero(reinterpret(Int,pointer(A)) & 63))) macrokernel!(C, A, B, _α, _β) return C diff --git a/test/matmul_coverage.jl b/test/matmul_coverage.jl index 482c3a7..2596fe9 100644 --- a/test/matmul_coverage.jl +++ b/test/matmul_coverage.jl @@ -57,3 +57,19 @@ end @test @time(Octavian.matmul(A, B′)) == A * B′ @test @time(Octavian.matmul(A′, B′)) == A′ * B′ end + +@time @testset "A not-a-StrideArray" begin + m = 20 + n = 20 + k = 20 + A = view(rand(Float64, 2m, 2k), 1:2:2m, 2:2:2k) + B = rand(Float64, k, n) + A′ = view(permutedims(parent(A))', 1:2:2m, 2:2:2k) + B′ = permutedims(B)' + @show m, k, n + @test @time(Octavian.matmul(A, B)) ≈ A * B + @test @time(Octavian.matmul(A′, B)) ≈ A′ * B + @test @time(Octavian.matmul(A, B′)) ≈ A * B′ + @test @time(Octavian.matmul(A′, B′)) ≈ A′ * B′ +end +