JuliaLinearAlgebra · chriselrod · Jun 5, 2021 · Jun 5, 2021 · Jun 5, 2021
diff --git a/Project.toml b/Project.toml
@@ -1,7 +1,7 @@
 name = "Octavian"
 uuid = "6fd5a793-0b7e-452c-907f-f8bfe9c57db4"
 authors = ["Mason Protter", "Chris Elrod", "Dilum Aluthge", "contributors"]
-version = "0.2.17"
+version = "0.2.18"
 
 [deps]
 ArrayInterface = "4fba245c-0d91-5ea0-9b3e-6abc04ee57a9"
@@ -13,11 +13,11 @@ VectorizationBase = "3d5dd08c-fd9d-11e8-17fa-ed2836048c2f"
 
 [compat]
 ArrayInterface = "3.1.14"
-LoopVectorization = "0.12.26"
+LoopVectorization = "0.12.32"
 Static = "0.2"
 StrideArraysCore = "0.1.11"
 ThreadingUtilities = "0.4"
-VectorizationBase = "0.20.11"
+VectorizationBase = "0.20.15"
 julia = "1.6"
 
 [extras]

diff --git a/src/complex_matmul.jl b/src/complex_matmul.jl
@@ -1,16 +1,16 @@
 real_rep(a::AbstractArray{Complex{T}, N}) where {T, N} = reinterpret(reshape, T, a)
 #PtrArray(Ptr{T}(pointer(a)), (StaticInt(2), size(a)...))
 
-@inline function _matmul!(_C::AbstractMatrix{Complex{T}}, _A::AbstractMatrix{Complex{U}}, _B::AbstractMatrix{Complex{V}},
+@inline function _matmul!(_C::AbstractVecOrMat{Complex{T}}, _A::AbstractMatrix{Complex{U}}, _B::AbstractVecOrMat{Complex{V}},
                          α=One(), β=Zero(), nthread::Nothing=nothing, MKN=nothing, contig_axis=nothing) where {T,U,V}
-    C, A, B =  real_rep.((_C, _A, _B))
+    C, A, B = map(real_rep, (_C, _A, _B))
 
     η = ifelse(ArrayInterface.is_lazy_conjugate(_A), StaticInt(-1), StaticInt(1))
     θ = ifelse(ArrayInterface.is_lazy_conjugate(_B), StaticInt(-1), StaticInt(1))
     (+ᶻ, -ᶻ) = ifelse(ArrayInterface.is_lazy_conjugate(_C), (-, +), (+, -))
     ηθ = η*θ
 
-    @avxt for n ∈ indices((C, B), 3), m ∈ indices((C, A), 2)
+    @tturbo for n ∈ indices((C, B), 3), m ∈ indices((C, A), 2)
         Cmn_re = zero(T)
         Cmn_im = zero(T)
         for k ∈ indices((A, B), (3, 2))
@@ -23,14 +23,14 @@ real_rep(a::AbstractArray{Complex{T}, N}) where {T, N} = reinterpret(reshape, T,
     _C
 end
 
-@inline function _matmul!(_C::AbstractMatrix{Complex{T}}, A::AbstractMatrix{U}, _B::AbstractMatrix{Complex{V}},
+@inline function _matmul!(_C::AbstractVecOrMat{Complex{T}}, A::AbstractMatrix{U}, _B::AbstractVecOrMat{Complex{V}},
                          α=One(), β=Zero(), nthread::Nothing=nothing, MKN=nothing, contig_axis=nothing) where {T,U,V}
-    C, B = real_rep.((_C, _B))
+    C, B = map(real_rep, (_C, _B))
 
     θ = ifelse(ArrayInterface.is_lazy_conjugate(_B), StaticInt(-1), StaticInt(1))
     (+ᶻ, -ᶻ) = ifelse(ArrayInterface.is_lazy_conjugate(_C), (-, +), (+, -))
 
-    @avxt for n ∈ indices((C, B), 3), m ∈ indices((C, A), (2, 1))
+    @tturbo for n ∈ indices((C, B), 3), m ∈ indices((C, A), (2, 1))
         Cmn_re = zero(T)
         Cmn_im = zero(T)
         for k ∈ indices((A, B), (2, 2))
@@ -43,14 +43,14 @@ end
     _C
 end
 
-@inline function _matmul!(_C::AbstractMatrix{Complex{T}}, _A::AbstractMatrix{Complex{U}}, B::AbstractMatrix{V},
+@inline function _matmul!(_C::AbstractVecOrMat{Complex{T}}, _A::AbstractMatrix{Complex{U}}, B::AbstractVecOrMat{V},
                          α=One(), β=Zero(), nthread::Nothing=nothing, MKN=nothing, contig_axis=nothing) where {T,U,V}
-    C, A = real_rep.((_C, _A))
+    C, A = map(real_rep, (_C, _A))
 
     η = ifelse(ArrayInterface.is_lazy_conjugate(_A), StaticInt(-1), StaticInt(1))
     (+ᶻ, -ᶻ) = ifelse(ArrayInterface.is_lazy_conjugate(_C), (-, +), (+, -))
 
-    @avxt for n ∈ indices((C, B), (3, 2)), m ∈ indices((C, A), 2)
+    @tturbo for n ∈ indices((C, B), (3, 2)), m ∈ indices((C, A), 2)
         Cmn_re = zero(T)
         Cmn_im = zero(T)
         for k ∈ indices((A, B), (3, 1))
@@ -67,15 +67,15 @@ end
 
 
 
-@inline function _matmul_serial!(_C::AbstractMatrix{Complex{T}}, _A::AbstractMatrix{Complex{U}}, _B::AbstractMatrix{Complex{V}},
+@inline function _matmul_serial!(_C::AbstractVecOrMat{Complex{T}}, _A::AbstractMatrix{Complex{U}}, _B::AbstractVecOrMat{Complex{V}},
                          α=One(), β=Zero(), MKN=nothing, contig_axis=nothing) where {T,U,V}
-    C, A, B = real_rep.((_C, _A, _B))
+    C, A, B = map(real_rep, (_C, _A, _B))
 
     η = ifelse(ArrayInterface.is_lazy_conjugate(_A), StaticInt(-1), StaticInt(1))
     θ = ifelse(ArrayInterface.is_lazy_conjugate(_B), StaticInt(-1), StaticInt(1))
     (+ᶻ, -ᶻ) = ifelse(ArrayInterface.is_lazy_conjugate(_C), (-, +), (+, -))
     ηθ = η*θ
-    @avx for n ∈ indices((C, B), 3), m ∈ indices((C, A), 2)
+    @turbo for n ∈ indices((C, B), 3), m ∈ indices((C, A), 2)
         Cmn_re = zero(T)
         Cmn_im = zero(T)
         for k ∈ indices((A, B), (3, 2))
@@ -88,14 +88,14 @@ end
     _C
 end
 
-@inline function _matmul_serial!(_C::AbstractMatrix{Complex{T}}, A::AbstractMatrix{U}, _B::AbstractMatrix{Complex{V}},
+@inline function _matmul_serial!(_C::AbstractVecOrMat{Complex{T}}, A::AbstractMatrix{U}, _B::AbstractVecOrMat{Complex{V}},
                          α=One(), β=Zero(), MKN=nothing, contig_axis=nothing) where {T,U,V}
-    C, B = real_rep.((_C, _B))
+    C, B = map(real_rep, (_C, _B))
 
     θ = ifelse(ArrayInterface.is_lazy_conjugate(_B), StaticInt(-1), StaticInt(1))
     (+ᶻ, -ᶻ) = ifelse(ArrayInterface.is_lazy_conjugate(_C), (-, +), (+, -))
 
-    @avx for n ∈ indices((C, B), 3), m ∈ indices((C, A), (2, 1))
+    @turbo for n ∈ indices((C, B), 3), m ∈ indices((C, A), (2, 1))
         Cmn_re = zero(T)
         Cmn_im = zero(T)
         for k ∈ indices((A, B), (2, 2))
@@ -108,14 +108,14 @@ end
     _C
 end
 
-@inline function _matmul_serial!(_C::AbstractMatrix{Complex{T}}, _A::AbstractMatrix{Complex{U}}, B::AbstractMatrix{V},
+@inline function _matmul_serial!(_C::AbstractVecOrMat{Complex{T}}, _A::AbstractMatrix{Complex{U}}, B::AbstractVecOrMat{V},
                          α=One(), β=Zero(), MKN=nothing, contig_axis=nothing) where {T,U,V}
-    C, A = real_rep.((_C, _A))
+    C, A = map(real_rep, (_C, _A))
 
     η = ifelse(ArrayInterface.is_lazy_conjugate(_A), StaticInt(-1), StaticInt(1))
     (+ᶻ, -ᶻ) = ifelse(ArrayInterface.is_lazy_conjugate(_C), (-, +), (+, -))
 
-    @avx for n ∈ indices((C, B), (3, 2)), m ∈ indices((C, A), 2)
+    @turbo for n ∈ indices((C, B), (3, 2)), m ∈ indices((C, A), 2)
         Cmn_re = zero(T)
         Cmn_im = zero(T)
         for k ∈ indices((A, B), (3, 1))

diff --git a/src/macrokernels.jl b/src/macrokernels.jl
@@ -27,7 +27,7 @@ macro kernel(pack::Bool, ex::Expr)
   else
     Ainit = areconstruct = Expr[]
   end
-  lvkern = esc(:(@avx inline=true $ex))
+  lvkern = esc(:(@turbo inline=true $ex))
 
   loopnest = quote
     let ãₚ = ãₚ, c = c, $(esc(:B)) = VectorizationBase.reconstruct_ptr(B, b), m = m
@@ -81,7 +81,7 @@ macro kernel(pack::Bool, ex::Expr)
   Expr(:block, preheader, loopnest)
 end
 @inline function loopmul!(C, A, B, α, β, M, K, N)
-    @avx for n ∈ CloseOpen(N), m ∈ CloseOpen(M)
+    @turbo for n ∈ CloseOpen(N), m ∈ CloseOpen(M)
         Cₘₙ = zero(eltype(C))
         for k ∈ CloseOpen(K)
             Cₘₙ += A[m,k] * B[k,n]
@@ -142,7 +142,7 @@ function packaloopmul!(
 end
 
 @inline function inlineloopmul!(C, A, B, α, β, M, K, N)
-    @avx inline=true for m ∈ CloseOpen(M), n ∈ CloseOpen(N)
+    @turbo inline=true for m ∈ CloseOpen(M), n ∈ CloseOpen(N)
         Cₘₙ = zero(eltype(C))
         for k ∈ CloseOpen(K)
             Cₘₙ += A[m,k] * B[k,n]

diff --git a/src/matmul.jl b/src/matmul.jl
@@ -50,7 +50,7 @@ function matmul_st_pack_A_and_B!(
             for k ∈ CloseOpen(Kiter)
                 ksize = ifelse(k < Krem, Kblock_Krem, Kblock)
                 _B = default_zerobased_stridedpointer(L3ptr, (One(),ksize))
-                unsafe_copyto_avx!(_B, B, ksize, nsize)
+                unsafe_copyto_turbo!(_B, B, ksize, nsize)
                 let A = A, C = C, B = _B
                     for m in CloseOpen(Miter)
                         msize = ifelse((m+1) == Miter, Mremfinal, ifelse(m < Mrem, Mblock_Mrem, Mblock))
@@ -91,14 +91,21 @@ end
 end
 
 
-@inline function alloc_matmul_product(A::AbstractArray{TA}, B::AbstractArray{TB}) where {TA,TB}
+@inline function alloc_matmul_product(A::AbstractArray{TA}, B::AbstractMatrix{TB}) where {TA,TB}
     # TODO: if `M` and `N` are statically sized, shouldn't return a `Matrix`.
     M, KA = size(A)
     KB, N = size(B)
     @assert KA == KB "Size mismatch."
     Matrix{promote_type(TA,TB)}(undef, M, N), (M, KA, N)
 end
-@inline function matmul_serial(A::AbstractMatrix, B::AbstractMatrix)
+@inline function alloc_matmul_product(A::AbstractArray{TA}, B::AbstractVector{TB}) where {TA,TB}
+  # TODO: if `M` and `N` are statically sized, shouldn't return a `Matrix`.
+  M, KA = size(A)
+  KB = length(B)
+  @assert KA == KB "Size mismatch."
+  Vector{promote_type(TA,TB)}(undef, M), (M, KA, One())
+end
+@inline function matmul_serial(A::AbstractMatrix, B::AbstractVecOrMat)
     C, (M,K,N) = alloc_matmul_product(A, B)
     _matmul_serial!(C, A, B, One(), Zero(), (M,K,N))
     return C
@@ -116,20 +123,20 @@ function maybeinline(::StaticInt{M}, ::StaticInt{N}, ::Type{T}, ::Val{false}) wh
 end
 
 
-@inline function matmul_serial!(C::AbstractMatrix{T}, A::AbstractMatrix, B::AbstractMatrix) where {T}
+@inline function matmul_serial!(C::AbstractVecOrMat, A::AbstractMatrix, B::AbstractVecOrMat)
     matmul_serial!(C, A, B, One(), Zero(), nothing, ArrayInterface.contiguous_axis(C))
 end
-@inline function matmul_serial!(C::AbstractMatrix, A::AbstractMatrix, B::AbstractMatrix, α)
+@inline function matmul_serial!(C::AbstractVecOrMat, A::AbstractMatrix, B::AbstractVecOrMat, α)
     matmul_serial!(C, A, B, α, Zero(), nothing, ArrayInterface.contiguous_axis(C))
 end
-@inline function matmul_serial!(C::AbstractMatrix, A::AbstractMatrix, B::AbstractMatrix, α, β)
+@inline function matmul_serial!(C::AbstractVecOrMat, A::AbstractMatrix, B::AbstractVecOrMat, α, β)
     matmul_serial!(C, A, B, α, β, nothing, ArrayInterface.contiguous_axis(C))
 end
-@inline function matmul_serial!(C::AbstractMatrix, A::AbstractMatrix, B::AbstractMatrix, α, β, MKN, ::StaticInt{2})
+@inline function matmul_serial!(C::AbstractVecOrMat, A::AbstractMatrix, B::AbstractVecOrMat, α, β, MKN, ::StaticInt{2})
     _matmul_serial!(C', B', A', α, β, nothing)
     return C
 end
-@inline function matmul_serial!(C::AbstractMatrix, A::AbstractMatrix, B::AbstractMatrix, α, β, MKN, ::StaticInt)
+@inline function matmul_serial!(C::AbstractVecOrMat, A::AbstractMatrix, B::AbstractVecOrMat, α, β, MKN, ::StaticInt)
     _matmul_serial!(C, A, B, α, β, nothing)
     return C
 end
@@ -149,7 +156,7 @@ Otherwise, based on the array's size, whether they are transposed, and whether t
 """
 @inline function _matmul_serial!(
     C::AbstractMatrix{T}, A::AbstractMatrix, B::AbstractMatrix, α, β, MKN
-) where {T}
+) where {T<:Real}
     M, K, N = MKN === nothing ? matmul_sizes(C, A, B) : MKN
     if M * N == 0
         return
@@ -175,13 +182,13 @@ Otherwise, based on the array's size, whether they are transposed, and whether t
 end # function
 
 function matmul_only_β!(C::AbstractMatrix{T}, β::StaticInt{0}) where T
-    @avx for i=1:length(C)
+    @turbo for i=1:length(C)
         C[i] = zero(T)
     end
 end
 
 function matmul_only_β!(C::AbstractMatrix{T}, β) where T
-    @avx for i=1:length(C)
+    @turbo for i=1:length(C)
         C[i] = β * C[i]
     end
 end
@@ -204,7 +211,7 @@ end
 
 Multiply matrices `A` and `B`.
 """
-@inline function matmul(A::AbstractMatrix, B::AbstractMatrix)
+@inline function matmul(A::AbstractMatrix, B::AbstractVecOrMat)
     C, (M,K,N) = alloc_matmul_product(A, B)
     _matmul!(C, A, B, One(), Zero(), nothing, (M,K,N))
     return C
@@ -213,26 +220,26 @@ end
 """
     matmul!(C, A, B[, α, β, max_threads])
 
-Calculates `C = α * A * B + β * C` in place, overwriting the contents of `A`.
+Calculates `C = α * A * B + β * C` in place, overwriting the contents of `C`.
 It may use up to `max_threads` threads. It will not use threads when nested in other threaded code.
 """
-@inline function matmul!(C::AbstractMatrix{T}, A::AbstractMatrix, B::AbstractMatrix) where {T}
+@inline function matmul!(C::AbstractVecOrMat, A::AbstractMatrix, B::AbstractVecOrMat)
     matmul!(C, A, B, One(), Zero(), nothing, nothing, ArrayInterface.contiguous_axis(C))
 end
-@inline function matmul!(C::AbstractMatrix, A::AbstractMatrix, B::AbstractMatrix, α)
+@inline function matmul!(C::AbstractVecOrMat, A::AbstractMatrix, B::AbstractVecOrMat, α)
     matmul!(C, A, B, α, Zero(), nothing, nothing, ArrayInterface.contiguous_axis(C))
 end
-@inline function matmul!(C::AbstractMatrix, A::AbstractMatrix, B::AbstractMatrix, α, β)
+@inline function matmul!(C::AbstractVecOrMat, A::AbstractMatrix, B::AbstractVecOrMat, α, β)
     matmul!(C, A, B, α, β, nothing, nothing, ArrayInterface.contiguous_axis(C))
 end
-@inline function matmul!(C::AbstractMatrix, A::AbstractMatrix, B::AbstractMatrix, α, β, nthread)
+@inline function matmul!(C::AbstractVecOrMat, A::AbstractMatrix, B::AbstractVecOrMat, α, β, nthread)
     matmul!(C, A, B, α, β, nthread, nothing, ArrayInterface.contiguous_axis(C))
 end
-@inline function matmul!(C::AbstractMatrix, A::AbstractMatrix, B::AbstractMatrix, α, β, nthread, MKN, ::StaticInt{2})
+@inline function matmul!(C::AbstractVecOrMat, A::AbstractMatrix, B::AbstractVecOrMat, α, β, nthread, MKN, ::StaticInt{2})
     _matmul!(C', B', A', α, β, nthread, MKN)
     return C
 end
-@inline function matmul!(C::AbstractMatrix, A::AbstractMatrix, B::AbstractMatrix, α, β, nthread, MKN, ::StaticInt)
+@inline function matmul!(C::AbstractVecOrMat, A::AbstractMatrix, B::AbstractVecOrMat, α, β, nthread, MKN, ::StaticInt)
     _matmul!(C, A, B, α, β, nthread, MKN)
     return C
 end
@@ -243,7 +250,7 @@ end
 end
 
 # passing MKN directly would let osmeone skip the size check.
-@inline function _matmul!(C::AbstractMatrix{T}, A, B, α, β, nthread, MKN) where {T}#::Union{Nothing,Tuple{Vararg{Integer,3}}}) where {T}
+@inline function _matmul!(C::AbstractMatrix{T}, A, B, α, β, nthread, MKN) where {T<:Real}
     M, K, N = MKN === nothing ? matmul_sizes(C, A, B) : MKN
     if M * N == 0
         return
@@ -443,7 +450,7 @@ function sync_mul!(
             for k ∈ CloseOpen(Kiter)
                 ksize = ifelse(k < Krem, Kblock_Krem, Kblock)
                 _B = default_zerobased_stridedpointer(bc, (One(), ksize))
-                unsafe_copyto_avx!(gesp(_B, (Zero(), pack_offset)), gesp(B, (Zero(), pack_offset)), ksize, pack_len)
+                unsafe_copyto_turbo!(gesp(_B, (Zero(), pack_offset)), gesp(B, (Zero(), pack_offset)), ksize, pack_len)
                 # synchronize before starting the multiplication, to ensure `B` is packed
                 _mv = _atomic_add!(myp, one(UInt))
                 sync_iters += one(UInt)
@@ -489,3 +496,26 @@ function sync_mul!(
     end
     nothing
 end
+
+function _matmul!(y::AbstractVector{T}, A::AbstractMatrix, x::AbstractVector, α, β, MKN, contig_axis) where {T<:Real}
+  @tturbo for m ∈ indices((A,y),1)
+    yₘ = zero(T)
+    for n ∈ indices((A,x),(2,1))
+      yₘ += A[m,n]*x[n]
+    end
+    y[m] = α * yₘ + β * y[m]
+  end
+  return y
+end
+function _matmul_serial!(y::AbstractVector{T}, A::AbstractMatrix, x::AbstractVector, α, β, MKN) where {T<:Real}
+  @turbo for m ∈ indices((A,y),1)
+    yₘ = zero(T)
+    for n ∈ indices((A,x),(2,1))
+      yₘ += A[m,n]*x[n]
+    end
+    y[m] = α * yₘ + β * y[m]
+  end
+  return y
+end
+
+
diff --git a/src/utils.jl b/src/utils.jl
@@ -16,8 +16,8 @@ given a mix of static and dynamic sizes.
     (_select(MA, MC), _select(KA, KB), _select(NB, NC))
 end
 
-function unsafe_copyto_avx!(pB, pA, M, N)
-    LoopVectorization.@avx for n ∈ CloseOpen(N), m ∈ CloseOpen(M)
+function unsafe_copyto_turbo!(pB, pA, M, N)
+    LoopVectorization.@turbo for n ∈ CloseOpen(N), m ∈ CloseOpen(M)
         pB[m,n] = pA[m,n]
     end
 end