Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 3 additions & 3 deletions Project.toml
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
name = "Octavian"
uuid = "6fd5a793-0b7e-452c-907f-f8bfe9c57db4"
authors = ["Mason Protter", "Chris Elrod", "Dilum Aluthge", "contributors"]
version = "0.2.17"
version = "0.2.18"

[deps]
ArrayInterface = "4fba245c-0d91-5ea0-9b3e-6abc04ee57a9"
Expand All @@ -13,11 +13,11 @@ VectorizationBase = "3d5dd08c-fd9d-11e8-17fa-ed2836048c2f"

[compat]
ArrayInterface = "3.1.14"
LoopVectorization = "0.12.26"
LoopVectorization = "0.12.32"
Static = "0.2"
StrideArraysCore = "0.1.11"
ThreadingUtilities = "0.4"
VectorizationBase = "0.20.11"
VectorizationBase = "0.20.15"
julia = "1.6"

[extras]
Expand Down
36 changes: 18 additions & 18 deletions src/complex_matmul.jl
Original file line number Diff line number Diff line change
@@ -1,16 +1,16 @@
real_rep(a::AbstractArray{Complex{T}, N}) where {T, N} = reinterpret(reshape, T, a)
#PtrArray(Ptr{T}(pointer(a)), (StaticInt(2), size(a)...))

@inline function _matmul!(_C::AbstractMatrix{Complex{T}}, _A::AbstractMatrix{Complex{U}}, _B::AbstractMatrix{Complex{V}},
@inline function _matmul!(_C::AbstractVecOrMat{Complex{T}}, _A::AbstractMatrix{Complex{U}}, _B::AbstractVecOrMat{Complex{V}},
α=One(), β=Zero(), nthread::Nothing=nothing, MKN=nothing, contig_axis=nothing) where {T,U,V}
C, A, B = real_rep.((_C, _A, _B))
C, A, B = map(real_rep, (_C, _A, _B))

η = ifelse(ArrayInterface.is_lazy_conjugate(_A), StaticInt(-1), StaticInt(1))
θ = ifelse(ArrayInterface.is_lazy_conjugate(_B), StaticInt(-1), StaticInt(1))
(+ᶻ, -ᶻ) = ifelse(ArrayInterface.is_lazy_conjugate(_C), (-, +), (+, -))
ηθ = η*θ

@avxt for n ∈ indices((C, B), 3), m ∈ indices((C, A), 2)
@tturbo for n ∈ indices((C, B), 3), m ∈ indices((C, A), 2)
Cmn_re = zero(T)
Cmn_im = zero(T)
for k ∈ indices((A, B), (3, 2))
Expand All @@ -23,14 +23,14 @@ real_rep(a::AbstractArray{Complex{T}, N}) where {T, N} = reinterpret(reshape, T,
_C
end

@inline function _matmul!(_C::AbstractMatrix{Complex{T}}, A::AbstractMatrix{U}, _B::AbstractMatrix{Complex{V}},
@inline function _matmul!(_C::AbstractVecOrMat{Complex{T}}, A::AbstractMatrix{U}, _B::AbstractVecOrMat{Complex{V}},
α=One(), β=Zero(), nthread::Nothing=nothing, MKN=nothing, contig_axis=nothing) where {T,U,V}
C, B = real_rep.((_C, _B))
C, B = map(real_rep, (_C, _B))

θ = ifelse(ArrayInterface.is_lazy_conjugate(_B), StaticInt(-1), StaticInt(1))
(+ᶻ, -ᶻ) = ifelse(ArrayInterface.is_lazy_conjugate(_C), (-, +), (+, -))

@avxt for n ∈ indices((C, B), 3), m ∈ indices((C, A), (2, 1))
@tturbo for n ∈ indices((C, B), 3), m ∈ indices((C, A), (2, 1))
Cmn_re = zero(T)
Cmn_im = zero(T)
for k ∈ indices((A, B), (2, 2))
Expand All @@ -43,14 +43,14 @@ end
_C
end

@inline function _matmul!(_C::AbstractMatrix{Complex{T}}, _A::AbstractMatrix{Complex{U}}, B::AbstractMatrix{V},
@inline function _matmul!(_C::AbstractVecOrMat{Complex{T}}, _A::AbstractMatrix{Complex{U}}, B::AbstractVecOrMat{V},
α=One(), β=Zero(), nthread::Nothing=nothing, MKN=nothing, contig_axis=nothing) where {T,U,V}
C, A = real_rep.((_C, _A))
C, A = map(real_rep, (_C, _A))

η = ifelse(ArrayInterface.is_lazy_conjugate(_A), StaticInt(-1), StaticInt(1))
(+ᶻ, -ᶻ) = ifelse(ArrayInterface.is_lazy_conjugate(_C), (-, +), (+, -))

@avxt for n ∈ indices((C, B), (3, 2)), m ∈ indices((C, A), 2)
@tturbo for n ∈ indices((C, B), (3, 2)), m ∈ indices((C, A), 2)
Cmn_re = zero(T)
Cmn_im = zero(T)
for k ∈ indices((A, B), (3, 1))
Expand All @@ -67,15 +67,15 @@ end



@inline function _matmul_serial!(_C::AbstractMatrix{Complex{T}}, _A::AbstractMatrix{Complex{U}}, _B::AbstractMatrix{Complex{V}},
@inline function _matmul_serial!(_C::AbstractVecOrMat{Complex{T}}, _A::AbstractMatrix{Complex{U}}, _B::AbstractVecOrMat{Complex{V}},
α=One(), β=Zero(), MKN=nothing, contig_axis=nothing) where {T,U,V}
C, A, B = real_rep.((_C, _A, _B))
C, A, B = map(real_rep, (_C, _A, _B))

η = ifelse(ArrayInterface.is_lazy_conjugate(_A), StaticInt(-1), StaticInt(1))
θ = ifelse(ArrayInterface.is_lazy_conjugate(_B), StaticInt(-1), StaticInt(1))
(+ᶻ, -ᶻ) = ifelse(ArrayInterface.is_lazy_conjugate(_C), (-, +), (+, -))
ηθ = η*θ
@avx for n ∈ indices((C, B), 3), m ∈ indices((C, A), 2)
@turbo for n ∈ indices((C, B), 3), m ∈ indices((C, A), 2)
Cmn_re = zero(T)
Cmn_im = zero(T)
for k ∈ indices((A, B), (3, 2))
Expand All @@ -88,14 +88,14 @@ end
_C
end

@inline function _matmul_serial!(_C::AbstractMatrix{Complex{T}}, A::AbstractMatrix{U}, _B::AbstractMatrix{Complex{V}},
@inline function _matmul_serial!(_C::AbstractVecOrMat{Complex{T}}, A::AbstractMatrix{U}, _B::AbstractVecOrMat{Complex{V}},
α=One(), β=Zero(), MKN=nothing, contig_axis=nothing) where {T,U,V}
C, B = real_rep.((_C, _B))
C, B = map(real_rep, (_C, _B))

θ = ifelse(ArrayInterface.is_lazy_conjugate(_B), StaticInt(-1), StaticInt(1))
(+ᶻ, -ᶻ) = ifelse(ArrayInterface.is_lazy_conjugate(_C), (-, +), (+, -))

@avx for n ∈ indices((C, B), 3), m ∈ indices((C, A), (2, 1))
@turbo for n ∈ indices((C, B), 3), m ∈ indices((C, A), (2, 1))
Cmn_re = zero(T)
Cmn_im = zero(T)
for k ∈ indices((A, B), (2, 2))
Expand All @@ -108,14 +108,14 @@ end
_C
end

@inline function _matmul_serial!(_C::AbstractMatrix{Complex{T}}, _A::AbstractMatrix{Complex{U}}, B::AbstractMatrix{V},
@inline function _matmul_serial!(_C::AbstractVecOrMat{Complex{T}}, _A::AbstractMatrix{Complex{U}}, B::AbstractVecOrMat{V},
α=One(), β=Zero(), MKN=nothing, contig_axis=nothing) where {T,U,V}
C, A = real_rep.((_C, _A))
C, A = map(real_rep, (_C, _A))

η = ifelse(ArrayInterface.is_lazy_conjugate(_A), StaticInt(-1), StaticInt(1))
(+ᶻ, -ᶻ) = ifelse(ArrayInterface.is_lazy_conjugate(_C), (-, +), (+, -))

@avx for n ∈ indices((C, B), (3, 2)), m ∈ indices((C, A), 2)
@turbo for n ∈ indices((C, B), (3, 2)), m ∈ indices((C, A), 2)
Cmn_re = zero(T)
Cmn_im = zero(T)
for k ∈ indices((A, B), (3, 1))
Expand Down
6 changes: 3 additions & 3 deletions src/macrokernels.jl
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ macro kernel(pack::Bool, ex::Expr)
else
Ainit = areconstruct = Expr[]
end
lvkern = esc(:(@avx inline=true $ex))
lvkern = esc(:(@turbo inline=true $ex))

loopnest = quote
let ãₚ = ãₚ, c = c, $(esc(:B)) = VectorizationBase.reconstruct_ptr(B, b), m = m
Expand Down Expand Up @@ -81,7 +81,7 @@ macro kernel(pack::Bool, ex::Expr)
Expr(:block, preheader, loopnest)
end
@inline function loopmul!(C, A, B, α, β, M, K, N)
@avx for n ∈ CloseOpen(N), m ∈ CloseOpen(M)
@turbo for n ∈ CloseOpen(N), m ∈ CloseOpen(M)
Cₘₙ = zero(eltype(C))
for k ∈ CloseOpen(K)
Cₘₙ += A[m,k] * B[k,n]
Expand Down Expand Up @@ -142,7 +142,7 @@ function packaloopmul!(
end

@inline function inlineloopmul!(C, A, B, α, β, M, K, N)
@avx inline=true for m ∈ CloseOpen(M), n ∈ CloseOpen(N)
@turbo inline=true for m ∈ CloseOpen(M), n ∈ CloseOpen(N)
Cₘₙ = zero(eltype(C))
for k ∈ CloseOpen(K)
Cₘₙ += A[m,k] * B[k,n]
Expand Down
72 changes: 51 additions & 21 deletions src/matmul.jl
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@ function matmul_st_pack_A_and_B!(
for k ∈ CloseOpen(Kiter)
ksize = ifelse(k < Krem, Kblock_Krem, Kblock)
_B = default_zerobased_stridedpointer(L3ptr, (One(),ksize))
unsafe_copyto_avx!(_B, B, ksize, nsize)
unsafe_copyto_turbo!(_B, B, ksize, nsize)
let A = A, C = C, B = _B
for m in CloseOpen(Miter)
msize = ifelse((m+1) == Miter, Mremfinal, ifelse(m < Mrem, Mblock_Mrem, Mblock))
Expand Down Expand Up @@ -91,14 +91,21 @@ end
end


@inline function alloc_matmul_product(A::AbstractArray{TA}, B::AbstractArray{TB}) where {TA,TB}
@inline function alloc_matmul_product(A::AbstractArray{TA}, B::AbstractMatrix{TB}) where {TA,TB}
# TODO: if `M` and `N` are statically sized, shouldn't return a `Matrix`.
M, KA = size(A)
KB, N = size(B)
@assert KA == KB "Size mismatch."
Matrix{promote_type(TA,TB)}(undef, M, N), (M, KA, N)
end
@inline function matmul_serial(A::AbstractMatrix, B::AbstractMatrix)
@inline function alloc_matmul_product(A::AbstractArray{TA}, B::AbstractVector{TB}) where {TA,TB}
# TODO: if `M` and `N` are statically sized, shouldn't return a `Matrix`.
M, KA = size(A)
KB = length(B)
@assert KA == KB "Size mismatch."
Vector{promote_type(TA,TB)}(undef, M), (M, KA, One())
end
@inline function matmul_serial(A::AbstractMatrix, B::AbstractVecOrMat)
C, (M,K,N) = alloc_matmul_product(A, B)
_matmul_serial!(C, A, B, One(), Zero(), (M,K,N))
return C
Expand All @@ -116,20 +123,20 @@ function maybeinline(::StaticInt{M}, ::StaticInt{N}, ::Type{T}, ::Val{false}) wh
end


@inline function matmul_serial!(C::AbstractMatrix{T}, A::AbstractMatrix, B::AbstractMatrix) where {T}
@inline function matmul_serial!(C::AbstractVecOrMat, A::AbstractMatrix, B::AbstractVecOrMat)
matmul_serial!(C, A, B, One(), Zero(), nothing, ArrayInterface.contiguous_axis(C))
end
@inline function matmul_serial!(C::AbstractMatrix, A::AbstractMatrix, B::AbstractMatrix, α)
@inline function matmul_serial!(C::AbstractVecOrMat, A::AbstractMatrix, B::AbstractVecOrMat, α)
matmul_serial!(C, A, B, α, Zero(), nothing, ArrayInterface.contiguous_axis(C))
end
@inline function matmul_serial!(C::AbstractMatrix, A::AbstractMatrix, B::AbstractMatrix, α, β)
@inline function matmul_serial!(C::AbstractVecOrMat, A::AbstractMatrix, B::AbstractVecOrMat, α, β)
matmul_serial!(C, A, B, α, β, nothing, ArrayInterface.contiguous_axis(C))
end
@inline function matmul_serial!(C::AbstractMatrix, A::AbstractMatrix, B::AbstractMatrix, α, β, MKN, ::StaticInt{2})
@inline function matmul_serial!(C::AbstractVecOrMat, A::AbstractMatrix, B::AbstractVecOrMat, α, β, MKN, ::StaticInt{2})
_matmul_serial!(C', B', A', α, β, nothing)
return C
end
@inline function matmul_serial!(C::AbstractMatrix, A::AbstractMatrix, B::AbstractMatrix, α, β, MKN, ::StaticInt)
@inline function matmul_serial!(C::AbstractVecOrMat, A::AbstractMatrix, B::AbstractVecOrMat, α, β, MKN, ::StaticInt)
_matmul_serial!(C, A, B, α, β, nothing)
return C
end
Expand All @@ -149,7 +156,7 @@ Otherwise, based on the array's size, whether they are transposed, and whether t
"""
@inline function _matmul_serial!(
C::AbstractMatrix{T}, A::AbstractMatrix, B::AbstractMatrix, α, β, MKN
) where {T}
) where {T<:Real}
M, K, N = MKN === nothing ? matmul_sizes(C, A, B) : MKN
if M * N == 0
return
Expand All @@ -175,13 +182,13 @@ Otherwise, based on the array's size, whether they are transposed, and whether t
end # function

function matmul_only_β!(C::AbstractMatrix{T}, β::StaticInt{0}) where T
@avx for i=1:length(C)
@turbo for i=1:length(C)
C[i] = zero(T)
end
end

function matmul_only_β!(C::AbstractMatrix{T}, β) where T
@avx for i=1:length(C)
@turbo for i=1:length(C)
C[i] = β * C[i]
end
end
Expand All @@ -204,7 +211,7 @@ end

Multiply matrices `A` and `B`.
"""
@inline function matmul(A::AbstractMatrix, B::AbstractMatrix)
@inline function matmul(A::AbstractMatrix, B::AbstractVecOrMat)
C, (M,K,N) = alloc_matmul_product(A, B)
_matmul!(C, A, B, One(), Zero(), nothing, (M,K,N))
return C
Expand All @@ -213,26 +220,26 @@ end
"""
matmul!(C, A, B[, α, β, max_threads])

Calculates `C = α * A * B + β * C` in place, overwriting the contents of `A`.
Calculates `C = α * A * B + β * C` in place, overwriting the contents of `C`.
It may use up to `max_threads` threads. It will not use threads when nested in other threaded code.
"""
@inline function matmul!(C::AbstractMatrix{T}, A::AbstractMatrix, B::AbstractMatrix) where {T}
@inline function matmul!(C::AbstractVecOrMat, A::AbstractMatrix, B::AbstractVecOrMat)
matmul!(C, A, B, One(), Zero(), nothing, nothing, ArrayInterface.contiguous_axis(C))
end
@inline function matmul!(C::AbstractMatrix, A::AbstractMatrix, B::AbstractMatrix, α)
@inline function matmul!(C::AbstractVecOrMat, A::AbstractMatrix, B::AbstractVecOrMat, α)
matmul!(C, A, B, α, Zero(), nothing, nothing, ArrayInterface.contiguous_axis(C))
end
@inline function matmul!(C::AbstractMatrix, A::AbstractMatrix, B::AbstractMatrix, α, β)
@inline function matmul!(C::AbstractVecOrMat, A::AbstractMatrix, B::AbstractVecOrMat, α, β)
matmul!(C, A, B, α, β, nothing, nothing, ArrayInterface.contiguous_axis(C))
end
@inline function matmul!(C::AbstractMatrix, A::AbstractMatrix, B::AbstractMatrix, α, β, nthread)
@inline function matmul!(C::AbstractVecOrMat, A::AbstractMatrix, B::AbstractVecOrMat, α, β, nthread)
matmul!(C, A, B, α, β, nthread, nothing, ArrayInterface.contiguous_axis(C))
end
@inline function matmul!(C::AbstractMatrix, A::AbstractMatrix, B::AbstractMatrix, α, β, nthread, MKN, ::StaticInt{2})
@inline function matmul!(C::AbstractVecOrMat, A::AbstractMatrix, B::AbstractVecOrMat, α, β, nthread, MKN, ::StaticInt{2})
_matmul!(C', B', A', α, β, nthread, MKN)
return C
end
@inline function matmul!(C::AbstractMatrix, A::AbstractMatrix, B::AbstractMatrix, α, β, nthread, MKN, ::StaticInt)
@inline function matmul!(C::AbstractVecOrMat, A::AbstractMatrix, B::AbstractVecOrMat, α, β, nthread, MKN, ::StaticInt)
_matmul!(C, A, B, α, β, nthread, MKN)
return C
end
Expand All @@ -243,7 +250,7 @@ end
end

# passing MKN directly would let osmeone skip the size check.
@inline function _matmul!(C::AbstractMatrix{T}, A, B, α, β, nthread, MKN) where {T}#::Union{Nothing,Tuple{Vararg{Integer,3}}}) where {T}
@inline function _matmul!(C::AbstractMatrix{T}, A, B, α, β, nthread, MKN) where {T<:Real}
M, K, N = MKN === nothing ? matmul_sizes(C, A, B) : MKN
if M * N == 0
return
Expand Down Expand Up @@ -443,7 +450,7 @@ function sync_mul!(
for k ∈ CloseOpen(Kiter)
ksize = ifelse(k < Krem, Kblock_Krem, Kblock)
_B = default_zerobased_stridedpointer(bc, (One(), ksize))
unsafe_copyto_avx!(gesp(_B, (Zero(), pack_offset)), gesp(B, (Zero(), pack_offset)), ksize, pack_len)
unsafe_copyto_turbo!(gesp(_B, (Zero(), pack_offset)), gesp(B, (Zero(), pack_offset)), ksize, pack_len)
# synchronize before starting the multiplication, to ensure `B` is packed
_mv = _atomic_add!(myp, one(UInt))
sync_iters += one(UInt)
Expand Down Expand Up @@ -489,3 +496,26 @@ function sync_mul!(
end
nothing
end

function _matmul!(y::AbstractVector{T}, A::AbstractMatrix, x::AbstractVector, α, β, MKN, contig_axis) where {T<:Real}
@tturbo for m ∈ indices((A,y),1)
yₘ = zero(T)
for n ∈ indices((A,x),(2,1))
yₘ += A[m,n]*x[n]
end
y[m] = α * yₘ + β * y[m]
end
return y
end
function _matmul_serial!(y::AbstractVector{T}, A::AbstractMatrix, x::AbstractVector, α, β, MKN) where {T<:Real}
@turbo for m ∈ indices((A,y),1)
yₘ = zero(T)
for n ∈ indices((A,x),(2,1))
yₘ += A[m,n]*x[n]
end
y[m] = α * yₘ + β * y[m]
end
return y
end


4 changes: 2 additions & 2 deletions src/utils.jl
Original file line number Diff line number Diff line change
Expand Up @@ -16,8 +16,8 @@ given a mix of static and dynamic sizes.
(_select(MA, MC), _select(KA, KB), _select(NB, NC))
end

function unsafe_copyto_avx!(pB, pA, M, N)
LoopVectorization.@avx for n ∈ CloseOpen(N), m ∈ CloseOpen(M)
function unsafe_copyto_turbo!(pB, pA, M, N)
LoopVectorization.@turbo for n ∈ CloseOpen(N), m ∈ CloseOpen(M)
pB[m,n] = pA[m,n]
end
end
Expand Down
Loading