Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -3,3 +3,5 @@
*.jl.mem
/docs/build/
Manifest.toml
*~

2 changes: 1 addition & 1 deletion Project.toml
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ VectorizationBase = "3d5dd08c-fd9d-11e8-17fa-ed2836048c2f"

[compat]
BenchmarkTools = "0.5"
LoopVectorization = "0.9.12"
LoopVectorization = "0.9.13"
VectorizationBase = "0.14.9"
julia = "1.5"

Expand Down
13 changes: 13 additions & 0 deletions src/Octavian.jl
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,21 @@ module Octavian

import LoopVectorization
import VectorizationBase
using VectorizationBase: StaticInt

export matmul!, matmul

include("types.jl")
include("macros.jl")
include("utils.jl")
include("block_sizes.jl")
include("macrokernel.jl")
include("matmul.jl")

const BCACHE = UInt8[]
function __init__()
resize!(BCACHE, VectorizationBase.CACHE_SIZE[3] * VectorizationBase.CACHE_COUNT[3]);
end


end # module Octavian
74 changes: 74 additions & 0 deletions src/block_sizes.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@

"""
block_sizes(::Type{T}) -> (Mc, Kc, Nc)

Returns the dimensions of our macrokernel, which iterates over the microkernel. That is, in
calculating `C = A * B`, our macrokernel will be called on `Mc × Kc` blocks of `A`, multiplying
them with `Kc × Nc` blocks of `B`, to update `Mc × Nc` blocks of `C`.

We want these blocks to fit nicely in the cache. There is a lot of room for improvement here, but this
initial implementation should work reasonably well.


The constants `LoopVectorization.mᵣ` and `LoopVectorization.nᵣ` are the factors by which LoopVectorization
wants to unroll the rows and columns of the microkernel, respectively. `LoopVectorization` defines these
constants by running it's analysis on a gemm kernel; they're there for convenience/to make it easier to
implement matrix-multiply.
It also wants to vectorize the rows by `W = VectorizationBase.pick_vector_width_val(T)`.
Thus, the microkernel's dimensions are `(W * mᵣ) × nᵣ`; that is, the microkernel updates a `(W * mᵣ) × nᵣ`
block of `C`.

Because the macrokernel iterates over tiles and repeatedly applies the microkernel, we would prefer the
macrokernel's dimensions to be an integer multiple of the microkernel's.
That is, we want `Mc` to be an integer multiple of `W * mᵣ` and `Nc` to be an integer multiple of `nᵣ`.

Additionally, we want our blocks of `A` to fit in the core-local L2 cache.
Empirically, I found that when using `Float64` arrays, 72 rows works well on Haswell (where `W * mᵣ = 8`)
and 96 works well for Cascadelake (where `W * mᵣ = 24`).
So I kind of heuristically multiply `W * mᵣ` by `4` given 32 vector register (as in Cascadelake), which
would yield `96`, and multiply by `9` otherwise, which would give `72` on Haswell.
Ideally, we'd have a better means of picking.
I suspect relatively small numbers work well because I'm currently using a column-major memory layout for
the internal packing arrays. A column-major memory layout means that if our macro-kernel had a lot of rows,
moving across columns would involve reading memory far apart, moving across memory pages more rapidly,
hitting the TLB harder. This is why libraries like OpenBLAS and BLIS don't use a column-major layout, but
reshape into a 3-d array, e.g. `A` will be reshaped into a `Mᵣ × Kc × (Mc ÷ Mᵣ)` array (also sometimes
referred to as a tile-major matrix), so that all memory reads happen in consecutive memory locations.


Now that we have `Mc`, we use it and the `L2` cache size to calculate `Kc`, but shave off a percent to
leave room in the cache for some other things.

We want out blocks of `B` to fir in the `L3` cache, so we can use the `L3` cache-size and `Kc` to
similarly calculate `Nc`, with the additional note that we also divide and multiply by `nᵣ` to ensure
that `Nc` is an integer multiple of `nᵣ`.
"""
function block_sizes(::Type{T}) where {T}
_L1, _L2, __L3, _L4 = VectorizationBase.CACHE_SIZE
L1c, L2c, L3c, L4c = VectorizationBase.CACHE_COUNT
# TODO: something better than treating it as 4 MiB if there is no L3 cache
# one possibility is to focus on the L1 and L2 caches instead of the L2 and L3.
_L3 = something(__L3, 4194304)
@assert L1c == L2c == VectorizationBase.NUM_CORES

st = VectorizationBase.static_sizeof(T)

L2 = (StaticInt{_L2}() - StaticInt{_L1}()) ÷ st
if 2_L2 * L2c > _L3 * L3c
L3 = StaticInt{_L3}() ÷ st
else
L3 = (StaticInt{_L3}() - StaticInt{_L2}()) ÷ st
end

W = VectorizationBase.pick_vector_width_val(T)
Mr = StaticInt{LoopVectorization.mᵣ}()
Nr = StaticInt{LoopVectorization.nᵣ}()

Mc = (VectorizationBase.REGISTER_COUNT == 32 ? StaticInt{4}() : StaticInt{9}()) * Mr * W
Kc = (StaticInt{5}() * L2) ÷ (StaticInt{7}() * Mc)
Nc = ((StaticInt{5}() * L3) ÷ (StaticInt{7}() * Kc * Nr)) * Nr

Mc, Kc, Nc
end


26 changes: 26 additions & 0 deletions src/macrokernel.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@

"""
The macrokernel. It iterates over our tiles, and applies the microkernel.
"""
function macrokernel!(C, A, B, α, β)
iszero(β) && return macrokernel!(C, A, B, α, StaticInt{0}())
LoopVectorization.@avx for n ∈ axes(C,2), m ∈ axes(C,1)
Cmn = zero(eltype(C))
for k ∈ axes(B,1)
Cmn += A[m,k] * B[k,n]
end
# C[m,n] may be `NaN`, but if `β == 0` we still want to zero it
C[m,n] = (α * Cmn) + β * C[m,n]
end
end
function macrokernel!(C, A, B, α, ::StaticInt{0})
LoopVectorization.@avx for n ∈ axes(C,2), m ∈ axes(C,1)
Cmn = zero(eltype(C))
for k ∈ axes(B,1)
Cmn += A[m,k] * B[k,n]
end
# C[m,n] may be `NaN`, but if `β == 0` we still want to zero it
C[m,n] = (α * Cmn)
end
end

73 changes: 73 additions & 0 deletions src/matmul.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,73 @@

evenly_divide(x, y) = cld(x, cld(x, y))
evenly_divide(x, y, z) = cld(evenly_divide(x, y), z) * z

function matmul!(C::AbstractMatrix{T}, A::AbstractMatrix{T}, B::AbstractMatrix{T}, _α = one(T), _β = zero(T)) where {T}
_Mc, _Kc, _Nc = block_sizes(T)

M, K, N = matmul_sizes(C, A, B)

# Check if maybe it's better not to pack at all.
if M * K ≤ _Mc * _Kc && A isa DenseArray && C isa StridedArray && B isa StridedArray && #
(stride(A,2) ≤ 72 || (iszero(stride(A,2) & (VectorizationBase.pick_vector_width(eltype(A))-1)) && iszero(reinterpret(Int,pointer(A)) & 63)))
macrokernel!(C, A, B, _α, _β)
return C
end

# check if we want to skip packing B
do_not_pack_B = (B isa DenseArray && (K * N ≤ _Kc * _Nc)) || N ≤ LoopVectorization.nᵣ
# Create L2-buffer for `A`; it should be stack-allocated
Amem = L2Buffer(T)
Aptr = Base.unsafe_convert(Ptr{T}, Amem);

Bptr = Base.unsafe_convert(Ptr{T}, BCACHE);

Mc = evenly_divide(M, _Mc, VectorizationBase.pick_vector_width_val(T) * StaticInt{LoopVectorization.mᵣ}())
Kc = evenly_divide(M, _Kc)
Nc = evenly_divide(M, _Nc, StaticInt{LoopVectorization.nᵣ}())

GC.@preserve Amem begin
α = T(_α);
for n ∈ StaticInt{1}():Nc:N # loop 5
nsize = min(Int(n + Nc), Int(N + 1)) - n
β = T(_β)
for k ∈ StaticInt{1}():Kc:K # loop 4
ksize = min(Int(k + Kc), Int(K + 1)) - k
Bview = view(B, k:k+ksize-1, n:n+nsize-1)
# seperate out loop 3, because of _Bblock type instability
if do_not_pack_B
# _Bblock is likely to have the same type as _Bblock; it'd be nice to reduce the amount of compilation
# by homogenizing types across branches, but for now I'm prefering the simplicity of using `Bview`
# _Bblock = PointerMatrix(gesp1(stridedpointer(B), (k,n)), ksize, nsize)
# matmul_loop3!(C, Aptr, Ablock, A, _Bblock, α, β, msize, ksize, nsize, M, k, n, Mc)
matmul_loop3!(C, Aptr, A, Bview, α, β, ksize, nsize, M, k, n, Mc)
else
Bblock = PointerMatrix(Bptr, (ksize,nsize))
unsafe_copyto_avx!(Bblock, Bview)
matmul_loop3!(C, Aptr, A, Bblock, α, β, ksize, nsize, M, k, n, Mc)
end
β = one(T) # re-writing to the same blocks of `C`, so replace original factor with `1`
end
end
end # GC.@preserve
C
end

function matmul_loop3!(C, Aptr, A, Bblock, α, β, ksize, nsize, M, k, n, Mc)
for m ∈ StaticInt{1}():Mc:M
msize = min(Int(m + Mc), Int(M + 1)) - m
Ablock = PointerMatrix(Aptr, (msize, ksize), true)
unsafe_copyto_avx!(Ablock, view(A, m:m+msize-1, k:k+ksize-1))

Cblock = view(C, m:m+msize-1, n:n+nsize-1)
macrokernel!(Cblock, Ablock, Bblock, α, β)
end
end

function matmul(A::AbstractMatrix{Ta}, B::AbstractMatrix{Tb}) where {Ta, Tb}
# TODO: use `similar` / make it more generic; ideally should work with `StaticArrays.MArray`
C = Matrix{promote_type(Ta, Tb)}(undef, size(A,1), size(B,2))
matmul!(C, A, B)
end


53 changes: 53 additions & 0 deletions src/types.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@

struct PointerMatrix{T,P<:VectorizationBase.AbstractStridedPointer,S<:Tuple{Vararg{Integer,2}}} <: DenseMatrix{T}
p::P
s::S
end
PointerMatrix(p::P, s::S) where {T,P<:VectorizationBase.AbstractStridedPointer{T},S} = PointerMatrix{T,P,S}(p, s)
Base.size(A::PointerMatrix) = map(Int, A.s)
VectorizationBase.stridedpointer(A::PointerMatrix) = A.p
Base.unsafe_convert(::Type{Ptr{T}}, A::PointerMatrix{T}) where {T} = pointer(A.p)
@inline function Base.getindex(A::PointerMatrix, i::Integer, j::Integer)
@boundscheck checkbounds(A, i, j)
VectorizationBase.vload(VectorizationBase.stridedpointer(A), (i,j))
end
@inline function Base.getindex(A::PointerMatrix, i::Integer)
@boundscheck checkbounds(A, i)
VectorizationBase.vload(VectorizationBase.stridedpointer(A), (i-1,))
end
@inline function Base.setindex!(A::PointerMatrix{T}, v, i::Integer, j::Integer) where {T}
@boundscheck checkbounds(A, i, j)
VectorizationBase.vstore!(VectorizationBase.stridedpointer(A), convert(T, v), (i,j))
v
end
@inline function Base.setindex!(A::PointerMatrix{T}, v, i::Integer) where {T}
@boundscheck checkbounds(A, i)
VectorizationBase.vstore!(VectorizationBase.stridedpointer(A), convert(T, v), (i-1,))
v
end

function PointerMatrix(Bptr::Ptr{T}, (M,N), padcols::Bool = false) where {T}
st = VectorizationBase.static_sizeof(T)
_M = padcols ? VectorizationBase.align(M, T) : M
# Should maybe add a more convenient column major constructor
Bsptr = VectorizationBase.stridedpointer(
Bptr, VectorizationBase.ArrayInterface.Contiguous{1}(), VectorizationBase.ArrayInterface.ContiguousBatch{0}(),
VectorizationBase.ArrayInterface.StrideRank{(1,2)}(), (st, _M*st), (StaticInt{1}(),StaticInt{1}())
)
PointerMatrix(Bsptr, (M,N))
end


mutable struct MemoryBuffer{L,T}
data::NTuple{L,T}
MemoryBuffer{L,T}(::UndefInitializer) where {L,T} = new{L,T}()
end
Base.unsafe_convert(::Type{Ptr{T}}, m::MemoryBuffer) where {T} = Base.unsafe_convert(Ptr{T}, Base.pointer_from_objref(m))
@inline MemoryBuffer(::StaticInt{L}, ::Type{T}) where {L,T} = MemoryBuffer{L,T}(undef)
@inline function L2Buffer(::Type{T}) where {T}
MemoryBuffer(StaticInt{VectorizationBase.CACHE_SIZE[2]}() ÷ VectorizationBase.static_sizeof(T), T)
end




32 changes: 32 additions & 0 deletions src/utils.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@

check_sizes(::StaticInt{M}, ::StaticInt{M}) where {M} = StaticInt{M}()
check_sizes(::StaticInt{M}, ::StaticInt{N}) where {M,N} = throw("$M ≠ $N")
check_sizes(::StaticInt{M}, m) where {M} = (@assert M == m; StaticInt{M}())
check_sizes(m, ::StaticInt{M}) where {M} = (@assert M == m; StaticInt{M}())
check_sizes(m, n) = (@assert m == n; m)

"""
Checks sizes for compatibility, and preserves the static size information if
given a mix of static and dynamic sizes.
"""
function matmul_sizes(C, A, B)
MC, NC = VectorizationBase.ArrayInterface.size(C)
MA, KA = VectorizationBase.ArrayInterface.size(A)
KB, NB = VectorizationBase.ArrayInterface.size(B)
M = check_sizes(MC, MA)
K = check_sizes(KA, KB)
N = check_sizes(NC, NB)
M, K, N
end


function unsafe_copyto_avx!(B, A)
LoopVectorization.@avx for i ∈ eachindex(B, A)
B[i] = A[i]
end
end

# @inline function gesp1(sp::P, inds) where {P <: VectorizationBase.AbstractStridedPointer}
# P(VectorizationBase.gep(sp, inds), sp.strd, sp.offsets)
# end

27 changes: 27 additions & 0 deletions test/matmul.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@

@testset "Matrix Multiply" begin

for T ∈ (Float32, Float64, Int32, Int64)
Mc, Kc, Nc = map(Int, Octavian.block_sizes(T))
for logn ∈ range(log(1), log(1.5Nc+1), length = 5)
n = round(Int, exp(logn))
for logk ∈ range(log(1), log(1.5Kc+1), length = 5)
k = round(Int, exp(logk))
B = rand(T, k, n)
B′ = permutedims(B)'
for logm ∈ range(log(1), log(1.5Mc+1), length = 5)
m = round(Int, exp(logm))
A = rand(T, m, k)
A′ = permutedims(A)'

@show m, k, n
@test @time(Octavian.matmul(A, B)) ≈ A * B
@test @time(Octavian.matmul(A′, B)) ≈ A′ * B
@test @time(Octavian.matmul(A, B′)) ≈ A * B′
@test @time(Octavian.matmul(A′, B′)) ≈ A′ * B′
end
end
end
end
end

6 changes: 5 additions & 1 deletion test/runtests.jl
Original file line number Diff line number Diff line change
Expand Up @@ -11,4 +11,8 @@ versioninfo()

@info("Running tests with $(Threads.nthreads()) threads")

include("macros.jl")
@testset "Octavian" begin
include("macros.jl")
include("matmul.jl")
end