JuliaLinearAlgebra · DilumAluthge · Dec 28, 2020 · Dec 27, 2020 · Dec 27, 2020 · Dec 27, 2020
diff --git a/.gitignore b/.gitignore
@@ -3,3 +3,5 @@
 *.jl.mem
 /docs/build/
 Manifest.toml
+*~
+
diff --git a/Project.toml b/Project.toml
@@ -9,7 +9,7 @@ VectorizationBase = "3d5dd08c-fd9d-11e8-17fa-ed2836048c2f"
 
 [compat]
 BenchmarkTools = "0.5"
-LoopVectorization = "0.9.12"
+LoopVectorization = "0.9.13"
 VectorizationBase = "0.14.9"
 julia = "1.5"
 

diff --git a/src/Octavian.jl b/src/Octavian.jl
@@ -2,8 +2,21 @@ module Octavian
 
 import LoopVectorization
 import VectorizationBase
+using VectorizationBase: StaticInt
+
+export matmul!, matmul
 
 include("types.jl")
 include("macros.jl")
+include("utils.jl")
+include("block_sizes.jl")
+include("macrokernel.jl")
+include("matmul.jl")
+
+const BCACHE = UInt8[]
+function __init__()
+    resize!(BCACHE, VectorizationBase.CACHE_SIZE[3] * VectorizationBase.CACHE_COUNT[3]);
+end
+
 
 end # module Octavian
diff --git a/src/block_sizes.jl b/src/block_sizes.jl
@@ -0,0 +1,74 @@
+
+"""
+  block_sizes(::Type{T}) -> (Mc, Kc, Nc)
+
+Returns the dimensions of our macrokernel, which iterates over the microkernel. That is, in
+calculating `C = A * B`, our macrokernel will be called on `Mc × Kc` blocks of `A`, multiplying
+ them with `Kc × Nc` blocks of `B`, to update `Mc × Nc` blocks of `C`.
+
+We want these blocks to fit nicely in the cache. There is a lot of room for improvement here, but this
+initial implementation should work reasonably well.
+
+
+The constants `LoopVectorization.mᵣ` and `LoopVectorization.nᵣ` are the factors by which LoopVectorization
+wants to unroll the rows and columns of the microkernel, respectively. `LoopVectorization` defines these
+constants by running it's analysis on a gemm kernel; they're there for convenience/to make it easier to
+implement matrix-multiply.
+It also wants to vectorize the rows by `W = VectorizationBase.pick_vector_width_val(T)`.
+Thus, the microkernel's dimensions are `(W * mᵣ) × nᵣ`; that is, the microkernel updates a `(W * mᵣ) × nᵣ`
+block of `C`.
+
+Because the macrokernel iterates over tiles and repeatedly applies the microkernel, we would prefer the
+macrokernel's dimensions to be an integer multiple of the microkernel's.
+That is, we want `Mc` to be an integer multiple of `W * mᵣ` and `Nc` to be an integer multiple of `nᵣ`.
+
+Additionally, we want our blocks of `A` to fit in the core-local L2 cache.
+Empirically, I found that when using `Float64` arrays, 72 rows works well on Haswell (where `W * mᵣ = 8`)
+and 96 works well for Cascadelake (where `W * mᵣ = 24`).
+So I kind of heuristically multiply `W * mᵣ` by `4` given 32 vector register (as in Cascadelake), which
+would yield `96`, and multiply by `9` otherwise, which would give `72` on Haswell.
+Ideally, we'd have a better means of picking.
+I suspect relatively small numbers work well because I'm currently using a column-major memory layout for
+the internal packing arrays. A column-major memory layout means that if our macro-kernel had a lot of rows,
+moving across columns would involve reading memory far apart, moving across memory pages more rapidly,
+hitting the TLB harder. This is why libraries like OpenBLAS and BLIS don't use a column-major layout, but
+reshape into a 3-d array, e.g. `A` will be reshaped into a `Mᵣ × Kc × (Mc ÷ Mᵣ)` array (also sometimes
+referred to as a tile-major matrix), so that all memory reads happen in consecutive memory locations.
+
+
+Now that we have `Mc`, we use it and the `L2` cache size to calculate `Kc`, but shave off a percent to
+leave room in the cache for some other things.
+
+We want out blocks of `B` to fir in the `L3` cache, so we can use the `L3` cache-size and `Kc` to
+similarly calculate `Nc`, with the additional note that we also divide and multiply by `nᵣ` to ensure
+that `Nc` is an integer multiple of `nᵣ`.
+"""
+function block_sizes(::Type{T}) where {T}
+    _L1, _L2, __L3, _L4 = VectorizationBase.CACHE_SIZE
+    L1c, L2c, L3c, L4c = VectorizationBase.CACHE_COUNT
+    # TODO: something better than treating it as 4 MiB if there is no L3 cache
+    #       one possibility is to focus on the L1 and L2 caches instead of the L2 and L3.
+    _L3 = something(__L3, 4194304) 
+    @assert L1c == L2c == VectorizationBase.NUM_CORES
+
+    st = VectorizationBase.static_sizeof(T)
+
+    L2 = (StaticInt{_L2}() - StaticInt{_L1}()) ÷ st
+    if 2_L2 * L2c > _L3 * L3c
+        L3 = StaticInt{_L3}() ÷ st
+    else
+        L3 = (StaticInt{_L3}() - StaticInt{_L2}()) ÷ st
+    end
+
+    W = VectorizationBase.pick_vector_width_val(T)
+    Mr = StaticInt{LoopVectorization.mᵣ}()
+    Nr = StaticInt{LoopVectorization.nᵣ}()
+
+    Mc = (VectorizationBase.REGISTER_COUNT == 32 ? StaticInt{4}() : StaticInt{9}()) * Mr * W
+    Kc = (StaticInt{5}() * L2) ÷ (StaticInt{7}() * Mc)
+    Nc = ((StaticInt{5}() * L3) ÷ (StaticInt{7}() * Kc * Nr)) * Nr
+
+    Mc, Kc, Nc                                  
+end
+
+
diff --git a/src/macrokernel.jl b/src/macrokernel.jl
@@ -0,0 +1,26 @@
+
+"""
+The macrokernel. It iterates over our tiles, and applies the microkernel.
+"""
+function macrokernel!(C, A, B, α, β)
+    iszero(β) && return macrokernel!(C, A, B, α, StaticInt{0}())
+    LoopVectorization.@avx for n ∈ axes(C,2), m ∈ axes(C,1)
+        Cmn = zero(eltype(C))
+        for k ∈ axes(B,1)
+            Cmn += A[m,k] * B[k,n]
+        end
+        # C[m,n] may be `NaN`, but if `β == 0` we still want to zero it
+        C[m,n] = (α * Cmn) + β * C[m,n]
+    end
+end
+function macrokernel!(C, A, B, α, ::StaticInt{0})
+    LoopVectorization.@avx for n ∈ axes(C,2), m ∈ axes(C,1)
+        Cmn = zero(eltype(C))
+        for k ∈ axes(B,1)
+            Cmn += A[m,k] * B[k,n]
+        end
+        # C[m,n] may be `NaN`, but if `β == 0` we still want to zero it
+        C[m,n] = (α * Cmn)
+    end
+end
+
diff --git a/src/matmul.jl b/src/matmul.jl
@@ -0,0 +1,73 @@
+
+evenly_divide(x, y) = cld(x, cld(x, y))
+evenly_divide(x, y, z) = cld(evenly_divide(x, y), z) * z
+
+function matmul!(C::AbstractMatrix{T}, A::AbstractMatrix{T}, B::AbstractMatrix{T}, _α = one(T), _β = zero(T)) where {T}
+    _Mc, _Kc, _Nc = block_sizes(T)
+
+    M, K, N = matmul_sizes(C, A, B)
+
+    # Check if maybe it's better not to pack at all.
+    if M * K ≤ _Mc * _Kc && A isa DenseArray && C isa StridedArray && B isa StridedArray && #
+        (stride(A,2) ≤ 72 || (iszero(stride(A,2) & (VectorizationBase.pick_vector_width(eltype(A))-1)) && iszero(reinterpret(Int,pointer(A)) & 63)))
+        macrokernel!(C, A, B, _α, _β)
+        return C
+    end
+
+    # check if we want to skip packing B
+    do_not_pack_B = (B isa DenseArray && (K * N ≤ _Kc * _Nc)) || N ≤ LoopVectorization.nᵣ
+    # Create L2-buffer for `A`; it should be stack-allocated
+    Amem = L2Buffer(T)
+    Aptr = Base.unsafe_convert(Ptr{T}, Amem);
+
+    Bptr = Base.unsafe_convert(Ptr{T}, BCACHE);
+
+    Mc = evenly_divide(M, _Mc, VectorizationBase.pick_vector_width_val(T) * StaticInt{LoopVectorization.mᵣ}())
+    Kc = evenly_divide(M, _Kc)
+    Nc = evenly_divide(M, _Nc, StaticInt{LoopVectorization.nᵣ}())
+
+    GC.@preserve Amem begin
+        α = T(_α);
+        for n ∈ StaticInt{1}():Nc:N # loop 5
+            nsize = min(Int(n + Nc), Int(N + 1)) - n
+            β = T(_β)
+            for k ∈ StaticInt{1}():Kc:K # loop 4
+                ksize = min(Int(k + Kc), Int(K + 1)) - k
+                Bview = view(B, k:k+ksize-1, n:n+nsize-1)
+                # seperate out loop 3, because of _Bblock type instability
+                if do_not_pack_B
+                    # _Bblock is likely to have the same type as _Bblock; it'd be nice to reduce the amount of compilation
+                    # by homogenizing types across branches, but for now I'm prefering the simplicity of using `Bview`
+                    # _Bblock = PointerMatrix(gesp1(stridedpointer(B), (k,n)), ksize, nsize)
+                    # matmul_loop3!(C, Aptr, Ablock, A, _Bblock, α, β, msize, ksize, nsize, M, k, n, Mc)
+                    matmul_loop3!(C, Aptr, A, Bview, α, β, ksize, nsize, M, k, n, Mc)
+                else
+                    Bblock = PointerMatrix(Bptr, (ksize,nsize))
+                    unsafe_copyto_avx!(Bblock, Bview)
+                    matmul_loop3!(C, Aptr, A, Bblock, α, β, ksize, nsize, M, k, n, Mc)
+                end
+                β = one(T) # re-writing to the same blocks of `C`, so replace original factor with `1`
+            end
+        end
+    end # GC.@preserve
+    C
+end
+
+function matmul_loop3!(C, Aptr, A, Bblock, α, β, ksize, nsize, M, k, n, Mc)
+    for m ∈ StaticInt{1}():Mc:M
+        msize = min(Int(m + Mc), Int(M + 1)) - m
+        Ablock = PointerMatrix(Aptr, (msize, ksize), true)
+        unsafe_copyto_avx!(Ablock, view(A, m:m+msize-1, k:k+ksize-1))
+
+        Cblock = view(C, m:m+msize-1, n:n+nsize-1)
+        macrokernel!(Cblock, Ablock, Bblock, α, β)
+    end
+end
+
+function matmul(A::AbstractMatrix{Ta}, B::AbstractMatrix{Tb}) where {Ta, Tb}
+    # TODO: use `similar` / make it more generic; ideally should work with `StaticArrays.MArray`
+    C = Matrix{promote_type(Ta, Tb)}(undef, size(A,1), size(B,2))
+    matmul!(C, A, B)
+end
+
+
diff --git a/src/types.jl b/src/types.jl
@@ -0,0 +1,53 @@
+
+struct PointerMatrix{T,P<:VectorizationBase.AbstractStridedPointer,S<:Tuple{Vararg{Integer,2}}} <: DenseMatrix{T}
+    p::P
+    s::S
+end
+PointerMatrix(p::P, s::S) where {T,P<:VectorizationBase.AbstractStridedPointer{T},S} = PointerMatrix{T,P,S}(p, s)
+Base.size(A::PointerMatrix) = map(Int, A.s)
+VectorizationBase.stridedpointer(A::PointerMatrix) = A.p
+Base.unsafe_convert(::Type{Ptr{T}}, A::PointerMatrix{T}) where {T} = pointer(A.p)
+@inline function Base.getindex(A::PointerMatrix, i::Integer, j::Integer)
+    @boundscheck checkbounds(A, i, j)
+    VectorizationBase.vload(VectorizationBase.stridedpointer(A), (i,j))
+end
+@inline function Base.getindex(A::PointerMatrix, i::Integer)
+    @boundscheck checkbounds(A, i)
+    VectorizationBase.vload(VectorizationBase.stridedpointer(A), (i-1,))
+end
+@inline function Base.setindex!(A::PointerMatrix{T}, v, i::Integer, j::Integer) where {T}
+    @boundscheck checkbounds(A, i, j)
+    VectorizationBase.vstore!(VectorizationBase.stridedpointer(A), convert(T, v), (i,j))
+    v
+end
+@inline function Base.setindex!(A::PointerMatrix{T}, v, i::Integer) where {T}
+    @boundscheck checkbounds(A, i)
+    VectorizationBase.vstore!(VectorizationBase.stridedpointer(A), convert(T, v), (i-1,))
+    v
+end
+
+function PointerMatrix(Bptr::Ptr{T}, (M,N), padcols::Bool = false) where {T}
+    st = VectorizationBase.static_sizeof(T)
+    _M = padcols ? VectorizationBase.align(M, T) : M
+    # Should maybe add a more convenient column major constructor
+    Bsptr = VectorizationBase.stridedpointer( 
+        Bptr, VectorizationBase.ArrayInterface.Contiguous{1}(), VectorizationBase.ArrayInterface.ContiguousBatch{0}(),
+        VectorizationBase.ArrayInterface.StrideRank{(1,2)}(), (st, _M*st), (StaticInt{1}(),StaticInt{1}())
+    )
+    PointerMatrix(Bsptr, (M,N))
+end
+
+
+mutable struct MemoryBuffer{L,T}
+    data::NTuple{L,T}
+    MemoryBuffer{L,T}(::UndefInitializer) where {L,T} = new{L,T}()
+end
+Base.unsafe_convert(::Type{Ptr{T}}, m::MemoryBuffer) where {T} = Base.unsafe_convert(Ptr{T}, Base.pointer_from_objref(m))
+@inline MemoryBuffer(::StaticInt{L}, ::Type{T}) where {L,T} = MemoryBuffer{L,T}(undef)
+@inline function L2Buffer(::Type{T}) where {T}
+    MemoryBuffer(StaticInt{VectorizationBase.CACHE_SIZE[2]}() ÷ VectorizationBase.static_sizeof(T), T)
+end
+
+
+
+
diff --git a/src/utils.jl b/src/utils.jl
@@ -0,0 +1,32 @@
+
+check_sizes(::StaticInt{M}, ::StaticInt{M}) where {M} = StaticInt{M}()
+check_sizes(::StaticInt{M}, ::StaticInt{N}) where {M,N} = throw("$M ≠ $N")
+check_sizes(::StaticInt{M}, m) where {M} = (@assert M == m; StaticInt{M}())
+check_sizes(m, ::StaticInt{M}) where {M} = (@assert M == m; StaticInt{M}())
+check_sizes(m, n) = (@assert m == n; m)
+
+"""
+Checks sizes for compatibility, and preserves the static size information if
+given a mix of static and dynamic sizes.
+"""
+function matmul_sizes(C, A, B)
+    MC, NC = VectorizationBase.ArrayInterface.size(C)
+    MA, KA = VectorizationBase.ArrayInterface.size(A)
+    KB, NB = VectorizationBase.ArrayInterface.size(B)
+    M = check_sizes(MC, MA)
+    K = check_sizes(KA, KB)
+    N = check_sizes(NC, NB)
+    M, K, N
+end
+
+
+function unsafe_copyto_avx!(B, A)
+    LoopVectorization.@avx for i ∈ eachindex(B, A)
+        B[i] = A[i]
+    end
+end
+
+# @inline function gesp1(sp::P, inds) where {P <: VectorizationBase.AbstractStridedPointer}
+#     P(VectorizationBase.gep(sp, inds), sp.strd, sp.offsets)
+# end
+
diff --git a/test/matmul.jl b/test/matmul.jl
@@ -0,0 +1,27 @@
+
+@testset "Matrix Multiply" begin
+
+    for T ∈ (Float32, Float64, Int32, Int64)
+        Mc, Kc, Nc = map(Int, Octavian.block_sizes(T))
+        for logn ∈ range(log(1), log(1.5Nc+1), length = 5)
+            n = round(Int, exp(logn))
+            for logk ∈ range(log(1), log(1.5Kc+1), length = 5)
+                k = round(Int, exp(logk))
+                B = rand(T, k, n)
+                B′ = permutedims(B)'
+                for logm ∈ range(log(1), log(1.5Mc+1), length = 5)
+                    m = round(Int, exp(logm))
+                    A = rand(T, m, k)
+                    A′ = permutedims(A)'
+
+                    @show m, k, n
+                    @test @time(Octavian.matmul(A, B)) ≈ A * B
+                    @test @time(Octavian.matmul(A′, B)) ≈ A′ * B
+                    @test @time(Octavian.matmul(A, B′)) ≈ A * B′
+                    @test @time(Octavian.matmul(A′, B′)) ≈ A′ * B′
+                end
+            end
+        end
+    end
+end
+
diff --git a/test/runtests.jl b/test/runtests.jl
@@ -11,4 +11,8 @@ versioninfo()
 
 @info("Running tests with $(Threads.nthreads()) threads")
 
-include("macros.jl")
+@testset "Octavian" begin
+    include("macros.jl")
+    include("matmul.jl")
+end
+