Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .github/workflows/ci-julia-nightly.yml
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ jobs:
threads:
- '1'
- '2'
# - '64'
version:
- 'nightly'
exclude:
Expand Down
1 change: 1 addition & 0 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ jobs:
threads:
- '1'
- '2'
# - '64'
version:
- '1.5'
- '1' # automatically expands to the latest stable 1.x release of Julia
Expand Down
4 changes: 3 additions & 1 deletion Project.toml
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,9 @@ julia = "1.5"
BenchmarkTools = "6e4b80f9-dd63-53aa-95a3-0cdb28fa8baf"
InteractiveUtils = "b77e0a4c-d291-57a0-90e8-8db25a27a240"
LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
LoopVectorization = "bdcacae8-1622-11e9-2a5c-532679323890"
VectorizationBase = "3d5dd08c-fd9d-11e8-17fa-ed2836048c2f"
Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"

[targets]
test = ["BenchmarkTools", "InteractiveUtils", "LinearAlgebra", "Test"]
test = ["BenchmarkTools", "InteractiveUtils", "LinearAlgebra", "LoopVectorization", "VectorizationBase", "Test"]
7 changes: 2 additions & 5 deletions src/Octavian.jl
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ using VectorizationBase: StaticInt
export matmul
export matmul!

include("global_constants.jl")
include("macros.jl")
include("types.jl")

Expand All @@ -18,10 +19,6 @@ include("memory_buffer.jl")
include("pointer_matrix.jl")
include("utils.jl")

const BCACHE = UInt8[]

function __init__()
resize!(BCACHE, VectorizationBase.CACHE_SIZE[3] * VectorizationBase.CACHE_COUNT[3]);
end
include("init.jl") # `Octavian.__init__()` is defined in this file

end # module Octavian
1 change: 0 additions & 1 deletion src/block_sizes.jl
Original file line number Diff line number Diff line change
Expand Up @@ -68,4 +68,3 @@ end

_calculate_L3(_L2, _L3, st, ::Val{true}) = (_L3 - _L2) ÷ st
_calculate_L3(_L2, _L3, st, ::Val{false}) = _L3 ÷ st

3 changes: 3 additions & 0 deletions src/global_constants.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
const BCACHE = UInt8[]

const OCTAVIAN_NUM_TASKS = Ref(1)
23 changes: 23 additions & 0 deletions src/init.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
function __init__()
init_bcache()
init_num_tasks()
return nothing
end

function init_bcache()
resize!(BCACHE, VectorizationBase.CACHE_SIZE[3] * VectorizationBase.CACHE_COUNT[3])
end

function init_num_tasks()
num_tasks = _read_environment_num_tasks()::Int
OCTAVIAN_NUM_TASKS[] = num_tasks
end

function _read_environment_num_tasks()
environment_variable = get(ENV, "OCTAVIAN_NUM_TASKS", "")::String
if isempty(environment_variable)
return min(Threads.nthreads(), VectorizationBase.NUM_CORES)::Int
else
return parse(Int, environment_variable)::Int
end
end
78 changes: 43 additions & 35 deletions src/matmul.jl
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ evenly_divide(x, y) = cld(x, cld(x, y))
evenly_divide(x, y, z) = cld(evenly_divide(x, y), z) * z

"""
matmul!(C::AbstractMatrix{T}, A::AbstractMatrix{T}, B::AbstractMatrix{T}, _α = one(T), _β = zero(T)) where {T}
matmul!(C::AbstractMatrix, A::AbstractMatrix, B::AbstractMatrix, _α = 1, _β = 0)
"""
function matmul!(C::AbstractMatrix{T}, A::AbstractMatrix{T}, B::AbstractMatrix{T}, _α = one(T), _β = zero(T)) where {T}
_Mc, _Kc, _Nc = block_sizes(T)
Expand All @@ -18,56 +18,64 @@ function matmul!(C::AbstractMatrix{T}, A::AbstractMatrix{T}, B::AbstractMatrix{T

# check if we want to skip packing B
do_not_pack_B = (B isa DenseArray && (K * N ≤ _Kc * _Nc)) || N ≤ LoopVectorization.nᵣ
# Create L2-buffer for `A`; it should be stack-allocated
Amem = L2Buffer(T)
Aptr = Base.unsafe_convert(Ptr{T}, Amem);

Bptr = Base.unsafe_convert(Ptr{T}, BCACHE);

Mc = evenly_divide(M, _Mc, VectorizationBase.pick_vector_width_val(T) * StaticInt{LoopVectorization.mᵣ}())
Kc = evenly_divide(M, _Kc)
Nc = evenly_divide(M, _Nc, StaticInt{LoopVectorization.nᵣ}())

GC.@preserve Amem begin
α = T(_α);
for n ∈ StaticInt{1}():Nc:N # loop 5
nsize = min(Int(n + Nc), Int(N + 1)) - n
β = T(_β)
for k ∈ StaticInt{1}():Kc:K # loop 4
ksize = min(Int(k + Kc), Int(K + 1)) - k
Bview = view(B, k:k+ksize-1, n:n+nsize-1)
# seperate out loop 3, because of _Bblock type instability
if do_not_pack_B
# _Bblock is likely to have the same type as _Bblock; it'd be nice to reduce the amount of compilation
# by homogenizing types across branches, but for now I'm prefering the simplicity of using `Bview`
# _Bblock = PointerMatrix(gesp1(stridedpointer(B), (k,n)), ksize, nsize)
# matmul_loop3!(C, Aptr, Ablock, A, _Bblock, α, β, msize, ksize, nsize, M, k, n, Mc)
matmul_loop3!(C, Aptr, A, Bview, α, β, ksize, nsize, M, k, n, Mc)
else
Bblock = PointerMatrix(Bptr, (ksize,nsize))
unsafe_copyto_avx!(Bblock, Bview)
matmul_loop3!(C, Aptr, A, Bblock, α, β, ksize, nsize, M, k, n, Mc)
end
β = one(T) # re-writing to the same blocks of `C`, so replace original factor with `1`
α = T(_α);
for n ∈ StaticInt{1}():Nc:N # loop 5
nsize = min(Int(n + Nc), Int(N + 1)) - n
β = T(_β)
for k ∈ StaticInt{1}():Kc:K # loop 4
ksize = min(Int(k + Kc), Int(K + 1)) - k
Bview = view(B, k:k+ksize-1, n:n+nsize-1)
# seperate out loop 3, because of _Bblock type instability
if do_not_pack_B
# _Bblock is likely to have the same type as _Bblock; it'd be nice to reduce the amount of compilation
# by homogenizing types across branches, but for now I'm prefering the simplicity of using `Bview`
# _Bblock = PointerMatrix(gesp1(stridedpointer(B), (k,n)), ksize, nsize)
# matmul_loop3!(C, T, Ablock, A, _Bblock, α, β, msize, ksize, nsize, M, k, n, Mc)
matmul_loop3!(C, T, A, Bview, α, β, ksize, nsize, M, k, n, Mc)
else
Bblock = PointerMatrix(Bptr, (ksize,nsize))
unsafe_copyto_avx!(Bblock, Bview)
matmul_loop3!(C, T, A, Bblock, α, β, ksize, nsize, M, k, n, Mc)
end
end
end # GC.@preserve
β = one(T) # re-writing to the same blocks of `C`, so replace original factor with `1`
end # loop 4
end # loop 5
C
end

function matmul_loop3!(C, Aptr, A, Bblock, α, β, ksize, nsize, M, k, n, Mc)
for m ∈ StaticInt{1}():Mc:M
msize = min(Int(m + Mc), Int(M + 1)) - m
Ablock = PointerMatrix(Aptr, (msize, ksize), true)
unsafe_copyto_avx!(Ablock, view(A, m:m+msize-1, k:k+ksize-1))
function matmul_loop3!(C, ::Type{T}, A, Bblock, α, β, ksize, nsize, M, k, n, Mc) where {T}
full_range = StaticInt{1}():Mc:M
partitions = Iterators.partition(full_range, OCTAVIAN_NUM_TASKS[])
@_sync for partition ∈ partitions
@_spawn begin
# Create L2-buffer for `A`; it should be stack-allocated
Amem = L2Buffer(T)
Aptr = Base.unsafe_convert(Ptr{T}, Amem);
GC.@preserve Amem begin
for m ∈ partition # loop 3
msize = min(Int(m + Mc), Int(M + 1)) - m
Ablock = PointerMatrix(Aptr, (msize, ksize), true)
unsafe_copyto_avx!(Ablock, view(A, m:m+msize-1, k:k+ksize-1))

Cblock = view(C, m:m+msize-1, n:n+nsize-1)
macrokernel!(Cblock, Ablock, Bblock, α, β)
Cblock = view(C, m:m+msize-1, n:n+nsize-1)
macrokernel!(Cblock, Ablock, Bblock, α, β)
end # loop 3
end # GC.@preserve
end
end
end

"""
matmul(A::AbstractMatrix{Ta}, B::AbstractMatrix{Tb}) where {Ta, Tb}
matmul(A::AbstractMatrix, B::AbstractMatrix)

Return the matrix product A*B.
"""
function matmul(A::AbstractMatrix{Ta}, B::AbstractMatrix{Tb}) where {Ta, Tb}
# TODO: use `similar` / make it more generic; ideally should work with `StaticArrays.MArray`
Expand Down
8 changes: 8 additions & 0 deletions test/init.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
@testset "init" begin
withenv("OCTAVIAN_NUM_TASKS" => "") do
@test Octavian._read_environment_num_tasks() == min(Threads.nthreads(), VectorizationBase.NUM_CORES)
end
withenv("OCTAVIAN_NUM_TASKS" => "99") do
@test Octavian._read_environment_num_tasks() == 99
end
end
8 changes: 8 additions & 0 deletions test/runtests.jl
Original file line number Diff line number Diff line change
Expand Up @@ -3,13 +3,21 @@ import Octavian
import BenchmarkTools
import InteractiveUtils
import LinearAlgebra
import LoopVectorization
import Test
import VectorizationBase

using Test: @testset, @test, @test_throws

@info("Sys.CPU_THREADS is $(Sys.CPU_THREADS)")
@info("VectorizationBase.NUM_CORES is $(VectorizationBase.NUM_CORES)")

include("test_suite_preamble.jl")

@info("Running Octavian tests with $(Octavian.OCTAVIAN_NUM_TASKS[]) tasks")

include("block_sizes.jl")
include("init.jl")
include("macrokernels.jl")
include("macros.jl")
include("matmul_coverage.jl")
Expand Down