diff --git a/Project.toml b/Project.toml index 7b2e11e..20eb595 100644 --- a/Project.toml +++ b/Project.toml @@ -1,14 +1,15 @@ name = "Octavian" uuid = "6fd5a793-0b7e-452c-907f-f8bfe9c57db4" authors = ["Mason Protter", "Chris Elrod", "Dilum Aluthge", "contributors"] -version = "0.2.20" +version = "0.3.0" [deps] ArrayInterface = "4fba245c-0d91-5ea0-9b3e-6abc04ee57a9" IfElse = "615f187c-cbe4-4ef1-ba3b-2fcf58d6d173" LoopVectorization = "bdcacae8-1622-11e9-2a5c-532679323890" +ManualMemory = "d125e4d3-2237-4719-b19c-fa641b8a4667" +Polyester = "f517fe37-dbe3-4b94-8317-1923a5111588" Static = "aedffcd0-7271-4cad-89d0-dc628f76c6d3" -StrideArraysCore = "7792a7ef-975c-4747-a70f-980b88e8d1da" ThreadingUtilities = "8290d209-cae3-49c0-8002-c8c24d57dab5" VectorizationBase = "3d5dd08c-fd9d-11e8-17fa-ed2836048c2f" @@ -16,9 +17,10 @@ VectorizationBase = "3d5dd08c-fd9d-11e8-17fa-ed2836048c2f" ArrayInterface = "3.1.14" IfElse = "0.1" LoopVectorization = "0.12.34" +ManualMemory = "0.1.1" +Polyester = "0.3.5" Static = "0.2" -StrideArraysCore = "0.1.11" -ThreadingUtilities = "0.4" +ThreadingUtilities = "0.4.6" VectorizationBase = "0.20.16" julia = "1.6" diff --git a/src/Octavian.jl b/src/Octavian.jl index 4954b98..933cb1c 100644 --- a/src/Octavian.jl +++ b/src/Octavian.jl @@ -8,14 +8,12 @@ using VectorizationBase: align, AbstractStridedPointer, zstridedpointer, vsub_ns using LoopVectorization: preserve_buffer, CloseOpen, UpperBoundedInteger using ArrayInterface: size, strides, offsets, indices, axes using IfElse: ifelse - +using Polyester using Static: StaticInt, Zero, One, StaticBool, True, False, gt, eq, StaticFloat64, roundtostaticint, floortostaticint -using StrideArraysCore: MemoryBuffer +using ManualMemory: MemoryBuffer, load, store! -using ThreadingUtilities: - _atomic_add!, _atomic_load, _atomic_store!, - launch, wait, load, store! +using ThreadingUtilities: _atomic_add!, _atomic_load, _atomic_store!, launch, wait export StaticInt export matmul! diff --git a/src/block_sizes.jl b/src/block_sizes.jl index 4d7756f..7d0587f 100644 --- a/src/block_sizes.jl +++ b/src/block_sizes.jl @@ -220,15 +220,15 @@ end Splits both `M` and `N` into blocks when trying to spawn a large number of threads relative to the size of the matrices. """ @inline function divide_blocks(::Val{T}, M, Ntotal, _nspawn, W) where {T} - _nspawn == num_cores() && return find_first_acceptable(Val(T), M, W) - mᵣ, nᵣ = matmul_params(Val(T)) - Miter = clamp(div_fast(M, W*mᵣ * MᵣW_mul_factor()), 1, _nspawn) + _nspawn == num_cores() && return find_first_acceptable(Val(T), M, W) + mᵣ, nᵣ = matmul_params(Val(T)) + Miter = clamp(div_fast(M, W*mᵣ * MᵣW_mul_factor()), 1, _nspawn) + nspawn = div_fast(_nspawn, Miter) + if (nspawn ≤ 1) & (Miter < _nspawn) + # rebalance Miter + Miter = cld_fast(_nspawn, cld_fast(_nspawn, Miter)) nspawn = div_fast(_nspawn, Miter) - if (nspawn ≤ 1) & (Miter < _nspawn) - # rebalance Miter - Miter = cld_fast(_nspawn, cld_fast(_nspawn, Miter)) - nspawn = div_fast(_nspawn, Miter) - end - Miter, cld_fast(Ntotal, max(2, cld_fast(Ntotal, nspawn))) + end + Miter, cld_fast(Ntotal, max(2, cld_fast(Ntotal, nspawn))) end diff --git a/src/funcptrs.jl b/src/funcptrs.jl index 9a03f9e..fade6ef 100644 --- a/src/funcptrs.jl +++ b/src/funcptrs.jl @@ -1,103 +1,101 @@ - struct LoopMulFunc{P,TC,TA,TB,Α,Β,Md,Kd,Nd} <: Function end function (::LoopMulFunc{P,TC,TA,TB,Α,Β,Md,Kd,Nd})(p::Ptr{UInt}) where {P,TC,TA,TB,Α,Β,Md,Kd,Nd} - offset, C = load(p, TC, 2*sizeof(UInt)) - offset, A = load(p, TA, offset) - offset, B = load(p, TB, offset) - offset, α = load(p, Α, offset) - offset, β = load(p, Β, offset) - offset, M = load(p, Md, offset) - offset, K = load(p, Kd, offset) - offset, N = load(p, Nd, offset) - _call_loopmul!(C, A, B, α, β, M, K, N, Val{P}()) - nothing + offset, C = load(p, TC, 2*sizeof(UInt)) + offset, A = load(p, TA, offset) + offset, B = load(p, TB, offset) + offset, α = load(p, Α, offset) + offset, β = load(p, Β, offset) + offset, M = load(p, Md, offset) + offset, K = load(p, Kd, offset) + offset, N = load(p, Nd, offset) + _call_loopmul!(C, A, B, α, β, M, K, N, Val{P}()) + nothing end @inline _call_loopmul!(C, A, B, α, β, M, K, N, ::Val{false}) = loopmul!(C, A, B, α, β, M, K, N) @inline function _call_loopmul!(C::StridedPointer{T}, A, B, α, β, M, K, N, ::Val{true}) where {T} - if M*K < first_cache_size(Val(T)) * R₂Default() - packaloopmul!(C, A, B, α, β, M, K, N) - return - else - matmul_st_only_pack_A!(C, A, B, α, β, M, K, N, W₁Default(), W₂Default(), R₁Default(), R₂Default()) - return - end + if M*K < first_cache_size(Val(T)) * R₂Default() + packaloopmul!(C, A, B, α, β, M, K, N) + return + else + matmul_st_only_pack_A!(C, A, B, α, β, M, K, N, W₁Default(), W₂Default(), R₁Default(), R₂Default()) + return + end end call_loopmul!(C, A, B, α, β, M, K, N, ::Val{P}) where {P} = _call_loopmul!(C, A, B, α, β, M, K, N, Val{P}()) struct SyncMulFunc{TC,TA,TB,Α,Β,Md,Kd,Nd,BCP,ID,TT,W₁,W₂,R₁,R₂} <: Function end function (::SyncMulFunc{TC,TA,TB,Α,Β,Md,Kd,Nd,BCP,ID,TT,W₁,W₂,R₁,R₂})(p::Ptr{UInt}) where {TC,TA,TB,Α,Β,Md,Kd,Nd,BCP,ID,TT,W₁,W₂,R₁,R₂} - offset, C = load(p, TC, 2*sizeof(UInt)) - offset, A = load(p, TA, offset) - offset, B = load(p, TB, offset) - offset, α = load(p, Α, offset) - offset, β = load(p, Β, offset) - offset, M = load(p, Md, offset) - offset, K = load(p, Kd, offset) - offset, N = load(p, Nd, offset) - offset, atomicp = load(p, Ptr{UInt32}, offset) - offset, bcachep = load(p, BCP, offset) - offset, id = load(p, ID, offset) - offset, total_ids = load(p, TT, offset) - sync_mul!(C, A, B, α, β, M, K, N, atomicp, bcachep, id, total_ids, StaticFloat64{W₁}(), StaticFloat64{W₂}(), StaticFloat64{R₁}(), StaticFloat64{R₂}()) - nothing + offset, C = load(p, TC, 2*sizeof(UInt)) + offset, A = load(p, TA, offset) + offset, B = load(p, TB, offset) + offset, α = load(p, Α, offset) + offset, β = load(p, Β, offset) + offset, M = load(p, Md, offset) + offset, K = load(p, Kd, offset) + offset, N = load(p, Nd, offset) + offset, atomicp = load(p, Ptr{UInt32}, offset) + offset, bcachep = load(p, BCP, offset) + offset, id = load(p, ID, offset) + offset, total_ids = load(p, TT, offset) + sync_mul!(C, A, B, α, β, M, K, N, atomicp, bcachep, id, total_ids, StaticFloat64{W₁}(), StaticFloat64{W₂}(), StaticFloat64{R₁}(), StaticFloat64{R₂}()) + nothing end @generated function cfuncpointer(::T) where {T} - precompile(T(), (Ptr{UInt},)) - quote - $(Expr(:meta,:inline)) - @cfunction($(T()), Cvoid, (Ptr{UInt},)) - end + precompile(T(), (Ptr{UInt},)) + quote + $(Expr(:meta,:inline)) + @cfunction($(T()), Cvoid, (Ptr{UInt},)) + end end @inline function setup_matmul!(p::Ptr{UInt}, C::TC, A::TA, B::TB, α::Α, β::Β, M::Md, K::Kd, N::Nd, ::Val{P}) where {P,TC,TA,TB,Α,Β,Md,Kd,Nd} - offset = store!(p, cfuncpointer(LoopMulFunc{P,TC,TA,TB,Α,Β,Md,Kd,Nd}()), sizeof(UInt)) - offset = store!(p, C, offset) - offset = store!(p, A, offset) - offset = store!(p, B, offset) - offset = store!(p, α, offset) - offset = store!(p, β, offset) - offset = store!(p, M, offset) - offset = store!(p, K, offset) - offset = store!(p, N, offset) - nothing + offset = store!(p, cfuncpointer(LoopMulFunc{P,TC,TA,TB,Α,Β,Md,Kd,Nd}()), sizeof(UInt)) + offset = store!(p, C, offset) + offset = store!(p, A, offset) + offset = store!(p, B, offset) + offset = store!(p, α, offset) + offset = store!(p, β, offset) + offset = store!(p, M, offset) + offset = store!(p, K, offset) + offset = store!(p, N, offset) + nothing end @inline function setup_syncmul!( - p::Ptr{UInt}, C::TC, A::TA, B::TB, α::Α, β::Β, M::Md, K::Kd, N::Nd, - ap::Ptr{UInt32},bcp::BCP,id::ID,tt::TT,::StaticFloat64{W₁},::StaticFloat64{W₂},::StaticFloat64{R₁},::StaticFloat64{R₂} + p::Ptr{UInt}, C::TC, A::TA, B::TB, α::Α, β::Β, M::Md, K::Kd, N::Nd, + ap::Ptr{UInt32},bcp::BCP,id::ID,tt::TT,::StaticFloat64{W₁},::StaticFloat64{W₂},::StaticFloat64{R₁},::StaticFloat64{R₂} ) where {TC,TA,TB,Α,Β,Md,Kd,Nd,BCP,ID,TT,W₁,W₂,R₁,R₂} - offset = store!(p, cfuncpointer(SyncMulFunc{TC,TA,TB,Α,Β,Md,Kd,Nd,BCP,ID,TT,W₁,W₂,R₁,R₂}()), sizeof(UInt)) - offset = store!(p, C, offset) - offset = store!(p, A, offset) - offset = store!(p, B, offset) - offset = store!(p, α, offset) - offset = store!(p, β, offset) - offset = store!(p, M, offset) - offset = store!(p, K, offset) - offset = store!(p, N, offset) - offset = store!(p, ap, offset) - offset = store!(p, bcp, offset) - offset = store!(p, id, offset) - offset = store!(p, tt, offset) - nothing + offset = store!(p, cfuncpointer(SyncMulFunc{TC,TA,TB,Α,Β,Md,Kd,Nd,BCP,ID,TT,W₁,W₂,R₁,R₂}()), sizeof(UInt)) + offset = store!(p, C, offset) + offset = store!(p, A, offset) + offset = store!(p, B, offset) + offset = store!(p, α, offset) + offset = store!(p, β, offset) + offset = store!(p, M, offset) + offset = store!(p, K, offset) + offset = store!(p, N, offset) + offset = store!(p, ap, offset) + offset = store!(p, bcp, offset) + offset = store!(p, id, offset) + offset = store!(p, tt, offset) + nothing end -function launch_thread_mul!(C, A, B, α, β, M, K, N, tid::Int, ::Val{P}) where {P} - launch(tid, C, A, B, α, β, M, K, N, Val{P}()) do p, C, A, B, α, β, M, K, N, VP - setup_matmul!(p, C, A, B, α, β, M, K, N, VP) - end +@inline function launch_thread_mul!(C, A, B, α, β, M, K, N, tid::UInt32, ::Val{P}) where {P} + launch(setup_matmul!, tid, C, A, B, α, β, M, K, N, Val{P}()) end -function launch_thread_mul!( - C, A, B, α, β, M, K, N, ap, bcp, tid, tt,::StaticFloat64{W₁},::StaticFloat64{W₂},::StaticFloat64{R₁},::StaticFloat64{R₂} +@inline function launch_thread_mul!( + C, A, B, α, β, M, K, N, ap, bcp, tid, id, tt, ::StaticFloat64{W₁},::StaticFloat64{W₂},::StaticFloat64{R₁},::StaticFloat64{R₂} ) where {W₁,W₂,R₁,R₂} - launch(tid+one(tid), C, A, B, α, β, M, K, N, ap, bcp, tid, tt) do p, C, A, B, α, β, M, K, N, ap, bcp, tid, tt - setup_syncmul!( - p, C, A, B, α, β, M, K, N, ap, bcp, tid, tt, - StaticFloat64{W₁}(),StaticFloat64{W₂}(),StaticFloat64{R₁}(),StaticFloat64{R₂}() - ) - end + launch(tid, C, A, B, α, β, M, K, N, ap, bcp, id, tt) do p, C, A, B, α, β, M, K, N, ap, bcp, id, tt + Base.@_inline_meta + setup_syncmul!( + p, C, A, B, α, β, M, K, N, ap, bcp, id, tt, + StaticFloat64{W₁}(),StaticFloat64{W₂}(),StaticFloat64{R₁}(),StaticFloat64{R₂}() + ) + end end diff --git a/src/global_constants.jl b/src/global_constants.jl index 2f65b54..c9e5445 100644 --- a/src/global_constants.jl +++ b/src/global_constants.jl @@ -73,7 +73,7 @@ bcache_count() = VectorizationBase.num_cache(second_cache()) const BCACHEPTR = Ref{Ptr{Cvoid}}(C_NULL) const BCACHE_LOCK = Threads.Atomic{UInt}(zero(UInt)) -if Sys.WORD_SIZE ≤ 32 - const ACACHEPTR = Ref{Ptr{Cvoid}}(C_NULL) +@static if Sys.WORD_SIZE ≤ 32 + const ACACHEPTR = Ref{Ptr{Cvoid}}(C_NULL) end diff --git a/src/init.jl b/src/init.jl index 8673d3d..dce3d26 100644 --- a/src/init.jl +++ b/src/init.jl @@ -1,45 +1,45 @@ function __init__() - init_acache() - init_bcache() - nt = init_num_tasks() - if nt < num_cores() && ("OCTAVIAN_WARNING" ∈ keys(ENV)) - msg = string( - "Your system has $(num_cores()) physical cores, but `Octavian.jl` only has ", - "$(nt > 1 ? "$(nt) threads" : "$(nt) thread") available. ", - "For the best performance, you should start Julia with at least $(num_cores()) threads.", - ) - @warn msg - end - reseet_bcache_lock!() + init_acache() + init_bcache() + nt = init_num_tasks() + if nt < num_cores() && ("OCTAVIAN_WARNING" ∈ keys(ENV)) + msg = string( + "Your system has $(num_cores()) physical cores, but `Octavian.jl` only has ", + "$(nt > 1 ? "$(nt) threads" : "$(nt) thread") available. ", + "For the best performance, you should start Julia with at least $(num_cores()) threads.", + ) + @warn msg + end + reseet_bcache_lock!() end function init_bcache() - if bcache_count() ≢ Zero() - BCACHEPTR[] = VectorizationBase.valloc(second_cache_size() * bcache_count(), Cvoid, ccall(:jl_getpagesize, Int, ())) - end - nothing + if bcache_count() ≢ Zero() + BCACHEPTR[] = VectorizationBase.valloc(second_cache_size() * bcache_count(), Cvoid, ccall(:jl_getpagesize, Int, ())) + end + nothing end -if Sys.WORD_SIZE ≤ 32 - function init_acache() - ACACHEPTR[] = VectorizationBase.valloc(first_cache_size() * init_num_tasks(), Cvoid, ccall(:jl_getpagesize, Int, ())) - nothing - end +@static if Sys.WORD_SIZE ≤ 32 + function init_acache() + ACACHEPTR[] = VectorizationBase.valloc(first_cache_size() * init_num_tasks(), Cvoid, ccall(:jl_getpagesize, Int, ())) + nothing + end else - init_acache() = nothing + init_acache() = nothing end function init_num_tasks() - num_tasks = _read_environment_num_tasks()::Int - OCTAVIAN_NUM_TASKS[] = num_tasks + num_tasks = _read_environment_num_tasks()::Int + OCTAVIAN_NUM_TASKS[] = num_tasks end function _read_environment_num_tasks() - environment_variable = get(ENV, "OCTAVIAN_NUM_TASKS", "")::String - nt = min(Threads.nthreads(), VectorizationBase.num_cores())::Int - if isempty(environment_variable) - return nt - else - return min(parse(Int, environment_variable)::Int, nt) - end + environment_variable = get(ENV, "OCTAVIAN_NUM_TASKS", "")::String + nt = min(Threads.nthreads(), VectorizationBase.num_cores())::Int + if isempty(environment_variable) + return nt + else + return min(parse(Int, environment_variable)::Int, nt) + end end diff --git a/src/matmul.jl b/src/matmul.jl index f413dcc..52604b0 100644 --- a/src/matmul.jl +++ b/src/matmul.jl @@ -296,134 +296,147 @@ end end # This funciton is sort of a `pun`. It splits aggressively (it does a lot of "splitin'"), which often means it will split-N. -function matmulsplitn!(C::AbstractStridedPointer{T}, A, B, α, β, ::StaticInt{Mc}, M, K, N, nspawn, ::Val{PACK}) where {T, Mc, PACK} - Mᵣ, Nᵣ = matmul_params(Val(T)) - W = pick_vector_width(T) - MᵣW = Mᵣ*W - _Mblocks, Nblocks = divide_blocks(Val(T), M, cld_fast(N, Nᵣ), nspawn, W) - Mbsize, Mrem, Mremfinal, Mblocks = split_m(M, _Mblocks, W) - # Nblocks = min(N, _Nblocks) - Nbsize, Nrem = divrem_fast(N, Nblocks) - - _nspawn = Mblocks * Nblocks - Mbsize_Mrem, Mbsize_ = promote(Mbsize + W, Mbsize) - Nbsize_Nrem, Nbsize_ = promote(Nbsize + One(), Nbsize) - - let _A = A, _B = B, _C = C, n = 0, tnum = 0, Nrc = Nblocks - Nrem, Mrc = Mblocks - Mrem, __Mblocks = Mblocks - One() - while true - nsize = ifelse(Nblocks > Nrc, Nbsize_Nrem, Nbsize_); Nblocks -= 1 - let _A = _A, _C = _C, __Mblocks = __Mblocks - while __Mblocks != 0 - msize = ifelse(__Mblocks ≥ Mrc, Mbsize_Mrem, Mbsize_); __Mblocks -= 1 - launch_thread_mul!(_C, _A, _B, α, β, msize, K, nsize, (tnum += 1), Val{PACK}()) - _A = gesp(_A, (msize, Zero())) - _C = gesp(_C, (msize, Zero())) - end - if Nblocks != 0 - launch_thread_mul!(_C, _A, _B, α, β, Mremfinal, K, nsize, (tnum += 1), Val{PACK}()) - else - call_loopmul!(_C, _A, _B, α, β, Mremfinal, K, nsize, Val{PACK}()) - waitonmultasks(CloseOpen(One(), _nspawn)) - return - end - end - _B = gesp(_B, (Zero(), nsize)) - _C = gesp(_C, (Zero(), nsize)) +function matmulsplitn!(C::AbstractStridedPointer{T}, A, B, α, β, ::StaticInt{Mc}, M, K, N, threads, ::Val{PACK}) where {T, Mc, PACK} + Mᵣ, Nᵣ = matmul_params(Val(T)) + W = pick_vector_width(T) + MᵣW = Mᵣ*W + _Mblocks, Nblocks = divide_blocks(Val(T), M, cld_fast(N, Nᵣ), threads.i%Int + 1, W) + Mbsize, Mrem, Mremfinal, Mblocks = split_m(M, _Mblocks, W) + # Nblocks = min(N, _Nblocks) + Nbsize, Nrem = divrem_fast(N, Nblocks) + + _nspawn = Mblocks * Nblocks + Mbsize_Mrem, Mbsize_ = promote(Mbsize + W, Mbsize) + Nbsize_Nrem, Nbsize_ = promote(Nbsize + One(), Nbsize) + (tnum, tuu) = Polyester.initial_state(threads) + let _A = A, _B = B, _C = C, n = 0, Nrc = Nblocks - Nrem, Mrc = Mblocks - Mrem, __Mblocks = Mblocks - One() + while true + nsize = ifelse(Nblocks > Nrc, Nbsize_Nrem, Nbsize_); Nblocks -= 1 + let _A = _A, _C = _C, __Mblocks = __Mblocks + while __Mblocks != 0 + msize = ifelse(__Mblocks ≥ Mrc, Mbsize_Mrem, Mbsize_); __Mblocks -= 1 + (tnum, tuu) = Polyester.iter(tnum, tuu) + launch_thread_mul!(_C, _A, _B, α, β, msize, K, nsize, tnum, Val{PACK}()) + _A = gesp(_A, (msize, Zero())) + _C = gesp(_C, (msize, Zero())) end + if Nblocks != 0 + (tnum, tuu) = Polyester.iter(tnum, tuu) + launch_thread_mul!(_C, _A, _B, α, β, Mremfinal, K, nsize, tnum, Val{PACK}()) + else + call_loopmul!(_C, _A, _B, α, β, Mremfinal, K, nsize, Val{PACK}()) + waitonmultasks(threads, _nspawn) + return + end + end + _B = gesp(_B, (Zero(), nsize)) + _C = gesp(_C, (Zero(), nsize)) end + end end function __matmul!( C::AbstractStridedPointer{T}, A::AbstractStridedPointer, B::AbstractStridedPointer, α, β, M, K, N, nthread ) where {T} - Mᵣ, Nᵣ = matmul_params(Val(T)) - W = pick_vector_width(T) - Mc, Kc, Nc = block_sizes(Val(T)) - MᵣW = Mᵣ*W - - # Not taking the fast path - # But maybe we don't want to thread anyway - # Maybe this is nested, or we have ≤ 1 threads - nt = _nthreads() - _nthread = nthread === nothing ? nt : min(nt, nthread) - if _nthread < 2 - matmul_st_pack_dispatcher!(C, A, B, α, β, M, K, N) - return - end - # We are threading, but how many threads? - nspawn = if (Sys.ARCH === :x86_64) || (Sys.ARCH === :i686) - clamp(div_fast(M * N, StaticInt{128}() * W), 1, _nthread) - else - clamp(div_fast(M * N, StaticInt{256}() * W), 1, _nthread) - end - # nkern = cld_fast(M * N, MᵣW * Nᵣ) - - # Approach: - # Check if we don't want to pack A, - # if not, aggressively subdivide - # if so, check if we don't want to pack B - # if not, check if we want to thread `N` loop anyway - # if so, divide `M` first, then use ratio of desired divisions / divisions along `M` to calc divisions along `N` - # if not, only thread along `M`. These don't need syncing, as we're not packing `B` - # if so, `matmul_pack_A_and_B!` - # - # MᵣW * (MᵣW_mul_factor - One()) # gives a smaller Mc, then - # if 2M/nspawn is less than it, we don't don't `A` - # First check is: do we just want to split aggressively? - mᵣ, nᵣ = matmul_params(Val(T)) - if dontpack(A, M, K, Mc, Kc, T, nspawn) || (W ≥ M) || (nᵣ*((num_cores() ≥ StaticInt(8)) ? max(nspawn,8) : 8) ≥ N) - # `nᵣ*nspawn ≥ N` is needed at the moment to avoid accidentally splitting `N` to be `< nᵣ` while packing - # Should probably handle that with a smarter splitting function... - matmulsplitn!(C, A, B, α, β, Mc, M, K, N, nspawn, Val{false}()) - elseif (bcache_count() === Zero()) || ((nspawn*(W+W) > M) || (contiguousstride1(B) ? (roundtostaticint(Kc * Nc * R₂Default()) ≥ K * N) : (firstbytestride(B) ≤ 1600))) - matmulsplitn!(C, A, B, α, β, Mc, M, K, N, nspawn, Val{true}()) - else # TODO: Allow splitting along `N` for `matmul_pack_A_and_B!` - matmul_pack_A_and_B!(C, A, B, α, β, M, K, N, nspawn, W₁Default(), W₂Default(), R₁Default(), R₂Default()) - end - nothing + Mᵣ, Nᵣ = matmul_params(Val(T)) + W = pick_vector_width(T) + Mc, Kc, Nc = block_sizes(Val(T)) + MᵣW = Mᵣ*W + + # Not taking the fast path + # But maybe we don't want to thread anyway + # Maybe this is nested, or we have ≤ 1 threads + nt = _nthreads() + _nthread = nthread === nothing ? nt : min(nt, nthread) + if _nthread < 2 + @label SINGLETHREAD + matmul_st_pack_dispatcher!(C, A, B, α, β, M, K, N) + return + end + # We are threading, but how many threads? + _nrequest = if (Sys.ARCH === :x86_64) || (Sys.ARCH === :i686) + clamp(div_fast(M * N, StaticInt{128}() * W), 0, _nthread-1) + else + clamp(div_fast(M * N, StaticInt{256}() * W), 0, _nthread-1) + end + # nkern = cld_fast(M * N, MᵣW * Nᵣ) + threads, torelease = Polyester.request_threads(Threads.threadid()%UInt32, _nrequest) + nrequest = threads.i + iszero(nrequest) && @goto SINGLETHREAD + nspawn = nrequest + 1 + # Approach: + # Check if we don't want to pack A, + # if not, aggressively subdivide + # if so, check if we don't want to pack B + # if not, check if we want to thread `N` loop anyway + # if so, divide `M` first, then use ratio of desired divisions / divisions along `M` to calc divisions along `N` + # if not, only thread along `M`. These don't need syncing, as we're not packing `B` + # if so, `matmul_pack_A_and_B!` + # + # MᵣW * (MᵣW_mul_factor - One()) # gives a smaller Mc, then + # if 2M/nspawn is less than it, we don't don't `A` + # First check is: do we just want to split aggressively? + mᵣ, nᵣ = matmul_params(Val(T)) + if dontpack(A, M, K, Mc, Kc, T, nspawn) || (W ≥ M) || (nᵣ*((num_cores() ≥ StaticInt(8)) ? max(nspawn,8) : 8) ≥ N) + # `nᵣ*nspawn ≥ N` is needed at the moment to avoid accidentally splitting `N` to be `< nᵣ` while packing + # Should probably handle that with a smarter splitting function... + matmulsplitn!(C, A, B, α, β, Mc, M, K, N, threads, Val{false}()) + elseif (bcache_count() === Zero()) || ((nspawn*(W+W) > M) || (contiguousstride1(B) ? (roundtostaticint(Kc * Nc * R₂Default()) ≥ K * N) : (firstbytestride(B) ≤ 1600))) + matmulsplitn!(C, A, B, α, β, Mc, M, K, N, threads, Val{true}()) + else # TODO: Allow splitting along `N` for `matmul_pack_A_and_B!` + matmul_pack_A_and_B!(C, A, B, α, β, M, K, N, threads, W₁Default(), W₂Default(), R₁Default(), R₂Default()) + end + Polyester.free_threads!(torelease) + nothing end # If tasks is [0,1,2,3] (e.g., `CloseOpen(0,4)`), it will wait on `MULTASKS[i]` for `i = [1,2,3]`. -function waitonmultasks(tasks) - for tid ∈ tasks - wait(tid) - end +function waitonmultasks(threads, nthread) + # for (_,tid) ∈ threads + # wait(tid) + # end + (tnum, tuu) = Polyester.initial_state(threads) + for _ ∈ CloseOpen(One(), nthread) + (tnum, tuu) = Polyester.iter(tnum, tuu) + wait(tnum) + end end @inline allocref(::StaticInt{N}) where {N} = Ref{NTuple{N,UInt8}}() function matmul_pack_A_and_B!( - C::AbstractStridedPointer{T}, A::AbstractStridedPointer, B::AbstractStridedPointer, α, β, M, K, N, - tospawn::Int, ::StaticFloat64{W₁}, ::StaticFloat64{W₂}, ::StaticFloat64{R₁}, ::StaticFloat64{R₂}#, ::Val{1} -) where {T,W₁,W₂,R₁,R₂} - W = pick_vector_width(T) - mᵣ, nᵣ = matmul_params(Val(T)) - mᵣW = mᵣ * W - # atomicsync = Ref{NTuple{16,UInt}}() - Mbsize, Mrem, Mremfinal, _to_spawn = split_m(M, tospawn, W) # M is guaranteed to be > W because of `W ≥ M` condition for `jmultsplitn!`... - atomicsync = allocref((StaticInt{1}()+num_cores())*cache_linesize()) - p = align(reinterpret(Ptr{UInt32}, Base.unsafe_convert(Ptr{UInt8}, atomicsync))) - GC.@preserve atomicsync begin - for i ∈ CloseOpen(_to_spawn) - _atomic_store!(reinterpret(Ptr{UInt64}, p) + i*cache_linesize(), 0x0000000000000000) - end - Mblock_Mrem, Mblock_ = promote(Mbsize + W, Mbsize) - u_to_spawn = _to_spawn % UInt - tid = 0 - bc = _use_bcache() - bc_ptr = Base.unsafe_convert(typeof(pointer(C)), pointer(bc)) - last_id = _to_spawn - One() - for m ∈ CloseOpen(last_id) # ...thus the fact that `CloseOpen()` iterates at least once is okay. - Mblock = ifelse(m < Mrem, Mblock_Mrem, Mblock_) - launch_thread_mul!(C, A, B, α, β, Mblock, K, N, p, bc_ptr, m % UInt, u_to_spawn, StaticFloat64{W₁}(),StaticFloat64{W₂}(),StaticFloat64{R₁}(),StaticFloat64{R₂}()) - A = gesp(A, (Mblock, Zero())) - C = gesp(C, (Mblock, Zero())) - end - sync_mul!(C, A, B, α, β, Mremfinal, K, N, p, bc_ptr, last_id % UInt, u_to_spawn, StaticFloat64{W₁}(), StaticFloat64{W₂}(), StaticFloat64{R₁}(), StaticFloat64{R₂}()) - waitonmultasks(CloseOpen(One(), _to_spawn)) + C::AbstractStridedPointer{T}, A::AbstractStridedPointer, B::AbstractStridedPointer, α, β, M, K, N, + threads, ::StaticFloat64{W₁}, ::StaticFloat64{W₂}, ::StaticFloat64{R₁}, ::StaticFloat64{R₂}#, ::Val{1} + ) where {T,W₁,W₂,R₁,R₂} + W = pick_vector_width(T) + mᵣ, nᵣ = matmul_params(Val(T)) + mᵣW = mᵣ * W + # atomicsync = Ref{NTuple{16,UInt}}() + Mbsize, Mrem, Mremfinal, _to_spawn = split_m(M, threads.i%Int + 1, W) # M is guaranteed to be > W because of `W ≥ M` condition for `jmultsplitn!`... + atomicsync = allocref((StaticInt{1}()+num_cores())*cache_linesize()) + p = align(reinterpret(Ptr{UInt32}, Base.unsafe_convert(Ptr{UInt8}, atomicsync))) + GC.@preserve atomicsync begin + for i ∈ CloseOpen(_to_spawn) + store!(reinterpret(Ptr{UInt64}, p) + i*cache_linesize(), 0x0000000000000000) end - _free_bcache!(bc) - return + Mblock_Mrem, Mblock_ = promote(Mbsize + W, Mbsize) + u_to_spawn = _to_spawn % UInt + (tnum, tuu) = Polyester.initial_state(threads) + bc = _use_bcache() + bc_ptr = Base.unsafe_convert(typeof(pointer(C)), pointer(bc)) + last_id = _to_spawn - One() + for m ∈ CloseOpen(last_id) # ...thus the fact that `CloseOpen()` iterates at least once is okay. + Mblock = ifelse(m < Mrem, Mblock_Mrem, Mblock_) + (tnum, tuu) = Polyester.iter(tnum, tuu) + launch_thread_mul!(C, A, B, α, β, Mblock, K, N, p, bc_ptr, tnum, m % UInt, u_to_spawn, StaticFloat64{W₁}(),StaticFloat64{W₂}(),StaticFloat64{R₁}(),StaticFloat64{R₂}()) + A = gesp(A, (Mblock, Zero())) + C = gesp(C, (Mblock, Zero())) + end + sync_mul!(C, A, B, α, β, Mremfinal, K, N, p, bc_ptr, last_id % UInt, u_to_spawn, StaticFloat64{W₁}(), StaticFloat64{W₂}(), StaticFloat64{R₁}(), StaticFloat64{R₂}()) + waitonmultasks(threads, _to_spawn) + end + _free_bcache!(bc) + return end function sync_mul!( @@ -504,7 +517,7 @@ function sync_mul!( nothing end -function _matmul!(y::AbstractVector{T}, A::AbstractMatrix, x::AbstractVector, α, β, MKN, contig_axis) where {T} +function _matmul!(y::AbstractVector{T}, A::AbstractMatrix, x::AbstractVector, α, β, _, __) where {T} @tturbo for m ∈ indices((A,y),1) yₘ = zero(T) for n ∈ indices((A,x),(2,1)) @@ -514,7 +527,7 @@ function _matmul!(y::AbstractVector{T}, A::AbstractMatrix, x::AbstractVector, α end return y end -function _matmul_serial!(y::AbstractVector{T}, A::AbstractMatrix, x::AbstractVector, α, β, MKN) where {T} +function _matmul_serial!(y::AbstractVector{T}, A::AbstractMatrix, x::AbstractVector, α, β, _) where {T} @turbo for m ∈ indices((A,y),1) yₘ = zero(T) for n ∈ indices((A,x),(2,1)) diff --git a/src/memory_buffer.jl b/src/memory_buffer.jl index 569ce7d..f4dc1aa 100644 --- a/src/memory_buffer.jl +++ b/src/memory_buffer.jl @@ -1,12 +1,15 @@ -if Sys.WORD_SIZE == 32 - @inline function first_cache_buffer(::Val{T}) where {T} - reinterpret(Ptr{T}, ACACHEPTR[] + Threads.threadid() * first_cache_size() - first_cache_size()) - end +@inline function first_cache_buffer(::Val{T}) where {T} + first_cache_buffer(Val{T}(), first_cache_size(Val(T))) +end +@static if Sys.WORD_SIZE == 32 + @inline function first_cache_buffer(::Val{T}, N) where {T} + reinterpret(Ptr{T}, ACACHEPTR[] + ((Threads.threadid()-1) * N) * static_sizeof(T)) + end else - @inline function first_cache_buffer(::Val{T}) where {T} - MemoryBuffer{T}(undef, first_cache_size(Val(T))) - end + @inline function first_cache_buffer(::Val{T}, ::StaticInt{N}) where {T,N} + MemoryBuffer{N,T}(undef) + end end BCache(i::Integer) = BCache(BCACHEPTR[]+cld_fast(second_cache_size()*i, Threads.nthreads()), i % UInt) @@ -16,20 +19,20 @@ BCache(::Nothing) = BCache(BCACHEPTR[], nothing) @inline Base.unsafe_convert(::Type{Ptr{T}}, b::BCache) where {T} = Base.unsafe_convert(Ptr{T}, b.p) function _use_bcache() - while Threads.atomic_cas!(BCACHE_LOCK, zero(UInt), typemax(UInt)) != zero(UInt) - pause() - end - return BCache(nothing) + while Threads.atomic_cas!(BCACHE_LOCK, zero(UInt), typemax(UInt)) != zero(UInt) + pause() + end + return BCache(nothing) end @inline _free_bcache!(b::BCache{Nothing}) = reseet_bcache_lock!() _use_bcache(::Nothing) = _use_bcache() function _use_bcache(i) - f = one(UInt) << i - while (Threads.atomic_or!(BCACHE_LOCK, f) & f) != zero(UInt) - pause() - end - BCache(i) + f = one(UInt) << i + while (Threads.atomic_or!(BCACHE_LOCK, f) & f) != zero(UInt) + pause() + end + BCache(i) end _free_bcache!(b::BCache{UInt}) = (Threads.atomic_xor!(BCACHE_LOCK, one(UInt) << b.i); nothing) diff --git a/test/matmul_coverage.jl b/test/matmul_coverage.jl index 4093a50..47f9595 100644 --- a/test/matmul_coverage.jl +++ b/test/matmul_coverage.jl @@ -3,23 +3,26 @@ k_values = [10, 20, 50, 100, 150, 200] m_values = [10, 20, 50, 100, 150, 200] function matmul_pack_ab!(C, A, B) - M, N = size(C); K = size(B,1) - zc, za, zb = Octavian.zstridedpointer.((C,A,B)) - nspawn = min(Threads.nthreads(), Octavian.num_cores()) - GC.@preserve C A B begin - if nspawn > 1 - Octavian.matmul_pack_A_and_B!( - zc, za, zb, Octavian.StaticInt{1}(), Octavian.StaticInt{0}(), M, K, N, nspawn, - Octavian.W₁Default(), Octavian.W₂Default(), Octavian.R₁Default(), Octavian.R₂Default() - ) - else - Octavian.matmul_st_pack_A_and_B!( - zc, za, zb, Octavian.StaticInt{1}(), Octavian.StaticInt{0}(), M, K, N, - Octavian.W₁Default(), Octavian.W₂Default(), Octavian.R₁Default(), Octavian.R₂Default(), 0 - ) - end + M, N = size(C); K = size(B,1) + zc, za, zb = Octavian.zstridedpointer.((C,A,B)) + nspawn = min(Threads.nthreads(), Octavian.num_cores()) + GC.@preserve C A B begin + if nspawn > 1 + threads, torelease = Octavian.Polyester.request_threads(Threads.threadid(), nspawn-1) + @assert threads.i < Threads.nthreads() + Octavian.matmul_pack_A_and_B!( + zc, za, zb, Octavian.StaticInt{1}(), Octavian.StaticInt{0}(), M, K, N, threads, + Octavian.W₁Default(), Octavian.W₂Default(), Octavian.R₁Default(), Octavian.R₂Default() + ) + Octavian.Polyester.free_threads!(torelease) + else + Octavian.matmul_st_pack_A_and_B!( + zc, za, zb, Octavian.StaticInt{1}(), Octavian.StaticInt{0}(), M, K, N, + Octavian.W₁Default(), Octavian.W₂Default(), Octavian.R₁Default(), Octavian.R₂Default(), 0 + ) end - C + end + C end testset_name_suffix = "(coverage)"