Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 6 additions & 4 deletions Project.toml
Original file line number Diff line number Diff line change
@@ -1,24 +1,26 @@
name = "Octavian"
uuid = "6fd5a793-0b7e-452c-907f-f8bfe9c57db4"
authors = ["Mason Protter", "Chris Elrod", "Dilum Aluthge", "contributors"]
version = "0.2.20"
version = "0.3.0"

[deps]
ArrayInterface = "4fba245c-0d91-5ea0-9b3e-6abc04ee57a9"
IfElse = "615f187c-cbe4-4ef1-ba3b-2fcf58d6d173"
LoopVectorization = "bdcacae8-1622-11e9-2a5c-532679323890"
ManualMemory = "d125e4d3-2237-4719-b19c-fa641b8a4667"
Polyester = "f517fe37-dbe3-4b94-8317-1923a5111588"
Static = "aedffcd0-7271-4cad-89d0-dc628f76c6d3"
StrideArraysCore = "7792a7ef-975c-4747-a70f-980b88e8d1da"
ThreadingUtilities = "8290d209-cae3-49c0-8002-c8c24d57dab5"
VectorizationBase = "3d5dd08c-fd9d-11e8-17fa-ed2836048c2f"

[compat]
ArrayInterface = "3.1.14"
IfElse = "0.1"
LoopVectorization = "0.12.34"
ManualMemory = "0.1.1"
Polyester = "0.3.5"
Static = "0.2"
StrideArraysCore = "0.1.11"
ThreadingUtilities = "0.4"
ThreadingUtilities = "0.4.6"
VectorizationBase = "0.20.16"
julia = "1.6"

Expand Down
8 changes: 3 additions & 5 deletions src/Octavian.jl
Original file line number Diff line number Diff line change
Expand Up @@ -8,14 +8,12 @@ using VectorizationBase: align, AbstractStridedPointer, zstridedpointer, vsub_ns
using LoopVectorization: preserve_buffer, CloseOpen, UpperBoundedInteger
using ArrayInterface: size, strides, offsets, indices, axes
using IfElse: ifelse

using Polyester
using Static: StaticInt, Zero, One, StaticBool, True, False, gt, eq, StaticFloat64,
roundtostaticint, floortostaticint
using StrideArraysCore: MemoryBuffer
using ManualMemory: MemoryBuffer, load, store!

using ThreadingUtilities:
_atomic_add!, _atomic_load, _atomic_store!,
launch, wait, load, store!
using ThreadingUtilities: _atomic_add!, _atomic_load, _atomic_store!, launch, wait

export StaticInt
export matmul!
Expand Down
18 changes: 9 additions & 9 deletions src/block_sizes.jl
Original file line number Diff line number Diff line change
Expand Up @@ -220,15 +220,15 @@ end
Splits both `M` and `N` into blocks when trying to spawn a large number of threads relative to the size of the matrices.
"""
@inline function divide_blocks(::Val{T}, M, Ntotal, _nspawn, W) where {T}
_nspawn == num_cores() && return find_first_acceptable(Val(T), M, W)
mᵣ, nᵣ = matmul_params(Val(T))
Miter = clamp(div_fast(M, W*mᵣ * MᵣW_mul_factor()), 1, _nspawn)
_nspawn == num_cores() && return find_first_acceptable(Val(T), M, W)
mᵣ, nᵣ = matmul_params(Val(T))
Miter = clamp(div_fast(M, W*mᵣ * MᵣW_mul_factor()), 1, _nspawn)
nspawn = div_fast(_nspawn, Miter)
if (nspawn ≤ 1) & (Miter < _nspawn)
# rebalance Miter
Miter = cld_fast(_nspawn, cld_fast(_nspawn, Miter))
nspawn = div_fast(_nspawn, Miter)
if (nspawn ≤ 1) & (Miter < _nspawn)
# rebalance Miter
Miter = cld_fast(_nspawn, cld_fast(_nspawn, Miter))
nspawn = div_fast(_nspawn, Miter)
end
Miter, cld_fast(Ntotal, max(2, cld_fast(Ntotal, nspawn)))
end
Miter, cld_fast(Ntotal, max(2, cld_fast(Ntotal, nspawn)))
end

148 changes: 73 additions & 75 deletions src/funcptrs.jl
Original file line number Diff line number Diff line change
@@ -1,103 +1,101 @@


struct LoopMulFunc{P,TC,TA,TB,Α,Β,Md,Kd,Nd} <: Function end
function (::LoopMulFunc{P,TC,TA,TB,Α,Β,Md,Kd,Nd})(p::Ptr{UInt}) where {P,TC,TA,TB,Α,Β,Md,Kd,Nd}
offset, C = load(p, TC, 2*sizeof(UInt))
offset, A = load(p, TA, offset)
offset, B = load(p, TB, offset)
offset, α = load(p, Α, offset)
offset, β = load(p, Β, offset)
offset, M = load(p, Md, offset)
offset, K = load(p, Kd, offset)
offset, N = load(p, Nd, offset)
_call_loopmul!(C, A, B, α, β, M, K, N, Val{P}())
nothing
offset, C = load(p, TC, 2*sizeof(UInt))
offset, A = load(p, TA, offset)
offset, B = load(p, TB, offset)
offset, α = load(p, Α, offset)
offset, β = load(p, Β, offset)
offset, M = load(p, Md, offset)
offset, K = load(p, Kd, offset)
offset, N = load(p, Nd, offset)
_call_loopmul!(C, A, B, α, β, M, K, N, Val{P}())
nothing
end
@inline _call_loopmul!(C, A, B, α, β, M, K, N, ::Val{false}) = loopmul!(C, A, B, α, β, M, K, N)
@inline function _call_loopmul!(C::StridedPointer{T}, A, B, α, β, M, K, N, ::Val{true}) where {T}
if M*K < first_cache_size(Val(T)) * R₂Default()
packaloopmul!(C, A, B, α, β, M, K, N)
return
else
matmul_st_only_pack_A!(C, A, B, α, β, M, K, N, W₁Default(), W₂Default(), R₁Default(), R₂Default())
return
end
if M*K < first_cache_size(Val(T)) * R₂Default()
packaloopmul!(C, A, B, α, β, M, K, N)
return
else
matmul_st_only_pack_A!(C, A, B, α, β, M, K, N, W₁Default(), W₂Default(), R₁Default(), R₂Default())
return
end
end
call_loopmul!(C, A, B, α, β, M, K, N, ::Val{P}) where {P} = _call_loopmul!(C, A, B, α, β, M, K, N, Val{P}())

struct SyncMulFunc{TC,TA,TB,Α,Β,Md,Kd,Nd,BCP,ID,TT,W₁,W₂,R₁,R₂} <: Function end
function (::SyncMulFunc{TC,TA,TB,Α,Β,Md,Kd,Nd,BCP,ID,TT,W₁,W₂,R₁,R₂})(p::Ptr{UInt}) where {TC,TA,TB,Α,Β,Md,Kd,Nd,BCP,ID,TT,W₁,W₂,R₁,R₂}
offset, C = load(p, TC, 2*sizeof(UInt))
offset, A = load(p, TA, offset)
offset, B = load(p, TB, offset)
offset, α = load(p, Α, offset)
offset, β = load(p, Β, offset)
offset, M = load(p, Md, offset)
offset, K = load(p, Kd, offset)
offset, N = load(p, Nd, offset)
offset, atomicp = load(p, Ptr{UInt32}, offset)
offset, bcachep = load(p, BCP, offset)
offset, id = load(p, ID, offset)
offset, total_ids = load(p, TT, offset)
sync_mul!(C, A, B, α, β, M, K, N, atomicp, bcachep, id, total_ids, StaticFloat64{W₁}(), StaticFloat64{W₂}(), StaticFloat64{R₁}(), StaticFloat64{R₂}())
nothing
offset, C = load(p, TC, 2*sizeof(UInt))
offset, A = load(p, TA, offset)
offset, B = load(p, TB, offset)
offset, α = load(p, Α, offset)
offset, β = load(p, Β, offset)
offset, M = load(p, Md, offset)
offset, K = load(p, Kd, offset)
offset, N = load(p, Nd, offset)
offset, atomicp = load(p, Ptr{UInt32}, offset)
offset, bcachep = load(p, BCP, offset)
offset, id = load(p, ID, offset)
offset, total_ids = load(p, TT, offset)
sync_mul!(C, A, B, α, β, M, K, N, atomicp, bcachep, id, total_ids, StaticFloat64{W₁}(), StaticFloat64{W₂}(), StaticFloat64{R₁}(), StaticFloat64{R₂}())
nothing
end

@generated function cfuncpointer(::T) where {T}
precompile(T(), (Ptr{UInt},))
quote
$(Expr(:meta,:inline))
@cfunction($(T()), Cvoid, (Ptr{UInt},))
end
precompile(T(), (Ptr{UInt},))
quote
$(Expr(:meta,:inline))
@cfunction($(T()), Cvoid, (Ptr{UInt},))
end
end

@inline function setup_matmul!(p::Ptr{UInt}, C::TC, A::TA, B::TB, α::Α, β::Β, M::Md, K::Kd, N::Nd, ::Val{P}) where {P,TC,TA,TB,Α,Β,Md,Kd,Nd}
offset = store!(p, cfuncpointer(LoopMulFunc{P,TC,TA,TB,Α,Β,Md,Kd,Nd}()), sizeof(UInt))
offset = store!(p, C, offset)
offset = store!(p, A, offset)
offset = store!(p, B, offset)
offset = store!(p, α, offset)
offset = store!(p, β, offset)
offset = store!(p, M, offset)
offset = store!(p, K, offset)
offset = store!(p, N, offset)
nothing
offset = store!(p, cfuncpointer(LoopMulFunc{P,TC,TA,TB,Α,Β,Md,Kd,Nd}()), sizeof(UInt))
offset = store!(p, C, offset)
offset = store!(p, A, offset)
offset = store!(p, B, offset)
offset = store!(p, α, offset)
offset = store!(p, β, offset)
offset = store!(p, M, offset)
offset = store!(p, K, offset)
offset = store!(p, N, offset)
nothing
end

@inline function setup_syncmul!(
p::Ptr{UInt}, C::TC, A::TA, B::TB, α::Α, β::Β, M::Md, K::Kd, N::Nd,
ap::Ptr{UInt32},bcp::BCP,id::ID,tt::TT,::StaticFloat64{W₁},::StaticFloat64{W₂},::StaticFloat64{R₁},::StaticFloat64{R₂}
p::Ptr{UInt}, C::TC, A::TA, B::TB, α::Α, β::Β, M::Md, K::Kd, N::Nd,
ap::Ptr{UInt32},bcp::BCP,id::ID,tt::TT,::StaticFloat64{W₁},::StaticFloat64{W₂},::StaticFloat64{R₁},::StaticFloat64{R₂}
) where {TC,TA,TB,Α,Β,Md,Kd,Nd,BCP,ID,TT,W₁,W₂,R₁,R₂}
offset = store!(p, cfuncpointer(SyncMulFunc{TC,TA,TB,Α,Β,Md,Kd,Nd,BCP,ID,TT,W₁,W₂,R₁,R₂}()), sizeof(UInt))
offset = store!(p, C, offset)
offset = store!(p, A, offset)
offset = store!(p, B, offset)
offset = store!(p, α, offset)
offset = store!(p, β, offset)
offset = store!(p, M, offset)
offset = store!(p, K, offset)
offset = store!(p, N, offset)
offset = store!(p, ap, offset)
offset = store!(p, bcp, offset)
offset = store!(p, id, offset)
offset = store!(p, tt, offset)
nothing
offset = store!(p, cfuncpointer(SyncMulFunc{TC,TA,TB,Α,Β,Md,Kd,Nd,BCP,ID,TT,W₁,W₂,R₁,R₂}()), sizeof(UInt))
offset = store!(p, C, offset)
offset = store!(p, A, offset)
offset = store!(p, B, offset)
offset = store!(p, α, offset)
offset = store!(p, β, offset)
offset = store!(p, M, offset)
offset = store!(p, K, offset)
offset = store!(p, N, offset)
offset = store!(p, ap, offset)
offset = store!(p, bcp, offset)
offset = store!(p, id, offset)
offset = store!(p, tt, offset)
nothing
end

function launch_thread_mul!(C, A, B, α, β, M, K, N, tid::Int, ::Val{P}) where {P}
launch(tid, C, A, B, α, β, M, K, N, Val{P}()) do p, C, A, B, α, β, M, K, N, VP
setup_matmul!(p, C, A, B, α, β, M, K, N, VP)
end
@inline function launch_thread_mul!(C, A, B, α, β, M, K, N, tid::UInt32, ::Val{P}) where {P}
launch(setup_matmul!, tid, C, A, B, α, β, M, K, N, Val{P}())
end
function launch_thread_mul!(
C, A, B, α, β, M, K, N, ap, bcp, tid, tt,::StaticFloat64{W₁},::StaticFloat64{W₂},::StaticFloat64{R₁},::StaticFloat64{R₂}
@inline function launch_thread_mul!(
C, A, B, α, β, M, K, N, ap, bcp, tid, id, tt, ::StaticFloat64{W₁},::StaticFloat64{W₂},::StaticFloat64{R₁},::StaticFloat64{R₂}
) where {W₁,W₂,R₁,R₂}
launch(tid+one(tid), C, A, B, α, β, M, K, N, ap, bcp, tid, tt) do p, C, A, B, α, β, M, K, N, ap, bcp, tid, tt
setup_syncmul!(
p, C, A, B, α, β, M, K, N, ap, bcp, tid, tt,
StaticFloat64{W₁}(),StaticFloat64{W₂}(),StaticFloat64{R₁}(),StaticFloat64{R₂}()
)
end
launch(tid, C, A, B, α, β, M, K, N, ap, bcp, id, tt) do p, C, A, B, α, β, M, K, N, ap, bcp, id, tt
Base.@_inline_meta
setup_syncmul!(
p, C, A, B, α, β, M, K, N, ap, bcp, id, tt,
StaticFloat64{W₁}(),StaticFloat64{W₂}(),StaticFloat64{R₁}(),StaticFloat64{R₂}()
)
end
end


4 changes: 2 additions & 2 deletions src/global_constants.jl
Original file line number Diff line number Diff line change
Expand Up @@ -73,7 +73,7 @@ bcache_count() = VectorizationBase.num_cache(second_cache())
const BCACHEPTR = Ref{Ptr{Cvoid}}(C_NULL)
const BCACHE_LOCK = Threads.Atomic{UInt}(zero(UInt))

if Sys.WORD_SIZE ≤ 32
const ACACHEPTR = Ref{Ptr{Cvoid}}(C_NULL)
@static if Sys.WORD_SIZE ≤ 32
const ACACHEPTR = Ref{Ptr{Cvoid}}(C_NULL)
end

62 changes: 31 additions & 31 deletions src/init.jl
Original file line number Diff line number Diff line change
@@ -1,45 +1,45 @@
function __init__()
init_acache()
init_bcache()
nt = init_num_tasks()
if nt < num_cores() && ("OCTAVIAN_WARNING" ∈ keys(ENV))
msg = string(
"Your system has $(num_cores()) physical cores, but `Octavian.jl` only has ",
"$(nt > 1 ? "$(nt) threads" : "$(nt) thread") available. ",
"For the best performance, you should start Julia with at least $(num_cores()) threads.",
)
@warn msg
end
reseet_bcache_lock!()
init_acache()
init_bcache()
nt = init_num_tasks()
if nt < num_cores() && ("OCTAVIAN_WARNING" ∈ keys(ENV))
msg = string(
"Your system has $(num_cores()) physical cores, but `Octavian.jl` only has ",
"$(nt > 1 ? "$(nt) threads" : "$(nt) thread") available. ",
"For the best performance, you should start Julia with at least $(num_cores()) threads.",
)
@warn msg
end
reseet_bcache_lock!()
end

function init_bcache()
if bcache_count() ≢ Zero()
BCACHEPTR[] = VectorizationBase.valloc(second_cache_size() * bcache_count(), Cvoid, ccall(:jl_getpagesize, Int, ()))
end
nothing
if bcache_count() ≢ Zero()
BCACHEPTR[] = VectorizationBase.valloc(second_cache_size() * bcache_count(), Cvoid, ccall(:jl_getpagesize, Int, ()))
end
nothing
end

if Sys.WORD_SIZE ≤ 32
function init_acache()
ACACHEPTR[] = VectorizationBase.valloc(first_cache_size() * init_num_tasks(), Cvoid, ccall(:jl_getpagesize, Int, ()))
nothing
end
@static if Sys.WORD_SIZE ≤ 32
function init_acache()
ACACHEPTR[] = VectorizationBase.valloc(first_cache_size() * init_num_tasks(), Cvoid, ccall(:jl_getpagesize, Int, ()))
nothing
end
else
init_acache() = nothing
init_acache() = nothing
end

function init_num_tasks()
num_tasks = _read_environment_num_tasks()::Int
OCTAVIAN_NUM_TASKS[] = num_tasks
num_tasks = _read_environment_num_tasks()::Int
OCTAVIAN_NUM_TASKS[] = num_tasks
end

function _read_environment_num_tasks()
environment_variable = get(ENV, "OCTAVIAN_NUM_TASKS", "")::String
nt = min(Threads.nthreads(), VectorizationBase.num_cores())::Int
if isempty(environment_variable)
return nt
else
return min(parse(Int, environment_variable)::Int, nt)
end
environment_variable = get(ENV, "OCTAVIAN_NUM_TASKS", "")::String
nt = min(Threads.nthreads(), VectorizationBase.num_cores())::Int
if isempty(environment_variable)
return nt
else
return min(parse(Int, environment_variable)::Int, nt)
end
end
Loading