Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 3 additions & 3 deletions Project.toml
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
name = "Octavian"
uuid = "6fd5a793-0b7e-452c-907f-f8bfe9c57db4"
authors = ["Mason Protter", "Chris Elrod", "Dilum Aluthge", "contributors"]
version = "0.2.8"
version = "0.2.9"

[deps]
ArrayInterface = "4fba245c-0d91-5ea0-9b3e-6abc04ee57a9"
Expand All @@ -11,9 +11,9 @@ VectorizationBase = "3d5dd08c-fd9d-11e8-17fa-ed2836048c2f"

[compat]
ArrayInterface = "3"
LoopVectorization = "0.11"
LoopVectorization = "0.11.2"
ThreadingUtilities = "0.2"
VectorizationBase = "0.17"
VectorizationBase = "0.18.1"
julia = "1.5"

[extras]
Expand Down
6 changes: 3 additions & 3 deletions benchmark/staticarraybench.jl
Original file line number Diff line number Diff line change
Expand Up @@ -63,11 +63,11 @@ rename!(df, matmulmethodnames);
df.Size = sizerange

function pick_suffix(desc = "")
suffix = if Octavian.VectorizationBase.has_feature("x86_64_avx512f")
suffix = if Bool(Octavian.VectorizationBase.has_feature(Val(:x86_64_avx512f)))
"AVX512"
elseif Octavian.VectorizationBase.has_feature("x86_64_avx2")
elseif Bool(Octavian.VectorizationBase.has_feature(Val(:x86_64_avx2)))
"AVX2"
elseif Octavian.VectorizationBase.has_feature("x86_64_avx")
elseif Bool(Octavian.VectorizationBase.has_feature(Val(:x86_64_avx)))
"AVX"
else
"REGSIZE$(Octavian.VectorizationBase.register_size())"
Expand Down
7 changes: 4 additions & 3 deletions src/Octavian.jl
Original file line number Diff line number Diff line change
Expand Up @@ -3,11 +3,12 @@ module Octavian
using VectorizationBase, ArrayInterface, LoopVectorization

using VectorizationBase: align, AbstractStridedPointer, zstridedpointer,
static_sizeof, lazymul, StridedPointer, gesp, pause, pick_vector_width_val,
snum_cache_levels, scache_size, snum_cores, num_cores, cache_inclusivity, scacheline_size
static_sizeof, lazymul, StridedPointer, gesp, pause, pick_vector_width, has_feature,
num_cache_levels, cache_size, num_cores, num_cores, cache_inclusive, cache_linesize, ifelse
using LoopVectorization: maybestaticsize, matmul_params, preserve_buffer, CloseOpen
using ArrayInterface: StaticInt, Zero, One, OptionallyStaticUnitRange, size, strides, offsets, indices,
static_length, static_first, static_last, axes, dense_dims, stride_rank
static_length, static_first, static_last, axes, dense_dims, stride_rank,
StaticBool, True, False, gt, eq

using ThreadingUtilities:
_atomic_add!, _atomic_umax!, _atomic_umin!,
Expand Down
6 changes: 3 additions & 3 deletions src/block_sizes.jl
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@


function block_sizes(::Type{T}, _α, _β, R₁, R₂) where {T}
W = pick_vector_width_val(T)
W = pick_vector_width(T)
α = _α * W
β = _β * W
L₁ₑ = first_cache_size(T) * R₁
Expand Down Expand Up @@ -158,7 +158,7 @@ Note that for synchronization on `B`, all threads must have the same values for
independently of `M`, this algorithm guarantees all threads are on the same page.
"""
@inline function solve_block_sizes(::Type{T}, M, K, N, _α, _β, R₂, R₃, Wfactor) where {T}
W = pick_vector_width_val(T)
W = pick_vector_width(T)
α = _α * W
β = _β * W
L₁ₑ = first_cache_size(T) * R₂
Expand All @@ -177,7 +177,7 @@ independently of `M`, this algorithm guarantees all threads are on the same page
end
# Takes Nc, calcs Mc and Kc
@inline function solve_McKc(::Type{T}, M, K, Nc, _α, _β, R₂, R₃, Wfactor) where {T}
W = pick_vector_width_val(T)
W = pick_vector_width(T)
α = _α * W
β = _β * W
L₁ₑ = first_cache_size(T) * R₂
Expand Down
125 changes: 48 additions & 77 deletions src/global_constants.jl
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
const OCTAVIAN_NUM_TASKS = Ref(1)
_nthreads() = OCTAVIAN_NUM_TASKS[]

@generated function calc_factors(::Union{Val{nc},StaticInt{nc}} = snum_cores()) where {nc}
@generated function calc_factors(::Union{Val{nc},StaticInt{nc}} = num_cores()) where {nc}
t = Expr(:tuple)
for i ∈ nc:-1:1
d, r = divrem(nc, i)
Expand All @@ -12,87 +12,58 @@ _nthreads() = OCTAVIAN_NUM_TASKS[]
end
# const CORE_FACTORS = calc_factors()

@generated function MᵣW_mul_factor()
f = VectorizationBase.has_feature("x86_64_avx512f") ? 4 : 9
Expr(:call, Expr(:curly, :StaticInt, f))
end
MᵣW_mul_factor(::True) = StaticInt{4}()
MᵣW_mul_factor(::False) = StaticInt{9}()
MᵣW_mul_factor() = MᵣW_mul_factor(has_feature(Val(:x86_64_avx512f)))

@generated function W₁Default()
w = if VectorizationBase.has_feature("x86_64_avx512f")
0.006089395198610773
elseif (Sys.CPU_NAME === "znver2") || (Sys.CPU_NAME === "znver3") # these are znver2 values, I'm assuming they're better for znver3 than generic
0.1
elseif Sys.CPU_NAME === "znver1"
0.053918949422353986
else
0.1
end
Expr(:call, Expr(:curly, :StaticFloat, w))
end
@generated function W₂Default()
w = if VectorizationBase.has_feature("x86_64_avx512f")
0.7979822724696168
elseif (Sys.CPU_NAME === "znver2") || (Sys.CPU_NAME === "znver3") # these are znver2 values, I'm assuming they're better for znver3 than generic
0.993489411720157
elseif Sys.CPU_NAME === "znver1"
0.3013238122374886
else
0.15989396641218157
end
Expr(:call, Expr(:curly, :StaticFloat, w))
end
@generated function R₁Default()
w = if VectorizationBase.has_feature("x86_64_avx512f")
0.5900561503730485
elseif (Sys.CPU_NAME === "znver2") || (Sys.CPU_NAME === "znver3") # these are znver2 values, I'm assuming they're better for znver3 than generic
0.6052218809954467
elseif Sys.CPU_NAME === "znver1"
0.6077103834481342
else
0.4203583148344484
end
Expr(:call, Expr(:curly, :StaticFloat, w))
end
@generated function R₂Default()
w = if VectorizationBase.has_feature("x86_64_avx512f")
0.762152930709678
elseif (Sys.CPU_NAME === "znver2") || (Sys.CPU_NAME === "znver3") # these are znver2 values, I'm assuming they're better for znver3 than generic
0.7594052633561165
elseif Sys.CPU_NAME === "znver1"
0.8775382433240162
else
0.6344856142604789
end
Expr(:call, Expr(:curly, :StaticFloat, w))
end
W₁Default(::True) = StaticFloat{0.006089395198610773}()
W₂Default(::True) = StaticFloat{0.7979822724696168}()
R₁Default(::True) = StaticFloat{0.5900561503730485}()
R₂Default(::True) = StaticFloat{0.762152930709678}()

W₁Default_arch(::Val{:znver1}) = StaticFloat{0.053918949422353986}()
W₂Default_arch(::Val{:znver1}) = StaticFloat{0.3013238122374886}()
R₁Default_arch(::Val{:znver1}) = StaticFloat{0.6077103834481342}()
R₂Default_arch(::Val{:znver1}) = StaticFloat{0.8775382433240162}()

W₁Default_arch(::Union{Val{:znver2},Val{:znver3}}) = StaticFloat{0.1}()
W₂Default_arch(::Union{Val{:znver2},Val{:znver3}}) = StaticFloat{0.993489411720157}()
R₁Default_arch(::Union{Val{:znver2},Val{:znver3}}) = StaticFloat{0.6052218809954467}()
R₂Default_arch(::Union{Val{:znver2},Val{:znver3}}) = StaticFloat{0.7594052633561165}()

W₁Default_arch(_) = StaticFloat{0.1}()
W₂Default_arch(_) = StaticFloat{0.15989396641218157}()
R₁Default_arch(_) = StaticFloat{0.4203583148344484}()
R₂Default_arch(_) = StaticFloat{0.8775382433240162}()

W₁Default(::False) = W₁Default_arch(VectorizationBase.cpu_name())
W₂Default(::False) = W₂Default_arch(VectorizationBase.cpu_name())
R₁Default(::False) = R₁Default_arch(VectorizationBase.cpu_name())
R₂Default(::False) = R₂Default_arch(VectorizationBase.cpu_name())

W₁Default() = W₁Default(has_feature(Val(:x86_64_avx512f)))
W₂Default() = W₂Default(has_feature(Val(:x86_64_avx512f)))
R₁Default() = R₁Default(has_feature(Val(:x86_64_avx512f)))
R₂Default() = R₂Default(has_feature(Val(:x86_64_avx512f)))




first_cache() = ifelse(gt(num_cache_levels(), StaticInt{2}()), StaticInt{2}(), StaticInt{1}())
second_cache() = first_cache() + One()

_first_cache_size(fcs::StaticInt) = ifelse(eq(first_cache(), StaticInt(2)) & cache_inclusive(StaticInt(2)), fcs - cache_size(One()), fcs)
_first_cache_size(::Nothing) = StaticInt(262144)
first_cache_size() = _first_cache_size(cache_size(first_cache()))

_second_cache_size(scs::StaticInt) = ifelse(cache_inclusive(second_cache()), scs - cache_size(first_cache()), scs)
_second_cache_size(::Nothing) = StaticInt(3145728)
second_cache_size() = _second_cache_size(cache_size(second_cache()))

first_cache() = StaticInt{1}() + (snum_cache_levels() > StaticInt{2}() ? One() : Zero())
second_cache() = StaticInt{2}() + (snum_cache_levels() > StaticInt{2}() ? One() : Zero())

function first_cache_size()
fcs = scache_size(first_cache())
if fcs === Zero()
return StaticInt(262144)
elseif (first_cache() === StaticInt(2)) && cache_inclusivity()[2]
return fcs - scache_size(One())
else
return fcs
end
end
function second_cache_size()
scs = scache_size(second_cache())
if scs === Zero()
return StaticInt(3145728)
elseif cache_inclusivity()[second_cache()]
return scs - scache_size(first_cache())
else
return scs
end
end
first_cache_size(::Type{T}) where {T} = first_cache_size() ÷ static_sizeof(T)
second_cache_size(::Type{T}) where {T} = second_cache_size() ÷ static_sizeof(T)

bcache_count() = VectorizationBase.scache_count(second_cache())
bcache_count() = VectorizationBase.num_cache(second_cache())

const BCACHEPTR = Ref{Ptr{Cvoid}}(C_NULL)
const BCACHE_LOCK = Threads.Atomic{UInt}(zero(UInt))
Expand Down
Loading