diff --git a/Project.toml b/Project.toml index 5c64aac..7f03179 100644 --- a/Project.toml +++ b/Project.toml @@ -1,7 +1,7 @@ name = "Octavian" uuid = "6fd5a793-0b7e-452c-907f-f8bfe9c57db4" authors = ["Mason Protter", "Chris Elrod", "Dilum Aluthge", "contributors"] -version = "0.2.13" +version = "0.2.14" [deps] ArrayInterface = "4fba245c-0d91-5ea0-9b3e-6abc04ee57a9" diff --git a/src/global_constants.jl b/src/global_constants.jl index 1fbd16d..4ee4eb5 100644 --- a/src/global_constants.jl +++ b/src/global_constants.jl @@ -48,17 +48,22 @@ R₂Default() = R₂Default(has_feature(Val(:x86_64_avx512f))) - -first_cache() = ifelse(gt(num_cache_levels(), StaticInt{2}()), StaticInt{2}(), StaticInt{1}()) +_first_cache(::StaticInt{1}) = StaticInt{1}() +_first_cache(::StaticInt) = StaticInt{2}() +first_cache() = _first_cache(VectorizationBase.num_l2cache()) second_cache() = first_cache() + One() _first_cache_size(fcs::StaticInt) = ifelse(eq(first_cache(), StaticInt(2)) & cache_inclusive(StaticInt(2)), fcs - cache_size(One()), fcs) _first_cache_size(::Nothing) = StaticInt(262144) first_cache_size() = _first_cache_size(cache_size(first_cache())) -_second_cache_size(scs::StaticInt) = ifelse(cache_inclusive(second_cache()), scs - cache_size(first_cache()), scs) -_second_cache_size(::Nothing) = StaticInt(3145728) -second_cache_size() = _second_cache_size(cache_size(second_cache())) +_second_cache_size(scs::StaticInt, ::True) = scs - cache_size(first_cache()) +_second_cache_size(scs::StaticInt, ::False) = scs +_second_cache_size(::StaticInt{0}, ::Nothing) = StaticInt(3145728) +function second_cache_size() + sc = second_cache() + _second_cache_size(cache_size(sc), cache_inclusive(sc)) +end first_cache_size(::Type{T}) where {T} = first_cache_size() ÷ static_sizeof(T) second_cache_size(::Type{T}) where {T} = second_cache_size() ÷ static_sizeof(T) diff --git a/src/matmul.jl b/src/matmul.jl index beef0ff..9d6b237 100644 --- a/src/matmul.jl +++ b/src/matmul.jl @@ -260,12 +260,18 @@ end if maybeinline(M, N, T, ArrayInterface.is_column_major(A)) # check MUST be compile-time resolvable inlineloopmul!(pC, pA, pB, One(), Zero(), M, K, N) return - elseif (nᵣ ≥ N) || (M*K*N < (StaticInt{4096}() * W)) - loopmul!(pC, pA, pB, α, β, M, K, N) - return else + (nᵣ ≥ N) && @goto LOOPMUL + if (Sys.ARCH === :x86_64) || (Sys.ARCH === :i686) + (M*K*N < (StaticInt{4_096}() * W)) && @goto LOOPMUL + else + (M*K*N < (StaticInt{32_000}() * W)) && @goto LOOPMUL + end __matmul!(pC, pA, pB, α, β, M, K, N, nthread) return + @label LOOPMUL + loopmul!(pC, pA, pB, α, β, M, K, N) + return end end end @@ -326,11 +332,13 @@ function __matmul!( return end # We are threading, but how many threads? - L = StaticInt{128}() * W - # L = StaticInt{64}() * W - nspawn = clamp(div_fast(M * N, L), 1, _nthread) - + nspawn = if (Sys.ARCH === :x86_64) || (Sys.ARCH === :i686) + clamp(div_fast(M * N, StaticInt{128}() * W), 1, _nthread) + else + clamp(div_fast(M * N, StaticInt{256}() * W), 1, _nthread) + end # nkern = cld_fast(M * N, MᵣW * Nᵣ) + # Approach: # Check if we don't want to pack A, # if not, aggressively subdivide