diff --git a/Project.toml b/Project.toml index ede20dd..552947b 100644 --- a/Project.toml +++ b/Project.toml @@ -1,7 +1,7 @@ name = "Octavian" uuid = "6fd5a793-0b7e-452c-907f-f8bfe9c57db4" authors = ["Mason Protter", "Chris Elrod", "Dilum Aluthge", "contributors"] -version = "0.3.9" +version = "0.3.10" [deps] ArrayInterface = "4fba245c-0d91-5ea0-9b3e-6abc04ee57a9" diff --git a/benchmark/tilesearch.jl b/benchmark/tilesearch.jl index b0a83ec..d54995b 100644 --- a/benchmark/tilesearch.jl +++ b/benchmark/tilesearch.jl @@ -5,12 +5,14 @@ function matmul_pack_ab!(C, A, B, ::Val{W₁}, ::Val{W₂}, ::Val{R₁}, ::Val{R M, N = size(C); K = size(B,1) zc, za, zb = Octavian.zstridedpointer.((C,A,B)) nspawn = VectorizationBase.num_cores() + threads, torelease = Octavian.PolyesterWeave.__request_threads((nspawn-1)%UInt32, Octavian.PolyesterWeave.worker_pointer(), nothing) t = Inf GC.@preserve C A B begin for _ ∈ 1:2 - t = min(t, @elapsed(Octavian.matmul_pack_A_and_B!(zc, za, zb, Octavian.One(), Octavian.Zero(), M, K, N, Int(nspawn), F64(W₁), F64(W₂), F64(R₁), F64(R₂)))) + t = min(t, @elapsed(Octavian.matmul_pack_A_and_B!(zc, za, zb, Octavian.One(), Octavian.Zero(), M, K, N, threads, F64(W₁), F64(W₂), F64(R₁), F64(R₂)))) end end + Octavian.PolyesterWeave.free_threads!(torelease) return t end @@ -119,15 +121,16 @@ using Optim hours = 60.0*60.0; days = 24hours; init = Float64[Octavian.W₁Default(), Octavian.W₂Default(), Octavian.R₁Default(), Octavian.R₂Default()] lower = 0.75 .* init; -upper = [1.25init[1], 1.25init[2], 0.75*init[3] + 0.25, 0.75*init[4] + 0.25]; +# upper = [1.25init[1], 1.25init[2], 0.75*init[3] + 0.25, 0.75*init[4] + 0.25]; +upper = [0.9, 1.25init[2], 0.999, 0.999]; # init = [0.001, 0.9754033943603924, 0.5711159869399494, 0.7547361860432168]; +#= opt = Optim.optimize( matmul_objective, init, ParticleSwarm(lower = lower, upper = upper), Optim.Options(iterations = 10^6, time_limit = 8hours) ); - - +=# diff --git a/src/block_sizes.jl b/src/block_sizes.jl index 7d0587f..ed62721 100644 --- a/src/block_sizes.jl +++ b/src/block_sizes.jl @@ -178,25 +178,47 @@ independently of `M`, this algorithm guarantees all threads are on the same page end # Takes Nc, calcs Mc and Kc @inline function solve_McKc(::Val{T}, M, K, Nc, _α, _β, R₂, R₃, Wfactor) where {T} - W = pick_vector_width(T) - α = _α * W - β = _β * W - L₁ₑ = first_cache_size(Val(T)) * R₂ - L₂ₑ = second_cache_size(Val(T)) * R₃ + W = pick_vector_width(T) + α = _α * W + β = _β * W + L₁ₑ = first_cache_size(Val(T)) * R₂ + L₂ₑ = second_cache_size(Val(T)) * R₃ - Kc_init⁻¹ = Base.FastMath.max_fast(√(α/L₁ₑ), Nc*inv(L₂ₑ)) - Kiter = cldapproxi(K, Kc_init⁻¹) # approximate `ceil` - Kblock, Krem = divrem_fast(K, Kiter) - Kblock_Krem = Kblock + One() + Kc_init⁻¹ = Base.FastMath.max_fast(√(α/L₁ₑ), Nc*inv(L₂ₑ)) + Kiter = cldapproxi(K, Kc_init⁻¹) # approximate `ceil` + Kblock, Krem = divrem_fast(K, Kiter) + Kblock_Krem = Kblock + One() - Miter_init = cldapproxi(M * inv(L₁ₑ), Kblock_Krem) # Miter = M * Kc / L₁ₑ - Mbsize, Mrem, Mremfinal, Miter = split_m(M, Miter_init, W * Wfactor) - Mblock_Mrem = Mbsize + W * Wfactor - - promote(Mbsize, Mblock_Mrem, Mremfinal, Mrem, Miter), promote(Kblock, Kblock_Krem, Krem, Kiter) + Mᵣ = Wfactor * W + Mc_init = floor(Int, Base.FastMath.div_fast(L₁ₑ / Mᵣ, Float64(Kblock_Krem))) + Mc_init_base = max(0, Mc_init - 1) + Kblock_summary = promote(Kblock, Kblock_Krem, Krem, Kiter) + if (Mc_init_base ≠ 0) # Mc_init > 1 + Mbsize = Mc_init_base * Mᵣ + Mblocks, Mblocks_rem = divrem_fast(M, Mᵣ) + Miter, Mrem = divrem_fast(Mblocks, Mc_init_base) + if Miter == 0 + return (0, 0, Int(M)::Int, 0, 1), Kblock_summary + elseif Miter > Mrem + Mblock_Mrem = Mbsize + Mᵣ + Mremfinal = Mbsize + Mblocks_rem + # @show Mbsize * (Miter - 1 - Mrem) + Mrem * Mblock_Mrem + Mremfinal + map(Int, (Mbsize, Mblock_Mrem, Mremfinal, Mrem, Miter)), Kblock_summary + else + _Mbsize, _Mrem, _Mremfinal, _Miter = split_m(M, Miter + (Mrem ≠ 0), Mᵣ) + _Mblock_Mrem = _Mbsize + Mᵣ + return map(Int, (_Mbsize, _Mblock_Mrem, _Mremfinal, _Mrem, _Miter)), Kblock_summary + end + else + Mbsize0 = Int(Mᵣ) + Mblock_Mrem0 = Int(Mᵣ) + Miter0, Mremfinal0 = divrem_fast(M, Mᵣ) + map(Int, (Mbsize0, Mblock_Mrem0, Mremfinal0, 0, Miter0)), Kblock_summary + end end @inline cldapproxi(n, d⁻¹) = Base.fptosi(Int, Base.FastMath.add_fast(Base.FastMath.mul_fast(n, d⁻¹), 0.9999999999999432)) # approximate `ceil` +# @inline divapproxi(n, d⁻¹) = Base.fptosi(Int, Base.FastMath.mul_fast(n, d⁻¹)) # approximate `div` """ find_first_acceptable(M, W) diff --git a/src/global_constants.jl b/src/global_constants.jl index af02665..6c115f2 100644 --- a/src/global_constants.jl +++ b/src/global_constants.jl @@ -16,11 +16,10 @@ MᵣW_mul_factor(::True) = StaticInt{4}() MᵣW_mul_factor(::False) = StaticInt{9}() MᵣW_mul_factor() = MᵣW_mul_factor(has_feature(Val(:x86_64_avx512f))) -W₁Default(::True) = StaticFloat64{0.009898277594117685}() -# W₁Default(::True) = StaticFloat64{0.0009898277594117685}() -W₂Default(::True) = StaticFloat64{0.9865020832559304}() -R₁Default(::True) = StaticFloat64{0.5820044063603483}() -R₂Default(::True) = StaticFloat64{0.7580885846640107}() +W₁Default(::True) = StaticFloat64{0.0007423708195588264}() +W₂Default(::True) = StaticFloat64{0.7398765624419478}() +R₁Default(::True) = StaticFloat64{0.4697043382682602}() +R₂Default(::True) = StaticFloat64{0.6342912896800855}() W₁Default_arch(::Val{:znver1}) = StaticFloat64{0.053918949422353986}() W₂Default_arch(::Val{:znver1}) = StaticFloat64{0.3013238122374886}() diff --git a/test/matmul_coverage.jl b/test/matmul_coverage.jl index 80fb63c..fcfbc24 100644 --- a/test/matmul_coverage.jl +++ b/test/matmul_coverage.jl @@ -4,7 +4,6 @@ m_values = [10, 20, 50, 100, 150, 200] -include("_matmul.jl") typ = get(ENV, "JULIA_TEST_ELTYPE", "ALL") types = if typ == "Float64" DataType[Float64] diff --git a/test/matmul_main.jl b/test/matmul_main.jl index f5152b3..1fd9e4b 100644 --- a/test/matmul_main.jl +++ b/test/matmul_main.jl @@ -5,8 +5,8 @@ m_values = [200, 300, 400] testset_name_suffix = "(main)" for T ∈ (Float64,Float32,Int64,Int32) - @time test_complex(T, m_values, k_values, n_values, testset_name_suffix) @time test_real(T, m_values, k_values, n_values, testset_name_suffix) + @time test_complex(T, m_values, k_values, n_values, testset_name_suffix) end A = rand(2,2); B = rand(2,2); AB = A*B; C = fill(NaN, 2, 2); diff --git a/test/runtests.jl b/test/runtests.jl index e01e2b3..d2f2ec7 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -23,12 +23,10 @@ include("block_sizes.jl") include("init.jl") include("integer_division.jl") include("macrokernels.jl") +include("_matmul.jl") +coverage || include("matmul_main.jl") include("matmul_coverage.jl") include("utils.jl") include("forward_diff.jl") -if !coverage - include("matmul_main.jl") -end - include("aqua.jl") # run the Aqua.jl tests last