Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion Project.toml
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
name = "Octavian"
uuid = "6fd5a793-0b7e-452c-907f-f8bfe9c57db4"
authors = ["Mason Protter", "Chris Elrod", "Dilum Aluthge", "contributors"]
version = "0.3.9"
version = "0.3.10"

[deps]
ArrayInterface = "4fba245c-0d91-5ea0-9b3e-6abc04ee57a9"
Expand Down
11 changes: 7 additions & 4 deletions benchmark/tilesearch.jl
Original file line number Diff line number Diff line change
Expand Up @@ -5,12 +5,14 @@ function matmul_pack_ab!(C, A, B, ::Val{W₁}, ::Val{W₂}, ::Val{R₁}, ::Val{R
M, N = size(C); K = size(B,1)
zc, za, zb = Octavian.zstridedpointer.((C,A,B))
nspawn = VectorizationBase.num_cores()
threads, torelease = Octavian.PolyesterWeave.__request_threads((nspawn-1)%UInt32, Octavian.PolyesterWeave.worker_pointer(), nothing)
t = Inf
GC.@preserve C A B begin
for _ ∈ 1:2
t = min(t, @elapsed(Octavian.matmul_pack_A_and_B!(zc, za, zb, Octavian.One(), Octavian.Zero(), M, K, N, Int(nspawn), F64(W₁), F64(W₂), F64(R₁), F64(R₂))))
t = min(t, @elapsed(Octavian.matmul_pack_A_and_B!(zc, za, zb, Octavian.One(), Octavian.Zero(), M, K, N, threads, F64(W₁), F64(W₂), F64(R₁), F64(R₂))))
end
end
Octavian.PolyesterWeave.free_threads!(torelease)
return t
end

Expand Down Expand Up @@ -119,15 +121,16 @@ using Optim
hours = 60.0*60.0; days = 24hours;
init = Float64[Octavian.W₁Default(), Octavian.W₂Default(), Octavian.R₁Default(), Octavian.R₂Default()]
lower = 0.75 .* init;
upper = [1.25init[1], 1.25init[2], 0.75*init[3] + 0.25, 0.75*init[4] + 0.25];
# upper = [1.25init[1], 1.25init[2], 0.75*init[3] + 0.25, 0.75*init[4] + 0.25];
upper = [0.9, 1.25init[2], 0.999, 0.999];
# init = [0.001, 0.9754033943603924, 0.5711159869399494, 0.7547361860432168];

#=
opt = Optim.optimize(
matmul_objective, init, ParticleSwarm(lower = lower, upper = upper),
Optim.Options(iterations = 10^6, time_limit = 8hours)
);


=#



50 changes: 36 additions & 14 deletions src/block_sizes.jl
Original file line number Diff line number Diff line change
Expand Up @@ -178,25 +178,47 @@ independently of `M`, this algorithm guarantees all threads are on the same page
end
# Takes Nc, calcs Mc and Kc
@inline function solve_McKc(::Val{T}, M, K, Nc, _α, _β, R₂, R₃, Wfactor) where {T}
W = pick_vector_width(T)
α = _α * W
β = _β * W
L₁ₑ = first_cache_size(Val(T)) * R₂
L₂ₑ = second_cache_size(Val(T)) * R₃
W = pick_vector_width(T)
α = _α * W
β = _β * W
L₁ₑ = first_cache_size(Val(T)) * R₂
L₂ₑ = second_cache_size(Val(T)) * R₃

Kc_init⁻¹ = Base.FastMath.max_fast(√(α/L₁ₑ), Nc*inv(L₂ₑ))
Kiter = cldapproxi(K, Kc_init⁻¹) # approximate `ceil`
Kblock, Krem = divrem_fast(K, Kiter)
Kblock_Krem = Kblock + One()
Kc_init⁻¹ = Base.FastMath.max_fast(√(α/L₁ₑ), Nc*inv(L₂ₑ))
Kiter = cldapproxi(K, Kc_init⁻¹) # approximate `ceil`
Kblock, Krem = divrem_fast(K, Kiter)
Kblock_Krem = Kblock + One()

Miter_init = cldapproxi(M * inv(L₁ₑ), Kblock_Krem) # Miter = M * Kc / L₁ₑ
Mbsize, Mrem, Mremfinal, Miter = split_m(M, Miter_init, W * Wfactor)
Mblock_Mrem = Mbsize + W * Wfactor

promote(Mbsize, Mblock_Mrem, Mremfinal, Mrem, Miter), promote(Kblock, Kblock_Krem, Krem, Kiter)
Mᵣ = Wfactor * W
Mc_init = floor(Int, Base.FastMath.div_fast(L₁ₑ / Mᵣ, Float64(Kblock_Krem)))
Mc_init_base = max(0, Mc_init - 1)
Kblock_summary = promote(Kblock, Kblock_Krem, Krem, Kiter)
if (Mc_init_base ≠ 0) # Mc_init > 1
Mbsize = Mc_init_base * Mᵣ
Mblocks, Mblocks_rem = divrem_fast(M, Mᵣ)
Miter, Mrem = divrem_fast(Mblocks, Mc_init_base)
if Miter == 0
return (0, 0, Int(M)::Int, 0, 1), Kblock_summary
elseif Miter > Mrem
Mblock_Mrem = Mbsize + Mᵣ
Mremfinal = Mbsize + Mblocks_rem
# @show Mbsize * (Miter - 1 - Mrem) + Mrem * Mblock_Mrem + Mremfinal
map(Int, (Mbsize, Mblock_Mrem, Mremfinal, Mrem, Miter)), Kblock_summary
else
_Mbsize, _Mrem, _Mremfinal, _Miter = split_m(M, Miter + (Mrem ≠ 0), Mᵣ)
_Mblock_Mrem = _Mbsize + Mᵣ
return map(Int, (_Mbsize, _Mblock_Mrem, _Mremfinal, _Mrem, _Miter)), Kblock_summary
end
else
Mbsize0 = Int(Mᵣ)
Mblock_Mrem0 = Int(Mᵣ)
Miter0, Mremfinal0 = divrem_fast(M, Mᵣ)
map(Int, (Mbsize0, Mblock_Mrem0, Mremfinal0, 0, Miter0)), Kblock_summary
end
end

@inline cldapproxi(n, d⁻¹) = Base.fptosi(Int, Base.FastMath.add_fast(Base.FastMath.mul_fast(n, d⁻¹), 0.9999999999999432)) # approximate `ceil`
# @inline divapproxi(n, d⁻¹) = Base.fptosi(Int, Base.FastMath.mul_fast(n, d⁻¹)) # approximate `div`

"""
find_first_acceptable(M, W)
Expand Down
9 changes: 4 additions & 5 deletions src/global_constants.jl
Original file line number Diff line number Diff line change
Expand Up @@ -16,11 +16,10 @@ MᵣW_mul_factor(::True) = StaticInt{4}()
MᵣW_mul_factor(::False) = StaticInt{9}()
MᵣW_mul_factor() = MᵣW_mul_factor(has_feature(Val(:x86_64_avx512f)))

W₁Default(::True) = StaticFloat64{0.009898277594117685}()
# W₁Default(::True) = StaticFloat64{0.0009898277594117685}()
W₂Default(::True) = StaticFloat64{0.9865020832559304}()
R₁Default(::True) = StaticFloat64{0.5820044063603483}()
R₂Default(::True) = StaticFloat64{0.7580885846640107}()
W₁Default(::True) = StaticFloat64{0.0007423708195588264}()
W₂Default(::True) = StaticFloat64{0.7398765624419478}()
R₁Default(::True) = StaticFloat64{0.4697043382682602}()
R₂Default(::True) = StaticFloat64{0.6342912896800855}()

W₁Default_arch(::Val{:znver1}) = StaticFloat64{0.053918949422353986}()
W₂Default_arch(::Val{:znver1}) = StaticFloat64{0.3013238122374886}()
Expand Down
1 change: 0 additions & 1 deletion test/matmul_coverage.jl
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@ m_values = [10, 20, 50, 100, 150, 200]



include("_matmul.jl")
typ = get(ENV, "JULIA_TEST_ELTYPE", "ALL")
types = if typ == "Float64"
DataType[Float64]
Expand Down
2 changes: 1 addition & 1 deletion test/matmul_main.jl
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,8 @@ m_values = [200, 300, 400]
testset_name_suffix = "(main)"

for T ∈ (Float64,Float32,Int64,Int32)
@time test_complex(T, m_values, k_values, n_values, testset_name_suffix)
@time test_real(T, m_values, k_values, n_values, testset_name_suffix)
@time test_complex(T, m_values, k_values, n_values, testset_name_suffix)
end

A = rand(2,2); B = rand(2,2); AB = A*B; C = fill(NaN, 2, 2);
Expand Down
6 changes: 2 additions & 4 deletions test/runtests.jl
Original file line number Diff line number Diff line change
Expand Up @@ -23,12 +23,10 @@ include("block_sizes.jl")
include("init.jl")
include("integer_division.jl")
include("macrokernels.jl")
include("_matmul.jl")
coverage || include("matmul_main.jl")
include("matmul_coverage.jl")
include("utils.jl")
include("forward_diff.jl")

if !coverage
include("matmul_main.jl")
end

include("aqua.jl") # run the Aqua.jl tests last