Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 3 additions & 3 deletions Project.toml
Original file line number Diff line number Diff line change
Expand Up @@ -13,11 +13,11 @@ VectorizationBase = "3d5dd08c-fd9d-11e8-17fa-ed2836048c2f"

[compat]
ArrayInterface = "3.1.14"
LoopVectorization = "0.12"
LoopVectorization = "0.12.26"
Static = "0.2"
StrideArraysCore = "0.1.5"
StrideArraysCore = "0.1.11"
ThreadingUtilities = "0.4"
VectorizationBase = "0.20.9"
VectorizationBase = "0.20.11"
julia = "1.6"

[extras]
Expand Down
2 changes: 1 addition & 1 deletion docs/src/index.md
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,6 @@ In general:

!!! note

Octavian's tasks can interfere with tasks spawned by Base.Threads, resulting in much slower execution times when used together. This can be avoided by using threading utilities from [CheapThreads](https://github.com/JuliaSIMD/CheapThreads.jl) or [LoopVectorization](https://github.com/JuliaSIMD/LoopVectorization.jl/) instead. See this [Discourse post](https://discourse.julialang.org/t/odd-benchmarktools-timings-using-threads-and-octavian/59838) for more information.
Octavian's tasks can interfere with tasks spawned by Base.Threads, resulting in much slower execution times when used together. This can be avoided by using threading utilities from [Polyester](https://github.com/JuliaSIMD/Polyester.jl) or [LoopVectorization](https://github.com/JuliaSIMD/LoopVectorization.jl/) instead. See this [Discourse post](https://discourse.julialang.org/t/odd-benchmarktools-timings-using-threads-and-octavian/59838) for more information.


4 changes: 2 additions & 2 deletions src/Octavian.jl
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,10 @@ module Octavian

using VectorizationBase, ArrayInterface, LoopVectorization

using VectorizationBase: align, AbstractStridedPointer, zstridedpointer,
using VectorizationBase: align, AbstractStridedPointer, zstridedpointer, vsub_nsw, assume,
static_sizeof, lazymul, StridedPointer, gesp, pause, pick_vector_width, has_feature,
num_cache_levels, cache_size, num_cores, num_cores, cache_inclusive, cache_linesize, ifelse
using LoopVectorization: maybestaticsize, matmul_params, preserve_buffer, CloseOpen
using LoopVectorization: maybestaticsize, preserve_buffer, CloseOpen, UpperBoundedInteger
using ArrayInterface: OptionallyStaticUnitRange, size, strides, offsets, indices,
static_length, static_first, static_last, axes, dense_dims, stride_rank

Expand Down
43 changes: 22 additions & 21 deletions src/block_sizes.jl
Original file line number Diff line number Diff line change
@@ -1,15 +1,16 @@

matmul_params(::Val{T}) where {T <: Base.HWReal} = LoopVectorization.matmul_params()

function block_sizes(::Type{T}, _α, _β, R₁, R₂) where {T}
function block_sizes(::Val{T}, _α, _β, R₁, R₂) where {T}
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why do we need to wrap the type in a Val?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

To throw an error if not specialized?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't think we need to, but I think I prefer it as a style.
It should force specialization, just like Array{T} always specializes on T.

W = pick_vector_width(T)
α = _α * W
β = _β * W
L₁ₑ = first_cache_size(T) * R₁
L₂ₑ = second_cache_size(T) * R₂
block_sizes(W, α, β, L₁ₑ, L₂ₑ)
L₁ₑ = first_cache_size(Val(T)) * R₁
L₂ₑ = second_cache_size(Val(T)) * R₂
block_sizes(Val(T), W, α, β, L₁ₑ, L₂ₑ)
end
function block_sizes(W, α, β, L₁ₑ, L₂ₑ)
mᵣ, nᵣ = matmul_params()
function block_sizes(::Val{T}, W, α, β, L₁ₑ, L₂ₑ) where {T}
mᵣ, nᵣ = matmul_params(Val(T))
MᵣW = mᵣ * W

Mc = floortostaticint(√(L₁ₑ)*√(L₁ₑ*β + L₂ₑ*α)/√(L₂ₑ) / MᵣW) * MᵣW
Expand All @@ -18,8 +19,8 @@ function block_sizes(W, α, β, L₁ₑ, L₂ₑ)

Mc, Kc, Nc
end
function block_sizes(::Type{T}) where {T}
block_sizes(T, W₁Default(), W₂Default(), R₁Default(), R₂Default())
function block_sizes(::Val{T}) where {T}
block_sizes(Val(T), W₁Default(), W₂Default(), R₁Default(), R₂Default())
end

"""
Expand Down Expand Up @@ -53,7 +54,7 @@ This method is used fairly generally.
end

"""
solve_block_sizes(::Type{T}, M, K, N, α, β, R₂, R₃)
solve_block_sizes(::Val{T}, M, K, N, α, β, R₂, R₃)

This function returns iteration/blocking descriptions `Mc`, `Kc`, and `Nc` for use when packing both `A` and `B`.

Expand Down Expand Up @@ -157,12 +158,12 @@ Note that for synchronization on `B`, all threads must have the same values for
`K` and `N` will be equal between threads, but `M` may differ. By calculating `Kc` and `Nc`
independently of `M`, this algorithm guarantees all threads are on the same page.
"""
@inline function solve_block_sizes(::Type{T}, M, K, N, _α, _β, R₂, R₃, Wfactor) where {T}
@inline function solve_block_sizes(::Val{T}, M, K, N, _α, _β, R₂, R₃, Wfactor) where {T}
W = pick_vector_width(T)
α = _α * W
β = _β * W
L₁ₑ = first_cache_size(T) * R₂
L₂ₑ = second_cache_size(T) * R₃
L₁ₑ = first_cache_size(Val(T)) * R₂
L₂ₑ = second_cache_size(Val(T)) * R₃

# Nc_init = round(Int, √(L₂ₑ)*√(α * L₂ₑ + β * L₁ₑ)/√(L₁ₑ))
Nc_init⁻¹ = √(L₁ₑ) / (√(L₂ₑ)*√(α * L₂ₑ + β * L₁ₑ))
Expand All @@ -171,17 +172,17 @@ independently of `M`, this algorithm guarantees all threads are on the same page
Nblock, Nrem = divrem_fast(N, Niter)
Nblock_Nrem = Nblock + One()#(Nrem > 0)

((Mblock, Mblock_Mrem, Mremfinal, Mrem, Miter), (Kblock, Kblock_Krem, Krem, Kiter)) = solve_McKc(T, M, K, Nblock_Nrem, _α, _β, R₂, R₃, Wfactor)
((Mblock, Mblock_Mrem, Mremfinal, Mrem, Miter), (Kblock, Kblock_Krem, Krem, Kiter)) = solve_McKc(Val(T), M, K, Nblock_Nrem, _α, _β, R₂, R₃, Wfactor)

(Mblock, Mblock_Mrem, Mremfinal, Mrem, Miter), (Kblock, Kblock_Krem, Krem, Kiter), promote(Nblock, Nblock_Nrem, Nrem, Niter)
end
# Takes Nc, calcs Mc and Kc
@inline function solve_McKc(::Type{T}, M, K, Nc, _α, _β, R₂, R₃, Wfactor) where {T}
@inline function solve_McKc(::Val{T}, M, K, Nc, _α, _β, R₂, R₃, Wfactor) where {T}
W = pick_vector_width(T)
α = _α * W
β = _β * W
L₁ₑ = first_cache_size(T) * R₂
L₂ₑ = second_cache_size(T) * R₃
L₁ₑ = first_cache_size(Val(T)) * R₂
L₂ₑ = second_cache_size(Val(T)) * R₃

Kc_init⁻¹ = Base.FastMath.max_fast(√(α/L₁ₑ), Nc*inv(L₂ₑ))
Kiter = cldapproxi(K, Kc_init⁻¹) # approximate `ceil`
Expand All @@ -203,8 +204,8 @@ end
Finds first combination of `Miter` and `Niter` that doesn't make `M` too small while producing `Miter * Niter = num_cores()`.
This would be awkard if there are computers with prime numbers of cores. I should probably consider that possibility at some point.
"""
@inline function find_first_acceptable(M, W)
Mᵣ, Nᵣ = matmul_params()
@inline function find_first_acceptable(::Val{T}, M, W) where {T}
Mᵣ, Nᵣ = matmul_params(Val(T))
factors = calc_factors()
for (miter, niter) ∈ factors
if miter * (StaticInt(2) * Mᵣ * W) ≤ M + (W + W)
Expand All @@ -218,9 +219,9 @@ end

Splits both `M` and `N` into blocks when trying to spawn a large number of threads relative to the size of the matrices.
"""
@inline function divide_blocks(M, Ntotal, _nspawn, W)
_nspawn == num_cores() && return find_first_acceptable(M, W)
mᵣ, nᵣ = matmul_params()
@inline function divide_blocks(::Val{T}, M, Ntotal, _nspawn, W) where {T}
_nspawn == num_cores() && return find_first_acceptable(Val(T), M, W)
mᵣ, nᵣ = matmul_params(Val(T))
Miter = clamp(div_fast(M, W*mᵣ * MᵣW_mul_factor()), 1, _nspawn)
nspawn = div_fast(_nspawn, Miter)
if (nspawn ≤ 1) & (Miter < _nspawn)
Expand Down
2 changes: 1 addition & 1 deletion src/funcptrs.jl
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ function (::LoopMulFunc{P,TC,TA,TB,Α,Β,Md,Kd,Nd})(p::Ptr{UInt}) where {P,TC,TA
end
@inline _call_loopmul!(C, A, B, α, β, M, K, N, ::Val{false}) = loopmul!(C, A, B, α, β, M, K, N)
@inline function _call_loopmul!(C::StridedPointer{T}, A, B, α, β, M, K, N, ::Val{true}) where {T}
if M*K < first_cache_size(T) * R₂Default()
if M*K < first_cache_size(Val(T)) * R₂Default()
packaloopmul!(C, A, B, α, β, M, K, N)
return
else
Expand Down
4 changes: 2 additions & 2 deletions src/global_constants.jl
Original file line number Diff line number Diff line change
Expand Up @@ -65,8 +65,8 @@ function second_cache_size()
_second_cache_size(cache_size(sc), cache_inclusive(sc))
end

first_cache_size(::Type{T}) where {T} = first_cache_size() ÷ static_sizeof(T)
second_cache_size(::Type{T}) where {T} = second_cache_size() ÷ static_sizeof(T)
first_cache_size(::Val{T}) where {T} = first_cache_size() ÷ static_sizeof(T)
second_cache_size(::Val{T}) where {T} = second_cache_size() ÷ static_sizeof(T)

bcache_count() = VectorizationBase.num_cache(second_cache())

Expand Down
Loading