From 2d11866a0a013d35e6217bedce6a1b050628c028 Mon Sep 17 00:00:00 2001 From: chriselrod Date: Wed, 2 Jun 2021 03:39:05 -0400 Subject: [PATCH 1/5] Remove all dependencies. --- Project.toml | 4 - src/ThreadingUtilities.jl | 27 ++++-- src/atomics.jl | 4 - src/threadtasks.jl | 2 - src/utils.jl | 169 +++++++++++++++++--------------------- test/internals.jl | 2 +- test/threadpool.jl | 2 +- 7 files changed, 100 insertions(+), 110 deletions(-) diff --git a/Project.toml b/Project.toml index 5b9e0c7..bcdee29 100644 --- a/Project.toml +++ b/Project.toml @@ -3,12 +3,8 @@ uuid = "8290d209-cae3-49c0-8002-c8c24d57dab5" authors = ["Chris Elrod and contributors"] version = "0.4.4" -[deps] -VectorizationBase = "3d5dd08c-fd9d-11e8-17fa-ed2836048c2f" - [compat] Aqua = "0.5" -VectorizationBase = "0.19.2, 0.20" julia = "1.5" [extras] diff --git a/src/ThreadingUtilities.jl b/src/ThreadingUtilities.jl index 7bfa76d..aa13b16 100644 --- a/src/ThreadingUtilities.jl +++ b/src/ThreadingUtilities.jl @@ -1,7 +1,21 @@ module ThreadingUtilities -using VectorizationBase: - pause, StaticInt, StridedPointer, stridedpointer, offsets, cache_linesize, align, __vload, __vstore!, num_threads, assume, False, register_size, NativeTypes +""" + pause() + +For use in spin-and-wait loops, like spinlocks. +""" +@inline pause() = ccall(:jl_cpu_pause, Cvoid, ()) + +if VERSION ≥ v"1.6.0-DEV.674" + @inline function assume(b::Bool)::Cvoid + Base.llvmcall((" declare void @llvm.assume(i1)\n\n define void @entry(i8) alwaysinline {\n top:\n %b = trunc i8 %0 to i1\ncall void @llvm.assume(i1 %b)\nret void\n }\n", "entry"), Cvoid, Tuple{Bool}, b) + end +else + @inline function assume(b::Bool)::Cvoid + Base.llvmcall(("declare void @llvm.assume(i1)", "%b = trunc i8 %0 to i1\ncall void @llvm.assume(i1 %b)\nret void"), Cvoid, Tuple{Bool}, b) + end +end @enum ThreadState::UInt32 begin TASK = 0 # 0: task available @@ -9,6 +23,7 @@ using VectorizationBase: SPIN = 2 # 2: spinning end const TASKS = Task[] +const LINESPACING = 256 # maximum cache-line size among contemporary CPUs. const THREADBUFFERSIZE = 512 const THREADPOOL = UInt[] const THREADPOOLPTR = Ref{Ptr{UInt}}(C_NULL); @@ -37,11 +52,11 @@ end function __init__() _print_exclusivity_warning() - nt = min(Threads.nthreads(),(Sys.CPU_THREADS)::Int) - 1 - resize!(THREADPOOL, (THREADBUFFERSIZE ÷ sizeof(UInt)) * nt + (cache_linesize() ÷ sizeof(UInt)) - 1) + nt = min(Threads.nthreads(), (Sys.CPU_THREADS)::Int) - 1 + resize!(THREADPOOL, (THREADBUFFERSIZE ÷ sizeof(UInt)) * nt + (LINESPACING ÷ sizeof(UInt)) - 1) copyto!(THREADPOOL, zero(UInt)) - THREADPOOLPTR[] = align(pointer(THREADPOOL)) - THREADBUFFERSIZE - Threads.atomic_fence() # ensure 0-initialization + # align to LINESPACING boundary, and then subtract THREADBUFFERSIZE to make the pointer 1-indexed + THREADPOOLPTR[] = reinterpret(Ptr{UInt}, (reinterpret(UInt, (pointer(THREADPOOL)))+LINESPACING-1) & (-LINESPACING)) - THREADBUFFERSIZE resize!(TASKS, nt) foreach(initialize_task, 1:nt) end diff --git a/src/atomics.jl b/src/atomics.jl index ca82146..462366c 100644 --- a/src/atomics.jl +++ b/src/atomics.jl @@ -1,7 +1,3 @@ -# TODO: Is atomic volatile really necessary? -# Early on my attempts weren't syncing / atomics -# weren't behaving atomically between threads so -# I got a bit defensive. for (ityp,jtyp) ∈ [("i8", UInt8), ("i16", UInt16), ("i32", UInt32), ("i64", UInt64), ("i128", UInt128)] @eval begin @inline function _atomic_load(ptr::Ptr{$jtyp}) diff --git a/src/threadtasks.jl b/src/threadtasks.jl index 47f37e8..f990a5f 100644 --- a/src/threadtasks.jl +++ b/src/threadtasks.jl @@ -42,10 +42,8 @@ function (tt::ThreadTask)() end # 1-based tid, pushes into task 2-nthreads() -# function wake_thread!(tid::T) where {T <: Unsigned} @noinline function wake_thread!(_tid::T) where {T <: Integer} tid = _tid % Int - # store!(taskpointer(_tid), TASK) tidp1 = tid + one(tid) assume(unsigned(length(Base.Workqueues)) > unsigned(tid)) assume(unsigned(length(TASKS)) > unsigned(tidp1)) diff --git a/src/utils.jl b/src/utils.jl index 7704a5f..d22fe69 100644 --- a/src/utils.jl +++ b/src/utils.jl @@ -1,103 +1,88 @@ -# To add support for loading/storing... -@inline function load(p::Ptr{UInt}, ::Type{T}) where {T<:NativeTypes} - __vload(Base.unsafe_convert(Ptr{T}, p), False(), register_size()) +@generated function load(p::Ptr{T}) where {T} + if Base.allocatedinline(T) + Expr(:block, Expr(:meta,:inline), :(unsafe_load(p))) + else + Expr(:block, Expr(:meta,:inline), :(ccall(:jl_value_ptr, Ref{$T}, (Ptr{Cvoid},), unsafe_load(Base.unsafe_convert(Ptr{Ptr{Cvoid}}, p))))) + end end -@inline function load(p::Ptr{UInt}, ::Type{T}) where {T<:Union{Ptr,Core.LLVMPtr}} - reinterpret(T, __vload(p, False(), register_size())) +@inline load(p::Ptr{UInt}, ::Type{T}) where {T} = load(reinterpret(Ptr{T}, p)) +@generated function store!(p::Ptr{T}, v::T) where {T} + if Base.allocatedinline(T) + Expr(:block, Expr(:meta,:inline), :(unsafe_store!(p, v); return nothing)) + else + Expr(:block, Expr(:meta,:inline), :(unsafe_store!(Base.unsafe_convert(Ptr{Ptr{Cvoid}}, p), Base.pointer_from_objref(v)); return nothing)) + end end -# @inline function load(p::Ptr{UInt}, ::Type{T}) where {T<:NativeTypes} -# __vload(reinterpret(Core.LLVMPtr{T,0}, p), False(), register_size()) -# end -# @inline function load(p::Ptr{UInt}, ::Type{T}) where {T<:Union{Ptr,Core.LLVMPtr}} -# reinterpret(T, __vload(reinterpret(Core.LLVMPtr{UInt,0}, p), False(), register_size())) -# end -@inline load(p::Ptr{UInt}, ::Type{T}) where {T} = unsafe_load(Base.unsafe_convert(Ptr{T}, p)) -@inline function store!(p::Ptr{UInt}, x::T) where {T <: Union{Ptr,Core.LLVMPtr}} - __vstore!(p, reinterpret(UInt, x), False(), False(), False(), register_size()) -end -@inline function store!(p::Ptr{UInt}, x::T) where {T <: NativeTypes} - __vstore!(Base.unsafe_convert(Ptr{T}, p), x, False(), False(), False(), register_size()) -end -# @inline function store!(p::Ptr{UInt}, x::T) where {T <: Union{Ptr,Core.LLVMPtr}} -# __vstore!(reinterpret(Core.LLVMPtr{UInt,0}, p), reinterpret(UInt, x), False(), False(), False(), register_size()) -# end -# @inline function store!(p::Ptr{UInt}, x::T) where {T <: NativeTypes} -# __vstore!(reinterpret(Core.LLVMPtr{T,0}, p), x, False(), False(), False(), register_size()) -# end -@inline store!(p::Ptr{UInt}, x::T) where {T} = (unsafe_store!(Base.unsafe_convert(Ptr{T}, p), x); nothing) - -@inline load(p::Ptr{UInt}, ::Type{StaticInt{N}}, i) where {N} = i, StaticInt{N}() -@inline store!(p::Ptr{UInt}, ::StaticInt, i) = i - +offsetsize(::Type{T}) where {T} = Base.allocatedinline(T) ? sizeof(Int) : sizeof(T) - - - -@generated function load(p::Ptr{UInt}, ::Type{StridedPointer{T,N,C,B,R,X,O}}, i) where {T,N,C,B,R,X,O} - q = quote - $(Expr(:meta,:inline)) - i, ptr = load(p, Ptr{$T}, i) - end - xt = Expr(:tuple) - Xp = X.parameters - for n ∈ 1:N - x = Symbol(:x_,n) - push!(xt.args, x) - push!(q.args, :((i, $x) = load(p, $(Xp[n]), i))) +function load_aggregate(::Type{T}, offset::Int) where {T} + numfields = fieldcount(T) + call = (T <: Tuple) ? Expr(:tuple) : Expr(:new, T) + for f ∈ 1:numfields + TF = fieldtype(T, f) + if Base.issingletontype(TF) + push!(call.args, TF.instance) + elseif fieldcount(TF) ≡ 0 + if TF === UInt + push!(call.args, :(load(p + (offset + $offset)))) + else + push!(call.args, :(load(reinterpret(Ptr{$TF}, p) + (offset + $offset)))) + end + offset += offsetsize(TF) + else + arg, offset = load_aggregate(TF, offset) + push!(call.args, arg) end - ot = Expr(:tuple) - Op = O.parameters - for n ∈ 1:N - o = Symbol(:o_,n) - push!(ot.args, o) - push!(q.args, :((i, $o) = load(p, $(Op[n]), i))) - end - push!(q.args, :((i, StridedPointer{$T,$N,$C,$B,$R}(ptr, $xt, $ot)))) - q -end -@generated function store!(p::Ptr{UInt}, ptr::StridedPointer{T,N,C,B,R,X,O}, i) where {T,N,C,B,R,X,O} - q = quote - $(Expr(:meta,:inline)) - i = store!(p, pointer(ptr), i) - strd = strides(ptr) - offs = offsets(ptr) - end - for n ∈ 1:N - push!(q.args, :(i = store!(p, strd[$n], i))) - end - for n ∈ 1:N - push!(q.args, :(i = store!(p, offs[$n], i))) - end - push!(q.args, :i) - q -end - -@inline function load(p::Ptr{UInt}, ::Type{T}, i) where {T} - i + sizeof(T), load(p + i, T) + end + return call, offset end -@inline function store!(p::Ptr{UInt}, x, i) - store!(p + i, x) - i + sizeof(x) +@generated function load(p::Ptr{UInt}, ::Type{T}, offset::Int) where {T} + if Base.issingletontype(T) + call = Expr(:tuple, :offset, T.instance) + elseif fieldcount(T) ≡ 0 + ptr = :(p + offset) + ptr = T === UInt ? ptr : :(reinterpret(Ptr{$T}, $ptr)) + call = :(((offset + $(offsetsize(T)), load($ptr)))) + else + call, off = load_aggregate(T, 0) + call = Expr(:tuple, :(offset + $off), call) + end + Expr(:block, Expr(:meta,:inline), call) end -@generated function load(p::Ptr{UInt}, ::Type{T}, i) where {T<:Tuple} - q = Expr(:block, Expr(:meta,:inline)) - tup = Expr(:tuple) - for (i,t) ∈ enumerate(T.parameters) - ln = Symbol(:l_,i) - push!(tup.args, ln) - push!(q.args, :((i,$ln) = load(p, $t, i))) +function store_aggregate!(q::Expr, sym, ::Type{T}, offset::Int) where {T} + gf = GlobalRef(Core,:getfield) + for f ∈ 1:fieldcount(T) + TF = fieldtype(T, f) + Base.issingletontype(TF) && continue + gfcall = Expr(:call, gf, sym, f) + if fieldcount(TF) ≡ 0 + if TF === UInt + push!(q.args, :(store!(p + (offset + $offset), $gfcall))) + else + push!(q.args, :(store!(reinterpret(Ptr{$TF}, p) + (offset + $offset), $gfcall))) + end + offset += offsetsize(TF) + else + newsym = gensym(sym) + push!(q.args, Expr(:(=), newsym, gfcall)) + offset = store_aggregate!(q, newsym, TF, offset) end - push!(q.args, :(i, $tup)) - q -end -@inline function store!(p::Ptr{UInt}, tup::Tuple{A,B,Vararg{Any,N}}, i) where {A,B,N} - i = store!(p, first(tup), i) - store!(p, Base.tail(tup), i) + end + return offset end -@inline function store!(p::Ptr{UInt}, tup::Tuple{A}, i) where {A} - store!(p, first(tup), i) +@generated function store!(p::Ptr{UInt}, x::T, offset::Int) where {T} + Base.issingletontype(T) && return :offset + body = Expr(:block, Expr(:meta,:inline)) + if fieldcount(T) ≡ 0 + ptr = :(p + offset) + ptr = T === UInt ? ptr : :(reinterpret(Ptr{$T}, $ptr)) + push!(body.args, :(store!($ptr, x))) + off = offsetsize(T) + else + off = store_aggregate!(body, :x, T, 0) + end + push!(body.args, Expr(:call, +, :offset, off)) + return body end -@inline store!(p::Ptr{UInt}, tup::Tuple{}, i) = i -@inline store!(p::Ptr{UInt}, tup::Nothing, i) = i diff --git a/test/internals.jl b/test/internals.jl index 821baf3..1cda91b 100644 --- a/test/internals.jl +++ b/test/internals.jl @@ -3,7 +3,7 @@ @test ThreadingUtilities.store!(pointer(UInt[]), nothing, 1) == 1 x = zeros(UInt, 100); GC.@preserve x begin - t1 = (1.0, C_NULL, 3, ThreadingUtilities.stridedpointer(x)) + t1 = (1.0, C_NULL, 3, VectorizationBase.stridedpointer(x)) @test ThreadingUtilities.store!(pointer(x), t1, 0) === mapreduce(sizeof, +, t1) @test ThreadingUtilities.load(pointer(x), typeof(t1), 0) === (mapreduce(sizeof, +, t1), t1) diff --git a/test/threadpool.jl b/test/threadpool.jl index 85f61a2..47ab8a1 100644 --- a/test/threadpool.jl +++ b/test/threadpool.jl @@ -2,5 +2,5 @@ @test isconst(ThreadingUtilities, :THREADPOOL) # test that ThreadingUtilities.THREADPOOL is a constant @test ThreadingUtilities.THREADPOOL isa Vector{UInt} @test eltype(ThreadingUtilities.THREADPOOL) === UInt - @test length(ThreadingUtilities.THREADPOOL) == (ThreadingUtilities.THREADBUFFERSIZE÷sizeof(UInt)) * (min(Threads.nthreads(),(Sys.CPU_THREADS)::Int) - 1) + (VectorizationBase.cache_linesize() ÷ sizeof(UInt)) - 1 + @test length(ThreadingUtilities.THREADPOOL) == (ThreadingUtilities.THREADBUFFERSIZE÷sizeof(UInt)) * (min(Threads.nthreads(),(Sys.CPU_THREADS)::Int) - 1) + (256 ÷ sizeof(UInt)) - 1 end From 95459ebdeecb2fcd1db731bbdd0219581a0221c6 Mon Sep 17 00:00:00 2001 From: chriselrod Date: Wed, 2 Jun 2021 03:50:21 -0400 Subject: [PATCH 2/5] Fix offsetsize --- src/utils.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/utils.jl b/src/utils.jl index d22fe69..da6fdc9 100644 --- a/src/utils.jl +++ b/src/utils.jl @@ -13,7 +13,7 @@ end Expr(:block, Expr(:meta,:inline), :(unsafe_store!(Base.unsafe_convert(Ptr{Ptr{Cvoid}}, p), Base.pointer_from_objref(v)); return nothing)) end end -offsetsize(::Type{T}) where {T} = Base.allocatedinline(T) ? sizeof(Int) : sizeof(T) +offsetsize(::Type{T}) where {T} = Base.allocatedinline(T) ? sizeof(T) : sizeof(Int) function load_aggregate(::Type{T}, offset::Int) where {T} numfields = fieldcount(T) From 061a1bbd44bea527e06066fa1af7fd9a15524f37 Mon Sep 17 00:00:00 2001 From: chriselrod Date: Wed, 2 Jun 2021 03:57:39 -0400 Subject: [PATCH 3/5] Test coverage --- src/ThreadingUtilities.jl | 8 ++------ src/utils.jl | 16 ++++++---------- test/internals.jl | 5 +++-- 3 files changed, 11 insertions(+), 18 deletions(-) diff --git a/src/ThreadingUtilities.jl b/src/ThreadingUtilities.jl index aa13b16..37dd7be 100644 --- a/src/ThreadingUtilities.jl +++ b/src/ThreadingUtilities.jl @@ -8,13 +8,9 @@ For use in spin-and-wait loops, like spinlocks. @inline pause() = ccall(:jl_cpu_pause, Cvoid, ()) if VERSION ≥ v"1.6.0-DEV.674" - @inline function assume(b::Bool)::Cvoid - Base.llvmcall((" declare void @llvm.assume(i1)\n\n define void @entry(i8) alwaysinline {\n top:\n %b = trunc i8 %0 to i1\ncall void @llvm.assume(i1 %b)\nret void\n }\n", "entry"), Cvoid, Tuple{Bool}, b) - end + @inline assume(b::Bool)::Cvoid = Base.llvmcall((" declare void @llvm.assume(i1)\n\n define void @entry(i8) alwaysinline {\n top:\n %b = trunc i8 %0 to i1\ncall void @llvm.assume(i1 %b)\nret void\n }\n", "entry"), Cvoid, Tuple{Bool}, b) else - @inline function assume(b::Bool)::Cvoid - Base.llvmcall(("declare void @llvm.assume(i1)", "%b = trunc i8 %0 to i1\ncall void @llvm.assume(i1 %b)\nret void"), Cvoid, Tuple{Bool}, b) - end + @inline assume(b::Bool)::Cvoid = Base.llvmcall(("declare void @llvm.assume(i1)", "%b = trunc i8 %0 to i1\ncall void @llvm.assume(i1 %b)\nret void"), Cvoid, Tuple{Bool}, b) end @enum ThreadState::UInt32 begin diff --git a/src/utils.jl b/src/utils.jl index da6fdc9..05bee63 100644 --- a/src/utils.jl +++ b/src/utils.jl @@ -23,11 +23,9 @@ function load_aggregate(::Type{T}, offset::Int) where {T} if Base.issingletontype(TF) push!(call.args, TF.instance) elseif fieldcount(TF) ≡ 0 - if TF === UInt - push!(call.args, :(load(p + (offset + $offset)))) - else - push!(call.args, :(load(reinterpret(Ptr{$TF}, p) + (offset + $offset)))) - end + ptr = :(p + (offset + $offset)) + ptr = TF === UInt ? ptr : :(reinterpret(Ptr{$TF}, $ptr)) + push!(call.args, :(load($ptr))) offset += offsetsize(TF) else arg, offset = load_aggregate(TF, offset) @@ -57,11 +55,9 @@ function store_aggregate!(q::Expr, sym, ::Type{T}, offset::Int) where {T} Base.issingletontype(TF) && continue gfcall = Expr(:call, gf, sym, f) if fieldcount(TF) ≡ 0 - if TF === UInt - push!(q.args, :(store!(p + (offset + $offset), $gfcall))) - else - push!(q.args, :(store!(reinterpret(Ptr{$TF}, p) + (offset + $offset), $gfcall))) - end + ptr = :(p + (offset + $offset)) + ptr = TF === UInt ? ptr : :(reinterpret(Ptr{$TF}, $ptr)) + push!(q.args, :(store!($ptr, $gfcall))) offset += offsetsize(TF) else newsym = gensym(sym) diff --git a/test/internals.jl b/test/internals.jl index 1cda91b..7105c09 100644 --- a/test/internals.jl +++ b/test/internals.jl @@ -3,10 +3,11 @@ @test ThreadingUtilities.store!(pointer(UInt[]), nothing, 1) == 1 x = zeros(UInt, 100); GC.@preserve x begin - t1 = (1.0, C_NULL, 3, VectorizationBase.stridedpointer(x)) + t1 = (1.0, C_NULL, (3, UInt(17)), VectorizationBase.stridedpointer(x)) @test ThreadingUtilities.store!(pointer(x), t1, 0) === mapreduce(sizeof, +, t1) @test ThreadingUtilities.load(pointer(x), typeof(t1), 0) === (mapreduce(sizeof, +, t1), t1) - + @test ThreadingUtilities.store!(pointer(x), 0xb502916f%UInt, 76) == 80 + @test ThreadingUtilities.load(pointer(x), UInt, 76) == (80,0xb502916f%UInt) nt1 = (;a = 1.0) @test ThreadingUtilities.store!(pointer(x), nt1, 0) === sizeof(nt1) @test ThreadingUtilities.load(pointer(x), typeof(nt1), 0) === (sizeof(nt1), nt1) From 7eb0487635f77d99c68ff938ba3ae646bcbf3d06 Mon Sep 17 00:00:00 2001 From: chriselrod Date: Wed, 2 Jun 2021 04:02:14 -0400 Subject: [PATCH 4/5] Fix tests --- test/internals.jl | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/test/internals.jl b/test/internals.jl index 7105c09..71ad2f1 100644 --- a/test/internals.jl +++ b/test/internals.jl @@ -6,8 +6,8 @@ t1 = (1.0, C_NULL, (3, UInt(17)), VectorizationBase.stridedpointer(x)) @test ThreadingUtilities.store!(pointer(x), t1, 0) === mapreduce(sizeof, +, t1) @test ThreadingUtilities.load(pointer(x), typeof(t1), 0) === (mapreduce(sizeof, +, t1), t1) - @test ThreadingUtilities.store!(pointer(x), 0xb502916f%UInt, 76) == 80 - @test ThreadingUtilities.load(pointer(x), UInt, 76) == (80,0xb502916f%UInt) + @test ThreadingUtilities.store!(pointer(x), 0xb502916f%UInt, 72) == 72 + sizeof(Int) + @test ThreadingUtilities.load(pointer(x), UInt, 72) == (72 + sizeof(Int),0xb502916f%UInt) nt1 = (;a = 1.0) @test ThreadingUtilities.store!(pointer(x), nt1, 0) === sizeof(nt1) @test ThreadingUtilities.load(pointer(x), typeof(nt1), 0) === (sizeof(nt1), nt1) From 0509594f5dec37a1c57225e3b5ec73ceec6357b1 Mon Sep 17 00:00:00 2001 From: chriselrod Date: Wed, 2 Jun 2021 04:07:40 -0400 Subject: [PATCH 5/5] More coverage. --- test/internals.jl | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/test/internals.jl b/test/internals.jl index 71ad2f1..58daa73 100644 --- a/test/internals.jl +++ b/test/internals.jl @@ -3,9 +3,11 @@ @test ThreadingUtilities.store!(pointer(UInt[]), nothing, 1) == 1 x = zeros(UInt, 100); GC.@preserve x begin - t1 = (1.0, C_NULL, (3, UInt(17)), VectorizationBase.stridedpointer(x)) + t1 = (1.0, C_NULL, Val(7), (3, UInt(17)), VectorizationBase.stridedpointer(x)) @test ThreadingUtilities.store!(pointer(x), t1, 0) === mapreduce(sizeof, +, t1) + @test ThreadingUtilities.store!(pointer(x), Val(0), 0) == 0 @test ThreadingUtilities.load(pointer(x), typeof(t1), 0) === (mapreduce(sizeof, +, t1), t1) + @test ThreadingUtilities.load(pointer(x), Val{0}, 0) === (0, Val(0)) @test ThreadingUtilities.store!(pointer(x), 0xb502916f%UInt, 72) == 72 + sizeof(Int) @test ThreadingUtilities.load(pointer(x), UInt, 72) == (72 + sizeof(Int),0xb502916f%UInt) nt1 = (;a = 1.0)