From ead079ac615e6f6b24eadeae36ede21fe99c268f Mon Sep 17 00:00:00 2001 From: Pete Vilter Date: Thu, 17 Nov 2022 14:03:21 -0500 Subject: [PATCH 1/2] Allow threadsafe access to buffer ot type inference profiling trees Co-authored-by: Nathan Daly --- base/compiler/typeinfer.jl | 118 +++++++++++++++++++++---------------- doc/src/devdocs/locks.md | 1 + src/Makefile | 3 +- src/inference-profiling.c | 44 ++++++++++++++ src/jl_exported_funcs.inc | 2 + test/compiler/inference.jl | 29 +++++---- 6 files changed, 131 insertions(+), 66 deletions(-) create mode 100644 src/inference-profiling.c diff --git a/base/compiler/typeinfer.jl b/base/compiler/typeinfer.jl index d9443045e9e89..f9d5f578c5587 100644 --- a/base/compiler/typeinfer.jl +++ b/base/compiler/typeinfer.jl @@ -22,7 +22,7 @@ being used for this purpose alone. """ module Timings -using Core.Compiler: -, +, :, Vector, length, first, empty!, push!, pop!, @inline, +using Core.Compiler: -, +, :, >, Vector, length, first, empty!, push!, pop!, @inline, @inbounds, copy, backtrace # What we record for any given frame we infer during type inference. @@ -47,13 +47,16 @@ end _typeinf_identifier(frame::InferenceFrameInfo) = frame +_typeinf_frame_linfo(frame::Core.Compiler.InferenceState) = frame.linfo +_typeinf_frame_linfo(frame::InferenceFrameInfo) = frame.mi + """ - Core.Compiler.Timing(mi_info, start_time, ...) + Core.Compiler.Timings.Timing(mi_info, start_time, ...) Internal type containing the timing result for running type inference on a single MethodInstance. """ -struct Timing +mutable struct Timing mi_info::InferenceFrameInfo start_time::UInt64 cur_start_time::UInt64 @@ -66,6 +69,31 @@ Timing(mi_info, start_time) = Timing(mi_info, start_time, start_time, UInt64(0), _time_ns() = ccall(:jl_hrtime, UInt64, ()) # Re-implemented here because Base not yet available. +""" + Core.Compiler.Timings.clear_and_fetch_timings() + +Return, then clear, the previously recorded type inference timings. + +This fetches a vector of all of the type inference timings that have _finished_ as of this call. Note +that there may be concurrent invocations of inference that are still running in another thread, but +which haven't yet been added to this buffer. Those can be fetched in a future call. +""" +function clear_and_fetch_timings() + # Pass in the type, since the C code doesn't know about our Timing struct. + ccall(:jl_typeinf_profiling_clear_and_fetch, Any, (Any, Any,), + _finished_timings, Vector{Timing})::Vector{Timing} +end + +function finish_timing_profile(timing::Timing) + ccall(:jl_typeinf_profiling_push_timing, Cvoid, (Any, Any,), _finished_timings, timing) +end + +# DO NOT ACCESS DIRECTLY. This vector should only be accessed through the +# functions above. It is a buffer that lives in the Julia module only to be *rooted* +# for GC, but all accesses to the vector must go through C code, in order to be +# thread safe. +const _finished_timings = Timing[] + # We keep a stack of the Timings for each of the MethodInstances currently being timed. # Since type inference currently operates via a depth-first search (during abstract # evaluation), this vector operates like a call stack. The last node in _timings is the @@ -74,16 +102,21 @@ _time_ns() = ccall(:jl_hrtime, UInt64, ()) # Re-implemented here because Base n # call structure through type inference is recorded. (It's recorded as a tree, not a graph, # because we create a new node for duplicates.) const _timings = Timing[] + # ROOT() is an empty function used as the top-level Timing node to measure all time spent # *not* in type inference during a given recording trace. It is used as a "dummy" node. function ROOT() end const ROOTmi = Core.Compiler.specialize_method( first(Core.Compiler.methods(ROOT)), Tuple{typeof(ROOT)}, Core.svec()) + """ Core.Compiler.reset_timings() Empty out the previously recorded type inference timings (`Core.Compiler._timings`), and start the ROOT() timer again. `ROOT()` measures all time spent _outside_ inference. + +!!! info + This function is deprecated as of Julia 1.9; use [`clear_and_fetch_timings`](@ref) instead. """ function reset_timings() empty!(_timings) @@ -93,7 +126,6 @@ function reset_timings() _time_ns())) return nothing end -reset_timings() # (This is split into a function so that it can be called both in this module, at the top # of `enter_new_timer()`, and once at the Very End of the operation, by whoever started @@ -105,16 +137,9 @@ reset_timings() parent_timer = _timings[end] accum_time = stop_time - parent_timer.cur_start_time - # Add in accum_time ("modify" the immutable struct) + # Add in accum_time @inbounds begin - _timings[end] = Timing( - parent_timer.mi_info, - parent_timer.start_time, - parent_timer.cur_start_time, - parent_timer.time + accum_time, - parent_timer.children, - parent_timer.bt, - ) + _timings[end].time += accum_time end return nothing end @@ -122,27 +147,21 @@ end @inline function enter_new_timer(frame) # Very first thing, stop the active timer: get the current time and add in the # time since it was last started to its aggregate exclusive time. - close_current_timer() - - mi_info = _typeinf_identifier(frame) + if length(_timings) > 0 + close_current_timer() + end # Start the new timer right before returning + mi_info = _typeinf_identifier(frame) push!(_timings, Timing(mi_info, UInt64(0))) len = length(_timings) new_timer = @inbounds _timings[len] + # Set the current time _after_ appending the node, to try to exclude the # overhead from measurement. start = _time_ns() - - @inbounds begin - _timings[len] = Timing( - new_timer.mi_info, - start, - start, - new_timer.time, - new_timer.children, - ) - end + new_timer.start_time = start + new_timer.cur_start_time = start return nothing end @@ -154,43 +173,38 @@ end # Finish the new timer stop_time = _time_ns() - expected_mi_info = _typeinf_identifier(_expected_frame_) + expected_linfo = _typeinf_frame_linfo(_expected_frame_) # Grab the new timer again because it might have been modified in _timings # (since it's an immutable struct) # And remove it from the current timings stack new_timer = pop!(_timings) - Core.Compiler.@assert new_timer.mi_info.mi === expected_mi_info.mi + Core.Compiler.@assert new_timer.mi_info.mi === expected_linfo - # Prepare to unwind one level of the stack and record in the parent - parent_timer = _timings[end] + # check for two cases: normal case & backcompat case + is_profile_root_normal = length(_timings) === 0 + is_profile_root_backcompat = length(_timings) === 1 && _timings[1] === ROOTmi + is_profile_root = is_profile_root_normal || is_profile_root_backcompat accum_time = stop_time - new_timer.cur_start_time # Add in accum_time ("modify" the immutable struct) - new_timer = Timing( - new_timer.mi_info, - new_timer.start_time, - new_timer.cur_start_time, - new_timer.time + accum_time, - new_timer.children, - parent_timer.mi_info.mi === ROOTmi ? backtrace() : nothing, - ) - # Record the final timing with the original parent timer - push!(parent_timer.children, new_timer) - - # And finally restart the parent timer: - len = length(_timings) - @inbounds begin - _timings[len] = Timing( - parent_timer.mi_info, - parent_timer.start_time, - _time_ns(), - parent_timer.time, - parent_timer.children, - parent_timer.bt, - ) + new_timer.time += accum_time + if is_profile_root + new_timer.bt = backtrace() end + # Prepare to unwind one level of the stack and record in the parent + if is_profile_root + finish_timing_profile(new_timer) + else + parent_timer = _timings[end] + + # Record the final timing with the original parent timer + push!(parent_timer.children, new_timer) + + # And finally restart the parent timer: + parent_timer.cur_start_time = _time_ns() + end return nothing end diff --git a/doc/src/devdocs/locks.md b/doc/src/devdocs/locks.md index bef1419b1c8f8..ed0b02f2bcc07 100644 --- a/doc/src/devdocs/locks.md +++ b/doc/src/devdocs/locks.md @@ -44,6 +44,7 @@ The following is a leaf lock (level 2), and only acquires level 1 locks (safepoi > * Module->lock > * JLDebuginfoPlugin::PluginMutex > * newly_inferred_mutex +> * typeinf_profiling_lock The following is a level 3 lock, which can only acquire level 1 or level 2 locks internally: diff --git a/src/Makefile b/src/Makefile index 0baa34fedf877..d996c7f43d1a9 100644 --- a/src/Makefile +++ b/src/Makefile @@ -43,7 +43,7 @@ endif SRCS := \ jltypes gf typemap smallintset ast builtins module interpreter symbol \ dlload sys init task array staticdata toplevel jl_uv datatype \ - simplevector runtime_intrinsics precompile jloptions \ + simplevector runtime_intrinsics precompile jloptions inference-profiling \ threading partr stackwalk gc gc-debug gc-pages gc-stacks gc-alloc-profiler method \ jlapi signal-handling safepoint timing subtype rtutils gc-heap-snapshot \ crc32c APInt-C processor ircode opaque_closure codegen-stubs coverage runtime_ccall @@ -300,6 +300,7 @@ $(BUILDDIR)/gc-pages.o $(BUILDDIR)/gc-pages.dbg.obj: $(SRCDIR)/gc.h $(BUILDDIR)/gc.o $(BUILDDIR)/gc.dbg.obj: $(SRCDIR)/gc.h $(SRCDIR)/gc-heap-snapshot.h $(SRCDIR)/gc-alloc-profiler.h $(BUILDDIR)/gc-heap-snapshot.o $(BUILDDIR)/gc-heap-snapshot.dbg.obj: $(SRCDIR)/gc.h $(SRCDIR)/gc-heap-snapshot.h $(BUILDDIR)/gc-alloc-profiler.o $(BUILDDIR)/gc-alloc-profiler.dbg.obj: $(SRCDIR)/gc.h $(SRCDIR)/gc-alloc-profiler.h +$(BUILDDIR)/inference-profiling.o $(BUILDDIR)/inference-profiling.dbg.obj: $(SRCDIR)/gc.h $(BUILDDIR)/init.o $(BUILDDIR)/init.dbg.obj: $(SRCDIR)/builtin_proto.h $(BUILDDIR)/interpreter.o $(BUILDDIR)/interpreter.dbg.obj: $(SRCDIR)/builtin_proto.h $(BUILDDIR)/jitlayers.o $(BUILDDIR)/jitlayers.dbg.obj: $(SRCDIR)/jitlayers.h $(SRCDIR)/llvm-codegen-shared.h diff --git a/src/inference-profiling.c b/src/inference-profiling.c new file mode 100644 index 0000000000000..8b284ae1a803f --- /dev/null +++ b/src/inference-profiling.c @@ -0,0 +1,44 @@ +// Implementation for type inference profiling + +#include "julia.h" +#include "julia_internal.h" + +jl_mutex_t typeinf_profiling_lock; + +// == exported interface == + +JL_DLLEXPORT jl_array_t* jl_typeinf_profiling_clear_and_fetch( + jl_array_t *inference_profiling_results_array, + jl_value_t *array_timing_type +) +{ + JL_LOCK(&typeinf_profiling_lock); + + size_t len = jl_array_len(inference_profiling_results_array); + + jl_array_t *out = jl_alloc_array_1d(array_timing_type, len); + JL_GC_PUSH1(&out); + + memcpy(out->data, inference_profiling_results_array->data, len * sizeof(void*)); + + jl_array_del_end(inference_profiling_results_array, len); + + JL_UNLOCK(&typeinf_profiling_lock); + + JL_GC_POP(); + return out; +} + +JL_DLLEXPORT void jl_typeinf_profiling_push_timing( + jl_array_t *inference_profiling_results_array, + jl_value_t *timing +) +{ + JL_LOCK(&typeinf_profiling_lock); + + jl_array_grow_end(inference_profiling_results_array, 1); + size_t len = jl_array_len(inference_profiling_results_array); + jl_array_ptr_set(inference_profiling_results_array, len - 1, timing); + + JL_UNLOCK(&typeinf_profiling_lock); +} diff --git a/src/jl_exported_funcs.inc b/src/jl_exported_funcs.inc index c475184573faa..80392a4dc9887 100644 --- a/src/jl_exported_funcs.inc +++ b/src/jl_exported_funcs.inc @@ -483,6 +483,8 @@ XX(jl_typeinf_lock_end) \ XX(jl_typeinf_timing_begin) \ XX(jl_typeinf_timing_end) \ + XX(jl_typeinf_profiling_clear_and_fetch) \ + XX(jl_typeinf_profiling_push_timing) \ XX(jl_typename_str) \ XX(jl_typeof_str) \ XX(jl_types_equal) \ diff --git a/test/compiler/inference.jl b/test/compiler/inference.jl index 71f9903f85324..51bfcb8ab2eec 100644 --- a/test/compiler/inference.jl +++ b/test/compiler/inference.jl @@ -3804,15 +3804,19 @@ f37532(T, x) = (Core.bitcast(Ptr{T}, x); x) # Helper functions for Core.Compiler.Timings. These are normally accessed via a package - # usually (SnoopCompileCore). function time_inference(f) - Core.Compiler.Timings.reset_timings() Core.Compiler.__set_measure_typeinf(true) f() Core.Compiler.__set_measure_typeinf(false) - Core.Compiler.Timings.close_current_timer() - return Core.Compiler.Timings._timings[1] + return Core.Compiler.Timings.clear_and_fetch_timings() end -function depth(t::Core.Compiler.Timings.Timing) - maximum(depth.(t.children), init=0) + 1 +function max_depth(t::Vector{Core.Compiler.Timings.Timing}) + maximum(max_depth.(t), init=0) + 1 +end +function max_depth(t::Core.Compiler.Timings.Timing) + maximum(max_depth.(t.children), init=0) + 1 +end +function flatten_times(t::Vector{Core.Compiler.Timings.Timing}) + collect(Iterators.flatten(flatten_times(el) for el in t)) end function flatten_times(t::Core.Compiler.Timings.Timing) collect(Iterators.flatten([(t.time => t.mi_info,), flatten_times.(t.children)...])) @@ -3829,14 +3833,14 @@ end timing1 = time_inference() do @eval M1.g(2, 3.0) end - @test occursin(r"Core.Compiler.Timings.Timing\(InferenceFrameInfo for Core.Compiler.Timings.ROOT\(\)\) with \d+ children", sprint(show, timing1)) + @test timing1 isa Vector{Core.Compiler.Timings.Timing} # The last two functions to be inferred should be `i` and `i2`, inferred at runtime with # their concrete types. @test sort([mi_info.mi.def.name for (time,mi_info) in flatten_times(timing1)[end-1:end]]) == [:i, :i2] - @test all(child->isa(child.bt, Vector), timing1.children) - @test all(child->child.bt===nothing, timing1.children[1].children) + @test all(child->isa(child.bt, Vector), timing1) + @test all(child->child.bt===nothing, timing1[1].children) # Test the stacktrace - @test isa(stacktrace(timing1.children[1].bt), Vector{Base.StackTraces.StackFrame}) + @test isa(stacktrace(timing1[1].bt), Vector{Base.StackTraces.StackFrame}) # Test that inference has cached some of the Method Instances timing2 = time_inference() do @eval M1.g(2, 3.0) @@ -3857,7 +3861,7 @@ end end end end - @test occursin("thunk from $(@__MODULE__) starting at $(@__FILE__):$((@__LINE__) - 5)", string(timingmod.children)) + @test occursin("thunk from $(@__MODULE__) starting at $(@__FILE__):$((@__LINE__) - 5)", string(timingmod)) # END LINE NUMBER SENSITIVITY # Recursive function @@ -3865,8 +3869,8 @@ end timing = time_inference() do @eval _Recursive.f(Base.inferencebarrier(5)) end - @test 2 <= depth(timing) <= 3 # root -> f (-> +) - @test 2 <= length(flatten_times(timing)) <= 3 # root, f, + + @test 1 <= max_depth(timing) <= 2 # f (-> +) + @test 1 <= length(flatten_times(timing)) <= 2 # f, + # Functions inferred with multiple constants @eval module C @@ -3894,7 +3898,6 @@ end @test !isempty(ft) str = sprint(show, ft) @test occursin("InferenceFrameInfo for /(1::$Int, ::$Int)", str) # inference constants - @test occursin("InferenceFrameInfo for Core.Compiler.Timings.ROOT()", str) # qualified # loopc has internal slots, check constant printing in this case sel = filter(ti -> ti.second.mi.def.name === :loopc, ft) ifi = sel[end].second From 10092107c1f93aefaba3ffd3ca77f132fd3cfd8c Mon Sep 17 00:00:00 2001 From: Nathan Daly Date: Fri, 30 Dec 2022 12:48:09 -0700 Subject: [PATCH 2/2] Make inference profiling thread safe for julia 1.9+ Starting in 1.9, julia does not hold the codegen lock for all of typeinference! This means that we cannot use a single shared scratch-space for profiling type inference. This PR changes the scratch space to use Task Local Storage, so that we build a separate inference profile tree per Task doing inference, and then report them to the global results buffer at the end. --- base/compiler/typeinfer.jl | 29 ++++++++++++++++++++++++----- 1 file changed, 24 insertions(+), 5 deletions(-) diff --git a/base/compiler/typeinfer.jl b/base/compiler/typeinfer.jl index f9d5f578c5587..bd08dc00cfa7b 100644 --- a/base/compiler/typeinfer.jl +++ b/base/compiler/typeinfer.jl @@ -23,7 +23,7 @@ being used for this purpose alone. module Timings using Core.Compiler: -, +, :, >, Vector, length, first, empty!, push!, pop!, @inline, - @inbounds, copy, backtrace + @inbounds, copy, backtrace, IdDict, Task, Ref, get! # What we record for any given frame we infer during type inference. struct InferenceFrameInfo @@ -94,14 +94,30 @@ end # thread safe. const _finished_timings = Timing[] -# We keep a stack of the Timings for each of the MethodInstances currently being timed. +# We store a profiling stack for *each Task* as a task-local-storage variable, _timings. +# This is a stack of the Timings for each of the MethodInstances currently being timed. # Since type inference currently operates via a depth-first search (during abstract # evaluation), this vector operates like a call stack. The last node in _timings is the # node currently being inferred, and its parent is directly before it, etc. # Each Timing also contains its own vector for all of its children, so that the tree # call structure through type inference is recorded. (It's recorded as a tree, not a graph, # because we create a new node for duplicates.) -const _timings = Timing[] +# You will see this accessed below as `task_local_storage(:_timings)` + +# ------- Task Local Storage ------- +# Reimplementation of Task Local Storage, since these functions aren't available yet +# at this stage of bootstrapping. +current_task() = ccall(:jl_get_current_task, Ref{Task}, ()) +task_local_storage() = get_task_tls(current_task()) +function get_task_tls(t::Task) + if t.storage === nothing + t.storage = IdDict() + end + return (t.storage)::IdDict{Any,Any} +end +# ------- + +tls_timings() = get!(task_local_storage(), :_timings, Vector{Timing}()) # ROOT() is an empty function used as the top-level Timing node to measure all time spent # *not* in type inference during a given recording trace. It is used as a "dummy" node. @@ -145,6 +161,8 @@ end end @inline function enter_new_timer(frame) + _timings = tls_timings() + # Very first thing, stop the active timer: get the current time and add in the # time since it was last started to its aggregate exclusive time. if length(_timings) > 0 @@ -154,8 +172,7 @@ end # Start the new timer right before returning mi_info = _typeinf_identifier(frame) push!(_timings, Timing(mi_info, UInt64(0))) - len = length(_timings) - new_timer = @inbounds _timings[len] + new_timer = @inbounds _timings[end] # Set the current time _after_ appending the node, to try to exclude the # overhead from measurement. @@ -170,6 +187,8 @@ end # assert that indeed we are always returning to a parent after finishing all of its # children (that is, asserting that inference proceeds via depth-first-search). @inline function exit_current_timer(_expected_frame_) + _timings = tls_timings() + # Finish the new timer stop_time = _time_ns()