Replace DataFrames in the profiler with NamedTuples#3029
Merged
maleadt merged 2 commits intoJuliaGPU:masterfrom Feb 11, 2026
Merged
Replace DataFrames in the profiler with NamedTuples#3029maleadt merged 2 commits intoJuliaGPU:masterfrom
maleadt merged 2 commits intoJuliaGPU:masterfrom
Conversation
Contributor
|
Your PR requires formatting changes to meet the project's style guidelines. Click here to view the suggested changes.diff --git a/src/profile.jl b/src/profile.jl
index 4cba87ff7..18f992125 100644
--- a/src/profile.jl
+++ b/src/profile.jl
@@ -109,37 +109,37 @@ end
module Profile
-using ..CUDA
+ using ..CUDA
using ..CUPTI
-using Crayons: @crayon_str, Crayon
-using NVTX: NVTX
-using PrettyTables: PrettyTables, TextHighlighter, pretty_table
-using Printf: Printf, @sprintf
-using Statistics: mean, quantile, std
-using demumble_jll: demumble
+ using Crayons: @crayon_str, Crayon
+ using NVTX: NVTX
+ using PrettyTables: PrettyTables, TextHighlighter, pretty_table
+ using Printf: Printf, @sprintf
+ using Statistics: mean, quantile, std
+ using demumble_jll: demumble
-#
-# helpers for NamedTuple-of-vector tables
-#
+ #
+ # helpers for NamedTuple-of-vector tables
+ #
-# Push a row to a NamedTuple of vectors, filling `missing` for absent keys
-function push_row!(t::NamedTuple, row::NamedTuple)
- for k in keys(t)
- if haskey(row, k)
- push!(t[k], row[k])
- else
- push!(t[k], missing)
+ # Push a row to a NamedTuple of vectors, filling `missing` for absent keys
+ function push_row!(t::NamedTuple, row::NamedTuple)
+ for k in keys(t)
+ if haskey(row, k)
+ push!(t[k], row[k])
+ else
+ push!(t[k], missing)
+ end
end
+ return t
end
- return t
-end
-# Filter rows by a boolean mask, returns new NamedTuple
-function filtermask(t::NamedTuple, mask::AbstractVector{Bool})
- NamedTuple{keys(t)}(Tuple(col[mask] for col in t))
-end
+ # Filter rows by a boolean mask, returns new NamedTuple
+ function filtermask(t::NamedTuple, mask::AbstractVector{Bool})
+ return NamedTuple{keys(t)}(Tuple(col[mask] for col in t))
+ end
#
# external profiler
@@ -333,7 +333,7 @@ interpret these results is to visualize them using the I/O stack (e.g. by callin
For programmatic access, it is possible to access the fields of this struct. However, the
exact format is not guaranteed to be stable, and may change between CUDA.jl releases.
-Currently, it contains three tables (NamedTuples of vectors):
+ Currently, it contains three tables (NamedTuples of vectors):
- `host`, containing host-side activity;
- `device`, containing device-side activity;
- `nvtx`, with information on captured NVTX ranges and events.
@@ -341,10 +341,10 @@ Currently, it contains three tables (NamedTuples of vectors):
See also: [`@profile`](@ref)
"""
Base.@kwdef struct ProfileResults
- # captured data (NamedTuples of vectors)
- host::NamedTuple
- device::NamedTuple
- nvtx::NamedTuple
+ # captured data (NamedTuples of vectors)
+ host::NamedTuple
+ device::NamedTuple
+ nvtx::NamedTuple
# display properties set by `@profile` kwargs
trace::Bool=false
@@ -393,15 +393,15 @@ end
# convert CUPTI activity records to host and device traces
function capture(cfg)
- host_trace = (
+ host_trace = (
id = Int[],
start = Float64[],
stop = Float64[],
name = String[],
- tid = Int[],
+ tid = Int[],
)
- device_trace = (
+ device_trace = (
id = Int[],
start = Float64[],
stop = Float64[],
@@ -421,9 +421,9 @@ function capture(cfg)
# memory operations
size = Union{Missing,Int64}[],
)
- # lookup tables (replaces leftjoin at end)
- details = Dict{Int,String}()
- nvtx_trace = (
+ # lookup tables (replaces leftjoin at end)
+ details = Dict{Int, String}()
+ nvtx_trace = (
id = Int[],
start = Float64[],
type = Symbol[],
@@ -431,7 +431,7 @@ function capture(cfg)
name = Union{Missing,String}[],
domain = Union{Missing,String}[],
)
- nvtx_data_lookup = Dict{Int,@NamedTuple{payload::Any, color::Union{Nothing,UInt32}, category::UInt32}}()
+ nvtx_data_lookup = Dict{Int, @NamedTuple{payload::Any, color::Union{Nothing, UInt32}, category::UInt32}}()
# memory_kind fields are sometimes typed CUpti_ActivityMemoryKind, sometimes UInt
as_memory_kind(x) = isa(x, CUPTI.CUpti_ActivityMemoryKind) ? x : CUPTI.CUpti_ActivityMemoryKind(x)
@@ -473,8 +473,12 @@ function capture(cfg)
"<unknown activity kind>"
end
- push_row!(host_trace, (; id, start=t0, stop=t1, name,
- tid=record.threadId))
+ push_row!(
+ host_trace, (;
+ id, start = t0, stop = t1, name,
+ tid = record.threadId,
+ )
+ )
# memory operations
elseif record.kind == CUPTI.CUPTI_ACTIVITY_KIND_MEMCPY
@@ -485,11 +489,15 @@ function capture(cfg)
dst_kind = as_memory_kind(record.dstKind)
name = "[copy $(string(src_kind)) to $(string(dst_kind)) memory]"
- push_row!(device_trace, (; id, start=t0, stop=t1, name,
- device=record.deviceId,
- context=record.contextId,
- stream=record.streamId,
- size=record.bytes))
+ push_row!(
+ device_trace, (;
+ id, start = t0, stop = t1, name,
+ device = record.deviceId,
+ context = record.contextId,
+ stream = record.streamId,
+ size = record.bytes,
+ )
+ )
elseif record.kind == CUPTI.CUPTI_ACTIVITY_KIND_MEMSET
id = record.correlationId
t0, t1 = record.start/1e9, record._end/1e9
@@ -497,11 +505,15 @@ function capture(cfg)
memory_kind = as_memory_kind(record.memoryKind)
name = "[set $(string(memory_kind)) memory]"
- push_row!(device_trace, (; id, start=t0, stop=t1, name,
- device=record.deviceId,
- context=record.contextId,
- stream=record.streamId,
- size=record.bytes))
+ push_row!(
+ device_trace, (;
+ id, start = t0, stop = t1, name,
+ device = record.deviceId,
+ context = record.contextId,
+ stream = record.streamId,
+ size = record.bytes,
+ )
+ )
# memory allocations
elseif record.kind == CUPTI.CUPTI_ACTIVITY_KIND_MEMORY2
@@ -513,7 +525,7 @@ function capture(cfg)
memory_kind = as_memory_kind(record.memoryKind)
str = "$(Base.format_bytes(record.bytes)), $(string(memory_kind)) memory"
- details[id] = str
+ details[id] = str
# kernel execution
# TODO: CUPTI_ACTIVITY_KIND_CDP_KERNEL (CUpti_ActivityCdpKernel)
@@ -531,12 +543,16 @@ function capture(cfg)
local_mem = (thread=Int64(record.localMemoryPerThread),
total=Int64(record.localMemoryTotal))
- push_row!(device_trace, (; id, start=t0, stop=t1, name,
- device=record.deviceId,
- context=record.contextId,
- stream=record.streamId,
- grid, block, registers,
- shared_mem, local_mem))
+ push_row!(
+ device_trace, (;
+ id, start = t0, stop = t1, name,
+ device = record.deviceId,
+ context = record.contextId,
+ stream = record.streamId,
+ grid, block, registers,
+ shared_mem, local_mem,
+ )
+ )
# NVTX markers
elseif record.kind == CUPTI.CUPTI_ACTIVITY_KIND_MARKER
@@ -547,15 +563,15 @@ function capture(cfg)
if record.flags == CUPTI.CUPTI_ACTIVITY_FLAG_MARKER_INSTANTANEOUS
@assert record.objectKind == CUDA.CUPTI.CUPTI_ACTIVITY_OBJECT_THREAD
tid = record.objectId.pt.threadId
- push_row!(nvtx_trace, (; id=record.id, start, tid, type=:instant, name, domain))
+ push_row!(nvtx_trace, (; id = record.id, start, tid, type = :instant, name, domain))
elseif record.flags == CUPTI.CUPTI_ACTIVITY_FLAG_MARKER_START
@assert record.objectKind == CUDA.CUPTI.CUPTI_ACTIVITY_OBJECT_THREAD
tid = record.objectId.pt.threadId
- push_row!(nvtx_trace, (; id=record.id, start, tid, type=:start, name, domain))
+ push_row!(nvtx_trace, (; id = record.id, start, tid, type = :start, name, domain))
elseif record.flags == CUPTI.CUPTI_ACTIVITY_FLAG_MARKER_END
@assert record.objectKind == CUDA.CUPTI.CUPTI_ACTIVITY_OBJECT_THREAD
tid = record.objectId.pt.threadId
- push_row!(nvtx_trace, (; id=record.id, start, tid, type=:end, name, domain))
+ push_row!(nvtx_trace, (; id = record.id, start, tid, type = :end, name, domain))
else
@error "Unexpected NVTX marker kind $(Int(record.flags)). Please file an issue."
end
@@ -584,65 +600,65 @@ function capture(cfg)
nothing
end
- nvtx_data_lookup[record.id] = (; payload, color, category=record.category)
+ nvtx_data_lookup[record.id] = (; payload, color, category = record.category)
else
@error "Unexpected CUPTI activity kind $(Int(record.kind)). Please file an issue."
end
end
- # Batch-demangle all kernel names in a single demumble invocation. This is
- # much faster than demangling them one-by-one.
- if !isempty(device_trace.name)
- input = join(device_trace.name, '\n')
- demangled = split(readchomp(pipeline(IOBuffer(input), `$(demumble())`)), '\n')
- copy!(device_trace.name, demangled)
- end
+ # Batch-demangle all kernel names in a single demumble invocation. This is
+ # much faster than demangling them one-by-one.
+ if !isempty(device_trace.name)
+ input = join(device_trace.name, '\n')
+ demangled = split(readchomp(pipeline(IOBuffer(input), `$(demumble())`)), '\n')
+ copy!(device_trace.name, demangled)
+ end
- # add details column via Dict lookup (replaces leftjoin)
- host_details = Union{Missing,String}[get(details, id, missing) for id in host_trace.id]
- host = merge(host_trace, (; details=host_details))
-
- dev_details = Union{Missing,String}[get(details, id, missing) for id in device_trace.id]
- device = merge(device_trace, (; details=dev_details))
-
- # add NVTX data columns via Dict lookup (replaces leftjoin)
- n_nvtx = length(nvtx_trace.id)
- nvtx_payload = Vector{Any}(missing, n_nvtx)
- nvtx_color = Vector{Union{Nothing,UInt32}}(nothing, n_nvtx)
- nvtx_category = Vector{Union{Missing, UInt32}}(missing, n_nvtx)
- for i in 1:n_nvtx
- data = get(nvtx_data_lookup, nvtx_trace.id[i], nothing)
- if !isnothing(data)
- nvtx_payload[i] = data.payload
- nvtx_color[i] = data.color
- nvtx_category[i] = data.category
+ # add details column via Dict lookup (replaces leftjoin)
+ host_details = Union{Missing, String}[get(details, id, missing) for id in host_trace.id]
+ host = merge(host_trace, (; details = host_details))
+
+ dev_details = Union{Missing, String}[get(details, id, missing) for id in device_trace.id]
+ device = merge(device_trace, (; details = dev_details))
+
+ # add NVTX data columns via Dict lookup (replaces leftjoin)
+ n_nvtx = length(nvtx_trace.id)
+ nvtx_payload = Vector{Any}(missing, n_nvtx)
+ nvtx_color = Vector{Union{Nothing, UInt32}}(nothing, n_nvtx)
+ nvtx_category = Vector{Union{Missing, UInt32}}(missing, n_nvtx)
+ for i in 1:n_nvtx
+ data = get(nvtx_data_lookup, nvtx_trace.id[i], nothing)
+ if !isnothing(data)
+ nvtx_payload[i] = data.payload
+ nvtx_color[i] = data.color
+ nvtx_category[i] = data.category
+ end
end
- end
- nvtx = merge(nvtx_trace, (; payload=nvtx_payload, color=nvtx_color, category=nvtx_category))
+ nvtx = merge(nvtx_trace, (; payload = nvtx_payload, color = nvtx_color, category = nvtx_category))
- return (; host, device, nvtx)
+ return (; host, device, nvtx)
end
function Base.show(io::IO, results::ProfileResults)
results = deepcopy(results)
- host = results.host
- device = results.device
- nvtx = results.nvtx
+ host = results.host
+ device = results.device
+ nvtx = results.nvtx
# find the relevant part of the trace (marked by calls to 'cuCtxSynchronize')
- trace_first_sync = findfirst(host.name .== "cuCtxSynchronize")
+ trace_first_sync = findfirst(host.name .== "cuCtxSynchronize")
trace_first_sync === nothing && error("Could not find the start of the profiling data.")
- trace_last_sync = findlast(host.name .== "cuCtxSynchronize")
+ trace_last_sync = findlast(host.name .== "cuCtxSynchronize")
trace_first_sync == trace_last_sync && error("Could not find the end of the profiling data.")
## truncate the trace
if !results.raw || !results.trace
- trace_begin = host.stop[trace_first_sync]
- trace_end = host.stop[trace_last_sync]
+ trace_begin = host.stop[trace_first_sync]
+ trace_end = host.stop[trace_last_sync]
- first_id = host.id[trace_first_sync + 1]
- last_id = host.id[trace_last_sync - 1]
- host = filtermask(host, first_id .<= host.id .<= last_id)
- device = filtermask(device, first_id .<= device.id .<= last_id)
+ first_id = host.id[trace_first_sync + 1]
+ last_id = host.id[trace_last_sync - 1]
+ host = filtermask(host, first_id .<= host.id .<= last_id)
+ device = filtermask(device, first_id .<= device.id .<= last_id)
trace_divisions = Int[]
else
# in raw mode, we display the entire trace, but highlight the relevant part.
@@ -651,132 +667,135 @@ function Base.show(io::IO, results::ProfileResults)
trace_divisions = [trace_first_sync, trace_last_sync-1]
# inclusive bounds
- trace_begin = host.start[begin]
- trace_end = host.stop[end]
+ trace_begin = host.start[begin]
+ trace_end = host.stop[end]
end
trace_time = trace_end - trace_begin
# compute event and trace duration
- host = merge(host, (; time=host.stop .- host.start))
- device = merge(device, (; time=device.stop .- device.start))
- events = length(host.id) + length(device.id)
+ host = merge(host, (; time = host.stop .- host.start))
+ device = merge(device, (; time = device.stop .- device.start))
+ events = length(host.id) + length(device.id)
println(io, "Profiler ran for $(format_time(trace_time)), capturing $(events) events.")
# make some numbers more friendly to read
## make timestamps relative to the start
- host.start .-= trace_begin
- host.stop .-= trace_begin
- device.start .-= trace_begin
- device.stop .-= trace_begin
- nvtx.start .-= trace_begin
+ host.start .-= trace_begin
+ host.stop .-= trace_begin
+ device.start .-= trace_begin
+ device.stop .-= trace_begin
+ nvtx.start .-= trace_begin
if !results.raw
# renumber event IDs from 1
- first_id = minimum([host.id; device.id])
- for df in (host, device)
+ first_id = minimum([host.id; device.id])
+ for df in (host, device)
df.id .-= first_id - 1
end
# renumber thread IDs from 1
- threads = unique([host.tid; nvtx.tid])
- for df in (host, nvtx)
+ threads = unique([host.tid; nvtx.tid])
+ for df in (host, nvtx)
broadcast!(df.tid, df.tid) do tid
findfirst(isequal(tid), threads)
end
- end
+ end
end
# helper function to visualize slow trace entries
function time_highlighters(df)
## filter out entries that execute _very_ quickly (like calls to cuCtxGetCurrent)
- relevant_times = df.time[df.time .>= 1e-8]
+ relevant_times = df.time[df.time .>= 1.0e-8]
isempty(relevant_times) && return ()
p75 = quantile(relevant_times, 0.75)
p95 = quantile(relevant_times, 0.95)
- highlight_p95 = TextHighlighter((data, i, j) -> (keys(data)[j] == :time) &&
- (data[j][i] >= p95),
+ highlight_p95 = TextHighlighter(
+ (data, i, j) -> (keys(data)[j] == :time) &&
+ (data[j][i] >= p95),
crayon"red")
- highlight_p75 = TextHighlighter((data, i, j) -> (keys(data)[j] == :time) &&
- (data[j][i] >= p75),
+ highlight_p75 = TextHighlighter(
+ (data, i, j) -> (keys(data)[j] == :time) &&
+ (data[j][i] >= p75),
crayon"yellow")
- highlight_bold = TextHighlighter((data, i, j) -> (keys(data)[j] == :name) &&
- (data.time[i] >= p75),
+ highlight_bold = TextHighlighter(
+ (data, i, j) -> (keys(data)[j] == :name) &&
+ (data.time[i] >= p75),
crayon"bold")
(highlight_p95, highlight_p75, highlight_bold)
end
function summarize_trace(df)
- # group times by name
- groups = Dict{String,Vector{Float64}}()
- for (name, t) in zip(df.name, df.time)
- push!(get!(Vector{Float64}, groups, name), t)
- end
+ # group times by name
+ groups = Dict{String, Vector{Float64}}()
+ for (name, t) in zip(df.name, df.time)
+ push!(get!(Vector{Float64}, groups, name), t)
+ end
- n = length(groups)
- out_name = Vector{String}(undef, n)
- out_time = Vector{Float64}(undef, n)
- out_calls = Vector{Int}(undef, n)
- out_dist = Vector{Union{Missing,@NamedTuple{std::Float64, mean::Float64, min::Float64, max::Float64}}}(undef, n)
- for (i, (name, times)) in enumerate(groups)
- out_name[i] = name
- out_time[i] = sum(times)
- out_calls[i] = length(times)
- out_dist[i] = if length(times) == 1
+ n = length(groups)
+ out_name = Vector{String}(undef, n)
+ out_time = Vector{Float64}(undef, n)
+ out_calls = Vector{Int}(undef, n)
+ out_dist = Vector{Union{Missing, @NamedTuple{std::Float64, mean::Float64, min::Float64, max::Float64}}}(undef, n)
+ for (i, (name, times)) in enumerate(groups)
+ out_name[i] = name
+ out_time[i] = sum(times)
+ out_calls[i] = length(times)
+ out_dist[i] = if length(times) == 1
missing
else
- (; std=std(times), mean=mean(times), min=minimum(times), max=maximum(times))
+ (; std = std(times), mean = mean(times), min = minimum(times), max = maximum(times))
end
end
- out_ratio = out_time ./ trace_time
-
- # sort by time ratio (descending)
- perm = sortperm(out_ratio; rev=true)
- return (
- name = out_name[perm],
- time = out_time[perm],
- calls = out_calls[perm],
- time_dist = out_dist[perm],
- time_ratio = out_ratio[perm],
- )
+ out_ratio = out_time ./ trace_time
+
+ # sort by time ratio (descending)
+ perm = sortperm(out_ratio; rev = true)
+ return (
+ name = out_name[perm],
+ time = out_time[perm],
+ calls = out_calls[perm],
+ time_dist = out_dist[perm],
+ time_ratio = out_ratio[perm],
+ )
end
trace_column_names = Dict(
- :id => "ID",
- :start => "Start",
- :time => "Time",
- :grid => "Blocks",
- :tid => "Thread",
- :block => "Threads",
- :registers => "Regs",
- :shared_mem => "Shared Mem",
- :local_mem => "Local Mem",
- :size => "Size",
- :throughput => "Throughput",
- :device => "Device",
- :stream => "Stream",
- :name => "Name",
- :domain => "Domain",
- :details => "Details",
- :payload => "Payload",
+ :id => "ID",
+ :start => "Start",
+ :time => "Time",
+ :grid => "Blocks",
+ :tid => "Thread",
+ :block => "Threads",
+ :registers => "Regs",
+ :shared_mem => "Shared Mem",
+ :local_mem => "Local Mem",
+ :size => "Size",
+ :throughput => "Throughput",
+ :device => "Device",
+ :stream => "Stream",
+ :name => "Name",
+ :domain => "Domain",
+ :details => "Details",
+ :payload => "Payload",
)
summary_column_names = Dict(
- :time => "Total time",
- :time_ratio => "Time (%)",
- :calls => "Calls",
- :time_dist => "Time distribution",
- :name => "Name",
+ :time => "Total time",
+ :time_ratio => "Time (%)",
+ :calls => "Calls",
+ :time_dist => "Time distribution",
+ :name => "Name",
)
summary_formatter(df) = function(v, i, j)
- col = keys(df)[j]
- if col == :time_ratio
+ col = keys(df)[j]
+ return if col == :time_ratio
format_percentage(v)
- elseif col == :time
+ elseif col == :time
format_time(v)
- elseif col == :time_dist
+ elseif col == :time_dist
if v === missing
""
else
@@ -802,79 +821,82 @@ function Base.show(io::IO, results::ProfileResults)
# host-side activity
let
# to determine the time the host was active, we should look at threads separately
- thread_times = Dict{Int,Float64}()
- for (tid, t) in zip(host.tid, host.time)
- thread_times[tid] = get(thread_times, tid, 0.0) + t
- end
- host_time = maximum(values(thread_times))
+ thread_times = Dict{Int, Float64}()
+ for (tid, t) in zip(host.tid, host.time)
+ thread_times[tid] = get(thread_times, tid, 0.0) + t
+ end
+ host_time = maximum(values(thread_times))
host_ratio = host_time / trace_time
# get rid of API call version suffixes
- host.name .= replace.(host.name, r"_v\d+$" => "")
+ host.name .= replace.(host.name, r"_v\d+$" => "")
df = if results.raw
- host
+ host
else
# filter spammy API calls
- spammy = Set([# context and stream queries we use for nonblocking sync
- "cuCtxGetCurrent", "cuCtxGetId", "cuCtxGetApiVersion",
- "cuStreamQuery", "cuStreamGetId",
- # occupancy API, done before every kernel launch
- "cuOccupancyMaxPotentialBlockSize",
- # driver pointer set-up
- "cuGetProcAddress",
- # called a lot during compilation
- "cuDeviceGetAttribute",
- # done before every memory operation
- "cuPointerGetAttribute", "cuDeviceGetMemPool",
- "cuStreamGetCaptureInfo"])
- filtermask(host, [name ∉ spammy for name in host.name])
+ spammy = Set(
+ [# context and stream queries we use for nonblocking sync
+ "cuCtxGetCurrent", "cuCtxGetId", "cuCtxGetApiVersion",
+ "cuStreamQuery", "cuStreamGetId",
+ # occupancy API, done before every kernel launch
+ "cuOccupancyMaxPotentialBlockSize",
+ # driver pointer set-up
+ "cuGetProcAddress",
+ # called a lot during compilation
+ "cuDeviceGetAttribute",
+ # done before every memory operation
+ "cuPointerGetAttribute", "cuDeviceGetMemPool",
+ "cuStreamGetCaptureInfo",
+ ]
+ )
+ filtermask(host, [name ∉ spammy for name in host.name])
end
# instantaneous NVTX markers can be added to the API trace
if results.trace
- instant_mask = nvtx.type .== :instant
- n_markers = count(instant_mask)
- if n_markers > 0
- marker_names = nvtx.name[instant_mask]
- marker_domains = nvtx.domain[instant_mask]
- marker_details = map(marker_names, marker_domains) do name, domain
- if !ismissing(name) && !ismissing(domain)
- "$(domain).$(name)"
- elseif !ismissing(name)
- "$name"
- else
- missing
- end
+ instant_mask = nvtx.type .== :instant
+ n_markers = count(instant_mask)
+ if n_markers > 0
+ marker_names = nvtx.name[instant_mask]
+ marker_domains = nvtx.domain[instant_mask]
+ marker_details = map(marker_names, marker_domains) do name, domain
+ if !ismissing(name) && !ismissing(domain)
+ "$(domain).$(name)"
+ elseif !ismissing(name)
+ "$name"
+ else
+ missing
+ end
end
- # append markers to host trace (with type widening for id/stop)
- df = (
- id = vcat(Vector{Union{Missing,Int}}(df.id), fill(missing, n_markers)),
- start = vcat(df.start, nvtx.start[instant_mask]),
- stop = vcat(Vector{Union{Missing,Float64}}(df.stop), fill(missing, n_markers)),
- name = vcat(df.name, fill("NVTX marker", n_markers)),
- tid = vcat(df.tid, nvtx.tid[instant_mask]),
- details = vcat(df.details, marker_details),
- time = vcat(df.time, zeros(Float64, n_markers)),
- )
-
- # sort by start time
- perm = sortperm(df.start)
- df = NamedTuple{keys(df)}(Tuple(col[perm] for col in df))
- end
+ # append markers to host trace (with type widening for id/stop)
+ df = (
+ id = vcat(Vector{Union{Missing, Int}}(df.id), fill(missing, n_markers)),
+ start = vcat(df.start, nvtx.start[instant_mask]),
+ stop = vcat(Vector{Union{Missing, Float64}}(df.stop), fill(missing, n_markers)),
+ name = vcat(df.name, fill("NVTX marker", n_markers)),
+ tid = vcat(df.tid, nvtx.tid[instant_mask]),
+ details = vcat(df.details, marker_details),
+ time = vcat(df.time, zeros(Float64, n_markers)),
+ )
+
+ # sort by start time
+ perm = sortperm(df.start)
+ df = NamedTuple{keys(df)}(Tuple(col[perm] for col in df))
+ end
end
- if !isempty(df.name)
+ if !isempty(df.name)
println(io, "\nHost-side activity: calling CUDA APIs took $(format_time(host_time)) ($(format_percentage(host_ratio)) of the trace)")
end
- if isempty(df.name)
+ if isempty(df.name)
println(io, "\nNo host-side activity was recorded.")
elseif results.trace
# determine columns to show, based on whether they contain useful information
columns = [:id, :start, :time]
for col in [:tid]
- if results.raw || length(unique(df[col])) > 1
+ if results.raw || length(unique(df[col])) > 1
push!(columns, col)
end
end
@@ -883,14 +905,14 @@ function Base.show(io::IO, results::ProfileResults)
push!(columns, :details)
end
- df = NamedTuple{Tuple(columns)}(Tuple(df[c] for c in columns))
+ df = NamedTuple{Tuple(columns)}(Tuple(df[c] for c in columns))
- header = [trace_column_names[k] for k in keys(df)]
- alignment = [k == :name ? :l : :r for k in keys(df)]
+ header = [trace_column_names[k] for k in keys(df)]
+ alignment = [k == :name ? :l : :r for k in keys(df)]
formatters = function(v, i, j)
if v === missing
return "-"
- elseif keys(df)[j] in (:start, :time)
+ elseif keys(df)[j] in (:start, :time)
format_time(v)
else
v
@@ -908,10 +930,10 @@ function Base.show(io::IO, results::ProfileResults)
push!(columns, :time_dist)
end
push!(columns, :name)
- df = NamedTuple{Tuple(columns)}(Tuple(df[c] for c in columns))
+ df = NamedTuple{Tuple(columns)}(Tuple(df[c] for c in columns))
- header = [summary_column_names[k] for k in keys(df)]
- alignment = [k in (:name, :time_dist) ? :l : :r for k in keys(df)]
+ header = [summary_column_names[k] for k in keys(df)]
+ alignment = [k in (:name, :time_dist) ? :l : :r for k in keys(df)]
highlighters = time_highlighters(df)
pretty_table(io, df; column_labels=header, alignment, formatters=[summary_formatter(df)], highlighters=collect(highlighters), fit_table_in_display_horizontally=(crop==:horizontal), fit_table_in_display_vertically=false)
end
@@ -919,58 +941,58 @@ function Base.show(io::IO, results::ProfileResults)
# device-side activity
let
- device_time = sum(device.time)
+ device_time = sum(device.time)
device_ratio = device_time / trace_time
- if !isempty(device.id)
+ if !isempty(device.id)
println(io, "\nDevice-side activity: GPU was busy for $(format_time(device_time)) ($(format_percentage(device_ratio)) of the trace)")
end
# add memory throughput information
- device = merge(device, (; throughput=device.size ./ device.time))
+ device = merge(device, (; throughput = device.size ./ device.time))
- if isempty(device.id)
+ if isempty(device.id)
println(io, "\nNo device-side activity was recorded.")
elseif results.trace
# determine columns to show, based on whether they contain useful information
columns = [:id, :start, :time]
## device/stream identification
for col in [:device, :stream]
- if results.raw || length(unique(device[col])) > 1
+ if results.raw || length(unique(device[col])) > 1
push!(columns, col)
end
end
## kernel details (can be missing)
for col in [:block, :grid, :registers]
- if results.raw || any(!ismissing, device[col])
+ if results.raw || any(!ismissing, device[col])
push!(columns, col)
end
end
- if results.raw || any(val->!ismissing(val) && (val.static > 0 || val.dynamic > 0), device.shared_mem)
+ if results.raw || any(val -> !ismissing(val) && (val.static > 0 || val.dynamic > 0), device.shared_mem)
push!(columns, :shared_mem)
end
- if results.raw || any(val->!ismissing(val) && val.thread > 0, device.local_mem)
+ if results.raw || any(val -> !ismissing(val) && val.thread > 0, device.local_mem)
push!(columns, :local_mem)
end
## memory details (can be missing)
- if results.raw || any(!ismissing, device.size)
+ if results.raw || any(!ismissing, device.size)
push!(columns, :size)
push!(columns, :throughput)
end
push!(columns, :name)
- df = NamedTuple{Tuple(columns)}(Tuple(device[c] for c in columns))
+ df = NamedTuple{Tuple(columns)}(Tuple(device[c] for c in columns))
- header = [trace_column_names[k] for k in keys(df)]
- alignment = [k == :name ? :l : :r for k in keys(df)]
+ header = [trace_column_names[k] for k in keys(df)]
+ alignment = [k == :name ? :l : :r for k in keys(df)]
formatters = function(v, i, j)
- col = keys(df)[j]
+ col = keys(df)[j]
if v === missing
return "-"
- elseif col in (:start, :time)
+ elseif col in (:start, :time)
format_time(v)
- elseif col == :size
+ elseif col == :size
Base.format_bytes(v)
- elseif col == :shared_mem
+ elseif col == :shared_mem
if results.raw || v.static > 0 && v.dynamic > 0
"$(Base.format_bytes(v.static)) static, $(Base.format_bytes(v.dynamic)) dynamic"
elseif v.static > 0
@@ -980,11 +1002,11 @@ function Base.show(io::IO, results::ProfileResults)
else
"-"
end
- elseif col == :local_mem
+ elseif col == :local_mem
"$(Base.format_bytes(v.thread)) / $(Base.format_bytes(v.total))"
- elseif col == :throughput
+ elseif col == :throughput
Base.format_bytes(v) * "/s"
- elseif col == :device
+ elseif col == :device
CUDA.name(CuDevice(v))
elseif v isa CUDA.CuDim3
if v.z != 1
@@ -1002,17 +1024,17 @@ function Base.show(io::IO, results::ProfileResults)
pretty_table(io, df; column_labels=header, alignment, formatters=[formatters], highlighters=collect(highlighters), fit_table_in_display_horizontally=(crop==:horizontal), fit_table_in_display_vertically=false)
#body_hlines=trace_divisions)
else
- df = summarize_trace(device)
+ df = summarize_trace(device)
columns = [:time_ratio, :time, :calls]
if any(!ismissing, df.time_dist)
push!(columns, :time_dist)
end
push!(columns, :name)
- df = NamedTuple{Tuple(columns)}(Tuple(df[c] for c in columns))
+ df = NamedTuple{Tuple(columns)}(Tuple(df[c] for c in columns))
- header = [summary_column_names[k] for k in keys(df)]
- alignment = [k in (:name, :time_dist) ? :l : :r for k in keys(df)]
+ header = [summary_column_names[k] for k in keys(df)]
+ alignment = [k in (:name, :time_dist) ? :l : :r for k in keys(df)]
highlighters = time_highlighters(df)
pretty_table(io, df; column_labels=header, alignment, formatters=[summary_formatter(df)], highlighters=collect(highlighters), fit_table_in_display_horizontally=(crop==:horizontal), fit_table_in_display_vertically=false)
end
@@ -1020,35 +1042,35 @@ function Base.show(io::IO, results::ProfileResults)
# show NVTX ranges
# TODO: do we also want to repeat the host/device summary for each NVTX range?
- # that's what nvprof used to do, but it's a little verbose...
+ # that's what nvprof used to do, but it's a little verbose...
- # build lookup from end event id → stop time
- end_times = Dict{Int,Float64}()
- for i in eachindex(nvtx.id)
- if nvtx.type[i] == :end
- end_times[nvtx.id[i]] = nvtx.start[i]
+ # build lookup from end event id → stop time
+ end_times = Dict{Int, Float64}()
+ for i in eachindex(nvtx.id)
+ if nvtx.type[i] == :end
+ end_times[nvtx.id[i]] = nvtx.start[i]
+ end
end
- end
- nvtx_ranges = filtermask(nvtx, nvtx.type .== :start)
- if length(nvtx_ranges.id) > 0
- # add stop time from matching end events
- stop_times = Float64[get(end_times, id, NaN) for id in nvtx_ranges.id]
- nvtx_ranges = merge(nvtx_ranges, (; stop=stop_times, time=stop_times .- nvtx_ranges.start))
+ nvtx_ranges = filtermask(nvtx, nvtx.type .== :start)
+ if length(nvtx_ranges.id) > 0
+ # add stop time from matching end events
+ stop_times = Float64[get(end_times, id, NaN) for id in nvtx_ranges.id]
+ nvtx_ranges = merge(nvtx_ranges, (; stop = stop_times, time = stop_times .- nvtx_ranges.start))
- println(io, "\nNVTX ranges:")
+ println(io, "\nNVTX ranges:")
df = nvtx_ranges
if results.trace
# determine columns to show, based on whether they contain useful information
columns = [:id, :start, :time]
for col in [:tid]
- if results.raw || length(unique(df[col])) > 1
+ if results.raw || length(unique(df[col])) > 1
push!(columns, col)
end
end
for col in [:domain, :name, :payload]
- if results.raw || any(!ismissing, df[col])
+ if results.raw || any(!ismissing, df[col])
push!(columns, col)
end
end
@@ -1057,22 +1079,22 @@ function Base.show(io::IO, results::ProfileResults)
color_highlighters = []
for color in unique(df.color)
if color !== nothing
- ids = Set(df.id[isequal.(df.color, color)])
+ ids = Set(df.id[isequal.(df.color, color)])
highlighter = TextHighlighter(Crayon(; foreground=color)) do data, i, j
- keys(data)[j] in (:name, :domain) && data.id[i] in ids
+ keys(data)[j] in (:name, :domain) && data.id[i] in ids
end
push!(color_highlighters, highlighter)
end
end
- df = NamedTuple{Tuple(columns)}(Tuple(df[c] for c in columns))
+ df = NamedTuple{Tuple(columns)}(Tuple(df[c] for c in columns))
- header = [trace_column_names[k] for k in keys(df)]
- alignment = [k == :name ? :l : :r for k in keys(df)]
+ header = [trace_column_names[k] for k in keys(df)]
+ alignment = [k == :name ? :l : :r for k in keys(df)]
formatters = function(v, i, j)
if v === missing
return "-"
- elseif keys(df)[j] in (:start, :time)
+ elseif keys(df)[j] in (:start, :time)
format_time(v)
else
v
@@ -1082,16 +1104,16 @@ function Base.show(io::IO, results::ProfileResults)
pretty_table(io, df; column_labels=header, alignment, formatters=[formatters], highlighters=collect(highlighters), fit_table_in_display_horizontally=(crop==:horizontal), fit_table_in_display_vertically=false)
else
# merge the domain and name into a single column
- merged_names = map(nvtx_ranges.name, nvtx_ranges.domain) do name, domain
+ merged_names = map(nvtx_ranges.name, nvtx_ranges.domain) do name, domain
if name !== missing && domain !== missing
"$(domain).$(name)"
elseif name !== missing
"$name"
- else
- missing
+ else
+ missing
end
end
- nvtx_ranges.name .= merged_names
+ nvtx_ranges.name .= merged_names
df = summarize_trace(nvtx_ranges)
@@ -1100,10 +1122,10 @@ function Base.show(io::IO, results::ProfileResults)
push!(columns, :time_dist)
end
push!(columns, :name)
- df = NamedTuple{Tuple(columns)}(Tuple(df[c] for c in columns))
+ df = NamedTuple{Tuple(columns)}(Tuple(df[c] for c in columns))
- header = [summary_column_names[k] for k in keys(df)]
- alignment = [k in (:name, :time_dist) ? :l : :r for k in keys(df)]
+ header = [summary_column_names[k] for k in keys(df)]
+ alignment = [k in (:name, :time_dist) ? :l : :r for k in keys(df)]
highlighters = time_highlighters(df)
pretty_table(io, df; column_labels=header, alignment, formatters=[summary_formatter(df)], highlighters=collect(highlighters), fit_table_in_display_horizontally=(crop==:horizontal), fit_table_in_display_vertically=false)
end
diff --git a/test/core/profile.jl b/test/core/profile.jl
index b0cbf9af7..3364f1264 100644
--- a/test/core/profile.jl
+++ b/test/core/profile.jl
@@ -3,32 +3,32 @@ using CUDA.Profile: push_row!, filtermask
@testset "profiler" begin
-############################################################################################
+ ############################################################################################
-@testset "helpers" begin
+ @testset "helpers" begin
-@testset "push_row!" begin
- nt = (id = Int[], name = String[], value = Union{Missing,Int}[])
- push_row!(nt, (id = 1, name = "a"))
- push_row!(nt, (id = 2, name = "b", value = 10))
+ @testset "push_row!" begin
+ nt = (id = Int[], name = String[], value = Union{Missing, Int}[])
+ push_row!(nt, (id = 1, name = "a"))
+ push_row!(nt, (id = 2, name = "b", value = 10))
- @test nt.id == [1, 2]
- @test nt.name == ["a", "b"]
- @test ismissing(nt.value[1])
- @test nt.value[2] == 10
-end
+ @test nt.id == [1, 2]
+ @test nt.name == ["a", "b"]
+ @test ismissing(nt.value[1])
+ @test nt.value[2] == 10
+ end
-@testset "filtermask" begin
- nt = (id = [1, 2, 3, 4, 5], val = [10, 25, 15, 30, 5])
- filtered = filtermask(nt, nt.val .> 15)
+ @testset "filtermask" begin
+ nt = (id = [1, 2, 3, 4, 5], val = [10, 25, 15, 30, 5])
+ filtered = filtermask(nt, nt.val .> 15)
- @test filtered.id == [2, 4]
- @test filtered.val == [25, 30]
-end
+ @test filtered.id == [2, 4]
+ @test filtered.val == [25, 30]
+ end
-end
+ end
-############################################################################################
+ ############################################################################################
@testset "external" begin
|
Contributor
Author
|
Not sure what's up with the 1.12 failures 🤔 It doesn't look related AFAICT. |
Contributor
|
Same error happened also in aa310ac on |
Codecov Report❌ Patch coverage is
Additional details and impacted files@@ Coverage Diff @@
## master #3029 +/- ##
==========================================
- Coverage 89.31% 89.27% -0.04%
==========================================
Files 148 148
Lines 12995 13047 +52
==========================================
+ Hits 11606 11648 +42
- Misses 1389 1399 +10 ☔ View full report in Codecov by Sentry. 🚀 New features to boost your workflow:
|
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Add this suggestion to a batch that can be applied as a single commit.This suggestion is invalid because no changes were made to the code.Suggestions cannot be applied while the pull request is closed.Suggestions cannot be applied while viewing a subset of changes.Only one suggestion per line can be applied in a batch.Add this suggestion to a batch that can be applied as a single commit.Applying suggestions on deleted lines is not supported.You must change the existing code in this line in order to create a valid suggestion.Outdated suggestions cannot be applied.This suggestion has been applied or marked resolved.Suggestions cannot be applied from pending reviews.Suggestions cannot be applied on multi-line comments.Suggestions cannot be applied while the pull request is queued to merge.Suggestion cannot be applied right now. Please check back later.
As suggested in #2859 and #2238.
I initially tried to implement this using a Tables-esque API (
leftjoin()etc) but it ended up being much simpler to implement that logic manually. One question I have is about the way that thedetailsandnvtx_data_lookupdicts are used: they currently assume that therecord.correlationIdandrecord.idare unique, is that a safe assumption? I'm not too familiar with the profiler.Load times:
BTW when testing it I noticed that the
@bprofiletests took like ~10 minutes on my node, which turned out to be the repeated calls to demumble. I modified the profiler in 7af4e5c to batch-call demumble and that removed most of the overhead, nowcore/profile.jltakes ~1 minute in total for me.Mostly written by Claude, with various tweaks by me 🤖