Skip to content

Commit

Permalink
feat(tracing): sampling rate scope by plugin
Browse files Browse the repository at this point in the history
Sampling rate can now be set via the Opentelemetry plugin instead of
it just being a global setting for the gateway.

It also fixes a small bug where, in the edge case of opentelemetry being
used for propagation only (instrumentations disabled), the `sampled`
flag was incorrectly set to `true` although no span was sampled for that
request.

Includes tests to cover more configuration scenarios (esp. different
sampling rates) and verify propagation is done correctly.
  • Loading branch information
samugi committed Nov 28, 2023
1 parent 3b53039 commit 32f754c
Show file tree
Hide file tree
Showing 9 changed files with 253 additions and 69 deletions.
5 changes: 5 additions & 0 deletions changelog/unreleased/kong/tracing-sampling-rate-scope.yml
@@ -0,0 +1,5 @@
message: >
Tracing Sampling Rate can now be set via the `config.sampling_rate` property
of the OpenTelemetry plugin instead of it just being a global setting for the gateway.
type: feature
scope: Plugin
7 changes: 7 additions & 0 deletions kong/clustering/compat/removed_fields.lua
Expand Up @@ -109,4 +109,11 @@ return {
"read_body_for_logout",
},
},

-- Any dataplane older than 3.6.0
[3006000000] = {
opentelemetry = {
"sampling_rate",
},
},
}
98 changes: 70 additions & 28 deletions kong/pdk/tracing.lua
Expand Up @@ -11,6 +11,7 @@ local tablepool = require "tablepool"
local new_tab = require "table.new"
local utils = require "kong.tools.utils"
local phase_checker = require "kong.pdk.private.phases"
local tracing_context = require "kong.tracing.tracing_context"

local ngx = ngx
local type = type
Expand Down Expand Up @@ -63,34 +64,29 @@ local function generate_span_id()
return rand_bytes(8)
end

--- Build-in sampler
local function always_on_sampler()
return true
end

local function always_off_sampler()
return false
end

-- Fractions >= 1 will always sample. Fractions < 0 are treated as zero.
-- spec: https://github.com/c24t/opentelemetry-specification/blob/3b3d321865cf46364bdfb292c179b6444dc96bf9/specification/sdk-tracing.md#probability-sampler-algorithm
local function get_trace_id_based_sampler(rate)
if type(rate) ~= "number" then
error("invalid fraction", 2)
end
local function get_trace_id_based_sampler(options_sampling_rate)
return function(trace_id, sampling_rate)
sampling_rate = sampling_rate or options_sampling_rate

if rate >= 1 then
return always_on_sampler
end
if type(sampling_rate) ~= "number" then
error("invalid fraction", 2)
end

if rate <= 0 then
return always_off_sampler
end
-- always on sampler
if sampling_rate >= 1 then
return true
end

-- always off sampler
if sampling_rate <= 0 then
return false
end

local bound = rate * BOUND_MAX
-- probability sampler
local bound = sampling_rate * BOUND_MAX

-- TODO: is this a sound method to sample?
return function(trace_id)
if #trace_id < SAMPLING_BYTE then
error(TOO_SHORT_MESSAGE, 2)
end
Expand Down Expand Up @@ -200,17 +196,17 @@ local function create_span(tracer, options)
span.span_id = generate_span_id()
span.trace_id = trace_id
span.kind = options.span_kind or SPAN_KIND.INTERNAL
-- get_sampling_decision() can be used to dynamically run the sampler's logic
-- and obtain the sampling decision for the span. This way plugins can apply
-- their configured sampling rate dynamically. The sampled flag can then be
-- overwritten by set_should_sample.
span.should_sample = sampled

setmetatable(span, span_mt)
return span
end

local function link_span(tracer, span, name, options)
if not span.should_sample then
kong.log.debug("skipping non-sampled span")
return
end
if tracer and type(tracer) ~= "table" then
error("invalid tracer", 2)
end
Expand Down Expand Up @@ -270,8 +266,8 @@ end
-- local time = ngx.now()
-- span:finish(time * 100000000)
function span_mt:finish(end_time_ns)
if self.end_time_ns ~= nil or not self.should_sample then
-- span is finished, and already processed or not sampled
if self.end_time_ns ~= nil then
-- span is finished, and already processed
return
end

Expand Down Expand Up @@ -419,6 +415,7 @@ noop_tracer.active_span = NOOP
noop_tracer.set_active_span = NOOP
noop_tracer.process_span = NOOP
noop_tracer.set_should_sample = NOOP
noop_tracer.get_sampling_decision = NOOP

local VALID_TRACING_PHASES = {
rewrite = true,
Expand Down Expand Up @@ -547,6 +544,51 @@ local function new_tracer(name, options)
end
end

--- Get the sampling decision result
--
-- Uses a parent-based sampler when the parent has sampled flag == false
-- to inherit the non-recording decision from the parent span, or when
-- trace_id is not available.
--
-- Else, apply the probability-based should_sample decision.
--
-- @function kong.tracing:set_should_sample
-- @tparam bool parent_should_sample value of the parent span sampled flag
-- extracted from the incoming tracing headers
-- @tparam number sampling_rate the sampling rate to apply for the
-- probability sampler
-- @treturn bool sampled value of sampled for this trace
function self:get_sampling_decision(parent_should_sample, sampling_rate)
local ctx = ngx.ctx

local sampled
local root_span = ctx.KONG_SPANS and ctx.KONG_SPANS[1]
local trace_id = tracing_context.get_raw_trace_id(ctx)

if not root_span or root_span.attributes["kong.propagation_only"] then
-- should not sample if there is no root span or if the root span is
-- a dummy created only to propagate headers
sampled = false

elseif parent_should_sample == false or not trace_id then
-- trace_id can be nil when tracing instrumentations are disabled
-- and Kong is configured to only do headers propagation
sampled = parent_should_sample

elseif not sampling_rate then
-- no custom sampling_rate was passed:
-- reuse the sampling result of the root_span
sampled = root_span.should_sample == true

else
-- use probability-based sampler
sampled = self.sampler(trace_id, sampling_rate)
end

-- enforce boolean
return not not sampled
end

tracer_memo[name] = setmetatable(self, tracer_mt)
return tracer_memo[name]
end
Expand Down
41 changes: 29 additions & 12 deletions kong/plugins/opentelemetry/handler.lua
Expand Up @@ -94,34 +94,32 @@ end
function OpenTelemetryHandler:access(conf)
local headers = ngx_get_headers()
local root_span = ngx.ctx.KONG_SPANS and ngx.ctx.KONG_SPANS[1]
local tracer = kong.tracing.new("otel")

-- make propagation running with tracing instrumetation not enabled
-- get the global tracer when available, or instantiate a new one
local tracer = kong.tracing.name == "noop" and kong.tracing.new("otel")
or kong.tracing

-- make propagation work with tracing disabled
if not root_span then
root_span = tracer.start_span("root")
root_span:set_attribute("kong.propagation_only", true)

-- the span created only for the propagation and will be bypassed to the exporter
-- since tracing is disabled, turn off sampling entirely for this trace
kong.ctx.plugin.should_sample = false
end

local injected_parent_span = tracing_context.get_unlinked_span("balancer") or root_span
local header_type, trace_id, span_id, parent_id, parent_sampled, _ = propagation_parse(headers, conf.header_type)

local header_type, trace_id, span_id, parent_id, should_sample, _ = propagation_parse(headers, conf.header_type)
if should_sample == false then
tracer:set_should_sample(should_sample)
injected_parent_span.should_sample = should_sample
end

-- overwrite trace id
-- as we are in a chain of existing trace
-- Overwrite trace ids
-- with the value extracted from incoming tracing headers
if trace_id then
-- to propagate the correct trace ID we have to set it here
-- before passing this span to propagation.set()
injected_parent_span.trace_id = trace_id
-- update the Tracing Context with the trace ID extracted from headers
tracing_context.set_raw_trace_id(trace_id)
end

-- overwrite root span's parent_id
if span_id then
root_span.parent_id = span_id
Expand All @@ -130,6 +128,25 @@ function OpenTelemetryHandler:access(conf)
root_span.parent_id = parent_id
end

-- Configure the sampled flags
local sampled
if kong.ctx.plugin.should_sample == false then
sampled = false

else
-- Sampling decision for the current trace.
local err
-- get_sampling_decision() depends on the value of the trace id: call it
-- after the trace_id is updated
sampled, err = tracer:get_sampling_decision(parent_sampled, conf.sampling_rate)
if err then
ngx_log(ngx_ERR, _log_prefix, "sampler failure: ", err)
end
end
tracer:set_should_sample(sampled)
-- Set the sampled flag for the outgoing header's span
injected_parent_span.should_sample = sampled

propagation_set(conf.header_type, header_type, injected_parent_span, "w3c")
end

Expand Down
7 changes: 7 additions & 0 deletions kong/plugins/opentelemetry/schema.lua
Expand Up @@ -59,6 +59,13 @@ return {
required = false,
default = "preserve",
one_of = { "preserve", "ignore", "b3", "b3-single", "w3c", "jaeger", "ot", "aws", "gcp" } } },
{ sampling_rate = {
description = "Tracing sampling rate for configuring the probability-based sampler. When set, this value supersedes the global `tracing_sampling_rate` setting from kong.conf.",
type = "number",
between = {0, 1},
required = false,
default = nil,
} },
},
entity_checks = {
{ custom_entity_check = {
Expand Down
2 changes: 2 additions & 0 deletions spec/02-integration/09-hybrid_mode/09-config-compat_spec.lua
Expand Up @@ -212,6 +212,7 @@ describe("CP/DP config compat transformations #" .. strategy, function()
local expected_otel_prior_35 = utils.cycle_aware_deep_copy(opentelemetry)
expected_otel_prior_35.config.header_type = "preserve"
expected_otel_prior_35.config.sampling_rate = nil
do_assert(utils.uuid(), "3.4.0", expected_otel_prior_35)
-- cleanup
Expand All @@ -231,6 +232,7 @@ describe("CP/DP config compat transformations #" .. strategy, function()
local expected_otel_prior_34 = utils.cycle_aware_deep_copy(opentelemetry)
expected_otel_prior_34.config.header_type = "preserve"
expected_otel_prior_34.config.sampling_rate = nil
do_assert(utils.uuid(), "3.3.0", expected_otel_prior_34)
-- cleanup
Expand Down

0 comments on commit 32f754c

Please sign in to comment.