/
tracing.lua
585 lines (489 loc) · 14.9 KB
/
tracing.lua
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
---
-- Tracer module
--
-- Application-level tracing for Kong.
--
-- @module kong.tracing
local require = require
local ffi = require "ffi"
local tablepool = require "tablepool"
local new_tab = require "table.new"
local utils = require "kong.tools.utils"
local phase_checker = require "kong.pdk.private.phases"
local ngx = ngx
local type = type
local error = error
local ipairs = ipairs
local tostring = tostring
local setmetatable = setmetatable
local getmetatable = getmetatable
local rand_bytes = utils.get_rand_bytes
local check_phase = phase_checker.check
local PHASES = phase_checker.phases
local ffi_cast = ffi.cast
local ffi_str = ffi.string
local ffi_time_unix_nano = utils.time_ns
local tablepool_fetch = tablepool.fetch
local tablepool_release = tablepool.release
local ngx_log = ngx.log
local ngx_ERR = ngx.ERR
local NOOP = function() end
local POOL_SPAN = "KONG_SPAN"
local POOL_SPAN_STORAGE = "KONG_SPAN_STORAGE"
local POOL_ATTRIBUTES = "KONG_SPAN_ATTRIBUTES"
local POOL_EVENTS = "KONG_SPAN_EVENTS"
-- must be power of 2
local SAMPLING_BYTE = 8
local SAMPLING_BITS = 8 * SAMPLING_BYTE
local BOUND_MAX = math.pow(2, SAMPLING_BITS)
local SAMPLING_UINT_PTR_TYPE = "uint" .. SAMPLING_BITS .. "_t*"
local TOO_SHORT_MESSAGE = "sampling needs trace ID to be longer than " .. SAMPLING_BYTE .. " bytes to work"
local SPAN_KIND = {
UNSPECIFIED = 0,
INTERNAL = 1,
SERVER = 2,
CLIENT = 3,
PRODUCER = 4,
CONSUMER = 5,
}
--- Generate trace ID
local function generate_trace_id()
return rand_bytes(16)
end
--- Generate span ID
local function generate_span_id()
return rand_bytes(8)
end
--- Build-in sampler
local function always_on_sampler()
return true
end
local function always_off_sampler()
return false
end
-- Fractions >= 1 will always sample. Fractions < 0 are treated as zero.
-- spec: https://github.com/c24t/opentelemetry-specification/blob/3b3d321865cf46364bdfb292c179b6444dc96bf9/specification/sdk-tracing.md#probability-sampler-algorithm
local function get_trace_id_based_sampler(rate)
if type(rate) ~= "number" then
error("invalid fraction", 2)
end
if rate >= 1 then
return always_on_sampler
end
if rate <= 0 then
return always_off_sampler
end
local bound = rate * BOUND_MAX
-- TODO: is this a sound method to sample?
return function(trace_id)
if #trace_id < SAMPLING_BYTE then
error(TOO_SHORT_MESSAGE, 2)
end
local truncated = ffi_cast(SAMPLING_UINT_PTR_TYPE, ffi_str(trace_id, SAMPLING_BYTE))[0]
return truncated < bound
end
end
-- @class span : table
--
--- Trace Context. Those IDs are all represented as bytes, and the length may vary.
-- We try best to preserve as much information as possible from the tracing context.
-- @field trace_id bytes auto generated 16 bytes ID if not designated
-- @field span_id bytes
-- @field parent_span_id bytes
--
--- Timing. All times are in nanoseconds.
-- @field start_time_ns number
-- @field end_time_ns number
--
--- Scopes and names. Defines what the span is about.
-- TODO: service should be retrieved from kong service instead of from plugin instances. It should be the same for spans from a single request.
-- service name/top level scope is defined by plugin instances.
-- @field name string type of the span. Should be of low cardinality. Good examples are "proxy", "DNS query", "database query". Approximately operation name of DataDog.
-- resource_name of Datadog is built from attirbutes.
--
--- Other fields
-- @field should_sample boolean whether the span should be sampled
-- @field kind number TODO: Should we remove this field? It's used by OTEL and zipkin. Maybe move this to impl_specific.
-- @field attributes table extra information about the span. Attribute of OTEL or meta of Datadog.
-- TODO: @field impl_specific table implementation specific fields. For example, impl_specific.datadog is used by Datadog tracer.
-- TODO: @field events table list of events.
--
--- Internal fields
-- @field tracer table
-- @field parent table
local span_mt = {}
span_mt.__index = span_mt
-- Noop Span
local noop_span = {}
-- Using static function instead of metatable for better performance
noop_span.is_recording = false
noop_span.finish = NOOP
noop_span.set_attribute = NOOP
noop_span.add_event = NOOP
noop_span.record_error = NOOP
noop_span.set_status = NOOP
noop_span.each_baggage_item = function() return NOOP end
setmetatable(noop_span, {
-- Avoid noop span table being modifed
__newindex = NOOP,
})
local function validate_span_options(options)
if options ~= nil then
if type(options) ~= "table" then
error("invalid options type", 2)
end
if options.start_time_ns ~= nil and type(options.start_time_ns) ~= "number" then
error("invalid start time", 2)
end
if options.span_kind ~= nil and type(options.span_kind) ~= "number" then
error("invalid start kind", 2)
end
if options.should_sample ~= nil and type(options.should_sample) ~= "boolean" then
error("invalid sampled", 2)
end
if options.attributes ~= nil and type(options.attributes) ~= "table" then
error("invalid attributes", 2)
end
end
end
local function create_span(tracer, options)
validate_span_options(options)
options = options or {}
local span = tablepool_fetch(POOL_SPAN, 0, 12)
span.parent = options.parent or tracer and tracer.active_span()
local trace_id = span.parent and span.parent.trace_id
or options.trace_id
or generate_trace_id()
local sampled
if span.parent and span.parent.should_sample ~= nil then
sampled = span.parent.should_sample
elseif options.should_sample ~= nil then
sampled = options.should_sample
else
sampled = tracer and tracer.sampler(trace_id)
end
span.parent_id = span.parent and span.parent.span_id
or options.parent_id
span.tracer = span.tracer or tracer
span.span_id = generate_span_id()
span.trace_id = trace_id
span.kind = options.span_kind or SPAN_KIND.INTERNAL
span.should_sample = sampled
setmetatable(span, span_mt)
return span
end
local function link_span(tracer, span, name, options)
if not span.should_sample then
kong.log.debug("skipping non-sampled span")
return
end
if tracer and type(tracer) ~= "table" then
error("invalid tracer", 2)
end
validate_span_options(options)
options = options or {}
-- cache tracer ref, to get hooks / span processer
-- tracer ref will not be cleared when the span table released
span.tracer = span.tracer or tracer
-- specify span start time
span.start_time_ns = options.start_time_ns or ffi_time_unix_nano()
span.attributes = options.attributes
span.name = name
span.linked = true
-- insert the span to ctx
local ctx = ngx.ctx
local spans = ctx.KONG_SPANS
if not spans then
spans = tablepool_fetch(POOL_SPAN_STORAGE, 10, 0)
spans[0] = 0 -- span counter
ctx.KONG_SPANS = spans
end
local len = spans[0] + 1
spans[len] = span
spans[0] = len
return span
end
local function new_span(tracer, name, options)
if type(tracer) ~= "table" then
error("invalid tracer", 2)
end
if type(name) ~= "string" or #name == 0 then
error("invalid span name", 2)
end
local span = create_span(tracer, options)
link_span(tracer, span, name, options)
return span
end
--- Ends a Span
-- Set the end time and release the span,
-- the span table MUST not being used after ended.
--
-- @function span:finish
-- @tparam number|nil end_time_ns
-- @usage
-- span:finish()
--
-- local time = ngx.now()
-- span:finish(time * 100000000)
function span_mt:finish(end_time_ns)
if self.end_time_ns ~= nil or not self.should_sample then
-- span is finished, and already processed or not sampled
return
end
if end_time_ns ~= nil and type(end_time_ns) ~= "number" then
error("invalid span end time", 2)
end
if end_time_ns and end_time_ns < self.start_time_ns then
error("invalid span duration", 2)
end
self.end_time_ns = end_time_ns or ffi_time_unix_nano()
if self.active and self.tracer.active_span() == self then
self.tracer.set_active_span(self.parent)
self.active = nil
end
end
--- Set an attribute to a Span
--
-- @function span:set_attribute
-- @tparam string key
-- @tparam string|number|boolean value
-- @usage
-- span:set_attribute("net.transport", "ip_tcp")
-- span:set_attribute("net.peer.port", 443)
-- span:set_attribute("exception.escaped", true)
function span_mt:set_attribute(key, value)
-- key is decided by the programmer, so if it is not a string, we should
-- error out.
if type(key) ~= "string" then
error("invalid key", 2)
end
local vtyp = type(value)
if vtyp ~= "string" and vtyp ~= "number" and vtyp ~= "boolean" then
-- we should not error out here, as most of the caller does not catch
-- errors, and they are hooking to core facilities, which may cause
-- unexpected behavior.
ngx_log(ngx_ERR, debug.traceback("invalid span attribute value type: " .. vtyp, 2))
end
if self.attributes == nil then
self.attributes = tablepool_fetch(POOL_ATTRIBUTES, 0, 4)
end
self.attributes[key] = value
end
--- Adds an event to a Span
--
-- @function span:add_event
-- @tparam string name Event name
-- @tparam table|nil attributes Event attributes
-- @tparam number|nil time_ns Event timestamp
function span_mt:add_event(name, attributes, time_ns)
if type(name) ~= "string" then
error("invalid name", 2)
end
if attributes ~= nil and type(attributes) ~= "table" then
error("invalid attribute", 2)
end
if self.events == nil then
self.events = tablepool_fetch(POOL_EVENTS, 4, 0)
self.events[0] = 0
end
local obj = new_tab(0, 3)
obj.name = name
obj.time_ns = time_ns or ffi_time_unix_nano()
if attributes then
obj.attributes = attributes
end
local len = self.events[0] + 1
self.events[len] = obj
self.events[0] = len
end
--- Adds an error event to a Span
--
-- @function span:record_error
-- @tparam string err error string
function span_mt:record_error(err)
if type(err) ~= "string" then
err = tostring(err)
end
self:add_event("exception", {
["exception.message"] = err,
})
end
--- Adds an error event to a Span
-- Status codes:
-- - `0` unset
-- - `1` ok
-- - `2` error
--
-- @function span:set_status
-- @tparam number status status code
function span_mt:set_status(status)
if type(status) ~= "number" then
error("invalid status", 2)
end
self.status = status
end
-- (internal) Release a span
-- The lifecycle of span is controlled by Kong
function span_mt:release()
if type(self.attributes) == "table" then
tablepool_release(POOL_ATTRIBUTES, self.attributes)
end
if type(self.events) == "table" then
tablepool_release(POOL_EVENTS, self.events)
end
-- metabale will be cleared
tablepool_release(POOL_SPAN, self)
end
-- (internal) compatible with Zipkin tracing headers
-- TODO: implement baggage API
function span_mt:each_baggage_item() return NOOP end
local tracer_mt = {}
tracer_mt.__index = tracer_mt
-- avoid creating multiple tracer with same name
local tracer_memo = setmetatable({}, { __mode = "k" })
local noop_tracer = {}
noop_tracer.name = "noop"
noop_tracer.start_span = function() return noop_span end
noop_tracer.create_span = function() return noop_span end
noop_tracer.link_span = NOOP
noop_tracer.active_span = NOOP
noop_tracer.set_active_span = NOOP
noop_tracer.process_span = NOOP
noop_tracer.set_should_sample = NOOP
local VALID_TRACING_PHASES = {
rewrite = true,
access = true,
header_filter = true,
body_filter = true,
log = true,
content = true,
}
--- New Tracer
local function new_tracer(name, options)
name = name or "default"
if tracer_memo[name] then
return tracer_memo[name]
end
local self = {
-- instrumentation library name
name = name,
}
options = options or {}
if options.noop then
return noop_tracer
end
options.sampling_rate = options.sampling_rate or 1.0
self.sampler = get_trace_id_based_sampler(options.sampling_rate)
self.active_span_key = name .. "_" .. "active_span"
--- Get the active span
-- Returns the root span by default
--
-- @function kong.tracing.active_span
-- @phases rewrite, access, header_filter, response, body_filter, log, admin_api
-- @treturn table span
function self.active_span()
if not VALID_TRACING_PHASES[ngx.get_phase()] then
return
end
return ngx.ctx[self.active_span_key]
end
--- Set the active span
--
-- @function kong.tracing.set_active_span
-- @phases rewrite, access, header_filter, response, body_filter, log, admin_api
-- @tparam table span
function self.set_active_span(span)
if not VALID_TRACING_PHASES[ngx.get_phase()] then
return
end
if span then
span.active = true
end
ngx.ctx[self.active_span_key] = span
end
--- Create a new Span
--
-- @function kong.tracing.start_span
-- @phases rewrite, access, header_filter, response, body_filter, log, admin_api
-- @tparam string name span name
-- @tparam table options TODO(mayo)
-- @treturn table span
function self.start_span(...)
if not VALID_TRACING_PHASES[ngx.get_phase()] then
return noop_span
end
return new_span(self, ...)
end
function self.create_span(...)
return create_span(...)
end
function self.link_span(...)
return link_span(...)
end
--- Batch process spans
-- Please note that socket is not available in the log phase, use `ngx.timer.at` instead
--
-- @function kong.tracing.process_span
-- @phases log
-- @tparam function processor a function that accecpt a span as the parameter
function self.process_span(processor, ...)
check_phase(PHASES.log)
if type(processor) ~= "function" then
error("processor must be a function", 2)
end
local ctx = ngx.ctx
if not ctx.KONG_SPANS then
return
end
for _, span in ipairs(ctx.KONG_SPANS) do
if span.tracer and span.tracer.name == self.name then
processor(span, ...)
end
end
end
--- Update the value of should_sample for all spans
--
-- @function kong.tracing:set_should_sample
-- @tparam bool should_sample value for the sample parameter
function self:set_should_sample(should_sample)
local ctx = ngx.ctx
if not ctx.KONG_SPANS then
return
end
for _, span in ipairs(ctx.KONG_SPANS) do
if span.is_recording ~= false then
span.should_sample = should_sample
end
end
end
tracer_memo[name] = setmetatable(self, tracer_mt)
return tracer_memo[name]
end
tracer_mt.new = new_tracer
noop_tracer.new = new_tracer
local global_tracer
tracer_mt.set_global_tracer = function(tracer)
if type(tracer) ~= "table" or getmetatable(tracer) ~= tracer_mt then
error("invalid tracer", 2)
end
tracer.active_span_key = "active_span"
global_tracer = tracer
-- replace kong.pdk.tracer
if kong then
kong.tracing = tracer
end
end
noop_tracer.set_global_tracer = tracer_mt.set_global_tracer
global_tracer = new_tracer("core", { noop = true })
tracer_mt.__call = function(_, ...)
return new_tracer(...)
end
setmetatable(noop_tracer, {
__call = tracer_mt.__call,
__newindex = NOOP,
})
return {
new = function()
return global_tracer
end,
}