-
Notifications
You must be signed in to change notification settings - Fork 1.6k
/
pfs.cc
7179 lines (6180 loc) · 216 KB
/
pfs.cc
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
/* Copyright (c) 2008, 2023, Oracle and/or its affiliates.
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License, version 2.0,
as published by the Free Software Foundation.
This program is also distributed with certain software (including
but not limited to OpenSSL) that is licensed under separate terms,
as designated in a particular file or component or in included license
documentation. The authors of MySQL hereby grant you an additional
permission to link the program and your derivative works with the
separately licensed software that they have included with MySQL.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License, version 2.0, for more details.
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software Foundation,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA */
/**
@file storage/perfschema/pfs.cc
The performance schema implementation of all instruments.
*/
#include "my_global.h"
#include "thr_lock.h"
/* Make sure exported prototypes match the implementation. */
#include "pfs_file_provider.h"
#include "pfs_idle_provider.h"
#include "pfs_memory_provider.h"
#include "pfs_metadata_provider.h"
#include "pfs_socket_provider.h"
#include "pfs_stage_provider.h"
#include "pfs_statement_provider.h"
#include "pfs_table_provider.h"
#include "pfs_thread_provider.h"
#include "pfs_transaction_provider.h"
#include "mysql/psi/psi.h"
#include "mysql/psi/mysql_thread.h"
#include "my_pthread.h"
#include "sql_const.h"
#include "pfs.h"
#include "pfs_instr_class.h"
#include "pfs_instr.h"
#include "pfs_host.h"
#include "pfs_user.h"
#include "pfs_account.h"
#include "pfs_global.h"
#include "pfs_column_values.h"
#include "pfs_timer.h"
#include "pfs_events_waits.h"
#include "pfs_events_stages.h"
#include "pfs_events_statements.h"
#include "pfs_events_transactions.h"
#include "pfs_setup_actor.h"
#include "pfs_setup_object.h"
#include "sql_error.h"
#include "sp_head.h"
#include "mdl.h" /* mdl_key_init */
#include "pfs_digest.h"
#include "pfs_program.h"
#include "pfs_prepared_stmt.h"
using std::min;
/*
This is a development tool to investigate memory statistics,
do not use in production.
*/
#undef PFS_PARANOID
#ifdef PFS_PARANOID
static void report_memory_accounting_error(
const char *api_name,
PFS_thread *new_thread,
size_t size,
PFS_memory_class *klass,
PFS_thread *old_thread)
{
pfs_print_error("%s "
"thread <%d> of class <%s> "
"not owner of <%d> bytes in class <%s> "
"allocated by thread <%d> of class <%s>\n",
api_name,
new_thread->m_thread_internal_id,
new_thread->m_class->m_name,
size, klass->m_name,
old_thread->m_thread_internal_id,
old_thread->m_class->m_name);
assert(strcmp(new_thread->m_class->m_name, "thread/sql/event_worker") != 0);
assert(strcmp(new_thread->m_class->m_name, "thread/sql/event_scheduler") != 0);
assert(strcmp(new_thread->m_class->m_name, "thread/sql/one_connection") != 0);
}
#endif /* PFS_PARANOID */
/**
@page PAGE_PERFORMANCE_SCHEMA The Performance Schema main page
MySQL PERFORMANCE_SCHEMA implementation.
@section INTRO Introduction
The PERFORMANCE_SCHEMA is a way to introspect the internal execution of
the server at runtime.
The performance schema focuses primarily on performance data,
as opposed to the INFORMATION_SCHEMA whose purpose is to inspect metadata.
From a user point of view, the performance schema consists of:
- a dedicated database schema, named PERFORMANCE_SCHEMA,
- SQL tables, used to query the server internal state or change
configuration settings.
From an implementation point of view, the performance schema is a dedicated
Storage Engine which exposes data collected by 'Instrumentation Points'
placed in the server code.
@section INTERFACES Multiple interfaces
The performance schema exposes many different interfaces,
for different components, and for different purposes.
@subsection INT_INSTRUMENTING Instrumenting interface
All the data representing the server internal state exposed
in the performance schema must be first collected:
this is the role of the instrumenting interface.
The instrumenting interface is a coding interface provided
by implementors (of the performance schema) to implementors
(of the server or server components).
This interface is available to:
- C implementations
- C++ implementations
- the core SQL layer (/sql)
- the mysys library (/mysys)
- MySQL plugins, including storage engines,
- third party plugins, including third party storage engines.
For details, see the @ref PAGE_INSTRUMENTATION_INTERFACE
"instrumentation interface page".
@subsection INT_COMPILING Compiling interface
The implementation of the performance schema can be enabled or disabled at
build time, when building MySQL from the source code.
When building with the performance schema code, some compilation flags
are available to change the default values used in the code, if required.
For more details, see:
@verbatim ./configure --help @endverbatim
To compile with the performance schema:
@verbatim ./configure --with-perfschema @endverbatim
The implementation of all the compiling options is located in
@verbatim ./storage/perfschema/plug.in @endverbatim
@subsection INT_STARTUP Server startup interface
The server startup interface consists of the "./mysqld ..."
command line used to start the server.
When the performance schema is compiled in the server binary,
extra command line options are available.
These extra start options allow the DBA to:
- enable or disable the performance schema
- specify some sizing parameters.
To see help for the performance schema startup options, see:
@verbatim ./sql/mysqld --verbose --help @endverbatim
The implementation of all the startup options is located in
@verbatim ./sql/mysqld.cc, my_long_options[] @endverbatim
@subsection INT_BOOTSTRAP Server bootstrap interface
The bootstrap interface is a private interface exposed by
the performance schema, and used by the SQL layer.
Its role is to advertise all the SQL tables natively
supported by the performance schema to the SQL server.
The code consists of creating MySQL tables for the
performance schema itself, and is used in './mysql --bootstrap'
mode when a server is installed.
The implementation of the database creation script is located in
@verbatim ./scripts/mysql_performance_tables.sql @endverbatim
@subsection INT_CONFIG Runtime configuration interface
When the performance schema is used at runtime, various configuration
parameters can be used to specify what kind of data is collected,
what kind of aggregations are computed, what kind of timers are used,
what events are timed, etc.
For all these capabilities, not a single statement or special syntax
was introduced in the parser.
Instead of new SQL statements, the interface consists of DML
(SELECT, INSERT, UPDATE, DELETE) against special "SETUP" tables.
For example:
@verbatim mysql> update performance_schema.SETUP_INSTRUMENTS
set ENABLED='YES', TIMED='YES';
Query OK, 234 rows affected (0.00 sec)
Rows matched: 234 Changed: 234 Warnings: 0 @endverbatim
@subsection INT_STATUS Internal audit interface
The internal audit interface is provided to the DBA to inspect if the
performance schema code itself is functioning properly.
This interface is necessary because a failure caused while
instrumenting code in the server should not cause failures in the
MySQL server itself, so that the performance schema implementation
never raises errors during runtime execution.
This auditing interface consists of:
@verbatim SHOW ENGINE PERFORMANCE_SCHEMA STATUS; @endverbatim
It displays data related to the memory usage of the performance schema,
as well as statistics about lost events, if any.
The SHOW STATUS command is implemented in
@verbatim ./storage/perfschema/pfs_engine_table.cc @endverbatim
@subsection INT_QUERY Query interface
The query interface is used to query the internal state of a running server.
It is provided as SQL tables.
For example:
@verbatim mysql> select * from performance_schema.EVENTS_WAITS_CURRENT;
@endverbatim
@section DESIGN_PRINCIPLES Design principles
@subsection PRINCIPLE_BEHAVIOR No behavior changes
The primary goal of the performance schema is to measure (instrument) the
execution of the server. A good measure should not cause any change
in behavior.
To achieve this, the overall design of the performance schema complies
with the following very severe design constraints:
The parser is unchanged. There are no new keywords, no new statements.
This guarantees that existing applications will run the same way with or
without the performance schema.
All the instrumentation points return "void", there are no error codes.
Even if the performance schema internally fails, execution of the server
code will proceed.
None of the instrumentation points allocate memory.
All the memory used by the performance schema is pre-allocated at startup,
and is considered "static" during the server life time.
None of the instrumentation points use any pthread_mutex, pthread_rwlock,
or pthread_cond (or platform equivalents).
Executing the instrumentation point should not cause thread scheduling to
change in the server.
In other words, the implementation of the instrumentation points,
including all the code called by the instrumentation points, is:
- malloc free
- mutex free
- rwlock free
TODO: All the code located in storage/perfschema is malloc free,
but unfortunately the usage of LF_HASH introduces some memory allocation.
This should be revised if possible, to use a lock-free,
malloc-free hash code table.
@subsection PRINCIPLE_PERFORMANCE No performance hit
The instrumentation of the server should be as fast as possible.
In cases when there are choices between:
- doing some processing when recording the performance data
in the instrumentation,
- doing some processing when retrieving the performance data,
priority is given in the design to make the instrumentation faster,
pushing some complexity to data retrieval.
As a result, some parts of the design, related to:
- the setup code path,
- the query code path,
might appear to be sub-optimal.
The criterion used here is to optimize primarily the critical path (data
collection), possibly at the expense of non-critical code paths.
@subsection PRINCIPLE_NOT_INTRUSIVE Unintrusive instrumentation
For the performance schema in general to be successful, the barrier
of entry for a developer should be low, so it's easy to instrument code.
In particular, the instrumentation interface:
- is available for C and C++ code (so it's a C interface),
- does not require parameters that the calling code can't easily provide,
- supports partial instrumentation (for example, instrumenting mutexes does
not require that every mutex is instrumented)
@subsection PRINCIPLE_EXTENDABLE Extendable instrumentation
As the content of the performance schema improves,
with more tables exposed and more data collected,
the instrumentation interface will also be augmented
to support instrumenting new concepts.
Existing instrumentations should not be affected when additional
instrumentation is made available, and making a new instrumentation
available should not require existing instrumented code to support it.
@subsection PRINCIPLE_VERSIONED Versioned instrumentation
Given that the instrumentation offered by the performance schema will
be augmented with time, when more features are implemented,
the interface itself should be versioned, to keep compatibility
with previous instrumented code.
For example, after both plugin-A and plugin-B have been instrumented for
mutexes, read write locks and conditions, using the instrumentation
interface, we can anticipate that the instrumentation interface
is expanded to support file based operations.
Plugin-A, a file based storage engine, will most likely use the expanded
interface and instrument its file usage, using the version 2
interface, while Plugin-B, a network based storage engine, will not change
its code and not release a new binary.
When later the instrumentation interface is expanded to support network
based operations (which will define interface version 3), the Plugin-B code
can then be changed to make use of it.
Note, this is just an example to illustrate the design concept here.
Both mutexes and file instrumentation are already available
since version 1 of the instrumentation interface.
@subsection PRINCIPLE_DEPLOYMENT Easy deployment
Internally, we might want every plugin implementation to upgrade the
instrumented code to the latest available, but this will cause additional
work and this is not practical if the code change is monolithic.
Externally, for third party plugin implementors, asking implementors to
always stay aligned to the latest instrumentation and make new releases,
even when the change does not provide new functionality for them,
is a bad idea.
For example, requiring a network based engine to re-release because the
instrumentation interface changed for file based operations, will create
too many deployment issues.
So, the performance schema implementation must support concurrently,
in the same deployment, multiple versions of the instrumentation
interface, and ensure binary compatibility with each version.
In addition to this, the performance schema can be included or excluded
from the server binary, using build time configuration options.
Regardless, the following types of deployment are valid:
- a server supporting the performance schema + a storage engine
that is not instrumented
- a server not supporting the performance schema + a storage engine
that is instrumented
*/
/**
@page PAGE_INSTRUMENTATION_INTERFACE Performance schema: instrumentation interface page.
MySQL performance schema instrumentation interface.
@section INTRO Introduction
The instrumentation interface consist of two layers:
- a raw ABI (Application Binary Interface) layer, that exposes the primitive
instrumentation functions exported by the performance schema instrumentation
- an API (Application Programing Interface) layer,
that provides many helpers for a developer instrumenting some code,
to make the instrumentation as easy as possible.
The ABI layer consists of:
@code
#include "mysql/psi/psi.h"
@endcode
The API layer consists of:
@code
#include "mysql/psi/mutex_mutex.h"
#include "mysql/psi/mutex_file.h"
@endcode
The first helper is for mutexes, rwlocks and conditions,
the second for file io.
The API layer exposes C macros and typedefs which will expand:
- either to non-instrumented code, when compiled without the performance
schema instrumentation
- or to instrumented code, that will issue the raw calls to the ABI layer
so that the implementation can collect data.
Note that all the names introduced (for example, @c mysql_mutex_lock) do not
collide with any other namespace.
In particular, the macro @c mysql_mutex_lock is on purpose not named
@c pthread_mutex_lock.
This is to:
- avoid overloading @c pthread_mutex_lock with yet another macro,
which is dangerous as it can affect user code and pollute
the end-user namespace.
- allow the developer instrumenting code to selectively instrument
some code but not all.
@section PRINCIPLES Design principles
The ABI part is designed as a facade, that exposes basic primitives.
The expectation is that each primitive will be very stable over time,
but the list will constantly grow when more instruments are supported.
To support binary compatibility with plugins compiled with a different
version of the instrumentation, the ABI itself is versioned
(see @c PSI_v1, @c PSI_v2).
For a given instrumentation point in the API, the basic coding pattern
used is:
- (a) notify the performance schema of the operation
about to be performed.
- (b) execute the instrumented code.
- (c) notify the performance schema that the operation
is completed.
An opaque "locker" pointer is returned by (a), that is given to (c).
This pointer helps the implementation to keep context, for performances.
The following code fragment is annotated to show how in detail this pattern
in implemented, when the instrumentation is compiled in:
@verbatim
static inline int mysql_mutex_lock(
mysql_mutex_t *that, myf flags, const char *src_file, uint src_line)
{
int result;
struct PSI_mutex_locker_state state;
struct PSI_mutex_locker *locker= NULL;
............... (a)
locker= PSI_MUTEX_CALL(start_mutex_wait)(&state, that->p_psi, PSI_MUTEX_LOCK,
locker, src_file, src_line);
............... (b)
result= pthread_mutex_lock(&that->m_mutex);
............... (c)
PSI_MUTEX_CALL(end_mutex_wait)(locker, result);
return result;
}
@endverbatim
When the performance schema instrumentation is not compiled in,
the code becomes simply a wrapper, expanded in line by the compiler:
@verbatim
static inline int mysql_mutex_lock(...)
{
int result;
............... (b)
result= pthread_mutex_lock(&that->m_mutex);
return result;
}
@endverbatim
When the performance schema instrumentation is compiled in,
and when the code compiled is internal to the server implementation,
PSI_MUTEX_CALL expands directly to functions calls in the performance schema,
to make (a) and (c) calls as efficient as possible.
@verbatim
static inline int mysql_mutex_lock(...)
{
int result;
struct PSI_mutex_locker_state state;
struct PSI_mutex_locker *locker= NULL;
............... (a)
locker= pfs_start_mutex_wait_v1(&state, that->p_psi, PSI_MUTEX_LOCK,
locker, src_file, src_line);
............... (b)
result= pthread_mutex_lock(&that->m_mutex);
............... (c)
pfs_end_mutex_wait_v1(locker, result);
return result;
}
@endverbatim
When the performance schema instrumentation is compiled in,
and when the code compiled is external to the server implementation
(typically, a dynamic plugin),
PSI_MUTEX_CALL expands to dynamic calls to the underlying implementation,
using the PSI_server entry point.
This makes (a) and (c) slower, as a function pointer is used instead of a static call,
but also independent of the implementation, for binary compatibility.
@verbatim
static inline int mysql_mutex_lock(...)
{
int result;
struct PSI_mutex_locker_state state;
struct PSI_mutex_locker *locker= NULL;
............... (a)
locker= PSI_server->start_mutex_wait(&state, that->p_psi, PSI_MUTEX_LOCK,
locker, src_file, src_line);
............... (b)
result= pthread_mutex_lock(&that->m_mutex);
............... (c)
PSI_server->end_mutex_wait(locker, result);
return result;
}
@endverbatim
*/
/**
@page PAGE_AGGREGATES Performance schema: the aggregates page.
Performance schema aggregates.
@section INTRO Introduction
Aggregates tables are tables that can be formally defined as
SELECT ... from EVENTS_WAITS_HISTORY_INFINITE ... group by 'group clause'.
Each group clause defines a different kind of aggregate, and corresponds to
a different table exposed by the performance schema.
Aggregates can be either:
- computed on the fly,
- computed on demand, based on other available data.
'EVENTS_WAITS_HISTORY_INFINITE' is a table that does not exist,
the best approximation is EVENTS_WAITS_HISTORY_LONG.
Aggregates computed on the fly in fact are based on EVENTS_WAITS_CURRENT,
while aggregates computed on demand are based on other
EVENTS_WAITS_SUMMARY_BY_xxx tables.
To better understand the implementation itself, a bit of math is
required first, to understand the model behind the code:
the code is deceptively simple, the real complexity resides
in the flyweight of pointers between various performance schema buffers.
@section DIMENSION Concept of dimension
An event measured by the instrumentation has many attributes.
An event is represented as a data point P(x1, x2, ..., xN),
where each x_i coordinate represents a given attribute value.
Examples of attributes are:
- the time waited
- the object waited on
- the instrument waited on
- the thread that waited
- the operation performed
- per object or per operation additional attributes, such as spins,
number of bytes, etc.
Computing an aggregate per thread is fundamentally different from
computing an aggregate by instrument, so the "_BY_THREAD" and
"_BY_EVENT_NAME" aggregates are different dimensions,
operating on different x_i and x_j coordinates.
These aggregates are "orthogonal".
@section PROJECTION Concept of projection
A given x_i attribute value can convey either just one basic information,
such as a number of bytes, or can convey implied information,
such as an object fully qualified name.
For example, from the value "test.t1", the name of the object schema
"test" can be separated from the object name "t1", so that now aggregates
by object schema can be implemented.
In math terms, that corresponds to defining a function:
F_i (x): x --> y
Applying this function to our point P gives another point P':
F_i (P):
P(x1, x2, ..., x{i-1}, x_i, x{i+1}, ..., x_N)
--> P' (x1, x2, ..., x{i-1}, f_i(x_i), x{i+1}, ..., x_N)
That function defines in fact an aggregate !
In SQL terms, this aggregate would look like the following table:
@verbatim
CREATE VIEW EVENTS_WAITS_SUMMARY_BY_Func_i AS
SELECT col_1, col_2, ..., col_{i-1},
Func_i(col_i),
COUNT(col_i),
MIN(col_i), AVG(col_i), MAX(col_i), -- if col_i is a numeric value
col_{i+1}, ..., col_N
FROM EVENTS_WAITS_HISTORY_INFINITE
group by col_1, col_2, ..., col_{i-1}, col{i+1}, ..., col_N.
@endverbatim
Note that not all columns have to be included,
in particular some columns that are dependent on the x_i column should
be removed, so that in practice, MySQL's aggregation method tends to
remove many attributes at each aggregation steps.
For example, when aggregating wait events by object instances,
- the wait_time and number_of_bytes can be summed,
and sum(wait_time) now becomes an object instance attribute.
- the source, timer_start, timer_end columns are not in the
_BY_INSTANCE table, because these attributes are only
meaningful for a wait.
@section COMPOSITION Concept of composition
Now, the "test.t1" --> "test" example was purely theory,
just to explain the concept, and does not lead very far.
Let's look at a more interesting example of data that can be derived
from the row event.
An event creates a transient object, PFS_wait_locker, per operation.
This object's life cycle is extremely short: it's created just
before the start_wait() instrumentation call, and is destroyed in
the end_wait() call.
The wait locker itself contains a pointer to the object instance
waited on.
That allows to implement a wait_locker --> object instance projection,
with m_target.
The object instance life cycle depends on _init and _destroy calls
from the code, such as mysql_mutex_init()
and mysql_mutex_destroy() for a mutex.
The object instance waited on contains a pointer to the object class,
which is represented by the instrument name.
That allows to implement an object instance --> object class projection.
The object class life cycle is permanent, as instruments are loaded in
the server and never removed.
The object class is named in such a way
(for example, "wait/sync/mutex/sql/LOCK_open",
"wait/io/file/maria/data_file) that the component ("sql", "maria")
that it belongs to can be inferred.
That allows to implement an object class --> server component projection.
Back to math again, we have, for example for mutexes:
F1 (l) : PFS_wait_locker l --> PFS_mutex m = l->m_target.m_mutex
F1_to_2 (m) : PFS_mutex m --> PFS_mutex_class i = m->m_class
F2_to_3 (i) : PFS_mutex_class i --> const char *component =
substring(i->m_name, ...)
Per components aggregates are not implemented, this is just an illustration.
F1 alone defines this aggregate:
EVENTS_WAITS_HISTORY_INFINITE --> EVENTS_WAITS_SUMMARY_BY_INSTANCE
(or MUTEX_INSTANCE)
F1_to_2 alone could define this aggregate:
EVENTS_WAITS_SUMMARY_BY_INSTANCE --> EVENTS_WAITS_SUMMARY_BY_EVENT_NAME
Alternatively, using function composition, with
F2 = F1_to_2 o F1, F2 defines:
EVENTS_WAITS_HISTORY_INFINITE --> EVENTS_WAITS_SUMMARY_BY_EVENT_NAME
Likewise, F_2_to_3 defines:
EVENTS_WAITS_SUMMARY_BY_EVENT_NAME --> EVENTS_WAITS_SUMMARY_BY_COMPONENT
and F3 = F_2_to_3 o F_1_to_2 o F1 defines:
EVENTS_WAITS_HISTORY_INFINITE --> EVENTS_WAITS_SUMMARY_BY_COMPONENT
What has all this to do with the code ?
Functions (or aggregates) such as F_3 are not implemented as is.
Instead, they are decomposed into F_2_to_3 o F_1_to_2 o F1,
and each intermediate aggregate is stored into an internal buffer.
This allows to support every F1, F2, F3 aggregates from shared
internal buffers, where computation already performed to compute F2
is reused when computing F3.
@section OBJECT_GRAPH Object graph
In terms of object instances, or records, pointers between
different buffers define an object instance graph.
For example, assuming the following scenario:
- A mutex class "M" is instrumented, the instrument name
is "wait/sync/mutex/sql/M"
- This mutex instrument has been instantiated twice,
mutex instances are noted M-1 and M-2
- Threads T-A and T-B are locking mutex instance M-1
- Threads T-C and T-D are locking mutex instance M-2
The performance schema will record the following data:
- EVENTS_WAITS_CURRENT has 4 rows, one for each mutex locker
- EVENTS_WAITS_SUMMARY_BY_INSTANCE shows 2 rows, for M-1 and M-2
- EVENTS_WAITS_SUMMARY_BY_EVENT_NAME shows 1 row, for M
The graph of structures will look like:
@verbatim
PFS_wait_locker (T-A, M-1) ----------
|
v
PFS_mutex (M-1)
- m_wait_stat ------------
^ |
| |
PFS_wait_locker (T-B, M-1) ---------- |
v
PFS_mutex_class (M)
- m_wait_stat
PFS_wait_locker (T-C, M-2) ---------- ^
| |
v |
PFS_mutex (M-2) |
- m_wait_stat ------------
^
|
PFS_wait_locker (T-D, M-2) ----------
|| || ||
|| || ||
vv vv vv
EVENTS_WAITS_CURRENT ..._SUMMARY_BY_INSTANCE ..._SUMMARY_BY_EVENT_NAME
@endverbatim
@section ON_THE_FLY On the fly aggregates
'On the fly' aggregates are computed during the code execution.
This is necessary because the data the aggregate is based on is volatile,
and can not be kept indefinitely.
With on the fly aggregates:
- the writer thread does all the computation
- the reader thread accesses the result directly
This model is to be avoided if possible, due to the overhead
caused when instrumenting code.
@section HIGHER_LEVEL Higher level aggregates
'Higher level' aggregates are implemented on demand only.
The code executing a SELECT from the aggregate table is
collecting data from multiple internal buffers to produce the result.
With higher level aggregates:
- the reader thread does all the computation
- the writer thread has no overhead.
@section MIXED Mixed level aggregates
The 'Mixed' model is a compromise between 'On the fly' and 'Higher level'
aggregates, for internal buffers that are not permanent.
While an object is present in a buffer, the higher level model is used.
When an object is about to be destroyed, statistics are saved into
a 'parent' buffer with a longer life cycle, to follow the on the fly model.
With mixed aggregates:
- the reader thread does a lot of complex computation,
- the writer thread has minimal overhead, on destroy events.
@section IMPL_WAIT Implementation for waits aggregates
For waits, the tables that contains aggregated wait data are:
- EVENTS_WAITS_SUMMARY_BY_ACCOUNT_BY_EVENT_NAME
- EVENTS_WAITS_SUMMARY_BY_HOST_BY_EVENT_NAME
- EVENTS_WAITS_SUMMARY_BY_INSTANCE
- EVENTS_WAITS_SUMMARY_BY_THREAD_BY_EVENT_NAME
- EVENTS_WAITS_SUMMARY_BY_USER_BY_EVENT_NAME
- EVENTS_WAITS_SUMMARY_GLOBAL_BY_EVENT_NAME
- FILE_SUMMARY_BY_EVENT_NAME
- FILE_SUMMARY_BY_INSTANCE
- SOCKET_SUMMARY_BY_INSTANCE
- SOCKET_SUMMARY_BY_EVENT_NAME
- OBJECTS_SUMMARY_GLOBAL_BY_TYPE
The instrumented code that generates waits events consist of:
- mutexes (mysql_mutex_t)
- rwlocks (mysql_rwlock_t)
- conditions (mysql_cond_t)
- file io (MYSQL_FILE)
- socket io (MYSQL_SOCKET)
- table io
- table lock
- idle
The flow of data between aggregates tables varies for each instrumentation.
@subsection IMPL_WAIT_MUTEX Mutex waits
@verbatim
mutex_locker(T, M)
|
| [1]
|
|-> pfs_mutex(M) =====>> [B], [C]
| |
| | [2]
| |
| |-> pfs_mutex_class(M.class) =====>> [C]
|
|-> pfs_thread(T).event_name(M) =====>> [A], [D], [E], [F]
|
| [3]
|
3a |-> pfs_account(U, H).event_name(M) =====>> [D], [E], [F]
. |
. | [4-RESET]
. |
3b .....+-> pfs_user(U).event_name(M) =====>> [E]
. |
3c .....+-> pfs_host(H).event_name(M) =====>> [F]
@endverbatim
How to read this diagram:
- events that occur during the instrumented code execution are noted with numbers,
as in [1]. Code executed by these events has an impact on overhead.
- events that occur during TRUNCATE TABLE operations are noted with numbers,
followed by "-RESET", as in [4-RESET].
Code executed by these events has no impact on overhead,
since they are executed by independent monitoring sessions.
- events that occur when a reader extracts data from a performance schema table
are noted with letters, as in [A]. The name of the table involved,
and the method that builds a row are documented. Code executed by these events
has no impact on the instrumentation overhead. Note that the table
implementation may pull data from different buffers.
- nominal code paths are in plain lines. A "nominal" code path corresponds to
cases where the performance schema buffers are sized so that no records are lost.
- degenerated code paths are in dotted lines. A "degenerated" code path corresponds
to edge cases where parent buffers are full, which forces the code to aggregate to
grand parents directly.
Implemented as:
- [1] @c start_mutex_wait_v1(), @c end_mutex_wait_v1()
- [2] @c destroy_mutex_v1()
- [3] @c aggregate_thread_waits()
- [4] @c PFS_account::aggregate_waits()
- [A] EVENTS_WAITS_SUMMARY_BY_THREAD_BY_EVENT_NAME,
@c table_ews_by_thread_by_event_name::make_row()
- [B] EVENTS_WAITS_SUMMARY_BY_INSTANCE,
@c table_events_waits_summary_by_instance::make_mutex_row()
- [C] EVENTS_WAITS_SUMMARY_GLOBAL_BY_EVENT_NAME,
@c table_ews_global_by_event_name::make_mutex_row()
- [D] EVENTS_WAITS_SUMMARY_BY_ACCOUNT_BY_EVENT_NAME,
@c table_ews_by_account_by_event_name::make_row()
- [E] EVENTS_WAITS_SUMMARY_BY_USER_BY_EVENT_NAME,
@c table_ews_by_user_by_event_name::make_row()
- [F] EVENTS_WAITS_SUMMARY_BY_HOST_BY_EVENT_NAME,
@c table_ews_by_host_by_event_name::make_row()
Table EVENTS_WAITS_SUMMARY_BY_INSTANCE is a 'on the fly' aggregate,
because the data is collected on the fly by (1) and stored into a buffer,
pfs_mutex. The table implementation [B] simply reads the results directly
from this buffer.
Table EVENTS_WAITS_SUMMARY_GLOBAL_BY_EVENT_NAME is a 'mixed' aggregate,
because some data is collected on the fly (1),
some data is preserved with (2) at a later time in the life cycle,
and two different buffers pfs_mutex and pfs_mutex_class are used to store the
statistics collected. The table implementation [C] is more complex, since
it reads from two buffers pfs_mutex and pfs_mutex_class.
@subsection IMPL_WAIT_RWLOCK Rwlock waits
@verbatim
rwlock_locker(T, R)
|
| [1]
|
|-> pfs_rwlock(R) =====>> [B], [C]
| |
| | [2]
| |
| |-> pfs_rwlock_class(R.class) =====>> [C]
|
|-> pfs_thread(T).event_name(R) =====>> [A]
|
...
@endverbatim
Implemented as:
- [1] @c start_rwlock_rdwait_v1(), @c end_rwlock_rdwait_v1(), ...
- [2] @c destroy_rwlock_v1()
- [A] EVENTS_WAITS_SUMMARY_BY_THREAD_BY_EVENT_NAME,
@c table_ews_by_thread_by_event_name::make_row()
- [B] EVENTS_WAITS_SUMMARY_BY_INSTANCE,
@c table_events_waits_summary_by_instance::make_rwlock_row()
- [C] EVENTS_WAITS_SUMMARY_GLOBAL_BY_EVENT_NAME,
@c table_ews_global_by_event_name::make_rwlock_row()
@subsection IMPL_WAIT_COND Cond waits
@verbatim
cond_locker(T, C)
|
| [1]
|
|-> pfs_cond(C) =====>> [B], [C]
| |
| | [2]
| |
| |-> pfs_cond_class(C.class) =====>> [C]
|
|-> pfs_thread(T).event_name(C) =====>> [A]
|
...
@endverbatim
Implemented as:
- [1] @c start_cond_wait_v1(), @c end_cond_wait_v1()
- [2] @c destroy_cond_v1()
- [A] EVENTS_WAITS_SUMMARY_BY_THREAD_BY_EVENT_NAME,
@c table_ews_by_thread_by_event_name::make_row()
- [B] EVENTS_WAITS_SUMMARY_BY_INSTANCE,
@c table_events_waits_summary_by_instance::make_cond_row()
- [C] EVENTS_WAITS_SUMMARY_GLOBAL_BY_EVENT_NAME,
@c table_ews_global_by_event_name::make_cond_row()
@subsection IMPL_WAIT_FILE File waits
@verbatim
file_locker(T, F)
|
| [1]
|
|-> pfs_file(F) =====>> [B], [C], [D], [E]
| |
| | [2]
| |
| |-> pfs_file_class(F.class) =====>> [C], [D]
|
|-> pfs_thread(T).event_name(F) =====>> [A]
|
...
@endverbatim
Implemented as:
- [1] @c get_thread_file_name_locker_v1(), @c start_file_wait_v1(),
@c end_file_wait_v1(), ...
- [2] @c close_file_v1()
- [A] EVENTS_WAITS_SUMMARY_BY_THREAD_BY_EVENT_NAME,
@c table_ews_by_thread_by_event_name::make_row()
- [B] EVENTS_WAITS_SUMMARY_BY_INSTANCE,
@c table_events_waits_summary_by_instance::make_file_row()
- [C] EVENTS_WAITS_SUMMARY_GLOBAL_BY_EVENT_NAME,
@c table_ews_global_by_event_name::make_file_row()
- [D] FILE_SUMMARY_BY_EVENT_NAME,
@c table_file_summary_by_event_name::make_row()
- [E] FILE_SUMMARY_BY_INSTANCE,
@c table_file_summary_by_instance::make_row()
@subsection IMPL_WAIT_SOCKET Socket waits
@verbatim
socket_locker(T, S)
|
| [1]
|
|-> pfs_socket(S) =====>> [A], [B], [C], [D], [E]
|
| [2]
|
|-> pfs_socket_class(S.class) =====>> [C], [D]
|
|-> pfs_thread(T).event_name(S) =====>> [A]
|
| [3]
|
3a |-> pfs_account(U, H).event_name(S) =====>> [F], [G], [H]
. |
. | [4-RESET]
. |
3b .....+-> pfs_user(U).event_name(S) =====>> [G]
. |
3c .....+-> pfs_host(H).event_name(S) =====>> [H]
@endverbatim
Implemented as:
- [1] @c start_socket_wait_v1(), @c end_socket_wait_v1().
- [2] @c close_socket_v1()
- [3] @c aggregate_thread_waits()