Skip to content

Commit 042fc29

Browse files
committed
MDEV-19845: Adaptive spin loops
Starting with the Intel Skylake microarchitecture, the PAUSE instruction latency is about 140 clock cycles instead of earlier 10. On AMD processors, the latency could be 10 or 50 clock cycles, depending on microarchitecture. Because of this big range of latency, let us scale the loops around the PAUSE instruction based on timing results at server startup. my_cpu_relax_multiplier: New variable: How many times to invoke PAUSE in a loop. Only defined for IA-32 and AMD64. my_cpu_init(): Determine with RDTSC the time to run 16 PAUSE instructions in two unrolled loops according, and based on the quicker of the two runs, initialize my_cpu_relax_multiplier. This form of calibration was suggested by Mikhail Sinyavin from Intel. LF_BACKOFF(), ut_delay(): Use my_cpu_relax_multiplier when available. ut_delay(): Define inline in my_cpu.h. UT_COMPILER_BARRIER(): Remove. This does not seem to have any effect, because in our ut_delay() implementation, no computations are being performed inside the loop. The purpose of UT_COMPILER_BARRIER() was to prohibit the compiler from reordering computations. It was not emitting any code.
1 parent 620f4f8 commit 042fc29

File tree

9 files changed

+128
-83
lines changed

9 files changed

+128
-83
lines changed

config.h.cmake

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -187,8 +187,6 @@
187187
#cmakedefine HAVE_LINUX_FALLOC_H 1
188188
#cmakedefine HAVE_FALLOC_PUNCH_HOLE_AND_KEEP_SIZE 1
189189
#cmakedefine HAVE_PREAD 1
190-
#cmakedefine HAVE_PAUSE_INSTRUCTION 1
191-
#cmakedefine HAVE_FAKE_PAUSE_INSTRUCTION 1
192190
#cmakedefine HAVE_RDTSCLL 1
193191
#cmakedefine HAVE_READ_REAL_TIME 1
194192
#cmakedefine HAVE_PTHREAD_ATTR_CREATE 1

configure.cmake

Lines changed: 0 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -758,32 +758,6 @@ IF(NOT C_HAS_inline)
758758
ENDIF()
759759
ENDIF()
760760

761-
IF(NOT CMAKE_CROSSCOMPILING AND NOT MSVC)
762-
STRING(TOLOWER ${CMAKE_SYSTEM_PROCESSOR} processor)
763-
IF(processor MATCHES "86" OR processor MATCHES "amd64" OR processor MATCHES "x64")
764-
#Check for x86 PAUSE instruction
765-
# We have to actually try running the test program, because of a bug
766-
# in Solaris on x86_64, where it wrongly reports that PAUSE is not
767-
# supported when trying to run an application. See
768-
# http://bugs.opensolaris.org/bugdatabase/printableBug.do?bug_id=6478684
769-
CHECK_C_SOURCE_RUNS("
770-
int main()
771-
{
772-
__asm__ __volatile__ (\"pause\");
773-
return 0;
774-
}" HAVE_PAUSE_INSTRUCTION)
775-
ENDIF()
776-
IF (NOT HAVE_PAUSE_INSTRUCTION)
777-
CHECK_C_SOURCE_COMPILES("
778-
int main()
779-
{
780-
__asm__ __volatile__ (\"rep; nop\");
781-
return 0;
782-
}
783-
" HAVE_FAKE_PAUSE_INSTRUCTION)
784-
ENDIF()
785-
ENDIF()
786-
787761
CHECK_SYMBOL_EXISTS(tcgetattr "termios.h" HAVE_TCGETATTR 1)
788762

789763
#

include/my_cpu.h

Lines changed: 41 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -46,10 +46,20 @@
4646
#define HMT_high()
4747
#endif
4848

49+
#if defined __i386__ || defined __x86_64__ || defined _WIN32
50+
# define HAVE_PAUSE_INSTRUCTION /* added in Intel Pentium 4 */
51+
#endif
4952

5053
static inline void MY_RELAX_CPU(void)
5154
{
52-
#ifdef HAVE_PAUSE_INSTRUCTION
55+
#ifdef _WIN32
56+
/*
57+
In the Win32 API, the x86 PAUSE instruction is executed by calling
58+
the YieldProcessor macro defined in WinNT.h. It is a CPU architecture-
59+
independent way by using YieldProcessor.
60+
*/
61+
YieldProcessor();
62+
#elif defined HAVE_PAUSE_INSTRUCTION
5363
/*
5464
According to the gcc info page, asm volatile means that the
5565
instruction has important side-effects and must not be removed.
@@ -61,16 +71,6 @@ static inline void MY_RELAX_CPU(void)
6171
#else
6272
__asm__ __volatile__ ("pause");
6373
#endif
64-
65-
#elif defined(HAVE_FAKE_PAUSE_INSTRUCTION)
66-
__asm__ __volatile__ ("rep; nop");
67-
#elif defined _WIN32
68-
/*
69-
In the Win32 API, the x86 PAUSE instruction is executed by calling
70-
the YieldProcessor macro defined in WinNT.h. It is a CPU architecture-
71-
independent way by using YieldProcessor.
72-
*/
73-
YieldProcessor();
7474
#elif defined(_ARCH_PWR8)
7575
__ppc_get_timebase();
7676
#else
@@ -81,6 +81,20 @@ static inline void MY_RELAX_CPU(void)
8181
}
8282

8383

84+
#ifdef HAVE_PAUSE_INSTRUCTION
85+
# ifdef __cplusplus
86+
extern "C" {
87+
# endif
88+
extern unsigned my_cpu_relax_multiplier;
89+
void my_cpu_init(void);
90+
# ifdef __cplusplus
91+
}
92+
# endif
93+
#else
94+
# define my_cpu_relax_multiplier 200
95+
# define my_cpu_init() /* nothing */
96+
#endif
97+
8498
/*
8599
LF_BACKOFF should be used to improve performance on hyperthreaded CPUs. Intel
86100
recommends to use it in spin loops also on non-HT machines to reduce power
@@ -94,9 +108,23 @@ static inline void MY_RELAX_CPU(void)
94108

95109
static inline int LF_BACKOFF(void)
96110
{
97-
int i;
98-
for (i= 0; i < 200; i++)
111+
unsigned i= my_cpu_relax_multiplier;
112+
while (i--)
99113
MY_RELAX_CPU();
100114
return 1;
101115
}
116+
117+
/**
118+
Run a delay loop while waiting for a shared resource to be released.
119+
@param delay originally, roughly microseconds on 100 MHz Intel Pentium
120+
*/
121+
static inline void ut_delay(unsigned delay)
122+
{
123+
unsigned i= my_cpu_relax_multiplier / 4 * delay;
124+
HMT_low();
125+
while (i--)
126+
MY_RELAX_CPU();
127+
HMT_medium();
128+
}
129+
102130
#endif

mysys/CMakeLists.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -44,7 +44,7 @@ SET(MYSYS_SOURCES array.c charset-def.c charset.c checksum.c my_default.c
4444
my_getncpus.c my_safehash.c my_chmod.c my_rnd.c
4545
my_uuid.c wqueue.c waiting_threads.c ma_dyncol.c ../sql-common/my_time.c
4646
my_rdtsc.c my_context.c psi_noop.c
47-
my_atomic_writes.c my_likely.c
47+
my_atomic_writes.c my_cpu.c my_likely.c
4848
file_logger.c my_dlerror.c)
4949

5050
IF (WIN32)

mysys/my_cpu.c

Lines changed: 81 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,81 @@
1+
/* Copyright (c) 2019, MariaDB Corporation.
2+
3+
This program is free software; you can redistribute it and/or modify
4+
it under the terms of the GNU General Public License as published by
5+
the Free Software Foundation; version 2 of the License.
6+
7+
This program is distributed in the hope that it will be useful,
8+
but WITHOUT ANY WARRANTY; without even the implied warranty of
9+
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
10+
GNU General Public License for more details.
11+
12+
You should have received a copy of the GNU General Public License
13+
along with this program; if not, write to the Free Software
14+
Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */
15+
16+
#include <my_global.h>
17+
#include <my_cpu.h>
18+
19+
#ifdef HAVE_PAUSE_INSTRUCTION
20+
/** How many times to invoke PAUSE in a loop */
21+
unsigned my_cpu_relax_multiplier = 200;
22+
23+
# include <stdint.h>
24+
25+
# ifdef _MSC_VER
26+
# include <intrin.h>
27+
# else
28+
# include <x86intrin.h>
29+
# endif
30+
31+
#define PAUSE4 MY_RELAX_CPU(); MY_RELAX_CPU(); MY_RELAX_CPU(); MY_RELAX_CPU()
32+
#define PAUSE16 PAUSE4; PAUSE4; PAUSE4; PAUSE4
33+
34+
/**
35+
Initialize my_cpu_relax_multiplier.
36+
37+
Determine the duration of a PAUSE instruction by running an
38+
unrolled loop of 16 PAUSE instructions twice, and taking the
39+
faster of the two runs. In this way, even if the execution is
40+
interrupted by the operating system, it should be extremely
41+
unlikely that both loops get interrupted.
42+
43+
On the Intel Skylake microarchitecture, the PAUSE instruction takes
44+
around 140 clock cycles, while on earlier microarchitectures it could
45+
be 10 clock cycles or less. Scale the PAUSE loop counter accordingly.
46+
47+
On a pre-Skylake Intel Xeon CPU E5-2630 v4 @ 2.20GHz running an AMD64
48+
executable, the numbers would be between 172 and 220 when all the code
49+
is inlined as follows:
50+
51+
rdtsc,mov,shl,or, 16*pause,
52+
rdtsc,mov,shl,or, 16*pause,
53+
rdtsc.
54+
55+
That would yield 11 to 14 cycles per PAUSE instruction even if we
56+
(wrongly) ignore the overhead of the other instructions.
57+
58+
On a Skylake mobile processor Intel Core i7-6500U CPU @ 2.50GHz, the
59+
numbers would range from 1896 to 2410 (or 1976 if taking the minimum
60+
of two runs), yielding 118 to 151 (or 123) cycles per PAUSE instruction.
61+
62+
Let us define a threshold at roughly 30 cycles per PAUSE instruction,
63+
and use a shorter delay if the PAUSE instruction takes longer than
64+
that. In some AMD processors, the PAUSE instruction could take 40 or
65+
50 cycles. Let us use a shorter delay multiplier for them as well.
66+
67+
The 1/10 scaling factor (200/20) was derived experimentally by
68+
Mikhail Sinyavin from Intel.
69+
*/
70+
void my_cpu_init(void)
71+
{
72+
uint64_t t0, t1, t2;
73+
t0= __rdtsc();
74+
PAUSE16;
75+
t1= __rdtsc();
76+
PAUSE16;
77+
t2= __rdtsc();
78+
if (t2 - t1 > 30 * 16 && t1 - t0 > 30 * 16)
79+
my_cpu_relax_multiplier= 20;
80+
}
81+
#endif

sql/mysqld.cc

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5113,6 +5113,7 @@ static int init_server_components()
51135113
We need to call each of these following functions to ensure that
51145114
all things are initialized so that unireg_abort() doesn't fail
51155115
*/
5116+
my_cpu_init();
51165117
mdl_init();
51175118
if (tdc_init() || hostname_cache_init())
51185119
unireg_abort(1);

storage/innobase/include/ib0mutex.h

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
/*****************************************************************************
22
33
Copyright (c) 2013, 2015, Oracle and/or its affiliates. All Rights Reserved.
4-
Copyright (c) 2017, MariaDB Corporation. All Rights Reserved.
4+
Copyright (c) 2017, 2019, MariaDB Corporation.
55
66
This program is free software; you can redistribute it and/or modify it under
77
the terms of the GNU General Public License as published by the Free Software
@@ -29,8 +29,7 @@ Created 2013-03-26 Sunny Bains.
2929
#ifndef ib0mutex_h
3030
#define ib0mutex_h
3131

32-
#include "ut0ut.h"
33-
#include "ut0rnd.h"
32+
#include "my_cpu.h"
3433
#include "os0event.h"
3534
#include "sync0arr.h"
3635

storage/innobase/include/ut0ut.h

Lines changed: 1 addition & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -54,14 +54,6 @@ Created 1/20/1994 Heikki Tuuri
5454
/** Time stamp */
5555
typedef time_t ib_time_t;
5656

57-
#if defined (__GNUC__)
58-
# define UT_COMPILER_BARRIER() __asm__ __volatile__ ("":::"memory")
59-
#elif defined (_MSC_VER)
60-
# define UT_COMPILER_BARRIER() _ReadWriteBarrier()
61-
#else
62-
# define UT_COMPILER_BARRIER()
63-
#endif
64-
6557
/*********************************************************************//**
6658
Delays execution for at most max_wait_us microseconds or returns earlier
6759
if cond becomes true.
@@ -270,14 +262,7 @@ void
270262
ut_sprintf_timestamp(
271263
/*=================*/
272264
char* buf); /*!< in: buffer where to sprintf */
273-
/*************************************************************//**
274-
Runs an idle loop on CPU. The argument gives the desired delay
275-
in microseconds on 100 MHz Pentium + Visual C++.
276-
@return dummy value */
277-
void
278-
ut_delay(
279-
/*=====*/
280-
ulint delay); /*!< in: delay in microseconds on 100 MHz Pentium */
265+
281266
/*************************************************************//**
282267
Prints the contents of a memory buffer in hex and ascii. */
283268
void

storage/innobase/ut/ut0ut.cc

Lines changed: 1 addition & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
/*****************************************************************************
22
33
Copyright (c) 1994, 2017, Oracle and/or its affiliates. All Rights Reserved.
4-
Copyright (c) 2017, MariaDB Corporation.
4+
Copyright (c) 2017, 2019, MariaDB Corporation.
55
66
This program is free software; you can redistribute it and/or modify it under
77
the terms of the GNU General Public License as published by the Free Software
@@ -283,27 +283,6 @@ ut_sprintf_timestamp(
283283
#endif
284284
}
285285

286-
/*************************************************************//**
287-
Runs an idle loop on CPU. The argument gives the desired delay
288-
in microseconds on 100 MHz Pentium + Visual C++.
289-
@return dummy value */
290-
void
291-
ut_delay(
292-
/*=====*/
293-
ulint delay) /*!< in: delay in microseconds on 100 MHz Pentium */
294-
{
295-
ulint i;
296-
297-
HMT_low();
298-
299-
for (i = 0; i < delay * 50; i++) {
300-
MY_RELAX_CPU();
301-
UT_COMPILER_BARRIER();
302-
}
303-
304-
HMT_medium();
305-
}
306-
307286
/*************************************************************//**
308287
Prints the contents of a memory buffer in hex and ascii. */
309288
void

0 commit comments

Comments
 (0)