Skip to content

Commit cf2480d

Browse files
committed
MDEV-21452: Retain the watchdog only on dict_sys.mutex, for performance
Most hangs seem to involve dict_sys.mutex. While holding lock_sys.mutex we rarely acquire any buffer pool page latches, which are a frequent source of potential hangs.
1 parent ff5d306 commit cf2480d

File tree

6 files changed

+30
-148
lines changed

6 files changed

+30
-148
lines changed
Lines changed: 11 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -1,25 +1,14 @@
11
connect con1,localhost,root,,;
2+
CREATE TABLE t1 (a INT PRIMARY KEY) ENGINE=InnoDB;
3+
SET DEBUG_SYNC='create_table SIGNAL stuck WAIT_FOR ever';
4+
CREATE TABLE t2 (a INT PRIMARY KEY) ENGINE=InnoDB;
25
connect con2,localhost,root,,;
3-
drop table if exists t1;
4-
connection con1;
5-
create table t1 (id integer, x integer) engine = InnoDB;
6-
insert into t1 values(0, 0);
7-
set DEBUG_DBUG='+d,fatal-semaphore-timeout';
8-
set autocommit=0;
9-
# Sending query on con1,
10-
# the session will hold lock table mutex and sleep
11-
SELECT * from t1 where id = 0 FOR UPDATE;
12-
connection con2;
13-
set autocommit=0;
14-
# Sending query on con2,
15-
# the session will be blocked on the lock table mutex and
16-
# thus be put into sync arry
17-
SELECT * from t1 where id = 0 FOR UPDATE;
6+
SET DEBUG_SYNC='now WAIT_FOR stuck';
7+
FLUSH TABLES;
8+
SELECT * FROM t1;
189
connection default;
19-
# Waitting for mysqld to crash
20-
# Mysqld crash was detected
21-
# Waitting for reconnect after mysqld restarts
22-
# Reconnected after mysqld was successfully restarted
23-
# Cleaning up before exit
24-
drop table if exists t1;
25-
# Clean exit
10+
# Waiting for mariadbd to crash
11+
# Crash was detected
12+
DROP TABLE t1, t2;
13+
Warnings:
14+
Warning 1932 Table 'test.t2' doesn't exist in engine
Lines changed: 15 additions & 62 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
1-
# Only test in debug mode since DBUG_EXECUTE_IF is used
21
--source include/have_debug.inc
2+
--source include/have_debug_sync.inc
33

44
# Can't test this with embedded server
55
--source include/not_embedded.inc
@@ -11,56 +11,17 @@
1111
--source include/have_innodb.inc
1212

1313
connect (con1,localhost,root,,);
14+
CREATE TABLE t1 (a INT PRIMARY KEY) ENGINE=InnoDB;
15+
16+
SET DEBUG_SYNC='create_table SIGNAL stuck WAIT_FOR ever';
17+
send CREATE TABLE t2 (a INT PRIMARY KEY) ENGINE=InnoDB;
18+
1419
connect (con2,localhost,root,,);
20+
SET DEBUG_SYNC='now WAIT_FOR stuck';
21+
FLUSH TABLES;
1522

16-
--disable_warnings
17-
drop table if exists t1;
18-
--enable_warnings
19-
20-
connection con1;
21-
eval create table t1 (id integer, x integer) engine = InnoDB;
22-
insert into t1 values(0, 0);
23-
24-
# Enable the debug injection.
25-
set DEBUG_DBUG='+d,fatal-semaphore-timeout';
26-
set autocommit=0;
27-
28-
# The following query will hang for an hour since the debug injection
29-
# code will sleep an hour after holding the lock table mutex
30-
--echo # Sending query on con1,
31-
--echo # the session will hold lock table mutex and sleep
32-
--send
33-
SELECT * from t1 where id = 0 FOR UPDATE;
34-
35-
# To make sure con1 holding the lock table mutex and sleeping
36-
--sleep 2
37-
38-
connection con2;
39-
set autocommit=0;
40-
41-
# The following query will be blocked on the lock table mutex held by
42-
# con1 so it will be put into sync array.
43-
--echo # Sending query on con2,
44-
--echo # the session will be blocked on the lock table mutex and
45-
--echo # thus be put into sync arry
46-
--send
47-
SELECT * from t1 where id = 0 FOR UPDATE;
48-
49-
# Waitting for mysqld to abort due to fatal semaphore timeout.
50-
# Please note that, in the master.opt file, the fatal timeout
51-
# was set to 1 second, but in mysqld debug mode, this timeout
52-
# value will be timed 10 because UNIV_DEBUG_VALGRIND is set
53-
# (see sync_array_print_long_waits_low() in storage/innobase/sync/sync0arr.cc)
54-
# so the actual timeout will be 1 * 10 = 10 seconds. Besides,
55-
# mysqld will abort after detecting this fatal timeout 10 times in
56-
# a loop with interval of 1 second (see srv_error_monitor_thread
57-
# thread in torage/innobase/srv/srv0srv.cc), so mysqld will abort
58-
# in 1 * 10 + 1 * 10 = 20 seconds after con2 being blocked on
59-
# the lock table mutex.
60-
#
61-
# P.S. the default fatal sempahore timeout is 600 seconds,
62-
# so mysqld will abort after 600 * 10 + 1 * 10 = 6010 seconds
63-
# in debug mode and 600 + 1 * 10 = 610 seconds in release mode.
23+
# The following query will be blocked on the dict_sys.mutex held by con1
24+
send SELECT * FROM t1;
6425

6526
connection default;
6627

@@ -73,10 +34,10 @@ call mtr.add_suppression(".*");
7334
# The crash is expected
7435
exec echo "restart" > $MYSQLTEST_VARDIR/tmp/mysqld.1.expect;
7536

76-
--echo # Waitting for mysqld to crash
37+
--echo # Waiting for mariadbd to crash
7738

78-
# It will take 20 seconds to detect the long semaphore and mysqld to abort.
79-
# This test will be treated as pass as long as mysqld crash/restart is dectected
39+
# It may take 20 seconds to detect the long semaphore and mysqld to abort.
40+
# This test will be treated as pass as long as mysqld crash/restart is detected
8041
# in 80 seconds.
8142
let $counter= 80;
8243
let $mysql_errno= 0;
@@ -94,8 +55,7 @@ while (!$mysql_errno)
9455
--sleep 1
9556
}
9657

97-
--echo # Mysqld crash was detected
98-
--echo # Waitting for reconnect after mysqld restarts
58+
--echo # Crash was detected
9959

10060
enable_reconnect;
10161
connection default;
@@ -105,11 +65,4 @@ connection default;
10565
# Call script that will poll the server waiting for it to be back online again
10666
source include/wait_until_connected_again.inc;
10767

108-
--echo # Reconnected after mysqld was successfully restarted
109-
110-
--echo # Cleaning up before exit
111-
--disable_warnings
112-
drop table if exists t1;
113-
--enable_warnings
114-
115-
--echo # Clean exit
68+
DROP TABLE t1, t2;

storage/innobase/include/lock0lock.h

Lines changed: 1 addition & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -706,15 +706,9 @@ class lock_sys_t
706706
{
707707
bool m_initialised;
708708

709-
/** The my_hrtime_coarse().val of the oldest mutex_lock_wait() start, or 0 */
710-
std::atomic<ulonglong> mutex_wait_start;
711-
712709
/** mutex proteting the locks */
713710
MY_ALIGNED(CACHE_LINE_SIZE) mysql_mutex_t mutex;
714711
public:
715-
/** Diagnostic message for exceeding the mutex_lock_wait() timeout */
716-
static const char fatal_msg[];
717-
718712
/** record locks */
719713
hash_table_t rec_hash;
720714
/** predicate locks for SPATIAL INDEX */
@@ -747,10 +741,6 @@ class lock_sys_t
747741

748742
bool is_initialised() { return m_initialised; }
749743

750-
private:
751-
/** Acquire lock_sys.mutex */
752-
ATTRIBUTE_NOINLINE void mutex_lock_wait();
753-
public:
754744
#ifdef HAVE_PSI_MUTEX_INTERFACE
755745
/** Try to acquire lock_sys.mutex */
756746
ATTRIBUTE_NOINLINE int mutex_trylock();
@@ -762,7 +752,7 @@ class lock_sys_t
762752
/** Try to acquire lock_sys.mutex */
763753
int mutex_trylock() { return mysql_mutex_trylock(&mutex); }
764754
/** Aqcuire lock_sys.mutex */
765-
void mutex_lock() { if (mutex_trylock()) mutex_lock_wait(); }
755+
void mutex_lock() { mysql_mutex_lock(&mutex); }
766756
/** Release lock_sys.mutex */
767757
void mutex_unlock() { mysql_mutex_unlock(&mutex); }
768758
#endif
@@ -771,11 +761,6 @@ class lock_sys_t
771761
/** Assert that mutex_lock() has not been invoked */
772762
void mutex_assert_unlocked() const { mysql_mutex_assert_not_owner(&mutex); }
773763

774-
/** @return the my_hrtime_coarse().val of the oldest mutex_lock_wait() start,
775-
assuming that requests are served on a FIFO basis */
776-
ulonglong oldest_wait() const
777-
{ return mutex_wait_start.load(std::memory_order_relaxed); }
778-
779764
/** Wait for a lock to be granted */
780765
void wait_lock(lock_t **lock, mysql_cond_t *cond)
781766
{ while (*lock) mysql_cond_wait(cond, &mutex); }

storage/innobase/lock/lock0lock.cc

Lines changed: 1 addition & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -304,12 +304,6 @@ static bool lock_rec_validate_page(const buf_block_t *block, bool latched)
304304
/* The lock system */
305305
lock_sys_t lock_sys;
306306

307-
/** Diagnostic message for exceeding the mutex_lock_wait() timeout */
308-
const char lock_sys_t::fatal_msg[]=
309-
"innodb_fatal_semaphore_wait_threshold was exceeded for lock_sys.mutex. "
310-
"Please refer to "
311-
"https://mariadb.com/kb/en/how-to-produce-a-full-stack-trace-for-mysqld/";
312-
313307
/** We store info on the latest deadlock error to this buffer. InnoDB
314308
Monitor will then fetch it and print */
315309
static bool lock_deadlock_found = false;
@@ -476,36 +470,12 @@ void lock_sys_t::create(ulint n_cells)
476470
timeout_timer_active = false;
477471
}
478472

479-
void lock_sys_t::mutex_lock_wait()
480-
{
481-
ulonglong now= my_hrtime_coarse().val, old= 0;
482-
if (mutex_wait_start.compare_exchange_strong
483-
(old, now, std::memory_order_relaxed, std::memory_order_relaxed))
484-
{
485-
mysql_mutex_lock(&mutex);
486-
mutex_wait_start.store(0, std::memory_order_relaxed);
487-
return;
488-
}
489-
490-
ut_ad(old);
491-
/* We could have old > now due to our use of my_hrtime_coarse(). */
492-
ulong waited= old <= now ? static_cast<ulong>((now - old) / 1000000) : 0;
493-
const ulong threshold= srv_fatal_semaphore_wait_threshold;
494-
495-
if (waited >= threshold)
496-
ib::fatal() << fatal_msg;
497-
498-
if (waited > threshold / 4)
499-
ib::warn() << "A long wait (" << waited
500-
<< " seconds) was observed for lock_sys.mutex";
501-
mysql_mutex_lock(&mutex);
502-
}
503473

504474
#ifdef HAVE_PSI_MUTEX_INTERFACE
505475
/** Try to acquire lock_sys.mutex */
506476
int lock_sys_t::mutex_trylock() { return mysql_mutex_trylock(&mutex); }
507477
/** Acquire lock_sys.mutex */
508-
void lock_sys_t::mutex_lock() { if (mutex_trylock()) mutex_lock_wait(); }
478+
void lock_sys_t::mutex_lock() { mysql_mutex_lock(&mutex); }
509479
/** Release lock_sys.mutex */
510480
void lock_sys_t::mutex_unlock() { mysql_mutex_unlock(&mutex); }
511481
#endif
@@ -3571,9 +3541,6 @@ lock_table(
35713541

35723542
lock_sys.mutex_lock();
35733543

3574-
DBUG_EXECUTE_IF("fatal-semaphore-timeout",
3575-
{ os_thread_sleep(3600000000LL); });
3576-
35773544
/* We have to check if the new lock is compatible with any locks
35783545
other transactions have in the table lock queue. */
35793546

storage/innobase/row/row0mysql.cc

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2304,6 +2304,8 @@ row_create_table_for_mysql(
23042304
ut_d(dict_sys.assert_locked());
23052305
ut_ad(trx->dict_operation_lock_mode == RW_X_LATCH);
23062306

2307+
DEBUG_SYNC_C("create_table");
2308+
23072309
DBUG_EXECUTE_IF(
23082310
"ib_create_table_fail_at_start_of_row_create_table_for_mysql",
23092311
goto err_exit;

storage/innobase/srv/srv0srv.cc

Lines changed: 0 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -1349,20 +1349,6 @@ void srv_monitor_task(void*)
13491349
const ulonglong now = my_hrtime_coarse().val;
13501350
const ulong threshold = srv_fatal_semaphore_wait_threshold;
13511351

1352-
if (ulonglong start = lock_sys.oldest_wait()) {
1353-
ulong waited = static_cast<ulong>((now - start) / 1000000);
1354-
if (waited >= threshold) {
1355-
ib::fatal() << lock_sys.fatal_msg;
1356-
}
1357-
1358-
if (waited == threshold / 4
1359-
|| waited == threshold / 2
1360-
|| waited == threshold / 4 * 3) {
1361-
ib::warn() << "Long wait (" << waited
1362-
<< " seconds) for lock_sys.mutex";
1363-
}
1364-
}
1365-
13661352
if (ulonglong start = dict_sys.oldest_wait()) {
13671353
ulong waited = static_cast<ulong>((now - start) / 1000000);
13681354
if (waited >= threshold) {

0 commit comments

Comments
 (0)