Skip to content

Commit db5d1cd

Browse files
committed
MDEV-34857: Implement --slave-abort-blocking-timeout
If a slave replicating an event has waited for more than @@slave_abort_blocking_timeout for a conflicting metadata lock held by a non-replication thread, the blocking query is killed to allow replication to proceed and not be blocked indefinitely by a user query. Reviewed-by: Monty <monty@mariadb.org> Signed-off-by: Kristian Nielsen <knielsen@knielsen-hq.org>
1 parent 669d8ff commit db5d1cd

13 files changed

+246
-17
lines changed

mysql-test/main/mysqld--help.result

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1301,6 +1301,11 @@ The following specify which files/extra groups are read (specified before remain
13011301
--skip-show-database
13021302
Don't allow 'SHOW DATABASE' commands
13031303
--skip-slave-start If set, slave is not autostarted
1304+
--slave-abort-blocking-timeout=#
1305+
Maximum time a slave DDL will wait for a blocking SELECT
1306+
or other user query until that query will be aborted. The
1307+
argument will be treated as a decimal value with
1308+
nanosecond precision
13041309
--slave-compressed-protocol
13051310
Use compression on master/slave protocol
13061311
--slave-connections-needed-for-purge=#
@@ -1948,6 +1953,7 @@ skip-name-resolve FALSE
19481953
skip-networking FALSE
19491954
skip-show-database FALSE
19501955
skip-slave-start FALSE
1956+
slave-abort-blocking-timeout 3.1536e+07
19511957
slave-compressed-protocol FALSE
19521958
slave-connections-needed-for-purge 1
19531959
slave-ddl-exec-mode IDEMPOTENT
Lines changed: 74 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,74 @@
1+
include/master-slave.inc
2+
[connection master]
3+
*** Testcase to show how a long-running SELECT can block replication from proceeding
4+
*** past a DDL. Intention to implement a timeout after which such SELECT can be
5+
*** killed.
6+
connection master;
7+
CREATE TABLE t1 (a INT PRIMARY KEY, b INT) ENGINE=InnoDB;
8+
INSERT INTO t1 SELECT seq, 100+seq FROM seq_1_to_20;
9+
connection slave;
10+
include/stop_slave.inc
11+
SELECT @@GLOBAL.slave_abort_blocking_timeout;
12+
@@GLOBAL.slave_abort_blocking_timeout
13+
31536000.000000
14+
SET @old_abort_timeout= @@slave_abort_blocking_timeout;
15+
SET GLOBAL slave_abort_blocking_timeout= -1;
16+
Warnings:
17+
Warning 1292 Truncated incorrect slave_abort_blocking_timeout value: '-1'
18+
SELECT @@GLOBAL.slave_abort_blocking_timeout;
19+
@@GLOBAL.slave_abort_blocking_timeout
20+
0.000000
21+
SET GLOBAL slave_abort_blocking_timeout= 1.0;
22+
SELECT @@GLOBAL.slave_abort_blocking_timeout;
23+
@@GLOBAL.slave_abort_blocking_timeout
24+
1.000000
25+
connection server_2;
26+
SELECT X.a, SLEEP(IF((X.b MOD 2)=0, 0.4, 0.6)) FROM t1 X CROSS JOIN t1 Y;
27+
connection slave;
28+
connection master;
29+
UPDATE t1 SET b=b+1000 WHERE a=1;
30+
ALTER TABLE t1 ADD INDEX b_idx(b);
31+
UPDATE t1 SET b=b+1000 WHERE a=20;
32+
connection slave;
33+
include/start_slave.inc
34+
connection server_2;
35+
ERROR 70100: Query execution was interrupted
36+
connection slave;
37+
SHOW CREATE TABLE t1;
38+
Table t1
39+
Create Table CREATE TABLE `t1` (
40+
`a` int(11) NOT NULL,
41+
`b` int(11) DEFAULT NULL,
42+
PRIMARY KEY (`a`),
43+
KEY `b_idx` (`b`)
44+
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_uca1400_ai_ci
45+
include/stop_slave.inc
46+
SET GLOBAL slave_abort_blocking_timeout= 0;
47+
SELECT @@GLOBAL.slave_abort_blocking_timeout;
48+
@@GLOBAL.slave_abort_blocking_timeout
49+
0.000000
50+
connection server_2;
51+
SELECT X.a, SLEEP(IF((X.b MOD 2)=0, 0.4, 0.6)) FROM t1 X CROSS JOIN t1 Y;
52+
connection slave;
53+
connection master;
54+
UPDATE t1 SET b=b+1000 WHERE a=1;
55+
ALTER TABLE t1 DROP INDEX b_idx;
56+
UPDATE t1 SET b=b+1000 WHERE a=20;
57+
connection slave;
58+
include/start_slave.inc
59+
connection server_2;
60+
ERROR 70100: Query execution was interrupted
61+
connection slave;
62+
SHOW CREATE TABLE t1;
63+
Table t1
64+
Create Table CREATE TABLE `t1` (
65+
`a` int(11) NOT NULL,
66+
`b` int(11) DEFAULT NULL,
67+
PRIMARY KEY (`a`)
68+
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_uca1400_ai_ci
69+
include/stop_slave.inc
70+
SET GLOBAL slave_abort_blocking_timeout= @old_abort_timeout;
71+
include/start_slave.inc
72+
connection master;
73+
DROP TABLE t1;
74+
include/rpl_end.inc
Lines changed: 85 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,85 @@
1+
--source include/have_innodb.inc
2+
--source include/have_sequence.inc
3+
--source include/have_binlog_format_mixed.inc
4+
--source include/master-slave.inc
5+
6+
--echo *** Testcase to show how a long-running SELECT can block replication from proceeding
7+
--echo *** past a DDL. Intention to implement a timeout after which such SELECT can be
8+
--echo *** killed.
9+
10+
--connection master
11+
CREATE TABLE t1 (a INT PRIMARY KEY, b INT) ENGINE=InnoDB;
12+
INSERT INTO t1 SELECT seq, 100+seq FROM seq_1_to_20;
13+
14+
--sync_slave_with_master
15+
16+
--source include/stop_slave.inc
17+
SELECT @@GLOBAL.slave_abort_blocking_timeout;
18+
SET @old_abort_timeout= @@slave_abort_blocking_timeout;
19+
SET GLOBAL slave_abort_blocking_timeout= -1;
20+
SELECT @@GLOBAL.slave_abort_blocking_timeout;
21+
SET GLOBAL slave_abort_blocking_timeout= 1.0;
22+
SELECT @@GLOBAL.slave_abort_blocking_timeout;
23+
--connection server_2
24+
# Start a SELECT that will run for long.
25+
send SELECT X.a, SLEEP(IF((X.b MOD 2)=0, 0.4, 0.6)) FROM t1 X CROSS JOIN t1 Y;
26+
27+
--connection slave
28+
# Wait for the SELECT to have started so it will block the coming DDL
29+
# from replicating.
30+
--let $wait_condition= SELECT COUNT(*)=1 FROM INFORMATION_SCHEMA.PROCESSLIST WHERE state = 'User sleep'
31+
--source include/wait_condition.inc
32+
33+
--connection master
34+
UPDATE t1 SET b=b+1000 WHERE a=1;
35+
ALTER TABLE t1 ADD INDEX b_idx(b);
36+
UPDATE t1 SET b=b+1000 WHERE a=20;
37+
38+
--save_master_pos
39+
--connection slave
40+
--source include/start_slave.inc
41+
--sync_with_master
42+
43+
--connection server_2
44+
--error ER_QUERY_INTERRUPTED
45+
reap;
46+
47+
--connection slave
48+
query_vertical SHOW CREATE TABLE t1;
49+
50+
# Do it again to test that a timeout of 0 also works to abort user queries.
51+
--source include/stop_slave.inc
52+
SET GLOBAL slave_abort_blocking_timeout= 0;
53+
SELECT @@GLOBAL.slave_abort_blocking_timeout;
54+
--connection server_2
55+
send SELECT X.a, SLEEP(IF((X.b MOD 2)=0, 0.4, 0.6)) FROM t1 X CROSS JOIN t1 Y;
56+
57+
--connection slave
58+
--let $wait_condition= SELECT COUNT(*)=1 FROM INFORMATION_SCHEMA.PROCESSLIST WHERE state = 'User sleep'
59+
--source include/wait_condition.inc
60+
61+
--connection master
62+
UPDATE t1 SET b=b+1000 WHERE a=1;
63+
ALTER TABLE t1 DROP INDEX b_idx;
64+
UPDATE t1 SET b=b+1000 WHERE a=20;
65+
66+
--save_master_pos
67+
--connection slave
68+
--source include/start_slave.inc
69+
--sync_with_master
70+
71+
--connection server_2
72+
--error ER_QUERY_INTERRUPTED
73+
reap;
74+
75+
--connection slave
76+
query_vertical SHOW CREATE TABLE t1;
77+
78+
79+
--source include/stop_slave.inc
80+
SET GLOBAL slave_abort_blocking_timeout= @old_abort_timeout;
81+
--source include/start_slave.inc
82+
83+
--connection master
84+
DROP TABLE t1;
85+
--source include/rpl_end.inc

mysql-test/suite/sys_vars/r/sysvars_server_notembedded.result

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4012,6 +4012,16 @@ NUMERIC_BLOCK_SIZE NULL
40124012
ENUM_VALUE_LIST OFF,ON
40134013
READ_ONLY YES
40144014
COMMAND_LINE_ARGUMENT OPTIONAL
4015+
VARIABLE_NAME SLAVE_ABORT_BLOCKING_TIMEOUT
4016+
VARIABLE_SCOPE GLOBAL
4017+
VARIABLE_TYPE DOUBLE
4018+
VARIABLE_COMMENT Maximum time a slave DDL will wait for a blocking SELECT or other user query until that query will be aborted. The argument will be treated as a decimal value with nanosecond precision
4019+
NUMERIC_MIN_VALUE 0
4020+
NUMERIC_MAX_VALUE 31536000
4021+
NUMERIC_BLOCK_SIZE NULL
4022+
ENUM_VALUE_LIST NULL
4023+
READ_ONLY NO
4024+
COMMAND_LINE_ARGUMENT REQUIRED
40154025
VARIABLE_NAME SLAVE_COMPRESSED_PROTOCOL
40164026
VARIABLE_SCOPE GLOBAL
40174027
VARIABLE_TYPE BOOLEAN

sql/mdl.cc

Lines changed: 39 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -613,7 +613,7 @@ class MDL_lock
613613

614614
bool needs_notification(const MDL_ticket *ticket) const
615615
{ return m_strategy->needs_notification(ticket); }
616-
void notify_conflicting_locks(MDL_context *ctx)
616+
void notify_conflicting_locks(MDL_context *ctx, bool abort_blocking)
617617
{
618618
for (const auto &conflicting_ticket : m_granted)
619619
{
@@ -624,7 +624,8 @@ class MDL_lock
624624

625625
ctx->get_owner()->
626626
notify_shared_lock(conflicting_ctx->get_owner(),
627-
conflicting_ctx->get_needs_thr_lock_abort());
627+
conflicting_ctx->get_needs_thr_lock_abort(),
628+
abort_blocking);
628629
}
629630
}
630631
}
@@ -2361,10 +2362,10 @@ MDL_context::acquire_lock(MDL_request *mdl_request, double lock_wait_timeout)
23612362

23622363
/*
23632364
Don't break conflicting locks if timeout is 0 as 0 is used
2364-
To check if there is any conflicting locks...
2365+
to check if there is any conflicting locks...
23652366
*/
23662367
if (lock->needs_notification(ticket) && lock_wait_timeout)
2367-
lock->notify_conflicting_locks(this);
2368+
lock->notify_conflicting_locks(this, false);
23682369

23692370
/*
23702371
Ensure that if we are trying to get an exclusive lock for a slave
@@ -2397,14 +2398,44 @@ MDL_context::acquire_lock(MDL_request *mdl_request, double lock_wait_timeout)
23972398

23982399
find_deadlock();
23992400

2400-
struct timespec abs_timeout, abs_shortwait;
2401+
struct timespec abs_timeout, abs_shortwait, abs_abort_blocking_timeout;
2402+
bool abort_blocking_enabled= false;
2403+
double abort_blocking_timeout= slave_abort_blocking_timeout;
2404+
if (abort_blocking_timeout < lock_wait_timeout &&
2405+
m_owner->get_thd()->rgi_slave)
2406+
{
2407+
/*
2408+
After @@slave_abort_blocking_timeout seconds, kill non-replication
2409+
queries that are blocking a replication event (such as an ALTER TABLE)
2410+
from proceeding.
2411+
*/
2412+
set_timespec_nsec(abs_abort_blocking_timeout,
2413+
(ulonglong)(abort_blocking_timeout * 1000000000ULL));
2414+
abort_blocking_enabled= true;
2415+
}
24012416
set_timespec_nsec(abs_timeout,
24022417
(ulonglong)(lock_wait_timeout * 1000000000ULL));
2403-
set_timespec(abs_shortwait, 1);
24042418
wait_status= MDL_wait::EMPTY;
24052419

2406-
while (cmp_timespec(abs_shortwait, abs_timeout) <= 0)
2420+
for (;;)
24072421
{
2422+
bool abort_blocking= false;
2423+
set_timespec(abs_shortwait, 1);
2424+
if (abort_blocking_enabled &&
2425+
cmp_timespec(abs_shortwait, abs_abort_blocking_timeout) >= 0)
2426+
{
2427+
/*
2428+
If a slave DDL has waited for --slave-abort-select-timeout, then notify
2429+
any blocking SELECT once before continuing to wait until the full
2430+
timeout.
2431+
*/
2432+
abs_shortwait= abs_abort_blocking_timeout;
2433+
abort_blocking= true;
2434+
abort_blocking_enabled= false;
2435+
}
2436+
else if (cmp_timespec(abs_shortwait, abs_timeout) > 0)
2437+
break;
2438+
24082439
/* abs_timeout is far away. Wait a short while and notify locks. */
24092440
wait_status= m_wait.timed_wait(m_owner, &abs_shortwait, FALSE,
24102441
mdl_request->key.get_wait_state_name());
@@ -2425,9 +2456,8 @@ MDL_context::acquire_lock(MDL_request *mdl_request, double lock_wait_timeout)
24252456

24262457
mysql_prlock_wrlock(&lock->m_rwlock);
24272458
if (lock->needs_notification(ticket))
2428-
lock->notify_conflicting_locks(this);
2459+
lock->notify_conflicting_locks(this, abort_blocking);
24292460
mysql_prlock_unlock(&lock->m_rwlock);
2430-
set_timespec(abs_shortwait, 1);
24312461
}
24322462
if (wait_status == MDL_wait::EMPTY)
24332463
wait_status= m_wait.timed_wait(m_owner, &abs_timeout, TRUE,

sql/mdl.h

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -110,7 +110,8 @@ class MDL_context_owner
110110
@see THD::notify_shared_lock()
111111
*/
112112
virtual bool notify_shared_lock(MDL_context_owner *in_use,
113-
bool needs_thr_lock_abort) = 0;
113+
bool needs_thr_lock_abort,
114+
bool needs_non_slave_abort) = 0;
114115
};
115116

116117
/**

sql/mysqld.cc

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -494,6 +494,7 @@ uint internal_slave_connections_needed_for_purge;
494494
ulong slave_max_allowed_packet= 0;
495495
double slave_max_statement_time_double;
496496
ulonglong slave_max_statement_time;
497+
double slave_abort_blocking_timeout;
497498
ulonglong binlog_stmt_cache_size=0;
498499
ulonglong max_binlog_stmt_cache_size=0;
499500
ulonglong test_flags;

sql/mysqld.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -242,6 +242,7 @@ extern ulong max_binlog_size;
242242
extern ulong slave_max_allowed_packet;
243243
extern ulonglong slave_max_statement_time;
244244
extern double slave_max_statement_time_double;
245+
extern double slave_abort_blocking_timeout;
245246
extern ulong opt_binlog_rows_event_max_size;
246247
extern ulong binlog_row_metadata;
247248
extern my_bool opt_binlog_gtid_index;

sql/privilege.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -598,6 +598,8 @@ constexpr privilege_t PRIV_SET_SYSTEM_GLOBAL_VAR_SLAVE_MAX_ALLOWED_PACKET=
598598
REPL_SLAVE_ADMIN_ACL;
599599
constexpr privilege_t PRIV_SET_SYSTEM_GLOBAL_VAR_SLAVE_MAX_STATEMENT_TIME=
600600
REPL_SLAVE_ADMIN_ACL;
601+
constexpr privilege_t PRIV_SET_SYSTEM_GLOBAL_VAR_SLAVE_ABORT_BLOCKING_TIMEOUT=
602+
REPL_SLAVE_ADMIN_ACL;
601603
constexpr privilege_t PRIV_SET_SYSTEM_GLOBAL_VAR_SLAVE_NET_TIMEOUT=
602604
REPL_SLAVE_ADMIN_ACL;
603605
constexpr privilege_t PRIV_SET_SYSTEM_GLOBAL_VAR_SLAVE_PARALLEL_MAX_QUEUED=

sql/sql_base.cc

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1025,7 +1025,7 @@ void close_thread_table(THD *thd, TABLE **table_ptr)
10251025
thd->handler_stats.add(file->handler_stats);
10261026
}
10271027
/*
1028-
This look is needed to allow THD::notify_shared_lock() to
1028+
This lock is needed to allow THD::notify_shared_lock() to
10291029
traverse the thd->open_tables list without having to worry that
10301030
some of the tables are removed from under it
10311031
*/

0 commit comments

Comments
 (0)