Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
MDEV-10653: SHOW SLAVE STATUS Can Deadlock an Errored Slave
AKA rpl.rpl_parallel, binlog_encryption.rpl_parallel fails in buildbot with timeout in include A replication parallel worker thread can deadlock with another connection running SHOW SLAVE STATUS. That is, if the replication worker thread is in do_gco_wait() and is killed, it will already hold the LOCK_parallel_entry, and during error reporting, try to grab the err_lock. SHOW SLAVE STATUS, however, grabs these locks in reverse order. It will initially grab the err_lock, and then try to grab LOCK_parallel_entry. This leads to a deadlock when both threads have grabbed their first lock without the second. This patch implements the MDEV-31894 proposed fix to optimize the workers_idle() check to compare the last in-use relay log’s queued_count==dequeued_count for idleness. This removes the need for workers_idle() to grab LOCK_parallel_entry, as these values are atomically updated. Huge thanks to Kristian Nielsen for diagnosing the problem! Reviewed By: ============ Kristian Nielsen <knielsen@knielsen-hq.org> Andrei Elkin <andrei.elkin@mariadb.com>
- Loading branch information
Showing
6 changed files
with
203 additions
and
19 deletions.
There are no files selected for viewing
66 changes: 66 additions & 0 deletions
66
mysql-test/suite/rpl/r/rpl_deadlock_show_slave_status.result
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,66 @@ | ||
include/master-slave.inc | ||
[connection master] | ||
# | ||
# Initialize test data | ||
connection master; | ||
create table t1 (a int) engine=innodb; | ||
insert into t1 values (1); | ||
include/save_master_gtid.inc | ||
connection slave; | ||
include/sync_with_master_gtid.inc | ||
include/stop_slave.inc | ||
call mtr.add_suppression("Connection was killed"); | ||
call mtr.add_suppression("Commit failed due to failure of an earlier commit on which this one depends"); | ||
set @save_parallel_threads= @@global.slave_parallel_threads; | ||
set @save_parallel_mode= @@global.slave_parallel_mode; | ||
set @save_transaction_retries= @@global.slave_transaction_retries; | ||
set @save_innodb_lock_wait_timeout= @@global.innodb_lock_wait_timeout; | ||
set @@global.slave_parallel_threads= 2; | ||
set @@global.slave_parallel_mode= CONSERVATIVE; | ||
set @@global.slave_transaction_retries= 0; | ||
set @@global.innodb_lock_wait_timeout= 10; | ||
# Grabbing lock on innodb row to force future replication transaction to wait (and eventually timeout) | ||
BEGIN; | ||
select * from t1 where a=1 for update; | ||
a | ||
1 | ||
connection master; | ||
set @old_dbug= @@session.debug_dbug; | ||
set @@session.debug_dbug="+d,binlog_force_commit_id"; | ||
SET @commit_id= 10000; | ||
update t1 set a=2 where a=1; | ||
SET @commit_id= 10001; | ||
insert into t1 values (3); | ||
set @@session.debug_dbug= @old_dbug; | ||
connection slave; | ||
start slave; | ||
# Waiting for first transaction to start (and be held at innodb row lock).. | ||
# Waiting for next transaction to start and hold at do_gco_wait().. | ||
connection slave1; | ||
set @@session.debug_dbug="+d,hold_sss_with_err_lock"; | ||
show slave status; | ||
connection slave; | ||
set debug_sync="now wait_for sss_got_err_lock"; | ||
kill <TID of worker in do_gco_wait>; | ||
set debug_sync="now signal sss_continue"; | ||
connection slave1; | ||
# Waiting for SHOW SLAVE STATUS to complete.. | ||
# ..done | ||
connection slave; | ||
ROLLBACK; | ||
include/wait_for_slave_sql_error.inc [errno=1927] | ||
# | ||
# Cleanup | ||
connection master; | ||
drop table t1; | ||
include/save_master_gtid.inc | ||
connection slave; | ||
set debug_sync= "RESET"; | ||
set @@global.slave_parallel_threads= @save_parallel_threads; | ||
set @@global.slave_parallel_mode= @save_parallel_mode; | ||
set @@global.slave_transaction_retries= @save_transaction_retries; | ||
set @@global.innodb_lock_wait_timeout= @save_innodb_lock_wait_timeout; | ||
start slave sql_thread; | ||
include/sync_with_master_gtid.inc | ||
include/rpl_end.inc | ||
# End of rpl_deadlock_show_slave_status.test |
121 changes: 121 additions & 0 deletions
121
mysql-test/suite/rpl/t/rpl_deadlock_show_slave_status.test
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,121 @@ | ||
# | ||
# Verify that SHOW SLAVE STATUS will not cause deadlocks on the replica. | ||
# A deadlock has been seen in do_gco_wait if the thread is killed, as it will | ||
# hold the LOCK_parallel_entry, and during error reporting, try to grab the | ||
# err_lock. Prior to MDEV-10653, SHOW SLAVE STATUS would grab these locks in | ||
# the reverse order, as calling workers_idle() used to grab LOCK_parallel_entry | ||
# with the err_lock already grabbed (though the MDEV-10653 patch changed the | ||
# workles_idle() implementation to remove the need for locking the | ||
# parallel_entry). | ||
# | ||
# References: | ||
# MDEV-10653: SHOW SLAVE STATUS Can Deadlock an Errored Slave | ||
# | ||
|
||
--source include/master-slave.inc | ||
--source include/have_innodb.inc | ||
--source include/have_debug.inc | ||
--source include/have_binlog_format_row.inc | ||
|
||
--echo # | ||
--echo # Initialize test data | ||
--connection master | ||
create table t1 (a int) engine=innodb; | ||
insert into t1 values (1); | ||
--source include/save_master_gtid.inc | ||
|
||
--connection slave | ||
--source include/sync_with_master_gtid.inc | ||
--source include/stop_slave.inc | ||
|
||
call mtr.add_suppression("Connection was killed"); | ||
call mtr.add_suppression("Commit failed due to failure of an earlier commit on which this one depends"); | ||
|
||
set @save_parallel_threads= @@global.slave_parallel_threads; | ||
set @save_parallel_mode= @@global.slave_parallel_mode; | ||
set @save_transaction_retries= @@global.slave_transaction_retries; | ||
set @save_innodb_lock_wait_timeout= @@global.innodb_lock_wait_timeout; | ||
|
||
set @@global.slave_parallel_threads= 2; | ||
set @@global.slave_parallel_mode= CONSERVATIVE; | ||
set @@global.slave_transaction_retries= 0; | ||
set @@global.innodb_lock_wait_timeout= 10; | ||
|
||
--echo # Grabbing lock on innodb row to force future replication transaction to wait (and eventually timeout) | ||
BEGIN; | ||
select * from t1 where a=1 for update; | ||
|
||
--connection master | ||
|
||
set @old_dbug= @@session.debug_dbug; | ||
set @@session.debug_dbug="+d,binlog_force_commit_id"; | ||
|
||
|
||
# GCO 1 | ||
SET @commit_id= 10000; | ||
# T1 | ||
update t1 set a=2 where a=1; | ||
|
||
# GCO 2 | ||
SET @commit_id= 10001; | ||
# T2 | ||
insert into t1 values (3); | ||
|
||
set @@session.debug_dbug= @old_dbug; | ||
|
||
--connection slave | ||
start slave; | ||
|
||
--echo # Waiting for first transaction to start (and be held at innodb row lock).. | ||
--let $wait_condition= SELECT count(*)=1 FROM information_schema.processlist WHERE state LIKE 'Update_rows_log_event::find_row(%)' and command LIKE 'Slave_worker'; | ||
--source include/wait_condition.inc | ||
|
||
--echo # Waiting for next transaction to start and hold at do_gco_wait().. | ||
--let $wait_condition= SELECT count(*)=1 FROM information_schema.processlist WHERE state LIKE 'Waiting for prior transaction to start commit%' and command LIKE 'Slave_worker'; | ||
--source include/wait_condition.inc | ||
|
||
--connection slave1 | ||
set @@session.debug_dbug="+d,hold_sss_with_err_lock"; | ||
--send show slave status | ||
|
||
--connection slave | ||
set debug_sync="now wait_for sss_got_err_lock"; | ||
|
||
--let $t2_tid= `SELECT ID FROM INFORMATION_SCHEMA.PROCESSLIST WHERE STATE LIKE 'Waiting for prior transaction to start commit%'` | ||
--replace_result $t2_tid "<TID of worker in do_gco_wait>" | ||
--eval kill $t2_tid | ||
--let $wait_condition= SELECT count(*)=1 FROM information_schema.processlist WHERE command LIKE 'Killed'; | ||
--source include/wait_condition.inc | ||
|
||
set debug_sync="now signal sss_continue"; | ||
|
||
--connection slave1 | ||
--echo # Waiting for SHOW SLAVE STATUS to complete.. | ||
--disable_result_log | ||
--reap | ||
--enable_result_log | ||
--echo # ..done | ||
|
||
--connection slave | ||
ROLLBACK; | ||
--let $slave_sql_errno= 1927 | ||
--source include/wait_for_slave_sql_error.inc | ||
|
||
|
||
--echo # | ||
--echo # Cleanup | ||
--connection master | ||
drop table t1; | ||
--source include/save_master_gtid.inc | ||
|
||
--connection slave | ||
set debug_sync= "RESET"; | ||
set @@global.slave_parallel_threads= @save_parallel_threads; | ||
set @@global.slave_parallel_mode= @save_parallel_mode; | ||
set @@global.slave_transaction_retries= @save_transaction_retries; | ||
set @@global.innodb_lock_wait_timeout= @save_innodb_lock_wait_timeout; | ||
start slave sql_thread; | ||
--source include/sync_with_master_gtid.inc | ||
|
||
--source include/rpl_end.inc | ||
--echo # End of rpl_deadlock_show_slave_status.test |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters