Skip to content

Commit

Permalink
MDEV-10653: SHOW SLAVE STATUS Can Deadlock an Errored Slave
Browse files Browse the repository at this point in the history
AKA rpl.rpl_parallel, binlog_encryption.rpl_parallel fails in
buildbot with timeout in include

A replication parallel worker thread can deadlock with another
connection running SHOW SLAVE STATUS. That is, if the replication
worker thread is in do_gco_wait() and is killed, it will already
hold the LOCK_parallel_entry, and during error reporting, try to
grab the err_lock. SHOW SLAVE STATUS, however, grabs these locks in
reverse order. It will initially grab the err_lock, and then try to
grab LOCK_parallel_entry. This leads to a deadlock when both threads
have grabbed their first lock without the second.

This patch implements the MDEV-31894 proposed fix to optimize the
workers_idle() check to compare the last in-use relay log’s
queued_count==dequeued_count for idleness. This removes the need for
workers_idle() to grab LOCK_parallel_entry, as these values are
atomically updated.

Huge thanks to Kristian Nielsen for diagnosing the problem!

Reviewed By:
============
Kristian Nielsen <knielsen@knielsen-hq.org>
Andrei Elkin <andrei.elkin@mariadb.com>
  • Loading branch information
bnestere committed Dec 11, 2023
1 parent 5ca63b2 commit 8dad514
Show file tree
Hide file tree
Showing 6 changed files with 203 additions and 19 deletions.
66 changes: 66 additions & 0 deletions mysql-test/suite/rpl/r/rpl_deadlock_show_slave_status.result
@@ -0,0 +1,66 @@
include/master-slave.inc
[connection master]
#
# Initialize test data
connection master;
create table t1 (a int) engine=innodb;
insert into t1 values (1);
include/save_master_gtid.inc
connection slave;
include/sync_with_master_gtid.inc
include/stop_slave.inc
call mtr.add_suppression("Connection was killed");
call mtr.add_suppression("Commit failed due to failure of an earlier commit on which this one depends");
set @save_parallel_threads= @@global.slave_parallel_threads;
set @save_parallel_mode= @@global.slave_parallel_mode;
set @save_transaction_retries= @@global.slave_transaction_retries;
set @save_innodb_lock_wait_timeout= @@global.innodb_lock_wait_timeout;
set @@global.slave_parallel_threads= 2;
set @@global.slave_parallel_mode= CONSERVATIVE;
set @@global.slave_transaction_retries= 0;
set @@global.innodb_lock_wait_timeout= 10;
# Grabbing lock on innodb row to force future replication transaction to wait (and eventually timeout)
BEGIN;
select * from t1 where a=1 for update;
a
1
connection master;
set @old_dbug= @@session.debug_dbug;
set @@session.debug_dbug="+d,binlog_force_commit_id";
SET @commit_id= 10000;
update t1 set a=2 where a=1;
SET @commit_id= 10001;
insert into t1 values (3);
set @@session.debug_dbug= @old_dbug;
connection slave;
start slave;
# Waiting for first transaction to start (and be held at innodb row lock)..
# Waiting for next transaction to start and hold at do_gco_wait()..
connection slave1;
set @@session.debug_dbug="+d,hold_sss_with_err_lock";
show slave status;
connection slave;
set debug_sync="now wait_for sss_got_err_lock";
kill <TID of worker in do_gco_wait>;
set debug_sync="now signal sss_continue";
connection slave1;
# Waiting for SHOW SLAVE STATUS to complete..
# ..done
connection slave;
ROLLBACK;
include/wait_for_slave_sql_error.inc [errno=1927]
#
# Cleanup
connection master;
drop table t1;
include/save_master_gtid.inc
connection slave;
set debug_sync= "RESET";
set @@global.slave_parallel_threads= @save_parallel_threads;
set @@global.slave_parallel_mode= @save_parallel_mode;
set @@global.slave_transaction_retries= @save_transaction_retries;
set @@global.innodb_lock_wait_timeout= @save_innodb_lock_wait_timeout;
start slave sql_thread;
include/sync_with_master_gtid.inc
include/rpl_end.inc
# End of rpl_deadlock_show_slave_status.test
121 changes: 121 additions & 0 deletions mysql-test/suite/rpl/t/rpl_deadlock_show_slave_status.test
@@ -0,0 +1,121 @@
#
# Verify that SHOW SLAVE STATUS will not cause deadlocks on the replica.
# A deadlock has been seen in do_gco_wait if the thread is killed, as it will
# hold the LOCK_parallel_entry, and during error reporting, try to grab the
# err_lock. Prior to MDEV-10653, SHOW SLAVE STATUS would grab these locks in
# the reverse order, as calling workers_idle() used to grab LOCK_parallel_entry
# with the err_lock already grabbed (though the MDEV-10653 patch changed the
# workles_idle() implementation to remove the need for locking the
# parallel_entry).
#
# References:
# MDEV-10653: SHOW SLAVE STATUS Can Deadlock an Errored Slave
#

--source include/master-slave.inc
--source include/have_innodb.inc
--source include/have_debug.inc
--source include/have_binlog_format_row.inc

--echo #
--echo # Initialize test data
--connection master
create table t1 (a int) engine=innodb;
insert into t1 values (1);
--source include/save_master_gtid.inc

--connection slave
--source include/sync_with_master_gtid.inc
--source include/stop_slave.inc

call mtr.add_suppression("Connection was killed");
call mtr.add_suppression("Commit failed due to failure of an earlier commit on which this one depends");

set @save_parallel_threads= @@global.slave_parallel_threads;
set @save_parallel_mode= @@global.slave_parallel_mode;
set @save_transaction_retries= @@global.slave_transaction_retries;
set @save_innodb_lock_wait_timeout= @@global.innodb_lock_wait_timeout;

set @@global.slave_parallel_threads= 2;
set @@global.slave_parallel_mode= CONSERVATIVE;
set @@global.slave_transaction_retries= 0;
set @@global.innodb_lock_wait_timeout= 10;

--echo # Grabbing lock on innodb row to force future replication transaction to wait (and eventually timeout)
BEGIN;
select * from t1 where a=1 for update;

--connection master

set @old_dbug= @@session.debug_dbug;
set @@session.debug_dbug="+d,binlog_force_commit_id";


# GCO 1
SET @commit_id= 10000;
# T1
update t1 set a=2 where a=1;

# GCO 2
SET @commit_id= 10001;
# T2
insert into t1 values (3);

set @@session.debug_dbug= @old_dbug;

--connection slave
start slave;

--echo # Waiting for first transaction to start (and be held at innodb row lock)..
--let $wait_condition= SELECT count(*)=1 FROM information_schema.processlist WHERE state LIKE 'Update_rows_log_event::find_row(%)' and command LIKE 'Slave_worker';
--source include/wait_condition.inc

--echo # Waiting for next transaction to start and hold at do_gco_wait()..
--let $wait_condition= SELECT count(*)=1 FROM information_schema.processlist WHERE state LIKE 'Waiting for prior transaction to start commit%' and command LIKE 'Slave_worker';
--source include/wait_condition.inc

--connection slave1
set @@session.debug_dbug="+d,hold_sss_with_err_lock";
--send show slave status

--connection slave
set debug_sync="now wait_for sss_got_err_lock";

--let $t2_tid= `SELECT ID FROM INFORMATION_SCHEMA.PROCESSLIST WHERE STATE LIKE 'Waiting for prior transaction to start commit%'`
--replace_result $t2_tid "<TID of worker in do_gco_wait>"
--eval kill $t2_tid
--let $wait_condition= SELECT count(*)=1 FROM information_schema.processlist WHERE command LIKE 'Killed';
--source include/wait_condition.inc

set debug_sync="now signal sss_continue";

--connection slave1
--echo # Waiting for SHOW SLAVE STATUS to complete..
--disable_result_log
--reap
--enable_result_log
--echo # ..done

--connection slave
ROLLBACK;
--let $slave_sql_errno= 1927
--source include/wait_for_slave_sql_error.inc


--echo #
--echo # Cleanup
--connection master
drop table t1;
--source include/save_master_gtid.inc

--connection slave
set debug_sync= "RESET";
set @@global.slave_parallel_threads= @save_parallel_threads;
set @@global.slave_parallel_mode= @save_parallel_mode;
set @@global.slave_transaction_retries= @save_transaction_retries;
set @@global.innodb_lock_wait_timeout= @save_innodb_lock_wait_timeout;
start slave sql_thread;
--source include/sync_with_master_gtid.inc

--source include/rpl_end.inc
--echo # End of rpl_deadlock_show_slave_status.test
19 changes: 3 additions & 16 deletions sql/rpl_parallel.cc
Expand Up @@ -2537,23 +2537,10 @@ rpl_parallel::stop_during_until()


bool
rpl_parallel::workers_idle()
rpl_parallel::workers_idle(Relay_log_info *rli)
{
struct rpl_parallel_entry *e;
uint32 i, max_i;

max_i= domain_hash.records;
for (i= 0; i < max_i; ++i)
{
bool active;
e= (struct rpl_parallel_entry *)my_hash_element(&domain_hash, i);
mysql_mutex_lock(&e->LOCK_parallel_entry);
active= e->current_sub_id > e->last_committed_sub_id;
mysql_mutex_unlock(&e->LOCK_parallel_entry);
if (active)
break;
}
return (i == max_i);
return rli->last_inuse_relaylog->queued_count ==
rli->last_inuse_relaylog->dequeued_count;
}


Expand Down
3 changes: 2 additions & 1 deletion sql/rpl_parallel.h
Expand Up @@ -369,9 +369,10 @@ struct rpl_parallel {
rpl_parallel_entry *find(uint32 domain_id);
void wait_for_done(THD *thd, Relay_log_info *rli);
void stop_during_until();
bool workers_idle();
int wait_for_workers_idle(THD *thd);
int do_event(rpl_group_info *serial_rgi, Log_event *ev, ulonglong event_size);

static bool workers_idle(Relay_log_info *rli);
};


Expand Down
2 changes: 1 addition & 1 deletion sql/rpl_rli.h
Expand Up @@ -622,7 +622,7 @@ struct inuse_relaylog {
rpl_gtid *relay_log_state;
uint32 relay_log_state_count;
/* Number of events in this relay log queued for worker threads. */
int64 queued_count;
Atomic_counter<int64> queued_count;
/* Number of events completed by worker threads. */
Atomic_counter<int64> dequeued_count;
/* Set when all events have been read from a relaylog. */
Expand Down
11 changes: 10 additions & 1 deletion sql/slave.cc
Expand Up @@ -3123,6 +3123,14 @@ static bool send_show_master_info_data(THD *thd, Master_info *mi, bool full,
mysql_mutex_lock(&mi->err_lock);
/* err_lock is to protect mi->rli.last_error() */
mysql_mutex_lock(&mi->rli.err_lock);

DBUG_EXECUTE_IF("hold_sss_with_err_lock", {
DBUG_ASSERT(!debug_sync_set_action(
thd, STRING_WITH_LEN("now SIGNAL sss_got_err_lock "
"WAIT_FOR sss_continue")));
DBUG_SET("-d,hold_sss_with_err_lock");
});

protocol->store(mi->host, &my_charset_bin);
protocol->store(mi->user, &my_charset_bin);
protocol->store((uint32) mi->port);
Expand Down Expand Up @@ -3197,7 +3205,8 @@ static bool send_show_master_info_data(THD *thd, Master_info *mi, bool full,
while the slave is processing ignored events, such as those skipped
due to slave_skip_counter.
*/
if (mi->using_parallel() && idle && !mi->rli.parallel.workers_idle())
if (mi->using_parallel() && idle &&
!rpl_parallel::workers_idle(&mi->rli))
idle= false;
}
if (idle)
Expand Down

0 comments on commit 8dad514

Please sign in to comment.