Skip to content

Commit

Permalink
MDEV-29369: rpl.rpl_semi_sync_shutdown_await_ack fails regularly with…
Browse files Browse the repository at this point in the history
… Result content mismatch

This test was prone to failures for a few reasons, summarized below:

 1) MDEV-32168 introduced “only_running_threads=1” to
slave_stop.inc, which allowed the stop logic to bypass an
attempting-to-reconnect IO thread. That is, the IO thread could
realize the master shutdown in `read_event()`, and thereby call into
`try_to_reconnect()`. This would leave the IO thread up when the
test expected it to be stopped. Fixed by explicitly stopping the
IO thread and allowing an error state, as the above case would
lead to errno 2003.

 2) On slow systems (or those running profiling tools, e.g. MSAN),
the waiting-for-ack transaction can complete before the system
processes the `SHUTDOWN WAIT FOR ALL SLAVES`. There was shutdown
preparation logic in-between the transaction and shutdown itself,
which contributes to this problem. This patch also moves this
preparation logic before the transaction, so there is less to do
in-between the calls.

 3) Changed work-around for MDEV-28141 to use debug_sync instead
of sleep delay, as it was still possible to hit the bug on very
slow systems.

 4) Masked MTR variable reset with disable/enable query log

Reviewed By:
============
Kristian Nielsen <knielsen@knielsen-hq.org>
  • Loading branch information
bnestere committed Feb 12, 2024
1 parent ee89558 commit 03d1346
Show file tree
Hide file tree
Showing 5 changed files with 190 additions and 149 deletions.
174 changes: 90 additions & 84 deletions mysql-test/suite/rpl/r/rpl_semi_sync_shutdown_await_ack.result
Original file line number Diff line number Diff line change
Expand Up @@ -15,11 +15,15 @@ call mtr.add_suppression("reply failed");
call mtr.add_suppression("Replication event checksum verification");
call mtr.add_suppression("Relay log write failure");
call mtr.add_suppression("Failed to kill the active semi-sync connection");
set @sav_enabled_server_2= @@GLOBAL.rpl_semi_sync_slave_enabled;
set @sav_server_2_dbug= @@GLOBAL.debug_dbug;
connection server_3;
call mtr.add_suppression("reply failed");
call mtr.add_suppression("Replication event checksum verification");
call mtr.add_suppression("Relay log write failure");
call mtr.add_suppression("Failed to kill the active semi-sync connection");
set @sav_enabled_server_3= @@GLOBAL.rpl_semi_sync_slave_enabled;
set @sav_server_3_dbug= @@GLOBAL.debug_dbug;
connection server_1;
CREATE TABLE t1 (a int);
connection server_2;
Expand All @@ -40,15 +44,15 @@ connection server_1;
#-- Enable semi-sync on slaves
let slave_last= 3
connection server_2;
set global rpl_semi_sync_slave_enabled = 1;
include/stop_slave.inc
set global rpl_semi_sync_slave_enabled = 1;
include/start_slave.inc
show status like 'Rpl_semi_sync_slave_status';
Variable_name Value
Rpl_semi_sync_slave_status ON
connection server_3;
set global rpl_semi_sync_slave_enabled = 1;
include/stop_slave.inc
set global rpl_semi_sync_slave_enabled = 1;
include/start_slave.inc
show status like 'Rpl_semi_sync_slave_status';
Variable_name Value
Expand All @@ -67,24 +71,20 @@ show status like 'Rpl_semi_sync_master_clients';
Variable_name Value
Rpl_semi_sync_master_clients 2
#-- Prepare servers to simulate delay or error
connection server_1;
SET @@GLOBAL.debug_dbug= "";
connection server_2;
SET @@GLOBAL.debug_dbug= "+d,simulate_delay_semisync_slave_reply";
connection server_3;
SET @@GLOBAL.debug_dbug= "+d,simulate_delay_semisync_slave_reply";
#--
#-- Test begins
connection server_1_con2;
#-- Give enough time after timeout/ack received to query yes_tx/no_tx
SET @@GLOBAL.debug_dbug= "+d,delay_shutdown_phase_2_after_semisync_wait";
connection server_1;
#-- Begin semi-sync transaction
INSERT INTO t1 VALUES (1);
connection server_1_con2;
#-- Wait until master recognizes a connection is awaiting semi-sync ACK
show status like 'Rpl_semi_sync_master_wait_sessions';
Variable_name Value
Rpl_semi_sync_master_wait_sessions 1
#-- Give enough time after timeout/ack received to query yes_tx/no_tx
SET @@GLOBAL.debug_dbug= "+d,delay_shutdown_phase_2_after_semisync_wait";
#-- Begin master shutdown
SHUTDOWN WAIT FOR ALL SLAVES;
connection server_1;
Expand All @@ -111,22 +111,19 @@ count(*)=1
#-- Re-synchronize slaves with master and disable semi-sync
#-- Stop slaves
connection server_2;
SET @@GLOBAL.debug_dbug= "";
SET @@GLOBAL.rpl_semi_sync_slave_enabled= 0;
include/stop_slave.inc
include/stop_slave_io.inc
include/stop_slave_sql.inc
SET @@GLOBAL.debug_dbug= @sav_server_2_dbug;
SET @@GLOBAL.rpl_semi_sync_slave_enabled= @sav_enabled_server_2;
connection server_3;
SET @@GLOBAL.debug_dbug= "";
SET @@GLOBAL.rpl_semi_sync_slave_enabled= 0;
include/stop_slave.inc
include/stop_slave_io.inc
include/stop_slave_sql.inc
SET @@GLOBAL.debug_dbug= @sav_server_3_dbug;
SET @@GLOBAL.rpl_semi_sync_slave_enabled= @sav_enabled_server_3;
#-- Bring the master back up
connection server_1_con2;
connection default;
connection server_1;
SET @@GLOBAL.debug_dbug= "";
SET @@GLOBAL.rpl_semi_sync_master_enabled = 0;
show status like 'Rpl_semi_sync_master_status';
Variable_name Value
Rpl_semi_sync_master_status OFF
TRUNCATE TABLE t1;
#-- Bring slaves back up
connection server_2;
Expand Down Expand Up @@ -157,15 +154,15 @@ connection server_1;
#-- Enable semi-sync on slaves
let slave_last= 3
connection server_2;
set global rpl_semi_sync_slave_enabled = 1;
include/stop_slave.inc
set global rpl_semi_sync_slave_enabled = 1;
include/start_slave.inc
show status like 'Rpl_semi_sync_slave_status';
Variable_name Value
Rpl_semi_sync_slave_status ON
connection server_3;
set global rpl_semi_sync_slave_enabled = 1;
include/stop_slave.inc
set global rpl_semi_sync_slave_enabled = 1;
include/start_slave.inc
show status like 'Rpl_semi_sync_slave_status';
Variable_name Value
Expand All @@ -184,24 +181,20 @@ show status like 'Rpl_semi_sync_master_clients';
Variable_name Value
Rpl_semi_sync_master_clients 2
#-- Prepare servers to simulate delay or error
connection server_1;
SET @@GLOBAL.debug_dbug= "+d,mysqld_delay_kill_threads_phase_1";
connection server_2;
SET @@GLOBAL.debug_dbug= "+d,corrupt_queue_event";
SET @@GLOBAL.debug_dbug= "+d,corrupt_queue_event,delay_semisync_kill_connection_for_mdev_28141";
connection server_3;
SET @@GLOBAL.debug_dbug= "+d,corrupt_queue_event";
SET @@GLOBAL.debug_dbug= "+d,corrupt_queue_event,delay_semisync_kill_connection_for_mdev_28141";
#--
#-- Test begins
connection server_1_con2;
#-- Give enough time after timeout/ack received to query yes_tx/no_tx
SET @@GLOBAL.debug_dbug= "+d,delay_shutdown_phase_2_after_semisync_wait";
connection server_1;
#-- Begin semi-sync transaction
INSERT INTO t1 VALUES (1);
connection server_1_con2;
#-- Wait until master recognizes a connection is awaiting semi-sync ACK
show status like 'Rpl_semi_sync_master_wait_sessions';
Variable_name Value
Rpl_semi_sync_master_wait_sessions 1
#-- Give enough time after timeout/ack received to query yes_tx/no_tx
SET @@GLOBAL.debug_dbug= "+d,delay_shutdown_phase_2_after_semisync_wait";
#-- Begin master shutdown
SHUTDOWN WAIT FOR ALL SLAVES;
connection server_1;
Expand All @@ -226,24 +219,33 @@ count(*)=0
1
#
#-- Re-synchronize slaves with master and disable semi-sync
#-- FIXME: workaround for MDEV-28141, preventing errored replicas from
# killing their semi-sync connections
connection server_2;
set debug_sync= "now wait_for at_semisync_kill_connection";
set debug_sync= "now signal continue_semisync_kill_connection";
# Wait for debug_sync signal to have been received before issuing RESET
set debug_sync= "reset";
connection server_3;
set debug_sync= "now wait_for at_semisync_kill_connection";
set debug_sync= "now signal continue_semisync_kill_connection";
# Wait for debug_sync signal to have been received before issuing RESET
set debug_sync= "reset";
#-- Stop slaves
connection server_2;
SET @@GLOBAL.debug_dbug= "";
SET @@GLOBAL.rpl_semi_sync_slave_enabled= 0;
include/stop_slave.inc
include/stop_slave_io.inc
include/stop_slave_sql.inc
SET @@GLOBAL.debug_dbug= @sav_server_2_dbug;
SET @@GLOBAL.rpl_semi_sync_slave_enabled= @sav_enabled_server_2;
connection server_3;
SET @@GLOBAL.debug_dbug= "";
SET @@GLOBAL.rpl_semi_sync_slave_enabled= 0;
include/stop_slave.inc
include/stop_slave_io.inc
include/stop_slave_sql.inc
SET @@GLOBAL.debug_dbug= @sav_server_3_dbug;
SET @@GLOBAL.rpl_semi_sync_slave_enabled= @sav_enabled_server_3;
#-- Bring the master back up
connection server_1_con2;
connection default;
connection server_1;
SET @@GLOBAL.debug_dbug= "";
SET @@GLOBAL.rpl_semi_sync_master_enabled = 0;
show status like 'Rpl_semi_sync_master_status';
Variable_name Value
Rpl_semi_sync_master_status OFF
TRUNCATE TABLE t1;
#-- Bring slaves back up
connection server_2;
Expand Down Expand Up @@ -275,15 +277,15 @@ connection server_1;
#-- Enable semi-sync on slaves
let slave_last= 3
connection server_2;
set global rpl_semi_sync_slave_enabled = 1;
include/stop_slave.inc
set global rpl_semi_sync_slave_enabled = 1;
include/start_slave.inc
show status like 'Rpl_semi_sync_slave_status';
Variable_name Value
Rpl_semi_sync_slave_status ON
connection server_3;
set global rpl_semi_sync_slave_enabled = 1;
include/stop_slave.inc
set global rpl_semi_sync_slave_enabled = 1;
include/start_slave.inc
show status like 'Rpl_semi_sync_slave_status';
Variable_name Value
Expand All @@ -302,24 +304,20 @@ show status like 'Rpl_semi_sync_master_clients';
Variable_name Value
Rpl_semi_sync_master_clients 2
#-- Prepare servers to simulate delay or error
connection server_1;
SET @@GLOBAL.debug_dbug= "+d,mysqld_delay_kill_threads_phase_1";
connection server_2;
SET @@GLOBAL.debug_dbug= "+d,corrupt_queue_event";
SET @@GLOBAL.debug_dbug= "+d,corrupt_queue_event,delay_semisync_kill_connection_for_mdev_28141";
connection server_3;
SET @@GLOBAL.debug_dbug= "+d,simulate_delay_semisync_slave_reply";
#--
#-- Test begins
connection server_1_con2;
#-- Give enough time after timeout/ack received to query yes_tx/no_tx
SET @@GLOBAL.debug_dbug= "+d,delay_shutdown_phase_2_after_semisync_wait";
connection server_1;
#-- Begin semi-sync transaction
INSERT INTO t1 VALUES (1);
connection server_1_con2;
#-- Wait until master recognizes a connection is awaiting semi-sync ACK
show status like 'Rpl_semi_sync_master_wait_sessions';
Variable_name Value
Rpl_semi_sync_master_wait_sessions 1
#-- Give enough time after timeout/ack received to query yes_tx/no_tx
SET @@GLOBAL.debug_dbug= "+d,delay_shutdown_phase_2_after_semisync_wait";
#-- Begin master shutdown
SHUTDOWN WAIT FOR ALL SLAVES;
connection server_1;
Expand All @@ -344,24 +342,28 @@ count(*)=1
1
#
#-- Re-synchronize slaves with master and disable semi-sync
#-- FIXME: workaround for MDEV-28141, preventing errored replicas from
# killing their semi-sync connections
connection server_2;
set debug_sync= "now wait_for at_semisync_kill_connection";
set debug_sync= "now signal continue_semisync_kill_connection";
# Wait for debug_sync signal to have been received before issuing RESET
set debug_sync= "reset";
#-- Stop slaves
connection server_2;
SET @@GLOBAL.debug_dbug= "";
SET @@GLOBAL.rpl_semi_sync_slave_enabled= 0;
include/stop_slave.inc
include/stop_slave_io.inc
include/stop_slave_sql.inc
SET @@GLOBAL.debug_dbug= @sav_server_2_dbug;
SET @@GLOBAL.rpl_semi_sync_slave_enabled= @sav_enabled_server_2;
connection server_3;
SET @@GLOBAL.debug_dbug= "";
SET @@GLOBAL.rpl_semi_sync_slave_enabled= 0;
include/stop_slave.inc
include/stop_slave_io.inc
include/stop_slave_sql.inc
SET @@GLOBAL.debug_dbug= @sav_server_3_dbug;
SET @@GLOBAL.rpl_semi_sync_slave_enabled= @sav_enabled_server_3;
#-- Bring the master back up
connection server_1_con2;
connection default;
connection server_1;
SET @@GLOBAL.debug_dbug= "";
SET @@GLOBAL.rpl_semi_sync_master_enabled = 0;
show status like 'Rpl_semi_sync_master_status';
Variable_name Value
Rpl_semi_sync_master_status OFF
TRUNCATE TABLE t1;
#-- Bring slaves back up
connection server_2;
Expand Down Expand Up @@ -399,15 +401,15 @@ connection server_1;
#-- Enable semi-sync on slaves
let slave_last= 3
connection server_2;
set global rpl_semi_sync_slave_enabled = 1;
include/stop_slave.inc
set global rpl_semi_sync_slave_enabled = 1;
include/start_slave.inc
show status like 'Rpl_semi_sync_slave_status';
Variable_name Value
Rpl_semi_sync_slave_status ON
connection server_3;
set global rpl_semi_sync_slave_enabled = 1;
include/stop_slave.inc
set global rpl_semi_sync_slave_enabled = 1;
include/start_slave.inc
show status like 'Rpl_semi_sync_slave_status';
Variable_name Value
Expand All @@ -426,24 +428,20 @@ show status like 'Rpl_semi_sync_master_clients';
Variable_name Value
Rpl_semi_sync_master_clients 2
#-- Prepare servers to simulate delay or error
connection server_1;
SET @@GLOBAL.debug_dbug= "+d,mysqld_delay_kill_threads_phase_1";
connection server_2;
SET @@GLOBAL.debug_dbug= "+d,corrupt_queue_event,slave_delay_killing_semisync_connection";
SET @@GLOBAL.debug_dbug= "+d,corrupt_queue_event,delay_semisync_kill_connection_for_mdev_28141";
connection server_3;
SET @@GLOBAL.debug_dbug= "+d,simulate_delay_semisync_slave_reply";
#--
#-- Test begins
connection server_1_con2;
#-- Give enough time after timeout/ack received to query yes_tx/no_tx
SET @@GLOBAL.debug_dbug= "+d,delay_shutdown_phase_2_after_semisync_wait";
connection server_1;
#-- Begin semi-sync transaction
INSERT INTO t1 VALUES (1);
connection server_1_con2;
#-- Wait until master recognizes a connection is awaiting semi-sync ACK
show status like 'Rpl_semi_sync_master_wait_sessions';
Variable_name Value
Rpl_semi_sync_master_wait_sessions 1
#-- Give enough time after timeout/ack received to query yes_tx/no_tx
SET @@GLOBAL.debug_dbug= "+d,delay_shutdown_phase_2_after_semisync_wait";
#-- Begin master shutdown
SHUTDOWN WAIT FOR ALL SLAVES;
connection server_1;
Expand All @@ -468,24 +466,28 @@ count(*)=1
1
#
#-- Re-synchronize slaves with master and disable semi-sync
#-- FIXME: workaround for MDEV-28141, preventing errored replicas from
# killing their semi-sync connections
connection server_2;
set debug_sync= "now wait_for at_semisync_kill_connection";
set debug_sync= "now signal continue_semisync_kill_connection";
# Wait for debug_sync signal to have been received before issuing RESET
set debug_sync= "reset";
#-- Stop slaves
connection server_2;
SET @@GLOBAL.debug_dbug= "";
SET @@GLOBAL.rpl_semi_sync_slave_enabled= 0;
include/stop_slave.inc
include/stop_slave_io.inc
include/stop_slave_sql.inc
SET @@GLOBAL.debug_dbug= @sav_server_2_dbug;
SET @@GLOBAL.rpl_semi_sync_slave_enabled= @sav_enabled_server_2;
connection server_3;
SET @@GLOBAL.debug_dbug= "";
SET @@GLOBAL.rpl_semi_sync_slave_enabled= 0;
include/stop_slave.inc
include/stop_slave_io.inc
include/stop_slave_sql.inc
SET @@GLOBAL.debug_dbug= @sav_server_3_dbug;
SET @@GLOBAL.rpl_semi_sync_slave_enabled= @sav_enabled_server_3;
#-- Bring the master back up
connection server_1_con2;
connection default;
connection server_1;
SET @@GLOBAL.debug_dbug= "";
SET @@GLOBAL.rpl_semi_sync_master_enabled = 0;
show status like 'Rpl_semi_sync_master_status';
Variable_name Value
Rpl_semi_sync_master_status OFF
TRUNCATE TABLE t1;
#-- Bring slaves back up
connection server_2;
Expand All @@ -509,9 +511,13 @@ COUNT(*)=0
#############################
connection server_2;
include/stop_slave.inc
SET @@GLOBAL.rpl_semi_sync_slave_enabled = @sav_enabled_server_2;
SET @@GLOBAL.debug_dbug= @sav_server_2_dbug;
include/start_slave.inc
connection server_3;
include/stop_slave.inc
SET @@GLOBAL.rpl_semi_sync_slave_enabled = @sav_enabled_server_3;
SET @@GLOBAL.debug_dbug= @sav_server_3_dbug;
include/start_slave.inc
connection server_1;
drop table t1;
Expand Down

0 comments on commit 03d1346

Please sign in to comment.