Skip to content
Permalink
Browse files

Fix for MDEV-9670 server_id mysteriously set to 0

Problem was that in a circular replication setup the master remembers
position to events it has generated itself when reading from a slave.
If there are no new events in the queue from the slave, a
Gtid_list_log_event is generated to remember the last skipped event.
The problem happens if there is a network delay and we generate a
Gtid_list_log_event in the middle of the transaction, in which case there
will be an implicit comment and a new transaction with serverid=0 will be
logged.

The fix was to not generate any Gtid_list_log_events in the middle of a
transaction.
  • Loading branch information...
andrelkin authored and montywi committed Jul 2, 2017
1 parent 46d6f74 commit 946a07e8a85c5553b3906b46052b8044cb70a480
@@ -0,0 +1,30 @@
include/rpl_init.inc [topology=1->2->1]
include/rpl_connect.inc [creating M4]
include/rpl_connect.inc [creating M2]
SET @old_debug= @@global.debug;
STOP SLAVE;
SET GLOBAL debug_dbug= "+d,dbug.rows_events_to_delay_relay_logging";
START SLAVE IO_THREAD;
include/wait_for_slave_io_to_start.inc
CREATE TABLE t1 (a INT AUTO_INCREMENT PRIMARY KEY, b VARCHAR(30000)) ENGINE=innodb;
INSERT INTO `t1` VALUES (null, repeat('a', 1024)), (null, repeat('b', 1024));
SET debug_sync='now WAIT_FOR start_sql_thread';
START SLAVE SQL_THREAD;
include/show_binlog_events.inc
Log_name Pos Event_type Server_id End_log_pos Info
slave-bin.000001 # Gtid # # GTID #-#-#
slave-bin.000001 # Query # # use `test`; CREATE TABLE t1 (a INT AUTO_INCREMENT PRIMARY KEY, b VARCHAR(30000)) ENGINE=innodb
SET debug_sync='now SIGNAL go_on_relay_logging';
include/show_binlog_events.inc
Log_name Pos Event_type Server_id End_log_pos Info
slave-bin.000001 # Gtid # # GTID #-#-#
slave-bin.000001 # Query # # use `test`; CREATE TABLE t1 (a INT AUTO_INCREMENT PRIMARY KEY, b VARCHAR(30000)) ENGINE=innodb
slave-bin.000001 # Gtid # # BEGIN GTID #-#-#
slave-bin.000001 # Table_map # # table_id: # (test.t1)
slave-bin.000001 # Write_rows_v1 # # table_id: #
slave-bin.000001 # Write_rows_v1 # # table_id: # flags: STMT_END_F
slave-bin.000001 # Xid # # COMMIT /* XID */
drop table t1;
SET GLOBAL debug_dbug= @old_debug;
SET debug_sync='RESET';
include/rpl_end.inc
@@ -0,0 +1,30 @@
!include ../my.cnf

[mysqld.1]
gtid-domain-id=4
server-id=4
#
log-slave-updates
slave-parallel-threads=0
gtid-strict-mode=1
gtid-ignore-duplicates=1

#
# Max-size row events to minimum with the idea to create
# a number of Rows_log_event per Query.
#
binlog-row-event-max-size=1024

[mysqld.2]
gtid-domain-id=2
server-id=2
#
log-slave-updates
slave-parallel-threads=0
gtid-strict-mode=1
gtid-ignore-duplicates=1
binlog-row-event-max-size=1024
# The slave will be initialized with a @@global.dbug-var value
skip-slave-start=1


@@ -0,0 +1,104 @@
#
# Testing chain/circular replication scenario of MDEV-9670
# The effect of the bug was that we got a commit with a GTID with server_id
#

--source include/have_binlog_format_row.inc
--source include/have_innodb.inc
--source include/have_debug.inc
--source include/have_debug_sync.inc

--let $rpl_topology= 1->2->1
--source include/rpl_init.inc

--let $rpl_connection_name= M4
--let $rpl_server_number= 1
--source include/rpl_connect.inc

--let $rpl_connection_name= M2
--let $rpl_server_number= 2
--source include/rpl_connect.inc

# The parameter reflects binlog-row-event-max-size @cnf.
--let $row_size=1024

SET @old_debug= @@global.debug;

--connection M2
STOP SLAVE;
SET GLOBAL debug_dbug= "+d,dbug.rows_events_to_delay_relay_logging";
START SLAVE IO_THREAD;
--source include/wait_for_slave_io_to_start.inc

--connection M2
# This query also creates a Gtid event group whose Gtid will remain in
# ignored status for too long causing a following group split.

CREATE TABLE t1 (a INT AUTO_INCREMENT PRIMARY KEY, b VARCHAR(30000)) ENGINE=innodb;
--sync_slave_with_master M4

# This INSERT will be logged as two Write_log events which the buggy
# slave applier would split.

--connection M4
eval INSERT INTO `t1` VALUES (null, repeat('a', $row_size)), (null, repeat('b', $row_size));

# START M2 IO thread and wait for its signal to follow with the SQL
# thread start. At this moment the SQL thread shall be having 2 and
# "half" groups to execute. The "hafl" one would be committed by the
# buggy applier after which the IO is released to queue the rest of
# the 3rd group which the SQL thread commits separately to complete
# the split.

--connection M2

# wait for IO signal to start the SQL thread. IO will be hanging upon that.
SET debug_sync='now WAIT_FOR start_sql_thread';

# Now the slave server has relay log whose last group is incomplete.
# An unfixed slave server would go to "insert" a "fake"
# Gtid_list_log_event event which actually would commit the incomplete
# group. However before to actual commit do_apply_event() hits some assert.
# In the fixed server the fake Gtid_list_log_event is *not* inserted
# in the middle of a group.
START SLAVE SQL_THREAD;

# Sleep for a little time to give SQL thread a chance to commit while
# the IO thread is hanging (see
# DBUG_EXECUTE_IF("dbug.rows_events_to_delay_relay_logging"...) in
# queue_event). Alternatively to reproduce the case when buggy slave
# wait for the 1st group commit

#--let $count= 1
#--let $table= t1
#--source include/wait_until_rows_count.inc

--sleep 2

# Demonstrate either no split group in the correct slave or the 1nd
# group in the buggy one
--source include/show_binlog_events.inc

# Release the IO thread
SET debug_sync='now SIGNAL go_on_relay_logging';

# Sync servers
--sync_slave_with_master M4
--connection M4
--sync_slave_with_master M2
--connection M2

# Demonstrate replication goes correctly not to create any split, or
# the 2nd group in the buggy slave
--source include/show_binlog_events.inc

#
# Cleanup
#
--connection M4
drop table t1;

--connection M2
SET GLOBAL debug_dbug= @old_debug;
SET debug_sync='RESET';
--source include/rpl_end.inc
@@ -2363,6 +2363,7 @@ static void write_ignored_events_info_to_relay_log(THD *thd, Master_info *mi)
}
if (rli->ign_gtids.count())
{
DBUG_ASSERT(!rli->is_in_group()); // Ensure no active transaction
glev= new Gtid_list_log_event(&rli->ign_gtids,
Gtid_list_log_event::FLAG_IGN_GTIDS);
rli->ign_gtids.reset();
@@ -5317,7 +5318,9 @@ static int queue_event(Master_info* mi,const char* buf, ulong event_len)
bool gtid_skip_enqueue= false;
bool got_gtid_event= false;
rpl_gtid event_gtid;

#ifndef DBUG_OFF
static uint dbug_rows_event_count= 0;
#endif
/*
FD_q must have been prepared for the first R_a event
inside get_master_version_and_clock()
@@ -5384,6 +5387,26 @@ static int queue_event(Master_info* mi,const char* buf, ulong event_len)
(uchar)buf[EVENT_TYPE_OFFSET] != FORMAT_DESCRIPTION_EVENT /* a way to escape */)
DBUG_RETURN(queue_old_event(mi,buf,event_len));

#ifdef ENABLED_DEBUG_SYNC
/*
A (+d,dbug.rows_events_to_delay_relay_logging)-test is supposed to
create a few Write_log_events and after receiving the 1st of them
the IO thread signals to launch the SQL thread, and sets itself to
wait for a release signal.
*/
DBUG_EXECUTE_IF("dbug.rows_events_to_delay_relay_logging",
if ((buf[EVENT_TYPE_OFFSET] == WRITE_ROWS_EVENT_V1 ||
buf[EVENT_TYPE_OFFSET] == WRITE_ROWS_EVENT) &&
++dbug_rows_event_count == 2)
{
const char act[]=
"now SIGNAL start_sql_thread "
"WAIT_FOR go_on_relay_logging";
DBUG_ASSERT(debug_sync_service);
DBUG_ASSERT(!debug_sync_set_action(current_thd,
STRING_WITH_LEN(act)));
};);
#endif
mysql_mutex_lock(&mi->data_lock);

switch ((uchar)buf[EVENT_TYPE_OFFSET]) {
@@ -6588,9 +6611,12 @@ static Log_event* next_event(rpl_group_info *rgi, ulonglong *event_size)
DBUG_RETURN(ev);
}

if (rli->ign_gtids.count())
if (rli->ign_gtids.count() && !rli->is_in_group())
{
/* We generate and return a Gtid_list, to update gtid_slave_pos. */
/*
We generate and return a Gtid_list, to update gtid_slave_pos,
unless being in the middle of a group.
*/
DBUG_PRINT("info",("seeing ignored end gtids"));
ev= new Gtid_list_log_event(&rli->ign_gtids,
Gtid_list_log_event::FLAG_IGN_GTIDS);

0 comments on commit 946a07e

Please sign in to comment.
You can’t perform that action at this time.