Skip to content

Commit 1bcdc3e

Browse files
committed
MDEV-27697 slave must recognize incomplete replication event group
In cases of a faulty master or an incorrect binlog event producer, that slave is working with, sends an incomplete group of events slave must react with an error to not to log into the relay-log any new events that do not belong to the incomplete group. Fixed with extending received event properties check when slave connects to master in gtid mode. Specifically for the event that can be a part of a group its relay-logging is permitted only when its position within the group is validated. Otherwise slave IO thread stops with ER_SLAVE_RELAY_LOG_WRITE_FAILURE.
1 parent 907e4c6 commit 1bcdc3e

File tree

6 files changed

+292
-26
lines changed

6 files changed

+292
-26
lines changed

mysql-test/extra/rpl_tests/rpl_parallel.inc

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1872,6 +1872,7 @@ SET GLOBAL slave_parallel_threads=10;
18721872
SET GLOBAL slave_parallel_threads=1;
18731873
SET @old_dbug= @@GLOBAL.debug_dbug;
18741874
SET GLOBAL debug_dbug="+d,slave_discard_xid_for_gtid_0_x_1000";
1875+
CALL mtr.add_suppression("Unexpected break of being relay-logged GTID");
18751876

18761877
--connection server_1
18771878
INSERT INTO t2 VALUES (101);
Lines changed: 54 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,54 @@
1+
include/master-slave.inc
2+
[connection master]
3+
connection slave;
4+
call mtr.add_suppression("Unexpected break of being relay-logged GTID 0-27697-1000");
5+
call mtr.add_suppression("Relay log write failure: could not queue event from master");
6+
call mtr.add_suppression("The current group of events starts with a non-GTID");
7+
include/stop_slave.inc
8+
CHANGE MASTER TO MASTER_USE_GTID=slave_pos;
9+
include/start_slave.inc
10+
connection master;
11+
CREATE TABLE t (a INT) ENGINE=innodb;
12+
INSERT INTO t VALUES(1);
13+
### A. Simulate an unnoticeable loss of Xid event
14+
connection slave;
15+
SET @@global.debug_dbug="+d,slave_discard_xid_for_gtid_0_x_1000";
16+
connection master;
17+
SET @@gtid_seq_no=1000;
18+
set @@server_id=27697;
19+
INSERT INTO t VALUES(1000);
20+
set @@server_id=default;
21+
INSERT INTO t VALUES(1001);
22+
## Prove the error occurs.
23+
connection slave;
24+
include/wait_for_slave_io_error.inc [errno=1595]
25+
## Prove the slave recovers after the simulation condtion is lifted.
26+
SET @@global.debug_dbug=default;
27+
include/start_slave.inc
28+
### B. Do the same to GTID event.
29+
connection slave;
30+
SET @@global.debug_dbug="+d,slave_discard_gtid_0_x_1002";
31+
connection master;
32+
SET @@gtid_seq_no=1002;
33+
set @@server_id=27697;
34+
INSERT INTO t VALUES(1002);
35+
set @@server_id=default;
36+
INSERT INTO t VALUES(1003);
37+
## Prove the error occurs.
38+
connection slave;
39+
include/wait_for_slave_io_error.inc [errno=1595]
40+
## Prove the slave recovers after the simulation condtion is lifted.
41+
SET @@global.debug_dbug=default;
42+
include/start_slave.inc
43+
connection master;
44+
connection slave;
45+
include/diff_tables.inc [master:t,slave:t]
46+
"===== Clean up ====="
47+
connection slave;
48+
include/stop_slave.inc
49+
CHANGE MASTER TO MASTER_USE_GTID=no;
50+
include/start_slave.inc
51+
connection master;
52+
DROP TABLE t;
53+
SET GLOBAL LOG_WARNINGS=default;
54+
include/rpl_end.inc

mysql-test/suite/rpl/r/rpl_parallel.result

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1378,6 +1378,7 @@ include/stop_slave.inc
13781378
SET GLOBAL slave_parallel_threads=1;
13791379
SET @old_dbug= @@GLOBAL.debug_dbug;
13801380
SET GLOBAL debug_dbug="+d,slave_discard_xid_for_gtid_0_x_1000";
1381+
CALL mtr.add_suppression("Unexpected break of being relay-logged GTID");
13811382
connection server_1;
13821383
INSERT INTO t2 VALUES (101);
13831384
INSERT INTO t2 VALUES (102);
Lines changed: 97 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,97 @@
1+
# ==== Purpose ====
2+
#
3+
# Test verifies that replicated transaction boundaries are set properly
4+
# at receiving from master time.
5+
#
6+
# ==== Implementation ====
7+
#
8+
# A. Simulate an unnoticeable loss of Xid event to observe a slave error,
9+
# then restart slave to recover from the failure.
10+
# B. Do the same to GTID event.
11+
#
12+
# ==== References ====
13+
#
14+
# MDEV-27697 slave must recognize incomplete replication event group
15+
#
16+
--source include/have_binlog_format_mixed.inc
17+
--source include/have_innodb.inc
18+
--source include/have_debug.inc
19+
--source include/master-slave.inc
20+
21+
--connection slave
22+
call mtr.add_suppression("Unexpected break of being relay-logged GTID 0-27697-1000");
23+
call mtr.add_suppression("Relay log write failure: could not queue event from master");
24+
call mtr.add_suppression("The current group of events starts with a non-GTID");
25+
26+
--source include/stop_slave.inc
27+
CHANGE MASTER TO MASTER_USE_GTID=slave_pos;
28+
--source include/start_slave.inc
29+
30+
--connection master
31+
CREATE TABLE t (a INT) ENGINE=innodb;
32+
INSERT INTO t VALUES(1);
33+
save_master_pos;
34+
35+
--echo ### A. Simulate an unnoticeable loss of Xid event
36+
--sync_slave_with_master
37+
SET @@global.debug_dbug="+d,slave_discard_xid_for_gtid_0_x_1000";
38+
39+
--connection master
40+
SET @@gtid_seq_no=1000;
41+
set @@server_id=27697;
42+
INSERT INTO t VALUES(1000);
43+
set @@server_id=default;
44+
INSERT INTO t VALUES(1001);
45+
46+
--echo ## Prove the error occurs.
47+
--connection slave
48+
# ER_SLAVE_RELAY_LOG_WRITE_FAILURE
49+
--let $slave_io_errno = 1595
50+
--source include/wait_for_slave_io_error.inc
51+
## EOP
52+
53+
--echo ## Prove the slave recovers after the simulation condtion is lifted.
54+
SET @@global.debug_dbug=default;
55+
--source include/start_slave.inc
56+
57+
--echo ### B. Do the same to GTID event.
58+
--connection slave
59+
SET @@global.debug_dbug="+d,slave_discard_gtid_0_x_1002";
60+
61+
--connection master
62+
SET @@gtid_seq_no=1002;
63+
set @@server_id=27697;
64+
INSERT INTO t VALUES(1002);
65+
set @@server_id=default;
66+
INSERT INTO t VALUES(1003);
67+
68+
--echo ## Prove the error occurs.
69+
--connection slave
70+
# ER_SLAVE_RELAY_LOG_WRITE_FAILURE
71+
--let $slave_io_errno = 1595
72+
--source include/wait_for_slave_io_error.inc
73+
## EOP
74+
75+
--echo ## Prove the slave recovers after the simulation condtion is lifted.
76+
SET @@global.debug_dbug=default;
77+
--source include/start_slave.inc
78+
79+
--connection master
80+
save_master_pos;
81+
82+
--sync_slave_with_master
83+
## EOP
84+
85+
--let $diff_tables=master:t,slave:t
86+
--source include/diff_tables.inc
87+
88+
--echo "===== Clean up ====="
89+
--connection slave
90+
--source include/stop_slave.inc
91+
CHANGE MASTER TO MASTER_USE_GTID=no;
92+
--source include/start_slave.inc
93+
94+
--connection master
95+
DROP TABLE t;
96+
SET GLOBAL LOG_WARNINGS=default;
97+
--source include/rpl_end.inc

sql/rpl_gtid.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@
2626
extern const LEX_STRING rpl_gtid_slave_state_table_name;
2727

2828
class String;
29+
#define PARAM_GTID(G) G.domain_id, G.server_id, G.seq_no
2930

3031
struct rpl_gtid
3132
{

sql/slave.cc

Lines changed: 138 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -6196,23 +6196,75 @@ static int queue_event(Master_info* mi,const char* buf, ulong event_len)
61966196
}
61976197
}
61986198

6199-
if (unlikely(mi->gtid_reconnect_event_skip_count))
6200-
{
6201-
goto default_action;
6202-
}
6203-
62046199
/*
6205-
We have successfully queued to relay log everything before this GTID, so
6200+
Unless the previous group is malformed,
6201+
we have successfully queued to relay log everything before this GTID, so
62066202
in case of reconnect we can start from after any previous GTID.
6207-
(Normally we would have updated gtid_current_pos earlier at the end of
6208-
the previous event group, but better leave an extra check here for
6209-
safety).
6203+
(We must have updated gtid_current_pos earlier at the end of
6204+
the previous event group. Unless ...)
62106205
*/
6211-
if (mi->events_queued_since_last_gtid)
6206+
if (unlikely(mi->events_queued_since_last_gtid >
6207+
mi->gtid_reconnect_event_skip_count))
62126208
{
6213-
mi->gtid_current_pos.update(&mi->last_queued_gtid);
6214-
mi->events_queued_since_last_gtid= 0;
6209+
/*
6210+
...unless the last group has not been completed. An assert below
6211+
can be satisfied only with the strict mode that ensures
6212+
against "genuine" gtid duplicates.
6213+
*/
6214+
rpl_gtid *gtid_in_slave_state=
6215+
mi->gtid_current_pos.find(mi->last_queued_gtid.domain_id);
6216+
6217+
// Slave gtid state must not have updated yet to the last received gtid.
6218+
DBUG_ASSERT((mi->using_gtid == Master_info::USE_GTID_NO ||
6219+
!opt_gtid_strict_mode) ||
6220+
(!gtid_in_slave_state ||
6221+
!(*gtid_in_slave_state == mi->last_queued_gtid)));
6222+
6223+
DBUG_EXECUTE_IF("slave_discard_xid_for_gtid_0_x_1000",
6224+
{
6225+
/* Inject an event group that is missing its XID commit event. */
6226+
if (mi->last_queued_gtid.domain_id == 0 &&
6227+
mi->last_queued_gtid.seq_no == 1000)
6228+
{
6229+
sql_print_warning(
6230+
"Unexpected break of being relay-logged GTID %u-%u-%llu "
6231+
"event group by the current GTID event %u-%u-%llu",
6232+
PARAM_GTID(mi->last_queued_gtid),PARAM_GTID(event_gtid));
6233+
DBUG_SET("-d,slave_discard_xid_for_gtid_0_x_1000");
6234+
goto dbug_gtid_accept;
6235+
}
6236+
});
6237+
error= ER_SLAVE_RELAY_LOG_WRITE_FAILURE;
6238+
sql_print_error("Unexpected break of being relay-logged GTID %u-%u-%llu "
6239+
"event group by the current GTID event %u-%u-%llu",
6240+
PARAM_GTID(mi->last_queued_gtid),PARAM_GTID(event_gtid));
6241+
goto err;
6242+
}
6243+
else if (unlikely(mi->gtid_reconnect_event_skip_count > 0))
6244+
{
6245+
if (mi->gtid_reconnect_event_skip_count ==
6246+
mi->events_queued_since_last_gtid)
6247+
{
6248+
DBUG_ASSERT(event_gtid == mi->last_queued_gtid);
6249+
6250+
goto default_action;
6251+
}
6252+
6253+
DBUG_ASSERT(0);
62156254
}
6255+
// else_likely{...
6256+
#ifndef DBUG_OFF
6257+
dbug_gtid_accept:
6258+
DBUG_EXECUTE_IF("slave_discard_gtid_0_x_1002",
6259+
{
6260+
if (mi->last_queued_gtid.server_id == 27697 &&
6261+
mi->last_queued_gtid.seq_no == 1002)
6262+
{
6263+
DBUG_SET("-d,slave_discard_gtid_0_x_1002");
6264+
goto skip_relay_logging;
6265+
}
6266+
});
6267+
#endif
62166268
mi->last_queued_gtid= event_gtid;
62176269
mi->last_queued_gtid_standalone=
62186270
(gtid_flag & Gtid_log_event::FL_STANDALONE) != 0;
@@ -6222,6 +6274,7 @@ static int queue_event(Master_info* mi,const char* buf, ulong event_len)
62226274

62236275
++mi->events_queued_since_last_gtid;
62246276
inc_pos= event_len;
6277+
// ...} eof else_likely
62256278
}
62266279
break;
62276280
/*
@@ -6274,6 +6327,12 @@ static int queue_event(Master_info* mi,const char* buf, ulong event_len)
62746327
case XID_EVENT:
62756328
DBUG_EXECUTE_IF("slave_discard_xid_for_gtid_0_x_1000",
62766329
{
6330+
if (mi->last_queued_gtid.server_id == 27697 &&
6331+
mi->last_queued_gtid.seq_no == 1000)
6332+
{
6333+
DBUG_SET("-d,slave_discard_xid_for_gtid_0_x_1000");
6334+
goto skip_relay_logging;
6335+
}
62776336
/* Inject an event group that is missing its XID commit event. */
62786337
if (mi->last_queued_gtid.domain_id == 0 &&
62796338
mi->last_queued_gtid.seq_no == 1000)
@@ -6319,15 +6378,48 @@ static int queue_event(Master_info* mi,const char* buf, ulong event_len)
63196378
}
63206379
};);
63216380

6322-
if (mi->using_gtid != Master_info::USE_GTID_NO && mi->gtid_event_seen)
6381+
if (mi->using_gtid != Master_info::USE_GTID_NO)
63236382
{
6324-
if (unlikely(mi->gtid_reconnect_event_skip_count))
6383+
if (likely(mi->gtid_event_seen))
63256384
{
6326-
--mi->gtid_reconnect_event_skip_count;
6327-
gtid_skip_enqueue= true;
6385+
if (unlikely(mi->gtid_reconnect_event_skip_count))
6386+
{
6387+
if (!got_gtid_event &&
6388+
mi->gtid_reconnect_event_skip_count ==
6389+
mi->events_queued_since_last_gtid)
6390+
goto gtid_not_start; // the 1st re-sent must be gtid
6391+
6392+
--mi->gtid_reconnect_event_skip_count;
6393+
gtid_skip_enqueue= true;
6394+
}
6395+
else if (likely(mi->events_queued_since_last_gtid))
6396+
{
6397+
DBUG_ASSERT(!got_gtid_event);
6398+
6399+
++mi->events_queued_since_last_gtid;
6400+
}
6401+
else if (Log_event::is_group_event((Log_event_type) (uchar)
6402+
buf[EVENT_TYPE_OFFSET]))
6403+
{
6404+
goto gtid_not_start; // no first gtid event in this group
6405+
}
6406+
}
6407+
else if (Log_event::is_group_event((Log_event_type) (uchar)
6408+
buf[EVENT_TYPE_OFFSET]))
6409+
{
6410+
gtid_not_start:
6411+
6412+
DBUG_ASSERT(!got_gtid_event);
6413+
6414+
error= ER_SLAVE_RELAY_LOG_WRITE_FAILURE;
6415+
sql_print_error("The current group of events starts with "
6416+
"a non-GTID %s event; "
6417+
"the last seen GTID is %u-%u-%llu",
6418+
Log_event::get_type_str((Log_event_type) (uchar)
6419+
buf[EVENT_TYPE_OFFSET]),
6420+
mi->last_queued_gtid);
6421+
goto err;
63286422
}
6329-
else if (mi->events_queued_since_last_gtid)
6330-
++mi->events_queued_since_last_gtid;
63316423
}
63326424

63336425
if (!is_compress_event)
@@ -6500,15 +6592,35 @@ static int queue_event(Master_info* mi,const char* buf, ulong event_len)
65006592
Query_log_event::peek_is_commit_rollback(buf, event_len,
65016593
checksum_alg))))))
65026594
{
6503-
/*
6504-
The whole of the current event group is queued. So in case of
6505-
reconnect we can start from after the current GTID.
6506-
*/
6507-
mi->gtid_current_pos.update(&mi->last_queued_gtid);
6508-
mi->events_queued_since_last_gtid= 0;
6595+
DBUG_ASSERT(mi->events_queued_since_last_gtid > 1);
65096596

6510-
/* Reset the domain_id_filter flag. */
6511-
mi->domain_id_filter.reset_filter();
6597+
if (unlikely(gtid_skip_enqueue))
6598+
{
6599+
error= ER_SLAVE_RELAY_LOG_WRITE_FAILURE;
6600+
sql_print_error("Recieved a group closing %s event "
6601+
"at %llu position in the group while there are "
6602+
"still %llu events to skip upon reconnecting; "
6603+
"the last seen GTID is %u-%u-%llu",
6604+
Log_event::get_type_str((Log_event_type) (uchar)
6605+
buf[EVENT_TYPE_OFFSET]),
6606+
(mi->events_queued_since_last_gtid -
6607+
mi->gtid_reconnect_event_skip_count),
6608+
mi->events_queued_since_last_gtid,
6609+
mi->last_queued_gtid);
6610+
goto err;
6611+
}
6612+
else
6613+
{
6614+
/*
6615+
The whole of the current event group is queued. So in case of
6616+
reconnect we can start from after the current GTID.
6617+
*/
6618+
mi->gtid_current_pos.update(&mi->last_queued_gtid);
6619+
mi->events_queued_since_last_gtid= 0;
6620+
6621+
/* Reset the domain_id_filter flag. */
6622+
mi->domain_id_filter.reset_filter();
6623+
}
65126624
}
65136625

65146626
skip_relay_logging:

0 commit comments

Comments
 (0)