Skip to content

Commit

Permalink
MDEV-6589: Incorrect relay log start position when restarting SQL thr…
Browse files Browse the repository at this point in the history
…ead after error in parallel replication

The problem occurs in parallel replication in GTID mode, when we are using
multiple replication domains. In this case, if the SQL thread stops, the
slave GTID position may refer to a different point in the relay log for each
domain.

The bug was that when the SQL thread was stopped and restarted (but the IO
thread was kept running), the SQL thread would resume applying the relay log
from the point of the most advanced replication domain, silently skipping all
earlier events within other domains. This caused replication corruption.

This patch solves the problem by storing, when the SQL thread stops with
multiple parallel replication domains active, the current GTID
position. Additionally, the current position in the relay logs is moved back
to a point known to be earlier than the current position of any replication
domain. Then when the SQL thread restarts from the earlier position, GTIDs
encountered are compared against the stored GTID position. Any GTID that was
already applied before the stop is skipped to avoid duplicate apply.

This patch should have no effect if multi-domain GTID parallel replication is
not used. Similarly, if both SQL and IO thread are stopped and restarted, the
patch has no effect, as in this case the existing relay logs are removed and
re-fetched from the master at the current global @@gtid_slave_pos.
  • Loading branch information
knielsen committed Mar 4, 2015
1 parent fb71449 commit ad0d203
Show file tree
Hide file tree
Showing 10 changed files with 541 additions and 16 deletions.
147 changes: 147 additions & 0 deletions mysql-test/suite/rpl/r/rpl_parallel_mdev6589.result
Original file line number Diff line number Diff line change
@@ -0,0 +1,147 @@
include/master-slave.inc
[connection master]
SET @old_parallel_threads=@@GLOBAL.slave_parallel_threads;
include/stop_slave.inc
SET GLOBAL slave_parallel_threads=10;
CHANGE MASTER TO master_use_gtid=current_pos;
include/start_slave.inc
*** MDEV-6589: Incorrect relay log start position when restarting SQL thread after error in parallel replication ***
ALTER TABLE mysql.gtid_slave_pos ENGINE=InnoDB;
CREATE TABLE t1 (a int PRIMARY KEY) ENGINE=MyISAM;
CREATE TABLE t2 (a int PRIMARY KEY) ENGINE=InnoDB;
INSERT INTO t1 VALUES (1);
INSERT INTO t2 VALUES (1);
SELECT * FROM t1;
a
1
SELECT * FROM t2;
a
1
SET sql_log_bin=0;
BEGIN;
INSERT INTO t2 VALUES (5);
SET gtid_domain_id=0;
INSERT INTO t1 VALUES (2);
INSERT INTO t2 VALUES (3);
FLUSH LOGS;
INSERT INTO t1 VALUES (4);
SET gtid_domain_id=1;
INSERT INTO t2 VALUES (5);
SET gtid_domain_id=0;
INSERT INTO t1 VALUES (6);
INSERT INTO t1 VALUES (7);
SET gtid_domain_id=2;
INSERT INTO t2 VALUES (8);
INSERT INTO t1 VALUES (9);
FLUSH LOGS;
SET gtid_domain_id=3;
INSERT INTO t2 VALUES (10);
INSERT INTO t1 VALUES (11);
SET gtid_domain_id=1;
INSERT INTO t1 VALUES (12);
INSERT INTO t2 VALUES (13);
SET gtid_domain_id=0;
INSERT INTO t2 VALUES (14);
FLUSH LOGS;
SET gtid_domain_id=3;
INSERT INTO t2 VALUES (15);
SET gtid_domain_id=2;
INSERT INTO t2 VALUES (16);
SET gtid_domain_id=0;
INSERT INTO t1 VALUES (17);
SET @gtid0 = @@last_gtid;
SET gtid_domain_id=2;
INSERT INTO t1 VALUES (18);
SET @gtid2 = @@last_gtid;
SET gtid_domain_id=3;
INSERT INTO t1 VALUES (19);
SET @gtid3 = @@last_gtid;
SELECT * FROM t1 ORDER BY a;
a
1
2
4
6
7
9
11
12
17
18
19
SELECT * FROM t2 ORDER BY a;
a
1
3
5
8
10
13
14
15
16
include/save_master_gtid.inc
SELECT MASTER_GTID_WAIT('WAIT_POS');
MASTER_GTID_WAIT('WAIT_POS')
0
COMMIT;
SET sql_log_bin=1;
include/wait_for_slave_sql_error.inc [errno=1062]
SELECT * FROM t1 ORDER BY a;
a
1
2
4
6
7
9
11
17
18
19
SELECT * FROM t2 ORDER BY a;
a
1
3
5
8
10
14
15
16
SET sql_log_bin=0;
DELETE FROM t2 WHERE a=5;
SET sql_log_bin=1;
include/start_slave.inc
include/sync_with_master_gtid.inc
SELECT * FROM t1 ORDER BY a;
a
1
2
4
6
7
9
11
12
17
18
19
SELECT * FROM t2 ORDER BY a;
a
1
3
5
8
10
13
14
15
16
include/stop_slave.inc
SET GLOBAL slave_parallel_threads=@old_parallel_threads;
include/start_slave.inc
SET DEBUG_SYNC= 'RESET';
DROP TABLE t1,t2;
SET DEBUG_SYNC= 'RESET';
include/rpl_end.inc
132 changes: 132 additions & 0 deletions mysql-test/suite/rpl/t/rpl_parallel_mdev6589.test
Original file line number Diff line number Diff line change
@@ -0,0 +1,132 @@
--source include/have_innodb.inc
--source include/have_debug.inc
--source include/have_debug_sync.inc
--source include/master-slave.inc

--connection server_2
SET @old_parallel_threads=@@GLOBAL.slave_parallel_threads;
--source include/stop_slave.inc
SET GLOBAL slave_parallel_threads=10;
CHANGE MASTER TO master_use_gtid=current_pos;
--source include/start_slave.inc


--echo *** MDEV-6589: Incorrect relay log start position when restarting SQL thread after error in parallel replication ***

--connection server_1
ALTER TABLE mysql.gtid_slave_pos ENGINE=InnoDB;
CREATE TABLE t1 (a int PRIMARY KEY) ENGINE=MyISAM;
CREATE TABLE t2 (a int PRIMARY KEY) ENGINE=InnoDB;
INSERT INTO t1 VALUES (1);
INSERT INTO t2 VALUES (1);
--save_master_pos

--connection server_2
--sync_with_master
SELECT * FROM t1;
SELECT * FROM t2;

# Block one domain, which we will later cause to give an error. And let some
# other domains proceed so we can check that after restart, the slave is able
# to correctly restart each domain in a separate position.

--connect (con_temp1,127.0.0.1,root,,test,$SERVER_MYPORT_2,)
SET sql_log_bin=0;
BEGIN;
INSERT INTO t2 VALUES (5);

--connection server_1
SET gtid_domain_id=0;
INSERT INTO t1 VALUES (2);
INSERT INTO t2 VALUES (3);
FLUSH LOGS;
INSERT INTO t1 VALUES (4);

SET gtid_domain_id=1;
# This query will be blocked on the slave, and later give a duplicate key error.
INSERT INTO t2 VALUES (5);

SET gtid_domain_id=0;
INSERT INTO t1 VALUES (6);
INSERT INTO t1 VALUES (7);

SET gtid_domain_id=2;
INSERT INTO t2 VALUES (8);
INSERT INTO t1 VALUES (9);
FLUSH LOGS;

SET gtid_domain_id=3;
INSERT INTO t2 VALUES (10);
INSERT INTO t1 VALUES (11);

# These cannot be replicated before the error, as a prior commit is blocked.
SET gtid_domain_id=1;
INSERT INTO t1 VALUES (12);
INSERT INTO t2 VALUES (13);

SET gtid_domain_id=0;
INSERT INTO t2 VALUES (14);
FLUSH LOGS;

SET gtid_domain_id=3;
INSERT INTO t2 VALUES (15);

SET gtid_domain_id=2;
INSERT INTO t2 VALUES (16);

SET gtid_domain_id=0;
INSERT INTO t1 VALUES (17);
SET @gtid0 = @@last_gtid;
SET gtid_domain_id=2;
INSERT INTO t1 VALUES (18);
SET @gtid2 = @@last_gtid;
SET gtid_domain_id=3;
INSERT INTO t1 VALUES (19);
SET @gtid3 = @@last_gtid;
--let $wait_pos= `SELECT CONCAT(@gtid0, ",", @gtid2, ",", @gtid3)`

SELECT * FROM t1 ORDER BY a;
SELECT * FROM t2 ORDER BY a;
--source include/save_master_gtid.inc


--connection server_2
# First wait for domains 0, 2, and 3 to complete.
--replace_result $wait_pos WAIT_POS
eval SELECT MASTER_GTID_WAIT('$wait_pos');

# Then release the row lock, and wait for the domain 1 to fail with
# duplicate key error.
--connection con_temp1
COMMIT;
SET sql_log_bin=1;

--connection server_2
--let $slave_sql_errno= 1062
--source include/wait_for_slave_sql_error.inc

SELECT * FROM t1 ORDER BY a;
SELECT * FROM t2 ORDER BY a;

SET sql_log_bin=0;
DELETE FROM t2 WHERE a=5;
SET sql_log_bin=1;
--source include/start_slave.inc
--source include/sync_with_master_gtid.inc

SELECT * FROM t1 ORDER BY a;
SELECT * FROM t2 ORDER BY a;


# Clean up.
--connection server_2
--source include/stop_slave.inc
SET GLOBAL slave_parallel_threads=@old_parallel_threads;
--source include/start_slave.inc
SET DEBUG_SYNC= 'RESET';

--connection server_1
DROP TABLE t1,t2;
SET DEBUG_SYNC= 'RESET';

--source include/rpl_end.inc
3 changes: 1 addition & 2 deletions sql/log.cc
Original file line number Diff line number Diff line change
Expand Up @@ -4134,8 +4134,7 @@ int MYSQL_BIN_LOG::purge_first_log(Relay_log_info* rli, bool included)
included= 1;
to_purge_if_included= my_strdup(ir->name, MYF(0));
}
my_atomic_rwlock_destroy(&ir->inuse_relaylog_atomic_lock);
my_free(ir);
rli->free_inuse_relaylog(ir);
ir= next;
}
rli->inuse_relaylog_list= ir;
Expand Down
21 changes: 21 additions & 0 deletions sql/rpl_gtid.cc
Original file line number Diff line number Diff line change
Expand Up @@ -1089,6 +1089,27 @@ rpl_binlog_state::load(struct rpl_gtid *list, uint32 count)
}


static int rpl_binlog_state_load_cb(rpl_gtid *gtid, void *data)
{
rpl_binlog_state *self= (rpl_binlog_state *)data;
return self->update_nolock(gtid, false);
}


bool
rpl_binlog_state::load(rpl_slave_state *slave_pos)
{
bool res= false;

mysql_mutex_lock(&LOCK_binlog_state);
reset_nolock();
if (slave_pos->iterate(rpl_binlog_state_load_cb, this, NULL, 0))
res= true;
mysql_mutex_unlock(&LOCK_binlog_state);
return res;
}


rpl_binlog_state::~rpl_binlog_state()
{
free();
Expand Down
1 change: 1 addition & 0 deletions sql/rpl_gtid.h
Original file line number Diff line number Diff line change
Expand Up @@ -235,6 +235,7 @@ struct rpl_binlog_state
void reset();
void free();
bool load(struct rpl_gtid *list, uint32 count);
bool load(rpl_slave_state *slave_pos);
int update_nolock(const struct rpl_gtid *gtid, bool strict);
int update(const struct rpl_gtid *gtid, bool strict);
int update_with_next_gtid(uint32 domain_id, uint32 server_id,
Expand Down
Loading

0 comments on commit ad0d203

Please sign in to comment.