Skip to content

Commit 4111a53

Browse files
sjaakolaJan Lindström
authored andcommitted
MDEV-21096 async slave crash with gtid_log_pos table access (#1413)
The original crash happened when async replication IO thread was updating mysql.gtid_slave_pos table. Operations on this table should remain node local, but it appears that protection (THD::wsrep_ignore_table flag) to prevent wsrep replication for this table mas missing for innodb write_row() and update_row(). It was somewhat difficult to reproduce the issue, because mtr seems to create the affected table mysql.gtid_log_pos as of Aria engine type, and Aria engine operations will not be replicated anyhow. It looks, though, that in release installation, mysql.gtid_slave_pos table is of InnoDB engine. It was possible to trigger somewhat related problem by running test galera.galera_as_slave_gtid with configuration: gtid_pos_auto_engines=InnoDB. However, this test mode, causes earlier crash when replication background thread creates aditional table: mysql.gtid_slave_pos_InnoDB, and this table create triggered wsrep TOI replication, which also failed for assertion. Actually, async replication IO and background threads should not replicate anything to cluster. This pull request contains new test galera.galera_as_slave_gtid_auto_engine, which basically just runs galera.galera_as_slave_gtid with configuration of gtid_pos_auto_engines=InnoDB. Test galera.galera_as_slave_gtid is also modified for better code reuse. Actual fix for MDEV-21096 is in storage/innobase/handler/ha_innodb.cc, where THD::wsrep_ignore_table flag is now honored before wsrep key population. There is additional fix in sql/service_wsrep.cc where async replication IO and background threads are marked as non-local. This fences these threads out of wsrep replication altogether. Note that this change, actually makes the use of THD::wsrep_ignore-table redundant. We may want to refactor THD::wsrep_ignore_table out in the future, if there is no other use case for it in sight.
1 parent f952882 commit 4111a53

File tree

7 files changed

+163
-79
lines changed

7 files changed

+163
-79
lines changed
Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,40 @@
1+
connection node_2;
2+
connection node_1;
3+
connect node_3, 127.0.0.1, root, , test, $NODE_MYPORT_3;
4+
connection node_2;
5+
START SLAVE;
6+
connection node_3;
7+
CREATE TABLE t1 (f1 INTEGER PRIMARY KEY) ENGINE=InnoDB;
8+
INSERT INTO t1 VALUES(1);
9+
SELECT LENGTH(@@global.gtid_binlog_state) > 1;
10+
LENGTH(@@global.gtid_binlog_state) > 1
11+
1
12+
connection node_2;
13+
gtid_binlog_state_equal
14+
1
15+
connection node_1;
16+
SELECT COUNT(*) = 1 FROM t1;
17+
COUNT(*) = 1
18+
1
19+
gtid_binlog_state_equal
20+
1
21+
connection node_3;
22+
DROP TABLE t1;
23+
connection node_1;
24+
connection node_2;
25+
STOP SLAVE;
26+
RESET SLAVE ALL;
27+
#cleanup
28+
connection node_1;
29+
set global wsrep_on=OFF;
30+
reset master;
31+
set global wsrep_on=ON;
32+
connection node_2;
33+
set global wsrep_on=OFF;
34+
reset master;
35+
set global wsrep_on=ON;
36+
connection node_3;
37+
reset master;
38+
connection node_2;
39+
DROP TABLE mysql.gtid_slave_pos_InnoDB;
40+
CALL mtr.add_suppression("The automatically created table");
Lines changed: 86 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,86 @@
1+
#
2+
# Test Galera as a slave to a MariaDB master using GTIDs
3+
#
4+
# suite/galera/galera_2nodes_as_slave.cnf describes the setup of the nodes
5+
# suite/galera/t/galera_as_slave_gtid.cnf has the GTID options
6+
#
7+
# In addition to performing DDL and DML, we check that the gtid of the master is preserved inside the cluster
8+
#
9+
10+
--source include/have_innodb.inc
11+
--source include/galera_cluster.inc
12+
13+
# As node #3 is not a Galera node, and galera_cluster.inc does not open connetion to it
14+
# we open the node_3 connection here
15+
--connect node_3, 127.0.0.1, root, , test, $NODE_MYPORT_3
16+
17+
--connection node_2
18+
--disable_query_log
19+
--eval CHANGE MASTER TO MASTER_HOST='127.0.0.1', MASTER_USER='root', MASTER_PORT=$NODE_MYPORT_3;
20+
--enable_query_log
21+
START SLAVE;
22+
23+
--connection node_3
24+
CREATE TABLE t1 (f1 INTEGER PRIMARY KEY) ENGINE=InnoDB;
25+
INSERT INTO t1 VALUES(1);
26+
27+
SELECT LENGTH(@@global.gtid_binlog_state) > 1;
28+
--let $gtid_binlog_state_node1 = `SELECT @@global.gtid_binlog_state;`
29+
30+
--connection node_2
31+
--let $wait_condition = SELECT COUNT(*) = 1 FROM INFORMATION_SCHEMA.TABLES WHERE TABLE_NAME = 't1';
32+
--source include/wait_condition.inc
33+
34+
--let $wait_condition = SELECT COUNT(*) = 1 FROM t1;
35+
--source include/wait_condition.inc
36+
37+
--disable_query_log
38+
39+
--eval SELECT '$gtid_binlog_state_node1' = @@global.gtid_binlog_state AS gtid_binlog_state_equal;
40+
#--eval SELECT GTID_SUBSET('$gtid_executed_node1', @@global.gtid_executed) AS gtid_executed_equal;
41+
42+
--enable_query_log
43+
44+
--connection node_1
45+
SELECT COUNT(*) = 1 FROM t1;
46+
47+
--disable_query_log
48+
--eval SELECT '$gtid_binlog_state_node1' = @@global.gtid_binlog_state AS gtid_binlog_state_equal;
49+
#--eval SELECT GTID_SUBSET('$gtid_executed_node1', @@global.gtid_executed) AS gtid_executed_equal;
50+
--enable_query_log
51+
52+
--connection node_3
53+
DROP TABLE t1;
54+
55+
#
56+
# Unfortunately without the sleep below the following statement fails with "query returned no rows", which
57+
# is difficult to understand given that it is an aggregate query. A "query execution was interrupted"
58+
# warning is also reported by MTR, which is also weird.
59+
#
60+
61+
--sleep 1
62+
63+
--connection node_1
64+
--let $wait_condition = SELECT COUNT(*) = 0 FROM INFORMATION_SCHEMA.TABLES WHERE TABLE_NAME = 't1';
65+
--source include/wait_condition.inc
66+
67+
--connection node_2
68+
--let $wait_condition = SELECT COUNT(*) = 0 FROM INFORMATION_SCHEMA.TABLES WHERE TABLE_NAME = 't1';
69+
--source include/wait_condition.inc
70+
71+
STOP SLAVE;
72+
RESET SLAVE ALL;
73+
74+
--echo #cleanup
75+
--connection node_1
76+
set global wsrep_on=OFF;
77+
reset master;
78+
set global wsrep_on=ON;
79+
80+
--connection node_2
81+
set global wsrep_on=OFF;
82+
reset master;
83+
set global wsrep_on=ON;
84+
85+
--connection node_3
86+
reset master;

mysql-test/suite/galera/t/galera_as_slave_gtid.test

Lines changed: 1 addition & 77 deletions
Original file line numberDiff line numberDiff line change
@@ -7,80 +7,4 @@
77
# In addition to performing DDL and DML, we check that the gtid of the master is preserved inside the cluster
88
#
99

10-
--source include/have_innodb.inc
11-
--source include/galera_cluster.inc
12-
13-
# As node #3 is not a Galera node, and galera_cluster.inc does not open connetion to it
14-
# we open the node_3 connection here
15-
--connect node_3, 127.0.0.1, root, , test, $NODE_MYPORT_3
16-
17-
--connection node_2
18-
--disable_query_log
19-
--eval CHANGE MASTER TO MASTER_HOST='127.0.0.1', MASTER_USER='root', MASTER_PORT=$NODE_MYPORT_3;
20-
--enable_query_log
21-
START SLAVE;
22-
23-
--connection node_3
24-
CREATE TABLE t1 (f1 INTEGER PRIMARY KEY) ENGINE=InnoDB;
25-
INSERT INTO t1 VALUES(1);
26-
27-
SELECT LENGTH(@@global.gtid_binlog_state) > 1;
28-
--let $gtid_binlog_state_node1 = `SELECT @@global.gtid_binlog_state;`
29-
30-
--connection node_2
31-
--let $wait_condition = SELECT COUNT(*) = 1 FROM INFORMATION_SCHEMA.TABLES WHERE TABLE_NAME = 't1';
32-
--source include/wait_condition.inc
33-
34-
--let $wait_condition = SELECT COUNT(*) = 1 FROM t1;
35-
--source include/wait_condition.inc
36-
37-
--disable_query_log
38-
39-
--eval SELECT '$gtid_binlog_state_node1' = @@global.gtid_binlog_state AS gtid_binlog_state_equal;
40-
#--eval SELECT GTID_SUBSET('$gtid_executed_node1', @@global.gtid_executed) AS gtid_executed_equal;
41-
42-
--enable_query_log
43-
44-
--connection node_1
45-
SELECT COUNT(*) = 1 FROM t1;
46-
47-
--disable_query_log
48-
--eval SELECT '$gtid_binlog_state_node1' = @@global.gtid_binlog_state AS gtid_binlog_state_equal;
49-
#--eval SELECT GTID_SUBSET('$gtid_executed_node1', @@global.gtid_executed) AS gtid_executed_equal;
50-
--enable_query_log
51-
52-
--connection node_3
53-
DROP TABLE t1;
54-
55-
#
56-
# Unfortunately without the sleep below the following statement fails with "query returned no rows", which
57-
# is difficult to understand given that it is an aggregate query. A "query execution was interrupted"
58-
# warning is also reported by MTR, which is also weird.
59-
#
60-
61-
--sleep 1
62-
63-
--connection node_1
64-
--let $wait_condition = SELECT COUNT(*) = 0 FROM INFORMATION_SCHEMA.TABLES WHERE TABLE_NAME = 't1';
65-
--source include/wait_condition.inc
66-
67-
--connection node_2
68-
--let $wait_condition = SELECT COUNT(*) = 0 FROM INFORMATION_SCHEMA.TABLES WHERE TABLE_NAME = 't1';
69-
--source include/wait_condition.inc
70-
71-
STOP SLAVE;
72-
RESET SLAVE ALL;
73-
74-
--echo #cleanup
75-
--connection node_1
76-
set global wsrep_on=OFF;
77-
reset master;
78-
set global wsrep_on=ON;
79-
80-
--connection node_2
81-
set global wsrep_on=OFF;
82-
reset master;
83-
set global wsrep_on=ON;
84-
85-
--connection node_3
86-
reset master;
10+
--source galera_as_slave_gtid.inc
Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
!include ../galera_2nodes_as_slave.cnf
2+
3+
[mysqld]
4+
log-bin=mysqld-bin
5+
log-slave-updates
6+
binlog-format=ROW
7+
8+
gtid_pos_auto_engines=InnoDB
Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,14 @@
1+
#
2+
# Test Galera as a slave to a MariaDB master using GTIDs
3+
#
4+
# suite/galera/galera_2nodes_as_slave.cnf describes the setup of the nodes
5+
# suite/galera/t/galera_as_slave_gtid.cnf has the GTID options
6+
#
7+
# In addition to performing DDL and DML, we check that the gtid of the master is preserved inside the cluster
8+
#
9+
10+
--source galera_as_slave_gtid.inc
11+
12+
--connection node_2
13+
DROP TABLE mysql.gtid_slave_pos_InnoDB;
14+
CALL mtr.add_suppression("The automatically created table");

sql/service_wsrep.cc

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -112,7 +112,17 @@ extern "C" my_bool wsrep_get_debug()
112112

113113
extern "C" my_bool wsrep_thd_is_local(const THD *thd)
114114
{
115-
return thd->wsrep_cs().mode() == wsrep::client_state::m_local;
115+
/*
116+
async replication IO and background threads have nothing to replicate in the cluster,
117+
marking them as non-local here to prevent write set population and replication
118+
119+
async replication SQL thread, applies client transactions from mariadb master
120+
and will be replicated into cluster
121+
*/
122+
return (
123+
thd->system_thread != SYSTEM_THREAD_SLAVE_BACKGROUND &&
124+
thd->system_thread != SYSTEM_THREAD_SLAVE_IO &&
125+
thd->wsrep_cs().mode() == wsrep::client_state::m_local);
116126
}
117127

118128
extern "C" my_bool wsrep_thd_is_applying(const THD *thd)

storage/innobase/handler/ha_innodb.cc

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8199,6 +8199,7 @@ ha_innobase::write_row(
81998199
if (!error_result
82008200
&& wsrep_on(m_user_thd)
82018201
&& wsrep_thd_is_local(m_user_thd)
8202+
&& !wsrep_thd_ignore_table(m_user_thd)
82028203
&& !wsrep_consistency_check(m_user_thd)
82038204
&& (thd_sql_command(m_user_thd) != SQLCOM_CREATE_TABLE)
82048205
&& (thd_sql_command(m_user_thd) != SQLCOM_LOAD ||
@@ -8909,7 +8910,8 @@ ha_innobase::update_row(
89098910
#ifdef WITH_WSREP
89108911
if (error == DB_SUCCESS
89118912
&& wsrep_on(m_user_thd)
8912-
&& wsrep_thd_is_local(m_user_thd)) {
8913+
&& wsrep_thd_is_local(m_user_thd)
8914+
&& !wsrep_thd_ignore_table(m_user_thd)) {
89138915

89148916
DBUG_PRINT("wsrep", ("update row key"));
89158917

0 commit comments

Comments
 (0)