Skip to content

Commit a1bc50e

Browse files
committed
MDEV-37453 Parallel slave worker crashes during Backup at retrying
In the BASE of this patch when a slave parallel worker proceeds from the wait-for-prior-commit stage into retrying it may have its backup-lock related sub-state, specifically `THD::backup_commit_lock`, not reset, that is the pointer dangling. That caused segfault at the pointer's dereferencing in the worker retrying. The reason THD::backup_commit_lock is left dangling was unexpected state of THD having non-NULL of `THD::backup_commit_lock` and NULL of `mdl_backup.ticket`. This combination turns out possible when the slave worker is killed for retry *and* few instruction later it does not succeed to (re-)acquire the Backup MDL at exiting from `MYSQL_BIN_LOG::queue_for_group_commit()`. While it did not succeed it also did not expose that fact with timing out from `MDL_context::acquire_lock` and it did not have to, as it before to start waiting it found itself killed. The bug is fixed with amendment of `backup_commit_lock` reset condition at the end of `ha_commit_trans()`. The amended reset remains careful to affect only the stack allocated lock. A test is added to confirm the fixes with reproducing all stages described above. In the patch BASE it causes the segfault.
1 parent 1c9caba commit a1bc50e

File tree

3 files changed

+153
-3
lines changed

3 files changed

+153
-3
lines changed
Lines changed: 49 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,49 @@
1+
include/master-slave.inc
2+
[connection master]
3+
#
4+
# MDEV-37453 Parallel Replication Crash During Backup
5+
#
6+
connection master;
7+
CREATE TABLE t1 (a INT PRIMARY KEY, b INT) ENGINE = innodb;
8+
INSERT INTO t1 VALUES (1, 0);
9+
INSERT INTO t1 VALUES (2, 0);
10+
connection slave;
11+
include/stop_slave.inc
12+
ALTER TABLE mysql.gtid_slave_pos ENGINE=InnoDB;
13+
SET @old_parallel_threads = @@GLOBAL.slave_parallel_threads;
14+
SET @old_parallel_mode = @@GLOBAL.slave_parallel_mode;
15+
SET @@global.slave_parallel_threads= 2;
16+
SET @@global.slave_parallel_mode = 'optimistic';
17+
connection master;
18+
begin /* trx1 */;
19+
delete from t1 where a = 1;
20+
update t1 set b = 1 where a = 2;
21+
commit;
22+
begin /* trx2 */;
23+
delete from t1 where a = 2;
24+
commit;
25+
connect aux_slave,127.0.0.1,root,,test,$SLAVE_MYPORT,;
26+
BEGIN;
27+
DELETE FROM t1 WHERE a = 1;
28+
connection slave;
29+
include/start_slave.inc
30+
connection aux_slave;
31+
connect backup_slave,127.0.0.1,root,,test,$SLAVE_MYPORT,;
32+
BACKUP STAGE START;
33+
BACKUP STAGE BLOCK_COMMIT;
34+
connection aux_slave;
35+
ROLLBACK;
36+
connection aux_slave;
37+
connection backup_slave;
38+
BACKUP STAGE END;
39+
connection slave;
40+
include/diff_tables.inc [master:t1,slave:t1]
41+
connection slave;
42+
include/stop_slave.inc
43+
SET @@global.slave_parallel_threads= @old_parallel_threads;
44+
SET @@global.slave_parallel_mode = @old_parallel_mode;
45+
include/start_slave.inc
46+
connection server_1;
47+
DROP TABLE t1;
48+
include/rpl_end.inc
49+
# End of the tests
Lines changed: 99 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,99 @@
1+
--source include/have_innodb.inc
2+
--source include/have_binlog_format_mixed.inc
3+
--source include/master-slave.inc
4+
5+
--echo #
6+
--echo # MDEV-37453 Parallel Replication Crash During Backup
7+
--echo #
8+
9+
# Retrying after parallel conflict transition must be able to do that
10+
# cleanly despite possible "hard" Backup MDL lock in the way.
11+
# The retrying transaction will complete successfully.
12+
# The plot:
13+
# Two transactions are run by two parallel workers. The 2nd (in binlog order)
14+
# transaction depends on the 1st.
15+
# 1. Block the 1st and let the 2nd reach
16+
# Waiting-for-Prior-Transaction-to-Commit (WfPTtC).
17+
# 2. At this point issue BACKUP commands of which BLOCK_COMMIT will
18+
# later force the 2nd transaction to wait for the BACKUP MDL lock.
19+
# 3. Release locks to the 1st transaction which would kick the 2nd out
20+
# of waiting-for-prior-commit only to return negative from the
21+
# BACKUP MDL acquisition attempt (found itself killed).
22+
# 4. Finally, prove of safety: the 2nd transaction retries successfully.
23+
24+
--connection master
25+
CREATE TABLE t1 (a INT PRIMARY KEY, b INT) ENGINE = innodb;
26+
INSERT INTO t1 VALUES (1, 0);
27+
INSERT INTO t1 VALUES (2, 0);
28+
--sync_slave_with_master
29+
--source include/stop_slave.inc
30+
ALTER TABLE mysql.gtid_slave_pos ENGINE=InnoDB;
31+
SET @old_parallel_threads = @@GLOBAL.slave_parallel_threads;
32+
SET @old_parallel_mode = @@GLOBAL.slave_parallel_mode;
33+
SET @@global.slave_parallel_threads= 2;
34+
SET @@global.slave_parallel_mode = 'optimistic';
35+
36+
--connection master
37+
begin /* trx1 */;
38+
delete from t1 where a = 1;
39+
update t1 set b = 1 where a = 2;
40+
commit;
41+
begin /* trx2 */;
42+
delete from t1 where a = 2;
43+
commit;
44+
45+
--save_master_pos
46+
47+
--connect (aux_slave,127.0.0.1,root,,test,$SLAVE_MYPORT,)
48+
BEGIN;
49+
# block the 1st worker and wait for the 2nd ready to commit
50+
DELETE FROM t1 WHERE a = 1;
51+
52+
--connection slave
53+
--source include/start_slave.inc
54+
55+
--connection aux_slave
56+
--let $wait_condition= SELECT COUNT(*) = 1 FROM information_schema.processlist WHERE state = "Waiting for prior transaction to commit"
57+
--source include/wait_condition.inc
58+
59+
# While the 1st worker is locked out run backup
60+
--connect (backup_slave,127.0.0.1,root,,test,$SLAVE_MYPORT,)
61+
BACKUP STAGE START;
62+
BACKUP STAGE BLOCK_COMMIT;
63+
64+
# release the 1st work
65+
--connection aux_slave
66+
let $status_var= Slave_retried_transactions;
67+
let $status_var_value= query_get_value(SHOW STATUS LIKE '$status_var', Value, 1);
68+
--sleep 1
69+
ROLLBACK;
70+
71+
# that will kick the 2nd out of the current WfPTtC into next retry one
72+
--connection aux_slave
73+
--let $wait_condition= SELECT COUNT(*) = 1 FROM information_schema.processlist WHERE state = "Waiting for prior transaction to commit"
74+
--source include/wait_condition.inc
75+
76+
let $status_var_comparsion= >;
77+
--source include/wait_for_status_var.inc
78+
79+
--connection backup_slave
80+
BACKUP STAGE END;
81+
82+
--connection slave
83+
--sync_with_master
84+
85+
--let $diff_tables= master:t1,slave:t1
86+
--source include/diff_tables.inc
87+
88+
# Clean up.
89+
--connection slave
90+
--source include/stop_slave.inc
91+
SET @@global.slave_parallel_threads= @old_parallel_threads;
92+
SET @@global.slave_parallel_mode = @old_parallel_mode;
93+
--source include/start_slave.inc
94+
95+
--connection server_1
96+
DROP TABLE t1;
97+
98+
--source include/rpl_end.inc
99+
--echo # End of the tests

sql/handler.cc

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -2072,15 +2072,17 @@ int ha_commit_trans(THD *thd, bool all)
20722072
thd->rgi_slave->is_parallel_exec);
20732073
}
20742074
end:
2075-
if (mdl_backup.ticket)
2075+
// reset the pointer to the ticket when it's stack instantiated
2076+
if (thd->backup_commit_lock == &mdl_backup)
20762077
{
20772078
/*
20782079
We do not always immediately release transactional locks
20792080
after ha_commit_trans() (see uses of ha_enable_transaction()),
20802081
thus we release the commit blocker lock as soon as it's
20812082
not needed.
2082-
*/
2083-
thd->mdl_context.release_lock(mdl_backup.ticket);
2083+
*/
2084+
if (mdl_backup.ticket)
2085+
thd->mdl_context.release_lock(mdl_backup.ticket);
20842086
thd->backup_commit_lock= 0;
20852087
}
20862088
#ifdef WITH_WSREP

0 commit comments

Comments
 (0)