MDEV-37453 Parallel slave worker crashes during Backup at retrying

emoonrain · emoonrain · commit a1bc50e18af6 · 2025-10-17T15:46:53.000+03:00
In the BASE of this patch when a slave parallel worker proceeds from
the wait-for-prior-commit stage into retrying it may have its
backup-lock related sub-state, specifically `THD::backup_commit_lock`,
not reset, that is the pointer dangling.

That caused segfault at the pointer's dereferencing in the worker retrying.

The reason THD::backup_commit_lock is left dangling was unexpected
state of THD having non-NULL of `THD::backup_commit_lock` and NULL of `mdl_backup.ticket`.
This combination turns out possible when the slave worker is killed
for retry *and* few instruction later it does not succeed
to (re-)acquire the Backup MDL at exiting from
   `MYSQL_BIN_LOG::queue_for_group_commit()`.
While it did not succeed it also did not expose that fact with
timing out from `MDL_context::acquire_lock` and it did not have to,
as it before to start waiting it found itself killed.

The bug is fixed with amendment of `backup_commit_lock` reset condition
at the end of `ha_commit_trans()`. The amended reset remains careful to
affect only the stack allocated lock.

A test is added to confirm the fixes with reproducing all stages
described above. In the patch BASE it causes the segfault.
diff --git a/mysql-test/suite/rpl/r/rpl_parallel_backup_worker_retry.result b/mysql-test/suite/rpl/r/rpl_parallel_backup_worker_retry.result
@@ -0,0 +1,49 @@
+include/master-slave.inc
+[connection master]
+#
+# MDEV-37453 Parallel Replication Crash During Backup
+#
+connection master;
+CREATE TABLE t1 (a INT PRIMARY KEY, b INT) ENGINE = innodb;
+INSERT INTO  t1 VALUES (1, 0);
+INSERT INTO  t1 VALUES (2, 0);
+connection slave;
+include/stop_slave.inc
+ALTER TABLE mysql.gtid_slave_pos ENGINE=InnoDB;
+SET @old_parallel_threads  = @@GLOBAL.slave_parallel_threads;
+SET @old_parallel_mode     = @@GLOBAL.slave_parallel_mode;
+SET @@global.slave_parallel_threads= 2;
+SET @@global.slave_parallel_mode   = 'optimistic';
+connection master;
+begin /* trx1 */;
+delete from t1 where a = 1;
+update t1 set b = 1 where a = 2;
+commit;
+begin /* trx2 */;
+delete from t1 where a = 2;
+commit;
+connect  aux_slave,127.0.0.1,root,,test,$SLAVE_MYPORT,;
+BEGIN;
+DELETE FROM t1 WHERE a = 1;
+connection slave;
+include/start_slave.inc
+connection aux_slave;
+connect  backup_slave,127.0.0.1,root,,test,$SLAVE_MYPORT,;
+BACKUP STAGE START;
+BACKUP STAGE BLOCK_COMMIT;
+connection aux_slave;
+ROLLBACK;
+connection aux_slave;
+connection backup_slave;
+BACKUP STAGE END;
+connection slave;
+include/diff_tables.inc [master:t1,slave:t1]
+connection slave;
+include/stop_slave.inc
+SET @@global.slave_parallel_threads= @old_parallel_threads;
+SET @@global.slave_parallel_mode   = @old_parallel_mode;
+include/start_slave.inc
+connection server_1;
+DROP TABLE t1;
+include/rpl_end.inc
+# End of the tests
diff --git a/mysql-test/suite/rpl/t/rpl_parallel_backup_worker_retry.test b/mysql-test/suite/rpl/t/rpl_parallel_backup_worker_retry.test
@@ -0,0 +1,99 @@
+--source include/have_innodb.inc
+--source include/have_binlog_format_mixed.inc
+--source include/master-slave.inc
+
+--echo #
+--echo # MDEV-37453 Parallel Replication Crash During Backup
+--echo #
+
+# Retrying after parallel conflict transition must be able to do that
+# cleanly despite possible "hard" Backup MDL lock in the way.
+# The retrying transaction will complete successfully.
+# The plot:
+# Two transactions are run by two parallel workers. The 2nd (in binlog order)
+# transaction depends on the 1st.
+# 1. Block the 1st and let the 2nd reach
+#    Waiting-for-Prior-Transaction-to-Commit (WfPTtC).
+# 2. At this point issue BACKUP commands of which BLOCK_COMMIT will
+#    later force the 2nd transaction to wait for the BACKUP MDL lock.
+# 3. Release locks to the 1st transaction which would kick the 2nd out
+#    of waiting-for-prior-commit only to return negative from the
+#    BACKUP MDL acquisition attempt (found itself killed).
+# 4. Finally, prove of safety: the 2nd transaction retries successfully.
+
+--connection master
+CREATE TABLE t1 (a INT PRIMARY KEY, b INT) ENGINE = innodb;
+INSERT INTO  t1 VALUES (1, 0);
+INSERT INTO  t1 VALUES (2, 0);
+--sync_slave_with_master
+--source include/stop_slave.inc
+ALTER TABLE mysql.gtid_slave_pos ENGINE=InnoDB;
+SET @old_parallel_threads  = @@GLOBAL.slave_parallel_threads;
+SET @old_parallel_mode     = @@GLOBAL.slave_parallel_mode;
+SET @@global.slave_parallel_threads= 2;
+SET @@global.slave_parallel_mode   = 'optimistic';
+
+--connection master
+begin /* trx1 */;
+  delete from t1 where a = 1;
+  update t1 set b = 1 where a = 2;
+commit;
+begin /* trx2 */;
+  delete from t1 where a = 2;
+commit;
+
+--save_master_pos
+
+--connect (aux_slave,127.0.0.1,root,,test,$SLAVE_MYPORT,)
+BEGIN;
+# block the 1st worker and wait for the 2nd ready to commit
+  DELETE FROM t1 WHERE a = 1;
+
+--connection slave
+--source include/start_slave.inc
+
+--connection aux_slave
+--let $wait_condition= SELECT COUNT(*) = 1 FROM information_schema.processlist WHERE state = "Waiting for prior transaction to commit"
+--source include/wait_condition.inc
+
+# While the 1st worker is locked out run backup
+--connect (backup_slave,127.0.0.1,root,,test,$SLAVE_MYPORT,)
+BACKUP STAGE START;
+BACKUP STAGE BLOCK_COMMIT;
+
+# release the 1st work
+--connection aux_slave
+let $status_var= Slave_retried_transactions;
+let $status_var_value= query_get_value(SHOW STATUS LIKE '$status_var', Value, 1);
+--sleep 1
+ROLLBACK;
+
+# that will kick the 2nd out of the current WfPTtC into next retry one
+--connection aux_slave
+--let $wait_condition= SELECT COUNT(*) = 1 FROM information_schema.processlist WHERE state = "Waiting for prior transaction to commit"
+--source include/wait_condition.inc
+
+let $status_var_comparsion= >;
+--source include/wait_for_status_var.inc
+
+--connection backup_slave
+BACKUP STAGE END;
+
+--connection slave
+--sync_with_master
+
+--let $diff_tables= master:t1,slave:t1
+--source include/diff_tables.inc
+
+# Clean up.
+--connection slave
+--source include/stop_slave.inc
+SET @@global.slave_parallel_threads= @old_parallel_threads;
+SET @@global.slave_parallel_mode   = @old_parallel_mode;
+--source include/start_slave.inc
+
+--connection server_1
+DROP TABLE t1;
+
+--source include/rpl_end.inc
+--echo # End of the tests
diff --git a/sql/handler.cc b/sql/handler.cc
@@ -2072,15 +2072,17 @@ int ha_commit_trans(THD *thd, bool all)
                 thd->rgi_slave->is_parallel_exec);
   }
 end:
-  if (mdl_backup.ticket)
+  // reset the pointer to the ticket when it's stack instantiated
+  if (thd->backup_commit_lock == &mdl_backup)
   {
     /*
       We do not always immediately release transactional locks
       after ha_commit_trans() (see uses of ha_enable_transaction()),
       thus we release the commit blocker lock as soon as it's
       not needed.
-    */
-    thd->mdl_context.release_lock(mdl_backup.ticket);
+     */
+    if (mdl_backup.ticket)
+      thd->mdl_context.release_lock(mdl_backup.ticket);
     thd->backup_commit_lock= 0;
   }
 #ifdef WITH_WSREP