Skip to content

Commit 16ea692

Browse files
committed
MDEV-23586 Mariabackup: GTID saved for replication in 10.4.14 is wrong
MDEV-21953 deadlock between BACKUP STAGE BLOCK_COMMIT and parallel replication Fixed by partly reverting MDEV-21953 to put back MDL_BACKUP_COMMIT locking before log_and_order. The original problem for MDEV-21953 was that while a thread was waiting in for another threads to commit in 'log_and_order', it had the MDL_BACKUP_COMMIT lock. The backup thread was waiting to get the MDL_BACKUP_WAIT_COMMIT lock, which blocks all new MDL_BACKUP_COMMIT locks. This causes a deadlock as the waited-for thread can never get past the MDL_BACKUP_COMMIT lock in ha_commit_trans. The main part of the bug fix is to release the MDL_BACKUP_COMMIT lock while a thread is waiting for other 'previous' threads to commit. This ensures that no transactional thread keeps MDL_BACKUP_COMMIT while waiting, which ensures that there are no deadlocks anymore.
1 parent 3cdbaa0 commit 16ea692

File tree

6 files changed

+134
-59
lines changed

6 files changed

+134
-59
lines changed

sql/handler.cc

Lines changed: 52 additions & 49 deletions
Original file line numberDiff line numberDiff line change
@@ -114,7 +114,7 @@ static TYPELIB known_extensions= {0,"known_exts", NULL, NULL};
114114
uint known_extensions_id= 0;
115115

116116
static int commit_one_phase_2(THD *thd, bool all, THD_TRANS *trans,
117-
bool is_real_trans, bool rw_trans);
117+
bool is_real_trans);
118118

119119

120120
static plugin_ref ha_default_plugin(THD *thd)
@@ -1490,9 +1490,40 @@ int ha_commit_trans(THD *thd, bool all)
14901490
/* rw_trans is TRUE when we in a transaction changing data */
14911491
bool rw_trans= is_real_trans &&
14921492
(rw_ha_count > (thd->is_current_stmt_binlog_disabled()?0U:1U));
1493+
MDL_request mdl_backup;
14931494
DBUG_PRINT("info", ("is_real_trans: %d rw_trans: %d rw_ha_count: %d",
14941495
is_real_trans, rw_trans, rw_ha_count));
14951496

1497+
if (rw_trans)
1498+
{
1499+
/*
1500+
Acquire a metadata lock which will ensure that COMMIT is blocked
1501+
by an active FLUSH TABLES WITH READ LOCK (and vice versa:
1502+
COMMIT in progress blocks FTWRL).
1503+
1504+
We allow the owner of FTWRL to COMMIT; we assume that it knows
1505+
what it does.
1506+
*/
1507+
mdl_backup.init(MDL_key::BACKUP, "", "", MDL_BACKUP_COMMIT, MDL_EXPLICIT);
1508+
1509+
if (!WSREP(thd))
1510+
{
1511+
if (thd->mdl_context.acquire_lock(&mdl_backup,
1512+
thd->variables.lock_wait_timeout))
1513+
{
1514+
ha_rollback_trans(thd, all);
1515+
DBUG_RETURN(1);
1516+
}
1517+
thd->backup_commit_lock= &mdl_backup;
1518+
}
1519+
DEBUG_SYNC(thd, "ha_commit_trans_after_acquire_commit_lock");
1520+
1521+
/* Use shortcut as we already have the MDL_BACKUP_COMMIT lock */
1522+
ha_maria::implicit_commit(thd, TRUE);
1523+
}
1524+
else
1525+
ha_maria_implicit_commit(thd, TRUE);
1526+
14961527
if (rw_trans &&
14971528
opt_readonly &&
14981529
!(thd->security_ctx->master_access & SUPER_ACL) &&
@@ -1532,7 +1563,7 @@ int ha_commit_trans(THD *thd, bool all)
15321563
// Here, the call will not commit inside InnoDB. It is only working
15331564
// around closing thd->transaction.stmt open by TR_table::open().
15341565
if (all)
1535-
commit_one_phase_2(thd, false, &thd->transaction.stmt, false, false);
1566+
commit_one_phase_2(thd, false, &thd->transaction.stmt, false);
15361567
}
15371568
}
15381569
#endif
@@ -1552,7 +1583,7 @@ int ha_commit_trans(THD *thd, bool all)
15521583
goto wsrep_err;
15531584
}
15541585
#endif /* WITH_WSREP */
1555-
error= ha_commit_one_phase(thd, all, rw_trans);
1586+
error= ha_commit_one_phase(thd, all);
15561587
#ifdef WITH_WSREP
15571588
if (run_wsrep_hooks)
15581589
error= error || wsrep_after_commit(thd, all);
@@ -1604,7 +1635,7 @@ int ha_commit_trans(THD *thd, bool all)
16041635

16051636
if (!is_real_trans)
16061637
{
1607-
error= commit_one_phase_2(thd, all, trans, is_real_trans, rw_trans);
1638+
error= commit_one_phase_2(thd, all, trans, is_real_trans);
16081639
goto done;
16091640
}
16101641
#ifdef WITH_WSREP
@@ -1622,7 +1653,7 @@ int ha_commit_trans(THD *thd, bool all)
16221653
DEBUG_SYNC(thd, "ha_commit_trans_after_log_and_order");
16231654
DBUG_EXECUTE_IF("crash_commit_after_log", DBUG_SUICIDE(););
16241655

1625-
error= commit_one_phase_2(thd, all, trans, is_real_trans, rw_trans) ? 2 : 0;
1656+
error= commit_one_phase_2(thd, all, trans, is_real_trans) ? 2 : 0;
16261657
#ifdef WITH_WSREP
16271658
if (run_wsrep_hooks && (error || (error = wsrep_after_commit(thd, all))))
16281659
{
@@ -1685,6 +1716,17 @@ int ha_commit_trans(THD *thd, bool all)
16851716
thd->rgi_slave->is_parallel_exec);
16861717
}
16871718
end:
1719+
if (mdl_backup.ticket)
1720+
{
1721+
/*
1722+
We do not always immediately release transactional locks
1723+
after ha_commit_trans() (see uses of ha_enable_transaction()),
1724+
thus we release the commit blocker lock as soon as it's
1725+
not needed.
1726+
*/
1727+
thd->mdl_context.release_lock(mdl_backup.ticket);
1728+
}
1729+
thd->backup_commit_lock= 0;
16881730
#ifdef WITH_WSREP
16891731
if (wsrep_is_active(thd) && is_real_trans && !error &&
16901732
(rw_ha_count == 0 || all) &&
@@ -1699,8 +1741,8 @@ int ha_commit_trans(THD *thd, bool all)
16991741

17001742
/**
17011743
@note
1702-
This function does not care about global read lock. A caller should.
1703-
However backup locks are handled in commit_one_phase_2.
1744+
This function does not care about global read lock or backup locks,
1745+
the caller should.
17041746
17051747
@param[in] all Is set in case of explicit commit
17061748
(COMMIT statement), or implicit commit
@@ -1709,7 +1751,7 @@ int ha_commit_trans(THD *thd, bool all)
17091751
autocommit=1.
17101752
*/
17111753

1712-
int ha_commit_one_phase(THD *thd, bool all, bool rw_trans)
1754+
int ha_commit_one_phase(THD *thd, bool all)
17131755
{
17141756
THD_TRANS *trans=all ? &thd->transaction.all : &thd->transaction.stmt;
17151757
/*
@@ -1735,50 +1777,21 @@ int ha_commit_one_phase(THD *thd, bool all, bool rw_trans)
17351777
if ((res= thd->wait_for_prior_commit()))
17361778
DBUG_RETURN(res);
17371779
}
1738-
res= commit_one_phase_2(thd, all, trans, is_real_trans, rw_trans);
1780+
res= commit_one_phase_2(thd, all, trans, is_real_trans);
17391781
DBUG_RETURN(res);
17401782
}
17411783

17421784

17431785
static int
1744-
commit_one_phase_2(THD *thd, bool all, THD_TRANS *trans, bool is_real_trans,
1745-
bool rw_trans)
1786+
commit_one_phase_2(THD *thd, bool all, THD_TRANS *trans, bool is_real_trans)
17461787
{
17471788
int error= 0;
17481789
uint count= 0;
17491790
Ha_trx_info *ha_info= trans->ha_list, *ha_info_next;
1750-
MDL_request mdl_request;
17511791
DBUG_ENTER("commit_one_phase_2");
17521792
if (is_real_trans)
17531793
DEBUG_SYNC(thd, "commit_one_phase_2");
17541794

1755-
if (rw_trans)
1756-
{
1757-
/*
1758-
Acquire a metadata lock which will ensure that COMMIT is blocked
1759-
by an active FLUSH TABLES WITH READ LOCK (and vice versa:
1760-
COMMIT in progress blocks FTWRL).
1761-
1762-
We allow the owner of FTWRL to COMMIT; we assume that it knows
1763-
what it does.
1764-
*/
1765-
mdl_request.init(MDL_key::BACKUP, "", "", MDL_BACKUP_COMMIT, MDL_EXPLICIT);
1766-
1767-
if (!WSREP(thd) &&
1768-
thd->mdl_context.acquire_lock(&mdl_request,
1769-
thd->variables.lock_wait_timeout))
1770-
{
1771-
my_error(ER_ERROR_DURING_COMMIT, MYF(0), 1);
1772-
ha_rollback_trans(thd, all);
1773-
DBUG_RETURN(1);
1774-
}
1775-
DEBUG_SYNC(thd, "ha_commit_trans_after_acquire_commit_lock");
1776-
}
1777-
1778-
#if defined(WITH_ARIA_STORAGE_ENGINE) && MYSQL_VERSION_ID < 100500
1779-
ha_maria::implicit_commit(thd, TRUE);
1780-
#endif
1781-
17821795
if (ha_info)
17831796
{
17841797
for (; ha_info; ha_info= ha_info_next)
@@ -1807,16 +1820,6 @@ commit_one_phase_2(THD *thd, bool all, THD_TRANS *trans, bool is_real_trans,
18071820
#endif
18081821
}
18091822
}
1810-
if (mdl_request.ticket)
1811-
{
1812-
/*
1813-
We do not always immediately release transactional locks
1814-
after ha_commit_trans() (see uses of ha_enable_transaction()),
1815-
thus we release the commit blocker lock as soon as it's
1816-
not needed.
1817-
*/
1818-
thd->mdl_context.release_lock(mdl_request.ticket);
1819-
}
18201823

18211824
/* Free resources and perform other cleanup even for 'empty' transactions. */
18221825
if (is_real_trans)

sql/handler.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5024,7 +5024,7 @@ int ha_change_key_cache(KEY_CACHE *old_key_cache, KEY_CACHE *new_key_cache);
50245024
/* transactions: interface to handlerton functions */
50255025
int ha_start_consistent_snapshot(THD *thd);
50265026
int ha_commit_or_rollback_by_xid(XID *xid, bool commit);
5027-
int ha_commit_one_phase(THD *thd, bool all, bool rw_trans);
5027+
int ha_commit_one_phase(THD *thd, bool all);
50285028
int ha_commit_trans(THD *thd, bool all);
50295029
int ha_rollback_trans(THD *thd, bool all);
50305030
int ha_prepare(THD *thd);

sql/log.cc

Lines changed: 50 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -6380,11 +6380,25 @@ bool MYSQL_BIN_LOG::write(Log_event *event_info, my_bool *with_annotate)
63806380

63816381
if (direct)
63826382
{
6383+
/* We come here only for incident events */
63836384
int res;
63846385
uint64 commit_id= 0;
6386+
MDL_request mdl_request;
63856387
DBUG_PRINT("info", ("direct is set"));
6388+
DBUG_ASSERT(!thd->backup_commit_lock);
6389+
6390+
mdl_request.init(MDL_key::BACKUP, "", "", MDL_BACKUP_COMMIT, MDL_EXPLICIT);
6391+
thd->mdl_context.acquire_lock(&mdl_request,
6392+
thd->variables.lock_wait_timeout);
6393+
thd->backup_commit_lock= &mdl_request;
6394+
63866395
if ((res= thd->wait_for_prior_commit()))
6396+
{
6397+
if (mdl_request.ticket)
6398+
thd->mdl_context.release_lock(mdl_request.ticket);
6399+
thd->backup_commit_lock= 0;
63876400
DBUG_RETURN(res);
6401+
}
63886402
file= &log_file;
63896403
my_org_b_tell= my_b_tell(file);
63906404
mysql_mutex_lock(&LOCK_log);
@@ -6399,7 +6413,11 @@ bool MYSQL_BIN_LOG::write(Log_event *event_info, my_bool *with_annotate)
63996413
commit_name.length);
64006414
commit_id= entry->val_int(&null_value);
64016415
});
6402-
if (write_gtid_event(thd, true, using_trans, commit_id))
6416+
res= write_gtid_event(thd, true, using_trans, commit_id);
6417+
if (mdl_request.ticket)
6418+
thd->mdl_context.release_lock(mdl_request.ticket);
6419+
thd->backup_commit_lock= 0;
6420+
if (res)
64036421
goto err;
64046422
}
64056423
else
@@ -7461,7 +7479,11 @@ MYSQL_BIN_LOG::queue_for_group_commit(group_commit_entry *orig_entry)
74617479
group_commit_entry *entry, *orig_queue, *last;
74627480
wait_for_commit *cur;
74637481
wait_for_commit *wfc;
7482+
bool backup_lock_released= 0;
7483+
int result= 0;
7484+
THD *thd= orig_entry->thd;
74647485
DBUG_ENTER("MYSQL_BIN_LOG::queue_for_group_commit");
7486+
DBUG_ASSERT(thd == current_thd);
74657487

74667488
/*
74677489
Check if we need to wait for another transaction to commit before us.
@@ -7493,6 +7515,21 @@ MYSQL_BIN_LOG::queue_for_group_commit(group_commit_entry *orig_entry)
74937515
{
74947516
PSI_stage_info old_stage;
74957517

7518+
/*
7519+
Release MDL_BACKUP_COMMIT LOCK while waiting for other threads to
7520+
commit.
7521+
This is needed to avoid deadlock between the other threads (which not
7522+
yet have the MDL_BACKUP_COMMIT_LOCK) and any threads using
7523+
BACKUP LOCK BLOCK_COMMIT.
7524+
*/
7525+
if (thd->backup_commit_lock && thd->backup_commit_lock->ticket &&
7526+
!backup_lock_released)
7527+
{
7528+
backup_lock_released= 1;
7529+
thd->mdl_context.release_lock(thd->backup_commit_lock->ticket);
7530+
thd->backup_commit_lock->ticket= 0;
7531+
}
7532+
74967533
/*
74977534
By setting wfc->opaque_pointer to our own entry, we mark that we are
74987535
ready to commit, but waiting for another transaction to commit before
@@ -7553,7 +7590,8 @@ MYSQL_BIN_LOG::queue_for_group_commit(group_commit_entry *orig_entry)
75537590
wfc->wakeup_error= ER_QUERY_INTERRUPTED;
75547591
my_message(wfc->wakeup_error,
75557592
ER_THD(orig_entry->thd, wfc->wakeup_error), MYF(0));
7556-
DBUG_RETURN(-1);
7593+
result= -1;
7594+
goto end;
75577595
}
75587596
}
75597597
orig_entry->thd->EXIT_COND(&old_stage);
@@ -7567,12 +7605,13 @@ MYSQL_BIN_LOG::queue_for_group_commit(group_commit_entry *orig_entry)
75677605
then there is nothing else to do.
75687606
*/
75697607
if (orig_entry->queued_by_other)
7570-
DBUG_RETURN(0);
7608+
goto end;
75717609

75727610
if (wfc && wfc->wakeup_error)
75737611
{
75747612
my_error(ER_PRIOR_COMMIT_FAILED, MYF(0));
7575-
DBUG_RETURN(-1);
7613+
result= -1;
7614+
goto end;
75767615
}
75777616

75787617
/* Now enqueue ourselves in the group commit queue. */
@@ -7733,7 +7772,13 @@ MYSQL_BIN_LOG::queue_for_group_commit(group_commit_entry *orig_entry)
77337772

77347773
DBUG_PRINT("info", ("Queued for group commit as %s",
77357774
(orig_queue == NULL) ? "leader" : "participant"));
7736-
DBUG_RETURN(orig_queue == NULL);
7775+
result= orig_queue == NULL;
7776+
7777+
end:
7778+
if (backup_lock_released)
7779+
thd->mdl_context.acquire_lock(thd->backup_commit_lock,
7780+
thd->variables.lock_wait_timeout);
7781+
DBUG_RETURN(result);
77377782
}
77387783

77397784
bool

sql/sql_class.cc

Lines changed: 27 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1271,6 +1271,7 @@ void THD::init()
12711271
first_successful_insert_id_in_prev_stmt_for_binlog= 0;
12721272
first_successful_insert_id_in_cur_stmt= 0;
12731273
current_backup_stage= BACKUP_FINISHED;
1274+
backup_commit_lock= 0;
12741275
#ifdef WITH_WSREP
12751276
wsrep_last_query_id= 0;
12761277
wsrep_xid.null();
@@ -1383,7 +1384,7 @@ void THD::init_for_queries()
13831384
set_time();
13841385
/*
13851386
We don't need to call ha_enable_transaction() as we can't have
1386-
any active transactions that has to be commited
1387+
any active transactions that has to be committed
13871388
*/
13881389
transaction.on= TRUE;
13891390

@@ -7386,16 +7387,33 @@ wait_for_commit::register_wait_for_prior_commit(wait_for_commit *waitee)
73867387
}
73877388

73887389

7389-
/*
7390-
Wait for commit of another transaction to complete, as already registered
7390+
/**
7391+
Waits for commit of another transaction to complete, as already registered
73917392
with register_wait_for_prior_commit(). If the commit already completed,
73927393
returns immediately.
7394+
7395+
If thd->backup_commit_lock is set, release it while waiting for other threads
73937396
*/
7397+
73947398
int
73957399
wait_for_commit::wait_for_prior_commit2(THD *thd)
73967400
{
73977401
PSI_stage_info old_stage;
73987402
wait_for_commit *loc_waitee;
7403+
bool backup_lock_released= 0;
7404+
7405+
/*
7406+
Release MDL_BACKUP_COMMIT LOCK while waiting for other threads to commit
7407+
This is needed to avoid deadlock between the other threads (which not
7408+
yet have the MDL_BACKUP_COMMIT_LOCK) and any threads using
7409+
BACKUP LOCK BLOCK_COMMIT.
7410+
*/
7411+
if (thd->backup_commit_lock && thd->backup_commit_lock->ticket)
7412+
{
7413+
backup_lock_released= 1;
7414+
thd->mdl_context.release_lock(thd->backup_commit_lock->ticket);
7415+
thd->backup_commit_lock->ticket= 0;
7416+
}
73997417

74007418
mysql_mutex_lock(&LOCK_wait_commit);
74017419
DEBUG_SYNC(thd, "wait_for_prior_commit_waiting");
@@ -7445,10 +7463,16 @@ wait_for_commit::wait_for_prior_commit2(THD *thd)
74457463
use within enter_cond/exit_cond.
74467464
*/
74477465
DEBUG_SYNC(thd, "wait_for_prior_commit_killed");
7466+
if (backup_lock_released)
7467+
thd->mdl_context.acquire_lock(thd->backup_commit_lock,
7468+
thd->variables.lock_wait_timeout);
74487469
return wakeup_error;
74497470

74507471
end:
74517472
thd->EXIT_COND(&old_stage);
7473+
if (backup_lock_released)
7474+
thd->mdl_context.acquire_lock(thd->backup_commit_lock,
7475+
thd->variables.lock_wait_timeout);
74527476
return wakeup_error;
74537477
}
74547478

sql/sql_class.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2200,7 +2200,10 @@ class THD: public THD_count, /* this must be first */
22002200
rpl_io_thread_info *rpl_io_info;
22012201
rpl_sql_thread_info *rpl_sql_info;
22022202
} system_thread_info;
2203+
/* Used for BACKUP LOCK */
22032204
MDL_ticket *mdl_backup_ticket, *mdl_backup_lock;
2205+
/* Used to register that thread has a MDL_BACKUP_WAIT_COMMIT lock */
2206+
MDL_request *backup_commit_lock;
22042207

22052208
void reset_for_next_command(bool do_clear_errors= 1);
22062209
/*

0 commit comments

Comments
 (0)