Skip to content

Commit

Permalink
MDEV-22593 - InnoDB: don't take trx_sys.mutex in ReadView::open()
Browse files Browse the repository at this point in the history
This was the last abuse of trx_sys.mutex, which is now exclusively
protecting trx_sys.trx_list.

This global acquisition was also potential scalability bottleneck for
oltp_read_write benchmark. Although it didn't expose itself as such due
to larger scalability issues.

Replaced trx_sys.mutex based synchronisation between ReadView creator
thread and purge coordinator thread performing latest view clone with
ReadView::m_mutex.

It also allowed to simplify tri-state view m_state down to boolean
m_open flag.

For performance reasons trx->read_view.close() is left as atomic relaxed
store, so that we don't have to waste resources for otherwise meaningless
mutex acquisition.
  • Loading branch information
Sergey Vojtovich committed May 26, 2020
1 parent 8569dac commit 50b0ce4
Show file tree
Hide file tree
Showing 22 changed files with 265 additions and 320 deletions.
310 changes: 149 additions & 161 deletions storage/innobase/include/read0types.h

Large diffs are not rendered by default.

1 change: 1 addition & 0 deletions storage/innobase/include/sync0sync.h
Original file line number Diff line number Diff line change
Expand Up @@ -108,6 +108,7 @@ extern mysql_pfs_key_t thread_mutex_key;
extern mysql_pfs_key_t zip_pad_mutex_key;
extern mysql_pfs_key_t row_drop_list_mutex_key;
extern mysql_pfs_key_t rw_trx_hash_element_mutex_key;
extern mysql_pfs_key_t read_view_mutex_key;
#endif /* UNIV_PFS_MUTEX */

#ifdef UNIV_PFS_RWLOCK
Expand Down
4 changes: 3 additions & 1 deletion storage/innobase/include/sync0types.h
Original file line number Diff line number Diff line change
Expand Up @@ -147,7 +147,7 @@ V
lock_sys_mutex Mutex protecting lock_sys_t
|
V
trx_sys.mutex Mutex protecting trx_sys_t
trx_sys.mutex Mutex protecting trx_sys.trx_list
|
V
Threads mutex Background thread scheduling mutex
Expand Down Expand Up @@ -221,6 +221,7 @@ enum latch_level_t {
SYNC_THREADS,
SYNC_TRX,
SYNC_RW_TRX_HASH_ELEMENT,
SYNC_READ_VIEW,
SYNC_TRX_SYS,
SYNC_LOCK_SYS,
SYNC_LOCK_WAIT_SYS,
Expand Down Expand Up @@ -368,6 +369,7 @@ enum latch_id_t {
LATCH_ID_FIL_CRYPT_DATA_MUTEX,
LATCH_ID_FIL_CRYPT_THREADS_MUTEX,
LATCH_ID_RW_TRX_HASH_ELEMENT,
LATCH_ID_READ_VIEW,
LATCH_ID_TEST_MUTEX,
LATCH_ID_MAX = LATCH_ID_TEST_MUTEX
};
Expand Down
29 changes: 25 additions & 4 deletions storage/innobase/include/trx0purge.h
Original file line number Diff line number Diff line change
Expand Up @@ -128,18 +128,18 @@ class purge_sys_t
public:
/** latch protecting view, m_enabled */
MY_ALIGNED(CACHE_LINE_SIZE)
rw_lock_t latch;
mutable rw_lock_t latch;
private:
/** The purge will not remove undo logs which are >= this view */
MY_ALIGNED(CACHE_LINE_SIZE)
ReadViewBase view;
/** whether purge is enabled; protected by latch and std::atomic */
std::atomic<bool> m_enabled;
/** number of pending stop() calls without resume() */
Atomic_counter<int32_t> m_paused;
public:
que_t* query; /*!< The query graph which will do the
parallelized purge operation */
MY_ALIGNED(CACHE_LINE_SIZE)
ReadView view; /*!< The purge will not remove undo logs
which are >= this view (purge view) */

/** Iterator to the undo log records of committed transactions */
struct iterator
Expand Down Expand Up @@ -246,6 +246,27 @@ class purge_sys_t
void stop();
/** Resume purge at UNLOCK TABLES after FLUSH TABLES FOR EXPORT */
void resume();
/** A wrapper around ReadView::changes_visible(). */
bool changes_visible(trx_id_t id, const table_name_t &name) const
{
ut_ad(rw_lock_own(&latch, RW_LOCK_S));
return view.changes_visible(id, name);
}
/** A wrapper around ReadView::low_limit_no(). */
trx_id_t low_limit_no() const
{
#if 0 /* Unfortunately we don't hold this assertion, see MDEV-22718. */
ut_ad(rw_lock_own(&latch, RW_LOCK_S));
#endif
return view.low_limit_no();
}
/** A wrapper around trx_sys_t::clone_oldest_view(). */
void clone_oldest_view()
{
rw_lock_x_lock(&latch);
trx_sys.clone_oldest_view(&view);
rw_lock_x_unlock(&latch);
}
};

/** The global data structure coordinating a purge */
Expand Down
6 changes: 3 additions & 3 deletions storage/innobase/include/trx0sys.h
Original file line number Diff line number Diff line change
Expand Up @@ -814,7 +814,7 @@ class trx_sys_t
*/
MY_ALIGNED(CACHE_LINE_SIZE) Atomic_counter<uint32_t> rseg_history_len;

/** Mutex protecting trx_list. */
/** Mutex protecting trx_list AND NOTHING ELSE. */
MY_ALIGNED(CACHE_LINE_SIZE) mutable TrxSysMutex mutex;

/** List of all transactions. */
Expand Down Expand Up @@ -1086,7 +1086,7 @@ class trx_sys_t
in. This function is called by purge thread to determine whether it should
purge the delete marked record or not.
*/
void clone_oldest_view();
void clone_oldest_view(ReadViewBase *view) const;


/** @return the number of active views */
Expand All @@ -1098,7 +1098,7 @@ class trx_sys_t
for (const trx_t *trx= UT_LIST_GET_FIRST(trx_list); trx;
trx= UT_LIST_GET_NEXT(trx_list, trx))
{
if (trx->read_view.get_state() == READ_VIEW_STATE_OPEN)
if (trx->read_view.is_open())
++count;
}
mutex_exit(&mutex);
Expand Down
17 changes: 8 additions & 9 deletions storage/innobase/include/trx0trx.h
Original file line number Diff line number Diff line change
Expand Up @@ -241,8 +241,7 @@ trx_commit_step(
que_thr_t* thr); /*!< in: query thread */

/**********************************************************************//**
Prints info about a transaction.
Caller must hold trx_sys.mutex. */
Prints info about a transaction. */
void
trx_print_low(
/*==========*/
Expand All @@ -262,7 +261,6 @@ trx_print_low(

/**********************************************************************//**
Prints info about a transaction.
The caller must hold lock_sys.mutex and trx_sys.mutex.
When possible, use trx_print() instead. */
void
trx_print_latched(
Expand Down Expand Up @@ -304,7 +302,7 @@ trx_set_dict_operation(

/**********************************************************************//**
Determines if a transaction is in the given state.
The caller must hold trx_sys.mutex, or it must be the thread
The caller must hold trx->mutex, or it must be the thread
that is serving a running transaction.
A running RW transaction must be in trx_sys.rw_trx_hash.
@return TRUE if trx->state == state */
Expand Down Expand Up @@ -740,9 +738,10 @@ struct trx_t {
max trx id shortly before the
transaction is moved to
COMMITTED_IN_MEMORY state.
Protected by trx_sys_t::mutex
when trx is in rw_trx_hash. Initially
set to TRX_ID_MAX. */
Accessed exclusively by trx owner
thread. Should be removed in favour of
trx->rw_trx_hash_element->no.
Initially set to TRX_ID_MAX. */

/** State of the trx from the point of view of concurrency control
and the valid state transitions.
Expand Down Expand Up @@ -783,7 +782,7 @@ struct trx_t {
XA (2PC) transactions are always treated as non-autocommit.
Transitions to ACTIVE or NOT_STARTED occur when transaction
is not in rw_trx_hash (no trx_sys.mutex needed).
is not in rw_trx_hash.
Autocommit non-locking read-only transactions move between states
without holding any mutex. They are not in rw_trx_hash.
Expand All @@ -799,7 +798,7 @@ struct trx_t {
in rw_trx_hash.
ACTIVE->PREPARED->COMMITTED is only possible when trx is in rw_trx_hash.
The transition ACTIVE->PREPARED is protected by trx_sys.mutex.
The transition ACTIVE->PREPARED is protected by trx->mutex.
ACTIVE->COMMITTED is possible when the transaction is in
rw_trx_hash.
Expand Down
2 changes: 1 addition & 1 deletion storage/innobase/include/trx0trx.ic
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ Created 3/26/1996 Heikki Tuuri

/**********************************************************************//**
Determines if a transaction is in the given state.
The caller must hold trx_sys.mutex, or it must be the thread
The caller must hold trx->mutex, or it must be the thread
that is serving a running transaction.
A running RW transaction must be in trx_sys.rw_trx_hash.
@return TRUE if trx->state == state */
Expand Down
14 changes: 3 additions & 11 deletions storage/innobase/lock/lock0lock.cc
Original file line number Diff line number Diff line change
Expand Up @@ -4595,15 +4595,7 @@ lock_trx_print_wait_and_mvcc_state(FILE* file, const trx_t* trx, time_t now)
fprintf(file, "---");

trx_print_latched(file, trx, 600);

/* Note: read_view->get_state() check is race condition. But it
should "kind of work" because read_view is freed only at shutdown.
Worst thing that may happen is that it'll get transferred to
another thread and print wrong values. */

if (trx->read_view.get_state() == READ_VIEW_STATE_OPEN) {
trx->read_view.print_limits(file);
}
trx->read_view.print_limits(file);

if (trx->lock.que_state == TRX_QUE_LOCK_WAIT) {

Expand Down Expand Up @@ -5169,8 +5161,8 @@ lock_validate()
(lock_validate_table_locks), 0);

/* Iterate over all the record locks and validate the locks. We
don't want to hog the lock_sys_t::mutex and the trx_sys_t::mutex.
Release both mutexes during the validation check. */
don't want to hog the lock_sys_t::mutex. Release it during the
validation check. */

for (ulint i = 0; i < hash_get_n_cells(lock_sys.rec_hash); i++) {
ib_uint64_t limit = 0;
Expand Down
128 changes: 44 additions & 84 deletions storage/innobase/read/read0read.cc
Original file line number Diff line number Diff line change
Expand Up @@ -161,16 +161,6 @@ but it will never be dereferenced, because the purge view is older
than any active transaction.
For details see: row_vers_old_has_index_entry() and row_purge_poss_sec()
Some additional issues:
What if trx_sys.view_list == NULL and some transaction T1 and Purge both
try to open read_view at same time. Only one can acquire trx_sys.mutex.
In which order will the views be opened? Should it matter? If no, why?
The order does not matter. No new transactions can be created and no running
RW transaction can commit or rollback (or free views). AC-NL-RO transactions
will mark their views as closed but not actually free their views.
*/


Expand All @@ -180,7 +170,7 @@ will mark their views as closed but not actually free their views.
@param[in,out] trx transaction
*/
inline void ReadView::snapshot(trx_t *trx)
inline void ReadViewBase::snapshot(trx_t *trx)
{
trx_sys.snapshot_ids(trx, &m_ids, &m_low_limit_id, &m_low_limit_no);
std::sort(m_ids.begin(), m_ids.end());
Expand All @@ -196,74 +186,52 @@ inline void ReadView::snapshot(trx_t *trx)
View becomes visible to purge thread.
@param[in,out] trx transaction
Reuses closed view if there were no read-write transactions since (and at)
its creation time.
Original comment states: there is an inherent race here between purge
and this thread.
To avoid this race we should've checked trx_sys.get_max_trx_id() and
set m_open atomically under ReadView::m_mutex protection. But we're cutting
edges to achieve greater performance.
There're at least two types of concurrent threads interested in this
value: purge coordinator thread (see trx_sys_t::clone_oldest_view()) and
InnoDB monitor thread (see lock_trx_print_wait_and_mvcc_state()).
What bad things can happen because we allow this race?
Speculative execution may reorder state change before get_max_trx_id().
In this case purge thread has short gap to clone outdated view. Which is
probably not that bad: it just won't be able to purge things that it was
actually allowed to purge for a short while.
This thread may as well get suspended after trx_sys.get_max_trx_id() and
before m_open is set. New read-write transaction may get started, committed
and purged meanwhile. It is acceptable as well, since this view doesn't see
it.
*/
void ReadView::open(trx_t *trx)
{
ut_ad(this == &trx->read_view);
switch (state())
{
case READ_VIEW_STATE_OPEN:
if (is_open())
ut_ad(!srv_read_only_mode);
return;
case READ_VIEW_STATE_CLOSED:
if (srv_read_only_mode)
return;
/*
Reuse closed view if there were no read-write transactions since (and at)
its creation time.
Original comment states: there is an inherent race here between purge
and this thread.
To avoid this race we should've checked trx_sys.get_max_trx_id() and
set state to READ_VIEW_STATE_OPEN atomically under trx_sys.mutex
protection. But we're cutting edges to achieve great scalability.
There're at least two types of concurrent threads interested in this
value: purge coordinator thread (see trx_sys_t::clone_oldest_view()) and
InnoDB monitor thread (see lock_trx_print_wait_and_mvcc_state()).
What bad things can happen because we allow this race?
Speculative execution may reorder state change before get_max_trx_id().
In this case purge thread has short gap to clone outdated view. Which is
probably not that bad: it just won't be able to purge things that it was
actually allowed to purge for a short while.
This thread may as well get suspended after trx_sys.get_max_trx_id() and
before state is set to READ_VIEW_STATE_OPEN. New read-write transaction
may get started, committed and purged meanwhile. It is acceptable as
well, since this view doesn't see it.
*/
if (trx_is_autocommit_non_locking(trx) && m_ids.empty() &&
m_low_limit_id == trx_sys.get_max_trx_id())
goto reopen;

/*
Can't reuse view, take new snapshot.
Alas this empty critical section is simplest way to make sure concurrent
purge thread completed snapshot copy. Of course purge thread may come
again and try to copy once again after we release this mutex, but in
this case it is guaranteed to see READ_VIEW_STATE_REGISTERED and thus
it'll skip this view.
This critical section can be replaced with new state, which purge thread
would set to inform us to wait until it completes snapshot. However it'd
complicate m_state even further.
*/
mutex_enter(&trx_sys.mutex);
mutex_exit(&trx_sys.mutex);
m_state.store(READ_VIEW_STATE_SNAPSHOT, std::memory_order_relaxed);
break;
default:
ut_ad(0);
else if (likely(!srv_read_only_mode))
{
m_creator_trx_id= trx->id;
if (trx_is_autocommit_non_locking(trx) && empty() &&
low_limit_id() == trx_sys.get_max_trx_id())
m_open.store(true, std::memory_order_relaxed);
else
{
mutex_enter(&m_mutex);
snapshot(trx);
m_open.store(true, std::memory_order_relaxed);
mutex_exit(&m_mutex);
}
}

snapshot(trx);
reopen:
m_creator_trx_id= trx->id;
m_state.store(READ_VIEW_STATE_OPEN, std::memory_order_release);
}


Expand All @@ -274,21 +242,13 @@ void ReadView::open(trx_t *trx)
in. This function is called by purge thread to determine whether it should
purge the delete marked record or not.
*/
void trx_sys_t::clone_oldest_view()
void trx_sys_t::clone_oldest_view(ReadViewBase *view) const
{
purge_sys.view.snapshot(0);
view->snapshot(nullptr);
mutex_enter(&mutex);
/* Find oldest view. */
for (const trx_t *trx= UT_LIST_GET_FIRST(trx_list); trx;
trx= UT_LIST_GET_NEXT(trx_list, trx))
{
uint32_t state;

while ((state= trx->read_view.get_state()) == READ_VIEW_STATE_SNAPSHOT)
ut_delay(1);

if (state == READ_VIEW_STATE_OPEN)
purge_sys.view.copy(trx->read_view);
}
trx->read_view.append_to(view);
mutex_exit(&mutex);
}
1 change: 0 additions & 1 deletion storage/innobase/row/row0row.cc
Original file line number Diff line number Diff line change
Expand Up @@ -435,7 +435,6 @@ row_build_low(
ut_ad(rec != NULL);
ut_ad(heap != NULL);
ut_ad(dict_index_is_clust(index));
ut_ad(!mutex_own(&trx_sys.mutex));
ut_ad(!col_map || col_table);

if (!offsets) {
Expand Down
Loading

0 comments on commit 50b0ce4

Please sign in to comment.