Skip to content

Commit 9654b81

Browse files
committed
MDEV-37482: Contention on btr_sea::partition::latch
To reduce contention between insert, erase and search, let us mimic commit b08448d (MDEV-20612). That is, btr_sea::partition::insert() and btr_sea::partition::erase() will use a combination of a shared btr_sea::partition::latch and a tiny page_hash_latch that is pushed down to the btr_sea::hash_table::array. An exclusive btr_sea::partition::latch will be used in the final part of btr_search_drop_page_hash_index(), where we must guarantee that all entries will be removed, as well as in operations that affect an entire adaptive hash index partition. btr_sea::hash_chain: Chain of ahi_node hash buckets. btr_sea::hash_table: A hash table that includes page_hash_latch interleaved with hash_chain. page_hash_latch::try_lock(): Attempt to acquire an exclusive latch without waiting. btr_search_guess_on_hash(): Acquire also the page_hash_latch in order to prevent a concurrent modification of the hash bucket chain that our lookup is traversing. ha_insert_for_fold(): Remove. Invoke btr_sea::partition::insert() directly. btr_sea::partition::erase(): Add template<bool ex> for indicating whether an exclusive or a shared btr_sea::partition::latch is being held. If the ex=false operation fails to free the memory, btr_search_update_hash_on_delete() will retry with ex=true. btr_sea::partition::cleanup_after_erase(): Add an overload for the case where instead of holding an exclusive latch, we hold a shared latch along with a page_hash_latch. When not holding an exclusve latch, we may fail to free the memory, and the caller has to retry with an exclusive latch. btr_sea::partition::cleanup_after_erase_start(), btr_sea::partition::cleanup_after_erase_finish(): Split from cleanup_after_erase() to reduce the amount of code duplication. btr_sea::partition::block_mutex: Protect only the linked list of blocks. The spare block will exclusively be updated via Atomic_relaxed::exchange(). btr_sea::partition::rollback_insert(): Free the spare block in the unlikely event that the adaptive hash index has been disabled after our invocation of btr_sea::partition::prepare_insert(). ha_remove_all_nodes_to_page(): Merged to the only caller btr_search_drop_page_hash_index(). ssux_lock_impl::wr_rd_downgrade(): Downgrade an X latch to S. srw_lock_debug::wr_rd_downgrade(), srw_lock_impl::wr_rd_downgrade(): Downgrade from exclusive to shared. This operation is unavailable if _WIN32 or SUX_LOCK_GENERIC is defined. btr_search_build_page_hash_index(): Downgrade from exclusive to shared part.latch before starting to insert records into the adaptive hash index. In multi-batch operation, we preserve the last fr[] value in order to ensure the correct operation when buf_block_t::LEFT_SIDE is not set.
1 parent 429c5b1 commit 9654b81

File tree

8 files changed

+684
-289
lines changed

8 files changed

+684
-289
lines changed

storage/innobase/btr/btr0sea.cc

Lines changed: 463 additions & 260 deletions
Large diffs are not rendered by default.

storage/innobase/include/btr0sea.h

Lines changed: 150 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -119,14 +119,107 @@ struct btr_sea
119119
@param resize whether buf_pool_t::resize() is the caller */
120120
ATTRIBUTE_COLD void enable(bool resize= false) noexcept;
121121

122+
/** Hash cell chain in hash_table */
123+
struct hash_chain
124+
{
125+
/** pointer to the first block */
126+
ahi_node *first;
127+
128+
/** Find an element.
129+
@param u unary predicate
130+
@return the first matching element
131+
@retval nullptr if not found */
132+
template<typename UnaryPred>
133+
inline ahi_node *find(UnaryPred u) const noexcept;
134+
135+
/** Search for a pointer to an element.
136+
@param u unary predicate
137+
@return pointer to the first matching element,
138+
or to the last element in the chain */
139+
template<typename UnaryPred>
140+
inline ahi_node **search(UnaryPred u) noexcept;
141+
};
142+
143+
/** Hash table with singly-linked overflow lists.
144+
Based on @see buf_pool_t::page_hash_table */
145+
struct hash_table
146+
{
147+
static_assert(CPU_LEVEL1_DCACHE_LINESIZE >= 64, "less than 64 bytes");
148+
static_assert(!(CPU_LEVEL1_DCACHE_LINESIZE & 63),
149+
"not a multiple of 64 bytes");
150+
151+
/** Number of array[] elements per page_hash_latch.
152+
Must be one less than a power of 2. */
153+
#if 0
154+
static constexpr size_t ELEMENTS_PER_LATCH= 64 / sizeof(void*) - 1;
155+
156+
/** Extra padding. FIXME: Is this ever useful to be nonzero?
157+
Long time ago, some testing on an ARMv8 implementation seemed
158+
to suggest so, but this has not been validated recently. */
159+
static constexpr size_t EMPTY_SLOTS_PER_LATCH=
160+
((CPU_LEVEL1_DCACHE_LINESIZE / 64) - 1) * (64 / sizeof(void*));
161+
#else
162+
static constexpr size_t ELEMENTS_PER_LATCH=
163+
CPU_LEVEL1_DCACHE_LINESIZE / sizeof(void*) - 1;
164+
static constexpr size_t EMPTY_SLOTS_PER_LATCH= 0;
165+
#endif
166+
167+
/** number of payload elements in array[] */
168+
Atomic_relaxed<ulint> n_cells;
169+
/** the hash table, with pad(n_cells) elements, aligned to L1 cache size */
170+
hash_chain *array;
171+
172+
/** Create the hash table.
173+
@param n the lower bound of n_cells */
174+
inline void create(ulint n) noexcept;
175+
176+
/** Free the hash table. */
177+
void free() noexcept { aligned_free(array); array= nullptr; }
178+
179+
/** @return the index of an array element */
180+
ulint calc_hash(ulint fold) const noexcept
181+
{ return calc_hash(fold, n_cells); }
182+
/** @return raw array index converted to padded index */
183+
static ulint pad(ulint h) noexcept
184+
{
185+
ulint latches= h / ELEMENTS_PER_LATCH;
186+
ulint empty_slots= latches * EMPTY_SLOTS_PER_LATCH;
187+
return 1 + latches + empty_slots + h;
188+
}
189+
private:
190+
/** @return the index of an array element */
191+
static ulint calc_hash(ulint fold, ulint n_cells) noexcept
192+
{
193+
return pad(fold % n_cells);
194+
}
195+
public:
196+
/** @return the latch covering a hash table chain */
197+
static page_hash_latch &lock_get(hash_chain &chain) noexcept
198+
{
199+
static_assert(!((ELEMENTS_PER_LATCH + 1) & ELEMENTS_PER_LATCH),
200+
"must be one less than a power of 2");
201+
const size_t addr= reinterpret_cast<size_t>(&chain);
202+
ut_ad(addr & (ELEMENTS_PER_LATCH * sizeof chain));
203+
return *reinterpret_cast<page_hash_latch*>
204+
(addr & ~(ELEMENTS_PER_LATCH * sizeof chain));
205+
}
206+
207+
/** Get a hash table slot. */
208+
hash_chain &cell_get(ulint fold) const
209+
{ return array[calc_hash(fold, n_cells)]; }
210+
};
211+
122212
/** Partition of the hash table */
123213
struct partition
124214
{
125-
/** latch protecting table */
126-
alignas(CPU_LEVEL1_DCACHE_LINESIZE) srw_spin_lock latch;
215+
/** latch protecting table: either an exclusive latch, or
216+
a shared latch combined with lock_get() */
217+
alignas(CPU_LEVEL1_DCACHE_LINESIZE)
218+
IF_DBUG(srw_lock_debug,srw_spin_lock) latch;
127219
/** map of CRC-32C of rec prefix to rec_t* in buf_page_t::frame */
128-
hash_table_t table;
129-
/** latch protecting blocks, spare; may be acquired while holding latch */
220+
hash_table table;
221+
/** protects blocks; acquired while holding latch
222+
and possibly table.lock_get() */
130223
srw_mutex blocks_mutex;
131224
/** allocated blocks */
132225
UT_LIST_BASE_NODE_T(buf_page_t) blocks;
@@ -141,15 +234,50 @@ struct btr_sea
141234

142235
inline void free() noexcept;
143236

237+
/** @return the number of allocated buffer pool blocks */
238+
TPOOL_SUPPRESS_TSAN size_t get_blocks() const noexcept
239+
{ return UT_LIST_GET_LEN(blocks) + !!spare; }
240+
144241
/** Ensure that there is a spare block for a future insert() */
145242
void prepare_insert() noexcept;
146243

147-
/** Clean up after erasing an AHI node
148-
@param erase node being erased
244+
/** Undo prepare_insert() in case !btr_search.enabled */
245+
void rollback_insert() noexcept;
246+
247+
private:
248+
/** Start cleanup_after_erase()
249+
@return the last allocated element */
250+
inline ahi_node *cleanup_after_erase_start() noexcept;
251+
/** Finish cleanup_after_erase().
252+
We reduce the allocated size in UT_LIST_GET_LAST(blocks)->free_offset.
253+
If that size reaches 0, the last block will be removed from blocks,
254+
and a block may have to be freed by our caller.
255+
@return buffer block to be freed
256+
@retval nullptr if no buffer block was freed */
257+
buf_block_t *cleanup_after_erase_finish() noexcept;
258+
public:
259+
__attribute__((nonnull))
260+
/** Clean up after erasing an AHI node, while the caller is
261+
holding an exclusive latch. Unless "erase" is the last allocated
262+
element, we will swap it with the last allocated element.
263+
Finally, we return via cleanup_after_erase_finish().
264+
@param erase node being erased
149265
@return buffer block to be freed
150266
@retval nullptr if no buffer block was freed */
151267
buf_block_t *cleanup_after_erase(ahi_node *erase) noexcept;
152268

269+
__attribute__((nonnull))
270+
/** Clean up after erasing an AHI node. This is similar to
271+
cleanup_after_erase(ahi_node*), except that the operation may fail.
272+
@param erase node being erased
273+
@param l the latch held together with shared latch
274+
@return buffer block to be freed
275+
@retval nullptr if no buffer block was freed
276+
@retval -1 if we fail to shrink the allocation and erasing
277+
needs to be retried while holding an exclusive latch */
278+
buf_block_t *cleanup_after_erase(ahi_node *erase, page_hash_latch *l)
279+
noexcept;
280+
153281
__attribute__((nonnull))
154282
# if defined UNIV_AHI_DEBUG || defined UNIV_DEBUG
155283
/** Insert or replace an entry into the hash table.
@@ -164,11 +292,23 @@ struct btr_sea
164292
void insert(uint32_t fold, const rec_t *rec) noexcept;
165293
# endif
166294

167-
/** Delete a pointer to a record if it exists.
168-
@param fold CRC-32C of rec prefix
295+
/** erase() return value */
296+
enum erase_status{
297+
/** must retry with exclusive latch */
298+
ERASE_RETRY= -1,
299+
/** the pointer to the record was erased */
300+
ERASED= 0,
301+
/** nothing was erased */
302+
NOT_ERASED= 1
303+
};
304+
305+
/** Delete a pointer to a record if it exists, and release the latch.
306+
@tparam ex true=holding exclusive latch, false=shared latch
307+
@param cell hash table cell that may contain the CRC-32C of rec prefix
169308
@param rec B-tree leaf page record
170-
@return whether a record existed and was removed */
171-
inline bool erase(uint32_t fold, const rec_t *rec) noexcept;
309+
@return status */
310+
template<bool ex>
311+
erase_status erase(hash_chain &cell, const rec_t *rec) noexcept;
172312
};
173313

174314
/** innodb_adaptive_hash_index_parts */

storage/innobase/include/buf0buf.h

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -473,7 +473,8 @@ class buf_page_t
473473
protected by buf_pool.page_hash.lock_get() */
474474
buf_page_t *hash;
475475
/** for state()==MEMORY that are part of recv_sys.pages and
476-
protected by recv_sys.mutex */
476+
protected by recv_sys.mutex, or part of btr_sea::partition::table
477+
and protected by btr_sea::partition::blocks_mutex */
477478
struct {
478479
/** number of recv_sys.pages entries stored in the block */
479480
uint16_t used_records;
@@ -868,7 +869,7 @@ struct buf_block_t{
868869
Atomic_relaxed<uint16_t> n_hash_helps;
869870
# if defined UNIV_AHI_DEBUG || defined UNIV_DEBUG
870871
/** number of pointers from the btr_sea::partition::table;
871-
!n_pointers == !index */
872+
!index implies n_pointers == 0 */
872873
Atomic_counter<uint16_t> n_pointers;
873874
# define assert_block_ahi_empty(block) ut_a(!(block)->n_pointers)
874875
# define assert_block_ahi_valid(b) ut_a((b)->index || !(b)->n_pointers)
@@ -878,7 +879,7 @@ struct buf_block_t{
878879
# endif /* UNIV_AHI_DEBUG || UNIV_DEBUG */
879880
/** index for which the adaptive hash index has been created,
880881
or nullptr if the page does not exist in the index.
881-
Protected by btr_sea::partition::latch. */
882+
May be modified while holding exclusive btr_sea::partition::latch. */
882883
Atomic_relaxed<dict_index_t*> index;
883884
/* @} */
884885
#else /* BTR_CUR_HASH_ADAPT */

storage/innobase/include/buf0types.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -192,6 +192,8 @@ class page_hash_latch : private rw_lock
192192
inline void lock_shared() noexcept;
193193
/** Acquire an exclusive lock */
194194
inline void lock() noexcept;
195+
/** @return whether an exclusive lock was acquired without waiting */
196+
bool try_lock() noexcept { return write_trylock(); }
195197

196198
/** @return whether an exclusive lock is being held by any thread */
197199
bool is_write_locked() const noexcept { return rw_lock::is_write_locked(); }
@@ -215,6 +217,7 @@ class page_hash_latch
215217
void lock_shared() noexcept { lk.rd_lock(); }
216218
void unlock_shared() noexcept { lk.rd_unlock(); }
217219
void lock() noexcept { lk.wr_lock(); }
220+
bool try_lock() noexcept { return lk.wr_lock_try(); }
218221
void unlock() noexcept { lk.wr_unlock(); }
219222
bool is_write_locked() const noexcept { return lk.is_write_locked(); }
220223
bool is_locked() const noexcept { return lk.is_locked(); }
@@ -229,6 +232,7 @@ class page_hash_latch
229232
void lock_shared() noexcept { lock(); }
230233
void unlock_shared() noexcept { unlock(); }
231234
void lock() noexcept { lk.wr_lock(); }
235+
bool try_lock() noexcept { return lk.wr_lock_try(); }
232236
void unlock() noexcept { lk.wr_unlock(); }
233237
bool is_locked() const noexcept { return lk.is_locked(); }
234238
bool is_write_locked() const noexcept { return is_locked(); }

storage/innobase/include/srw_lock.h

Lines changed: 35 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,16 @@ this program; if not, write to the Free Software Foundation, Inc.,
3434
# define SUX_LOCK_GENERIC /* Use dummy implementation for debugging purposes */
3535
#endif
3636

37+
#ifndef UNIV_PFS_RWLOCK
38+
# define SRW_LOCK_INIT(key) init()
39+
# define SRW_LOCK_ARGS(file, line) /* nothing */
40+
# define SRW_LOCK_CALL /* nothing */
41+
#else
42+
# define SRW_LOCK_INIT(key) init(key)
43+
# define SRW_LOCK_ARGS(file, line) file, line
44+
# define SRW_LOCK_CALL __FILE__, __LINE__
45+
#endif
46+
3747
/** An exclusive-only variant of srw_lock */
3848
template<bool spinloop>
3949
class pthread_mutex_wrapper final
@@ -310,6 +320,7 @@ class ssux_lock_impl
310320
ut_ad(lk < WRITER);
311321
u_unlock();
312322
}
323+
void wr_rd_downgrade() noexcept { wr_u_downgrade(); u_rd_downgrade(); }
313324

314325
void rd_unlock() noexcept
315326
{
@@ -415,16 +426,9 @@ typedef ssux_lock_impl<true> srw_spin_lock_low;
415426
#endif
416427

417428
#ifndef UNIV_PFS_RWLOCK
418-
# define SRW_LOCK_INIT(key) init()
419-
# define SRW_LOCK_ARGS(file, line) /* nothing */
420-
# define SRW_LOCK_CALL /* nothing */
421429
typedef srw_lock_low srw_lock;
422430
typedef srw_spin_lock_low srw_spin_lock;
423431
#else
424-
# define SRW_LOCK_INIT(key) init(key)
425-
# define SRW_LOCK_ARGS(file, line) file, line
426-
# define SRW_LOCK_CALL __FILE__, __LINE__
427-
428432
/** Slim shared-update-exclusive lock with PERFORMANCE_SCHEMA instrumentation */
429433
class ssux_lock
430434
{
@@ -556,6 +560,23 @@ class srw_lock_impl
556560
PSI_RWLOCK_CALL(unlock_rwlock)(pfs_psi);
557561
lock.wr_unlock();
558562
}
563+
# if defined _WIN32 || defined SUX_LOCK_GENERIC
564+
# else
565+
void wr_rd_downgrade(const char *file, unsigned line) noexcept
566+
{
567+
if (psi_likely(pfs_psi != nullptr))
568+
{
569+
PSI_RWLOCK_CALL(unlock_rwlock)(pfs_psi);
570+
PSI_rwlock_locker_state state;
571+
if (PSI_rwlock_locker *locker=
572+
PSI_RWLOCK_CALL(start_rwlock_rdwait)
573+
(&state, pfs_psi, PSI_RWLOCK_READLOCK, file, line))
574+
PSI_RWLOCK_CALL(end_rwlock_rdwait)(locker, 0);
575+
}
576+
577+
lock.wr_rd_downgrade();
578+
}
579+
#endif
559580
bool rd_lock_try() noexcept { return lock.rd_lock_try(); }
560581
bool wr_lock_try() noexcept { return lock.wr_lock_try(); }
561582
void lock_shared() noexcept { return rd_lock(SRW_LOCK_CALL); }
@@ -595,20 +616,25 @@ class srw_lock_debug : private srw_lock
595616
void SRW_LOCK_INIT(mysql_pfs_key_t key) noexcept;
596617
void destroy() noexcept;
597618

598-
#ifndef SUX_LOCK_GENERIC
619+
# ifndef SUX_LOCK_GENERIC
599620
/** @return whether any lock may be held by any thread */
600621
bool is_locked_or_waiting() const noexcept
601622
{ return srw_lock::is_locked_or_waiting(); }
602623
/** @return whether an exclusive lock may be held by any thread */
603624
bool is_write_locked() const noexcept { return srw_lock::is_write_locked(); }
604-
#endif
625+
# endif
605626

606627
/** Acquire an exclusive lock */
607628
void wr_lock(SRW_LOCK_ARGS(const char *file, unsigned line)) noexcept;
608629
/** @return whether an exclusive lock was acquired */
609630
bool wr_lock_try() noexcept;
610631
/** Release after wr_lock() */
611632
void wr_unlock() noexcept;
633+
# if defined _WIN32 || defined SUX_LOCK_GENERIC
634+
# else
635+
/** Downgrade wr_lock() to rd_lock() */
636+
void wr_rd_downgrade(SRW_LOCK_ARGS(const char*,unsigned)) noexcept;
637+
# endif
612638
/** Acquire a shared lock */
613639
void rd_lock(SRW_LOCK_ARGS(const char *file, unsigned line)) noexcept;
614640
/** @return whether a shared lock was acquired */

storage/innobase/srv/srv0srv.cc

Lines changed: 3 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -748,7 +748,8 @@ srv_printf_innodb_monitor(
748748
part.blocks_mutex.wr_lock();
749749
fprintf(file, "Hash table size " ULINTPF
750750
", node heap has " ULINTPF " buffer(s)\n",
751-
part.table.n_cells, part.blocks.count + !!part.spare);
751+
size_t{part.table.n_cells},
752+
part.blocks.count + !!part.spare);
752753
part.blocks_mutex.wr_unlock();
753754
}
754755

@@ -830,10 +831,7 @@ srv_export_innodb_status(void)
830831

831832
ulint mem_adaptive_hash = 0;
832833
for (ulong i = 0; i < btr_search.n_parts; i++) {
833-
btr_sea::partition& part= btr_search.parts[i];
834-
part.blocks_mutex.wr_lock();
835-
mem_adaptive_hash += part.blocks.count + !!part.spare;
836-
part.blocks_mutex.wr_unlock();
834+
mem_adaptive_hash += btr_search.parts[i].get_blocks();
837835
}
838836
mem_adaptive_hash <<= srv_page_size_shift;
839837
btr_search.parts[0].latch.rd_lock(SRW_LOCK_CALL);

storage/innobase/sync/srw_lock.cc

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -669,6 +669,18 @@ void srw_lock_debug::wr_unlock() noexcept
669669
srw_lock::wr_unlock();
670670
}
671671

672+
# if defined _WIN32 || defined SUX_LOCK_GENERIC
673+
# else
674+
void srw_lock_debug::wr_rd_downgrade
675+
(SRW_LOCK_ARGS(const char *file, unsigned line)) noexcept
676+
{
677+
ut_ad(have_wr());
678+
writer.store(0, std::memory_order_relaxed);
679+
readers_register();
680+
srw_lock::wr_rd_downgrade(SRW_LOCK_ARGS(file, line));
681+
}
682+
# endif
683+
672684
void srw_lock_debug::readers_register() noexcept
673685
{
674686
readers_lock.wr_lock();

0 commit comments

Comments
 (0)