Skip to content

Commit 60ed479

Browse files
committed
MDEV-26004 Excessive wait times in buf_LRU_get_free_block()
buf_LRU_get_free_block(): Initially wait for a single block to be freed, signaled by buf_pool.done_free. Only if that fails and no LRU eviction flushing batch is already running, we initiate a flushing batch that should serve all threads that are currently waiting in buf_LRU_get_free_block(). Note: In an extreme case, this may introduce a performance regression at larger numbers of connections. We observed this in sysbench oltp_update_index with 512MiB buffer pool, 4GiB of data on fast NVMe, and 1000 concurrent connections, on a 20-thread CPU. The contention point appears to be buf_pool.mutex, and the improvement would turn into a regression somewhere beyond 32 concurrent connections. On slower storage, such regression was not observed; instead, the throughput was improving and maximum latency was reduced. The excessive waits were pointed out by Vladislav Vaintroub.
1 parent 6441bc6 commit 60ed479

File tree

4 files changed

+30
-12
lines changed

4 files changed

+30
-12
lines changed

storage/innobase/buf/buf0buf.cc

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1467,6 +1467,7 @@ bool buf_pool_t::create()
14671467
pthread_cond_init(&done_flush_LRU, nullptr);
14681468
pthread_cond_init(&done_flush_list, nullptr);
14691469
pthread_cond_init(&do_flush_list, nullptr);
1470+
pthread_cond_init(&done_free, nullptr);
14701471

14711472
try_LRU_scan= true;
14721473

@@ -1532,6 +1533,7 @@ void buf_pool_t::close()
15321533
pthread_cond_destroy(&done_flush_LRU);
15331534
pthread_cond_destroy(&done_flush_list);
15341535
pthread_cond_destroy(&do_flush_list);
1536+
pthread_cond_destroy(&done_free);
15351537

15361538
ut_free(chunks);
15371539
chunks= nullptr;

storage/innobase/buf/buf0flu.cc

Lines changed: 8 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -392,18 +392,19 @@ void buf_page_write_complete(const IORequest &request)
392392
rw_lock_sx_unlock_gen(&((buf_block_t*) bpage)->lock, BUF_IO_WRITE);
393393

394394
if (request.is_LRU())
395+
{
395396
buf_LRU_free_page(bpage, true);
396-
else
397-
ut_ad(!temp);
398397

399-
if (request.is_LRU())
400-
{
401398
ut_ad(buf_pool.n_flush_LRU_);
402399
if (!--buf_pool.n_flush_LRU_)
400+
{
403401
pthread_cond_broadcast(&buf_pool.done_flush_LRU);
402+
pthread_cond_signal(&buf_pool.done_free);
403+
}
404404
}
405405
else
406406
{
407+
ut_ad(!temp);
407408
ut_ad(buf_pool.n_flush_list_);
408409
if (!--buf_pool.n_flush_list_)
409410
pthread_cond_broadcast(&buf_pool.done_flush_list);
@@ -1717,7 +1718,10 @@ ulint buf_flush_LRU(ulint max_n)
17171718
mysql_mutex_unlock(&buf_pool.mutex);
17181719

17191720
if (!n_flushing)
1721+
{
17201722
pthread_cond_broadcast(&buf_pool.done_flush_LRU);
1723+
pthread_cond_signal(&buf_pool.done_free);
1724+
}
17211725

17221726
buf_dblwr.flush_buffered_writes();
17231727

storage/innobase/buf/buf0lru.cc

Lines changed: 18 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -401,7 +401,7 @@ we put it to free list to be used.
401401
402402
@param have_mutex whether buf_pool.mutex is already being held
403403
@return the free control block, in state BUF_BLOCK_MEMORY */
404-
buf_block_t* buf_LRU_get_free_block(bool have_mutex)
404+
buf_block_t *buf_LRU_get_free_block(bool have_mutex)
405405
{
406406
ulint n_iterations = 0;
407407
ulint flush_failures = 0;
@@ -413,6 +413,7 @@ buf_block_t* buf_LRU_get_free_block(bool have_mutex)
413413
mysql_mutex_lock(&buf_pool.mutex);
414414
got_mutex:
415415
buf_LRU_check_size_of_non_data_objects();
416+
buf_block_t* block;
416417

417418
DBUG_EXECUTE_IF("ib_lru_force_no_free_page",
418419
if (!buf_lru_free_blocks_error_printed) {
@@ -421,7 +422,8 @@ buf_block_t* buf_LRU_get_free_block(bool have_mutex)
421422

422423
retry:
423424
/* If there is a block in the free list, take it */
424-
if (buf_block_t* block = buf_LRU_get_free_only()) {
425+
if ((block = buf_LRU_get_free_only()) != nullptr) {
426+
got_block:
425427
if (!have_mutex) {
426428
mysql_mutex_unlock(&buf_pool.mutex);
427429
}
@@ -446,10 +448,19 @@ buf_block_t* buf_LRU_get_free_block(bool have_mutex)
446448
buf_pool.try_LRU_scan = false;
447449
}
448450

451+
for (;;) {
452+
if ((block = buf_LRU_get_free_only()) != nullptr) {
453+
goto got_block;
454+
}
455+
if (!buf_pool.n_flush_LRU_) {
456+
break;
457+
}
458+
my_cond_wait(&buf_pool.done_free, &buf_pool.mutex.m_mutex);
459+
}
460+
449461
#ifndef DBUG_OFF
450462
not_found:
451463
#endif
452-
buf_flush_wait_batch_end(true);
453464
mysql_mutex_unlock(&buf_pool.mutex);
454465

455466
if (n_iterations > 20 && !buf_lru_free_blocks_error_printed
@@ -477,13 +488,11 @@ buf_block_t* buf_LRU_get_free_block(bool have_mutex)
477488
}
478489

479490
/* No free block was found: try to flush the LRU list.
480-
This call will flush one page from the LRU and put it on the
481-
free list. That means that the free block is up for grabs for
482-
all user threads.
491+
The freed blocks will be up for grabs for all threads.
483492
484-
TODO: A more elegant way would have been to return the freed
493+
TODO: A more elegant way would have been to return one freed
485494
up block to the caller here but the code that deals with
486-
removing the block from page_hash and LRU_list is fairly
495+
removing the block from buf_pool.page_hash and buf_pool.LRU is fairly
487496
involved (particularly in case of ROW_FORMAT=COMPRESSED pages). We
488497
can do that in a separate patch sometime in future. */
489498

@@ -1027,6 +1036,7 @@ buf_LRU_block_free_non_file_page(
10271036
} else {
10281037
UT_LIST_ADD_FIRST(buf_pool.free, &block->page);
10291038
ut_d(block->page.in_free_list = true);
1039+
pthread_cond_signal(&buf_pool.done_free);
10301040
}
10311041

10321042
MEM_NOACCESS(block->frame, srv_page_size);

storage/innobase/include/buf0buf.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2020,6 +2020,8 @@ class buf_pool_t
20202020
UT_LIST_BASE_NODE_T(buf_page_t) free;
20212021
/*!< base node of the free
20222022
block list */
2023+
/** signaled each time when the free list grows; protected by mutex */
2024+
pthread_cond_t done_free;
20232025

20242026
UT_LIST_BASE_NODE_T(buf_page_t) withdraw;
20252027
/*!< base node of the withdraw

0 commit comments

Comments
 (0)