Skip to content

Commit 77bebe9

Browse files
MDEV-36226 Stall and crash when page cleaner fails to generate free pages during Async flush
During regular iteration the page cleaner does flush from flush list with some flush target and then goes for generating free pages from LRU tail. When asynchronous flush is triggered i.e. when 7/8 th of the LSN margin is filled in the redo log, the flush target for flush list is set to innodb_io_capacity_max. If it could flush all, the flush bandwidth for LRU flush is currently set to zero. If the LRU tail has dirty pages, page cleaner ends up freeing no pages in one iteration. The scenario could repeat across multiple iterations till async flush target is reached. During this time the DB system is starved of free pages resulting in apparent stall and in some cases dict_sys latch fatal error. Fix: In page cleaner iteration, before LRU flush, ensure we provide enough flush limit so that freeing pages is no blocked by dirty pages in LRU tail. Log IO and flush state if double write flush wait is long. Reviewed by: Marko Mäkelä
1 parent 51cc77f commit 77bebe9

File tree

9 files changed

+143
-8
lines changed

9 files changed

+143
-8
lines changed

mysql-test/mariadb-test-run.pl

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4464,6 +4464,7 @@ ($$)
44644464
qr/InnoDB: innodb_open_files .* should not be greater than/,
44654465
qr/InnoDB: Trying to delete tablespace.*but there are.*pending/,
44664466
qr/InnoDB: Tablespace 1[0-9]* was not found at .*, and innodb_force_recovery was set/,
4467+
qr/InnoDB: Long wait \([0-9]+ seconds\) for double-write buffer flush/,
44674468
qr/Slave: Unknown table 't1' .* 1051/,
44684469
qr/Slave SQL:.*(Internal MariaDB error code: [[:digit:]]+|Query:.*)/,
44694470
qr/slave SQL thread aborted/,

storage/innobase/buf/buf0dblwr.cc

Lines changed: 50 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -603,20 +603,67 @@ static void buf_dblwr_check_block(const buf_page_t *bpage) noexcept
603603
}
604604
#endif /* UNIV_DEBUG */
605605

606+
ATTRIBUTE_COLD void buf_dblwr_t::print_info() const noexcept
607+
{
608+
mysql_mutex_assert_owner(&mutex);
609+
const slot *flush_slot= active_slot == &slots[0] ? &slots[1] : &slots[0];
610+
611+
sql_print_information("InnoDB: Double Write State\n"
612+
"-------------------\n"
613+
"Batch running : %s\n"
614+
"Active Slot - first_free: %zu reserved: %zu\n"
615+
"Flush Slot - first_free: %zu reserved: %zu\n"
616+
"-------------------",
617+
(batch_running ? "true" : "false"),
618+
active_slot->first_free, active_slot->reserved,
619+
flush_slot->first_free, flush_slot->reserved);
620+
}
621+
606622
bool buf_dblwr_t::flush_buffered_writes(const ulint size) noexcept
607623
{
608624
mysql_mutex_assert_owner(&mutex);
609625
ut_ad(size == block_size());
610626

611-
for (;;)
627+
const size_t max_count= 60 * 60;
628+
const size_t first_log_count= 30;
629+
const size_t fatal_threshold=
630+
static_cast<size_t>(srv_fatal_semaphore_wait_threshold);
631+
size_t log_count= first_log_count;
632+
633+
for (ulong count= 0;;)
612634
{
613635
if (!active_slot->first_free)
614636
return false;
615637
if (!batch_running)
616638
break;
617-
my_cond_wait(&cond, &mutex.m_mutex);
618-
}
619639

640+
timespec abstime;
641+
set_timespec(abstime, 1);
642+
my_cond_timedwait(&cond, &mutex.m_mutex, &abstime);
643+
644+
if (count > fatal_threshold)
645+
{
646+
buf_pool.print_flush_info();
647+
print_info();
648+
ib::fatal() << "InnoDB: Long wait (" << count
649+
<< " seconds) for double-write buffer flush.";
650+
}
651+
else if (++count < first_log_count && !(count % 5))
652+
{
653+
sql_print_information("InnoDB: Long wait (%zu seconds) for double-write"
654+
" buffer flush.", count);
655+
buf_pool.print_flush_info();
656+
print_info();
657+
}
658+
else if (!(count % log_count))
659+
{
660+
sql_print_warning("InnoDB: Long wait (%zu seconds) for double-write"
661+
" buffer flush.", count);
662+
buf_pool.print_flush_info();
663+
print_info();
664+
log_count= log_count >= max_count ? max_count : log_count * 2;
665+
}
666+
}
620667
ut_ad(active_slot->reserved == active_slot->first_free);
621668
ut_ad(!flushing_buffered_writes);
622669

storage/innobase/buf/buf0flu.cc

Lines changed: 73 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1374,7 +1374,10 @@ static void buf_flush_LRU_list_batch(ulint max, flush_counters_t *n) noexcept
13741374
break;
13751375
}
13761376

1377-
if (neighbors && space->is_rotational())
1377+
if (neighbors && space->is_rotational() &&
1378+
/* Skip neighbourhood flush from LRU list if we haven't yet reached
1379+
half of the free page target. */
1380+
UT_LIST_GET_LEN(buf_pool.free) * 2 >= free_limit)
13781381
n->flushed+= buf_flush_try_neighbors(space, page_id, bpage,
13791382
neighbors == 1,
13801383
n->flushed, max);
@@ -1763,8 +1766,16 @@ static ulint buf_flush_LRU(ulint max_n) noexcept
17631766
buf_do_LRU_batch(max_n, &n);
17641767

17651768
ulint pages= n.flushed;
1769+
ulint evicted= n.evicted;
17661770

1767-
if (n.evicted)
1771+
/* If we have exhausted flush quota, it is likely we exited before
1772+
generating enough free pages. Call once more with 0 flush to generate
1773+
free pages immediately as required. */
1774+
if (pages >= max_n)
1775+
buf_do_LRU_batch(0, &n);
1776+
1777+
evicted+= n.evicted;
1778+
if (evicted)
17681779
{
17691780
buf_pool.try_LRU_scan= true;
17701781
pthread_cond_broadcast(&buf_pool.done_free);
@@ -2339,6 +2350,11 @@ static void buf_flush_page_cleaner() noexcept
23392350
DBUG_EXECUTE_IF("ib_page_cleaner_sleep",
23402351
{
23412352
std::this_thread::sleep_for(std::chrono::seconds(1));
2353+
/* Cover the logging code in debug mode. */
2354+
buf_pool.print_flush_info();
2355+
buf_dblwr.lock();
2356+
buf_dblwr.print_info();
2357+
buf_dblwr.unlock();
23422358
});
23432359
lsn_limit= buf_flush_sync_lsn;
23442360

@@ -2543,6 +2559,10 @@ static void buf_flush_page_cleaner() noexcept
25432559

25442560
n= srv_max_io_capacity;
25452561
n= n >= n_flushed ? n - n_flushed : 0;
2562+
/* It is critical to generate free pages to keep the system alive. Make
2563+
sure we are not hindered by dirty pages in LRU tail. */
2564+
n= std::max<ulint>(n, std::min<ulint>(srv_max_io_capacity,
2565+
buf_pool.LRU_scan_depth));
25462566
goto LRU_flush;
25472567
}
25482568

@@ -2581,10 +2601,11 @@ ATTRIBUTE_COLD void buf_pool_t::LRU_warn() noexcept
25812601
mysql_mutex_assert_owner(&mutex);
25822602
try_LRU_scan= false;
25832603
if (!LRU_warned.test_and_set(std::memory_order_acquire))
2604+
{
25842605
sql_print_warning("InnoDB: Could not free any blocks in the buffer pool!"
2585-
" %zu blocks are in use and %zu free."
2586-
" Consider increasing innodb_buffer_pool_size.",
2587-
UT_LIST_GET_LEN(LRU), UT_LIST_GET_LEN(free));
2606+
" Consider increasing innodb_buffer_pool_size.");
2607+
buf_pool.print_flush_info();
2608+
}
25882609
}
25892610

25902611
/** Initialize page_cleaner. */
@@ -2664,6 +2685,53 @@ void buf_flush_sync() noexcept
26642685
thd_wait_end(nullptr);
26652686
}
26662687

2688+
ATTRIBUTE_COLD void buf_pool_t::print_flush_info() const noexcept
2689+
{
2690+
/* We do dirty read of UT_LIST count variable. */
2691+
size_t lru_size= UT_LIST_GET_LEN(LRU);
2692+
size_t dirty_size= UT_LIST_GET_LEN(flush_list);
2693+
size_t free_size= UT_LIST_GET_LEN(free);
2694+
size_t dirty_pct= lru_size ? dirty_size * 100 / (lru_size + free_size) : 0;
2695+
sql_print_information("InnoDB: Buffer Pool pages\n"
2696+
"-------------------\n"
2697+
"LRU Pages : %zu\n"
2698+
"Free Pages : %zu\n"
2699+
"Dirty Pages: %zu : %zu%%\n"
2700+
"-------------------",
2701+
lru_size, free_size, dirty_size, dirty_pct);
2702+
2703+
lsn_t lsn= log_sys.get_lsn();
2704+
lsn_t clsn= log_sys.last_checkpoint_lsn;
2705+
sql_print_information("InnoDB: LSN flush parameters\n"
2706+
"-------------------\n"
2707+
"System LSN : %" PRIu64 "\n"
2708+
"Checkpoint LSN: %" PRIu64 "\n"
2709+
"Flush ASync LSN: %" PRIu64 "\n"
2710+
"Flush Sync LSN: %" PRIu64 "\n"
2711+
"-------------------",
2712+
lsn, clsn, buf_flush_async_lsn.load(), buf_flush_sync_lsn.load());
2713+
2714+
lsn_t age= lsn - clsn;
2715+
lsn_t age_pct= log_sys.max_checkpoint_age
2716+
? age * 100 / log_sys.max_checkpoint_age : 0;
2717+
sql_print_information("InnoDB: LSN age parameters\n"
2718+
"-------------------\n"
2719+
"Current Age : %" PRIu64 " : %" PRIu64 "%%\n"
2720+
"Max Age(Async): %" PRIu64 "\n"
2721+
"Max Age(Sync) : %" PRIu64 "\n"
2722+
"Capacity : %" PRIu64 "\n"
2723+
"-------------------",
2724+
age, age_pct, log_sys.max_modified_age_async, log_sys.max_checkpoint_age,
2725+
log_sys.log_capacity);
2726+
2727+
sql_print_information("InnoDB: Pending IO count\n"
2728+
"-------------------\n"
2729+
"Pending Read : %zu\n"
2730+
"Pending Write: %zu\n"
2731+
"-------------------",
2732+
os_aio_pending_reads_approx(), os_aio_pending_writes_approx());
2733+
}
2734+
26672735
#ifdef UNIV_DEBUG
26682736
/** Functor to validate the flush list. */
26692737
struct Check {

storage/innobase/dict/dict0dict.cc

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,7 @@ Created 1/8/1996 Heikki Tuuri
4444
#include "btr0cur.h"
4545
#include "btr0sea.h"
4646
#include "buf0buf.h"
47+
#include "buf0flu.h"
4748
#include "data0type.h"
4849
#include "dict0boot.h"
4950
#include "dict0load.h"
@@ -1024,7 +1025,10 @@ void dict_sys_t::lock_wait(SRW_LOCK_ARGS(const char *file, unsigned line)) noexc
10241025
const ulong threshold= srv_fatal_semaphore_wait_threshold;
10251026

10261027
if (waited >= threshold)
1028+
{
1029+
buf_pool.print_flush_info();
10271030
ib::fatal() << fatal_msg;
1031+
}
10281032

10291033
if (waited > threshold / 4)
10301034
ib::warn() << "A long wait (" << waited

storage/innobase/include/buf0buf.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1951,6 +1951,9 @@ class buf_pool_t
19511951
/** Issue a warning that we could not free up buffer pool pages. */
19521952
ATTRIBUTE_COLD void LRU_warn() noexcept;
19531953

1954+
/** Print buffer pool flush state information. */
1955+
ATTRIBUTE_COLD void print_flush_info() const noexcept;
1956+
19541957
private:
19551958
/** Temporary memory for page_compressed and encrypted I/O */
19561959
struct io_buf_t

storage/innobase/include/buf0dblwr.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -159,6 +159,9 @@ class buf_dblwr_t
159159
my_cond_wait(&cond, &mutex.m_mutex);
160160
mysql_mutex_unlock(&mutex);
161161
}
162+
163+
/** Print double write state information. */
164+
ATTRIBUTE_COLD void print_info() const noexcept;
162165
};
163166

164167
/** The doublewrite buffer */

storage/innobase/include/os0file.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1021,6 +1021,8 @@ size_t os_aio_pending_reads() noexcept;
10211021
size_t os_aio_pending_reads_approx() noexcept;
10221022
/** @return number of pending writes */
10231023
size_t os_aio_pending_writes() noexcept;
1024+
/** @return approximate number of pending writes */
1025+
size_t os_aio_pending_writes_approx() noexcept;
10241026

10251027
/** Wait until there are no pending asynchronous writes.
10261028
@param declare whether the wait will be declared in tpool */

storage/innobase/os/os0file.cc

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3481,6 +3481,12 @@ size_t os_aio_pending_writes() noexcept
34813481
return pending;
34823482
}
34833483

3484+
/** @return approximate number of pending writes */
3485+
size_t os_aio_pending_writes_approx() noexcept
3486+
{
3487+
return write_slots->pending_io_count();
3488+
}
3489+
34843490
/** Wait until all pending asynchronous reads have completed.
34853491
@param declare whether the wait will be declared in tpool */
34863492
void os_aio_wait_until_no_pending_reads(bool declare) noexcept

storage/innobase/srv/srv0srv.cc

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1220,6 +1220,7 @@ void srv_monitor_task(void*)
12201220
now -= start;
12211221
ulong waited = static_cast<ulong>(now / 1000000);
12221222
if (waited >= threshold) {
1223+
buf_pool.print_flush_info();
12231224
ib::fatal() << dict_sys.fatal_msg;
12241225
}
12251226

0 commit comments

Comments
 (0)