Skip to content

Commit a7f0d79

Browse files
committed
MDEV-35155: Small innodb_log_file_size may lead to excessive writing
Any InnoDB write workload is marking data pages in the buffer pool dirty. To make the changes durable, it suffices to write to the write-ahead log to facilitate crash recovery. The longer we keep pages dirty in the buffer pool, the better, because the same pages could be modified again and a single write to the file system could cover several changes. Eventually, we must write out dirty pages, so that we can advance the log checkpoint, that is, allow the start of the recovery log to be discarded. (On a clean shutdown, a full checkpoint to the end of the log will be made.) A write workload can be bound by innodb_buffer_pool_size (we must write out changes and evict data pages to make room for others) or by innodb_log_file_size (we must advance the log checkpoint before the tail of the circular ib_logfile0 would overwrite the previous checkpoint). In innodb_log_file_size bound workloads, we failed to set an optimal target for the next checkpoint LSN. If we write out too few pages, then all writer threads may occasionally be blocked in log_free_check() while the buf_flush_page_cleaner() thread is resolving the situation. If we write out too many pages, then the I/O subsystem will be loaded unnecessarily and there will be some write amplification in case some of the unnecessarily written pages would be modified soon afterwards. log_close(): Return the target LSN for buf_flush_ahead(lsn), bitwise-ORed with the "furious" flag, or the special values 0 indicating that no flushing is needed, which is the usual case. log_checkpoint_margin(): Use a similar page checkpoint target as log_close() for the !not_furious case. mtr_flush_ahead(): A wrapper for buf_flush_ahead(). mtr_t::commit_log_release(): Make some common code non-inline in order to reduce code duplication. buf_flush_ahead(lsn, furious=false): Avoid an unnecessary wake-up of the page cleaner if it is scheduled to wake up once per second. Co-developed-by: Alessandro Vetere Reviewed by: Vladislav Lesin
1 parent e436481 commit a7f0d79

File tree

4 files changed

+91
-61
lines changed

4 files changed

+91
-61
lines changed

storage/innobase/buf/buf0flu.cc

Lines changed: 16 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2191,10 +2191,24 @@ ATTRIBUTE_COLD void buf_flush_ahead(lsn_t lsn, bool furious) noexcept
21912191
if (limit < lsn)
21922192
{
21932193
limit= lsn;
2194-
buf_pool.page_cleaner_set_idle(false);
2195-
pthread_cond_signal(&buf_pool.do_flush_list);
21962194
if (furious)
2195+
{
2196+
/* Request any concurrent threads to wait for this batch to complete,
2197+
in log_free_check(). */
21972198
log_sys.set_check_for_checkpoint();
2199+
/* Immediately wake up buf_flush_page_cleaner(), even when it
2200+
is in the middle of a 1-second my_cond_timedwait(). */
2201+
wake:
2202+
buf_pool.page_cleaner_set_idle(false);
2203+
pthread_cond_signal(&buf_pool.do_flush_list);
2204+
}
2205+
else if (buf_pool.page_cleaner_idle())
2206+
/* In non-furious mode, concurrent writes to the log will remain
2207+
possible, and we are gently requesting buf_flush_page_cleaner()
2208+
to do more work to avoid a later call with furious=true.
2209+
We will only wake the buf_flush_page_cleaner() from an indefinite
2210+
my_cond_wait(), but we will not disturb the regular 1-second sleep. */
2211+
goto wake;
21982212
}
21992213
mysql_mutex_unlock(&buf_pool.flush_list_mutex);
22002214
}

storage/innobase/include/mtr0mtr.h

Lines changed: 12 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -645,17 +645,6 @@ struct mtr_t {
645645
/** Note that log_sys.latch is no longer being held exclusively. */
646646
void flag_wr_unlock() noexcept { ut_ad(m_latch_ex); m_latch_ex= false; }
647647

648-
/** type of page flushing is needed during commit() */
649-
enum page_flush_ahead
650-
{
651-
/** no need to trigger page cleaner */
652-
PAGE_FLUSH_NO= 0,
653-
/** asynchronous flushing is needed */
654-
PAGE_FLUSH_ASYNC,
655-
/** furious flushing is needed */
656-
PAGE_FLUSH_SYNC
657-
};
658-
659648
private:
660649
/** Handle any pages that were freed during the mini-transaction. */
661650
void process_freed_pages();
@@ -702,29 +691,31 @@ struct mtr_t {
702691
/** Commit the mini-transaction log.
703692
@tparam pmem log_sys.is_mmap()
704693
@param mtr mini-transaction
705-
@param lsns {start_lsn,flush_ahead} */
694+
@param lsns {start_lsn,flush_ahead_lsn} */
706695
template<bool pmem>
707-
static void commit_log(mtr_t *mtr, std::pair<lsn_t,page_flush_ahead> lsns)
708-
noexcept;
696+
static void commit_log(mtr_t *mtr, std::pair<lsn_t,lsn_t> lsns) noexcept;
697+
698+
/** Release log_sys.latch. */
699+
void commit_log_release() noexcept;
709700

710701
/** Append the redo log records to the redo log buffer.
711-
@return {start_lsn,flush_ahead} */
712-
std::pair<lsn_t,page_flush_ahead> do_write();
702+
@return {start_lsn,flush_ahead_lsn} */
703+
std::pair<lsn_t,lsn_t> do_write() noexcept;
713704

714705
/** Append the redo log records to the redo log buffer.
715706
@tparam mmap log_sys.is_mmap()
716707
@param mtr mini-transaction
717708
@param len number of bytes to write
718-
@return {start_lsn,flush_ahead} */
709+
@return {start_lsn,flush_ahead_lsn} */
719710
template<bool mmap> static
720-
std::pair<lsn_t,page_flush_ahead> finish_writer(mtr_t *mtr, size_t len);
711+
std::pair<lsn_t,lsn_t> finish_writer(mtr_t *mtr, size_t len);
721712

722713
/** The applicable variant of commit_log() */
723-
static void (*commit_logger)(mtr_t *, std::pair<lsn_t,page_flush_ahead>);
714+
static void (*commit_logger)(mtr_t *, std::pair<lsn_t,lsn_t>);
724715
/** The applicable variant of finish_writer() */
725-
static std::pair<lsn_t,page_flush_ahead> (*finisher)(mtr_t *, size_t);
716+
static std::pair<lsn_t,lsn_t> (*finisher)(mtr_t *, size_t);
726717

727-
std::pair<lsn_t,page_flush_ahead> finish_write(size_t len)
718+
std::pair<lsn_t,lsn_t> finish_write(size_t len)
728719
{ return finisher(this, len); }
729720
public:
730721
/** Update finisher when spin_wait_delay is changing to or from 0. */

storage/innobase/log/log0log.cc

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1304,10 +1304,10 @@ ATTRIBUTE_COLD static void log_checkpoint_margin() noexcept
13041304
}
13051305

13061306
const lsn_t lsn= log_sys.get_lsn();
1307-
const lsn_t checkpoint= log_sys.last_checkpoint_lsn;
1308-
const lsn_t sync_lsn= checkpoint + log_sys.max_checkpoint_age;
1307+
const lsn_t max_age= log_sys.max_checkpoint_age;
1308+
const lsn_t age= lsn_t(lsn - log_sys.last_checkpoint_lsn);
13091309

1310-
if (lsn <= sync_lsn)
1310+
if (age <= max_age)
13111311
{
13121312
#ifndef DBUG_OFF
13131313
skip_checkpoint:
@@ -1320,7 +1320,7 @@ ATTRIBUTE_COLD static void log_checkpoint_margin() noexcept
13201320
log_sys.latch.wr_unlock();
13211321

13221322
/* We must wait to prevent the tail of the log overwriting the head. */
1323-
buf_flush_wait_flushed(std::min(sync_lsn, checkpoint + (1U << 20)));
1323+
buf_flush_wait_flushed(lsn - max_age);
13241324
/* Sleep to avoid a thundering herd */
13251325
std::this_thread::sleep_for(std::chrono::milliseconds(10));
13261326
}

storage/innobase/mtr/mtr0mtr.cc

Lines changed: 59 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -40,10 +40,10 @@ Created 11/26/1995 Heikki Tuuri
4040
#include "my_cpu.h"
4141

4242
#ifdef HAVE_PMEM
43-
void (*mtr_t::commit_logger)(mtr_t *, std::pair<lsn_t,page_flush_ahead>);
43+
void (*mtr_t::commit_logger)(mtr_t *, std::pair<lsn_t,lsn_t>);
4444
#endif
4545

46-
std::pair<lsn_t,mtr_t::page_flush_ahead> (*mtr_t::finisher)(mtr_t *, size_t);
46+
std::pair<lsn_t,lsn_t> (*mtr_t::finisher)(mtr_t *, size_t);
4747

4848
void mtr_t::finisher_update()
4949
{
@@ -335,9 +335,25 @@ void mtr_t::release()
335335
m_memo.clear();
336336
}
337337

338+
ATTRIBUTE_NOINLINE void mtr_t::commit_log_release() noexcept
339+
{
340+
if (m_latch_ex)
341+
{
342+
log_sys.latch.wr_unlock();
343+
m_latch_ex= false;
344+
}
345+
else
346+
log_sys.latch.rd_unlock();
347+
}
348+
349+
static ATTRIBUTE_NOINLINE ATTRIBUTE_COLD
350+
void mtr_flush_ahead(lsn_t flush_lsn) noexcept
351+
{
352+
buf_flush_ahead(flush_lsn, bool(flush_lsn & 1));
353+
}
354+
338355
template<bool mmap>
339-
void mtr_t::commit_log(mtr_t *mtr, std::pair<lsn_t,page_flush_ahead> lsns)
340-
noexcept
356+
void mtr_t::commit_log(mtr_t *mtr, std::pair<lsn_t,lsn_t> lsns) noexcept
341357
{
342358
size_t modified= 0;
343359

@@ -378,25 +394,12 @@ void mtr_t::commit_log(mtr_t *mtr, std::pair<lsn_t,page_flush_ahead> lsns)
378394
buf_pool.page_cleaner_wakeup();
379395
mysql_mutex_unlock(&buf_pool.flush_list_mutex);
380396

381-
if (mtr->m_latch_ex)
382-
{
383-
log_sys.latch.wr_unlock();
384-
mtr->m_latch_ex= false;
385-
}
386-
else
387-
log_sys.latch.rd_unlock();
388-
397+
mtr->commit_log_release();
389398
mtr->release();
390399
}
391400
else
392401
{
393-
if (mtr->m_latch_ex)
394-
{
395-
log_sys.latch.wr_unlock();
396-
mtr->m_latch_ex= false;
397-
}
398-
else
399-
log_sys.latch.rd_unlock();
402+
mtr->commit_log_release();
400403

401404
for (auto it= mtr->m_memo.rbegin(); it != mtr->m_memo.rend(); )
402405
{
@@ -456,8 +459,11 @@ void mtr_t::commit_log(mtr_t *mtr, std::pair<lsn_t,page_flush_ahead> lsns)
456459

457460
mariadb_increment_pages_updated(modified);
458461

459-
if (UNIV_UNLIKELY(lsns.second != PAGE_FLUSH_NO))
460-
buf_flush_ahead(mtr->m_commit_lsn, lsns.second == PAGE_FLUSH_SYNC);
462+
if (UNIV_UNLIKELY(lsns.second != 0))
463+
{
464+
ut_ad(lsns.second < mtr->m_commit_lsn);
465+
mtr_flush_ahead(lsns.second);
466+
}
461467
}
462468

463469
/** Commit a mini-transaction. */
@@ -480,7 +486,7 @@ void mtr_t::commit()
480486
}
481487

482488
ut_ad(!srv_read_only_mode);
483-
std::pair<lsn_t,page_flush_ahead> lsns{do_write()};
489+
std::pair<lsn_t,lsn_t> lsns{do_write()};
484490
process_freed_pages();
485491
#ifdef HAVE_PMEM
486492
commit_logger(this, lsns);
@@ -971,24 +977,44 @@ std::pair<lsn_t,byte*> log_t::append_prepare(size_t size, bool ex) noexcept
971977

972978
/** Finish appending data to the log.
973979
@param lsn the end LSN of the log record
974-
@return whether buf_flush_ahead() will have to be invoked */
975-
static mtr_t::page_flush_ahead log_close(lsn_t lsn) noexcept
980+
@return lsn for invoking buf_flush_ahead() on, with "furious" flag in the LSB
981+
@retval 0 if buf_flush_ahead() will not have to be invoked */
982+
static lsn_t log_close(lsn_t lsn) noexcept
976983
{
977984
ut_ad(log_sys.latch_have_any());
978985

979986
const lsn_t checkpoint_age= lsn - log_sys.last_checkpoint_lsn;
987+
const lsn_t max_age= log_sys.max_modified_age_async;
980988

981989
if (UNIV_UNLIKELY(checkpoint_age >= log_sys.log_capacity) &&
982990
/* silence message on create_log_file() after the log had been deleted */
983991
checkpoint_age != lsn)
984992
log_overwrite_warning(lsn);
985-
else if (UNIV_LIKELY(checkpoint_age <= log_sys.max_modified_age_async))
986-
return mtr_t::PAGE_FLUSH_NO;
987-
else if (UNIV_LIKELY(checkpoint_age <= log_sys.max_checkpoint_age))
988-
return mtr_t::PAGE_FLUSH_ASYNC;
989-
990-
log_sys.set_check_for_checkpoint();
991-
return mtr_t::PAGE_FLUSH_SYNC;
993+
else if (UNIV_LIKELY(checkpoint_age <= max_age))
994+
return 0;
995+
996+
/* The last checkpoint is too old. Let us set an appropriate
997+
checkpoint age target, that is, a checkpoint LSN target that is the
998+
current LSN minus the maximum age. Let us see if are exceeding the
999+
log_checkpoint_margin() limit that will involve a synchronous wait
1000+
in each write operation. */
1001+
1002+
const bool furious{checkpoint_age >= log_sys.max_checkpoint_age};
1003+
1004+
/* If furious==true, we could set a less aggressive target
1005+
(lsn - log_sys.max_checkpoint_age) instead of what we will be using
1006+
in both cases (lsn - log_sys.max_checkpoint_age_async).
1007+
1008+
The aim of the more aggressive target is that mtr_flush_ahead() will
1009+
request more progress in buf_flush_page_cleaner() sooner, so that it
1010+
will be less likely that several threads will end up waiting in
1011+
log_checkpoint_margin(). That function will use the less aggressive
1012+
limit (lsn - log_sys.max_checkpoint_age) in order to minimize the
1013+
synchronous wait time. */
1014+
if (furious)
1015+
log_sys.set_check_for_checkpoint();
1016+
1017+
return ((lsn - max_age) & ~lsn_t{1}) | lsn_t{furious};
9921018
}
9931019

9941020
inline void mtr_t::page_checksum(const buf_page_t &bpage)
@@ -1034,7 +1060,7 @@ inline void mtr_t::page_checksum(const buf_page_t &bpage)
10341060
m_log.close(l + 4);
10351061
}
10361062

1037-
std::pair<lsn_t,mtr_t::page_flush_ahead> mtr_t::do_write()
1063+
std::pair<lsn_t,lsn_t> mtr_t::do_write() noexcept
10381064
{
10391065
ut_ad(!recv_no_log_write);
10401066
ut_ad(is_logged());
@@ -1186,8 +1212,7 @@ inline void log_t::append(byte *&d, const void *s, size_t size) noexcept
11861212
}
11871213

11881214
template<bool mmap>
1189-
std::pair<lsn_t,mtr_t::page_flush_ahead>
1190-
mtr_t::finish_writer(mtr_t *mtr, size_t len)
1215+
std::pair<lsn_t,lsn_t> mtr_t::finish_writer(mtr_t *mtr, size_t len)
11911216
{
11921217
ut_ad(log_sys.is_latest());
11931218
ut_ad(!recv_no_log_write);

0 commit comments

Comments
 (0)