Skip to content

Commit

Permalink
MDEV-33112 innodb_undo_log_truncate=ON is blocking page write
Browse files Browse the repository at this point in the history
When innodb_undo_log_truncate=ON causes an InnoDB undo tablespace
to be truncated, we must guarantee that the undo tablespace will
be rebuilt atomically: After mtr_t::commit_shrink() has durably
written the mini-transaction that rebuilds the undo tablespace,
we must not write any old pages to the tablespace.

To guarantee this, in trx_purge_truncate_history() we used to
traverse the entire buf_pool.flush_list in order to acquire
exclusive latches on all pages for the undo tablespace that
reside in the buffer pool, so that those pages cannot be written
and will be evicted during mtr_t::commit_shrink(). But, this
traversal may interfere with the page writing activity of
buf_flush_page_cleaner(). It would be better to lazily discard
the old pages of the truncated undo tablespace.

fil_space_t::is_being_truncated, fil_space_t::clear_stopping(): Remove.

fil_space_t::create_lsn: A new field, identifying the LSN of the
latest rebuild of a tablespace.

buf_page_t::flush(), buf_flush_try_neighbors(): Evict pages whose
FIL_PAGE_LSN is below fil_space_t::create_lsn.

mtr_t::commit_shrink(): Update fil_space_t::create_lsn and
fil_space_t::size right before the log is durably written and the
tablespace file is being truncated.

fsp_page_create(), trx_purge_truncate_history(): Simplify the logic.

Reviewed by: Thirunarayanan Balathandayuthapani, Vladislav Lesin
Performance tested by: Axel Schwenke
Correctness tested by: Matthias Leich
  • Loading branch information
dr-m committed Jan 10, 2024
1 parent 593278f commit 3613fb2
Show file tree
Hide file tree
Showing 8 changed files with 94 additions and 226 deletions.
51 changes: 34 additions & 17 deletions storage/innobase/buf/buf0flu.cc
Original file line number Diff line number Diff line change
Expand Up @@ -792,16 +792,20 @@ bool buf_page_t::flush(bool evict, fil_space_t *space)
ut_ad(space->referenced());

const auto s= state();
ut_a(s >= FREED);

const lsn_t lsn=
mach_read_from_8(my_assume_aligned<8>
(FIL_PAGE_LSN + (zip.data ? zip.data : frame)));
ut_ad(lsn
? lsn >= oldest_modification() || oldest_modification() == 2
: space->purpose != FIL_TYPE_TABLESPACE);

if (s < UNFIXED)
{
ut_a(s >= FREED);
if (UNIV_LIKELY(space->purpose == FIL_TYPE_TABLESPACE))
{
const lsn_t lsn=
mach_read_from_8(my_assume_aligned<8>
(FIL_PAGE_LSN + (zip.data ? zip.data : frame)));
ut_ad(lsn >= oldest_modification());
freed:
if (lsn > log_sys.get_flushed_lsn())
{
mysql_mutex_unlock(&buf_pool.mutex);
Expand All @@ -813,6 +817,12 @@ bool buf_page_t::flush(bool evict, fil_space_t *space)
return false;
}

if (UNIV_UNLIKELY(lsn < space->get_create_lsn()))
{
ut_ad(space->purpose == FIL_TYPE_TABLESPACE);
goto freed;
}

ut_d(const auto f=) zip.fix.fetch_add(WRITE_FIX - UNFIXED);
ut_ad(f >= UNFIXED);
ut_ad(f < READ_FIX);
Expand Down Expand Up @@ -907,16 +917,9 @@ bool buf_page_t::flush(bool evict, fil_space_t *space)

if ((s & LRU_MASK) == REINIT || !space->use_doublewrite())
{
if (UNIV_LIKELY(space->purpose == FIL_TYPE_TABLESPACE))
{
const lsn_t lsn=
mach_read_from_8(my_assume_aligned<8>(FIL_PAGE_LSN +
(write_frame ? write_frame
: frame)));
ut_ad(lsn >= oldest_modification());
if (lsn > log_sys.get_flushed_lsn())
log_write_up_to(lsn, true);
}
if (UNIV_LIKELY(space->purpose == FIL_TYPE_TABLESPACE) &&
lsn > log_sys.get_flushed_lsn())
log_write_up_to(lsn, true);
space->io(IORequest{type, this, slot}, physical_offset(), size,
write_frame, this);
}
Expand Down Expand Up @@ -1096,11 +1099,25 @@ static ulint buf_flush_try_neighbors(fil_space_t *space,
bool contiguous, bool evict,
ulint n_flushed, ulint n_to_flush)
{
mysql_mutex_unlock(&buf_pool.mutex);

ut_ad(space->id == page_id.space());
ut_ad(bpage->id() == page_id);

{
const lsn_t lsn=
mach_read_from_8(my_assume_aligned<8>
(FIL_PAGE_LSN +
(bpage->zip.data ? bpage->zip.data : bpage->frame)));
ut_ad(lsn >= bpage->oldest_modification());
if (UNIV_UNLIKELY(lsn < space->get_create_lsn()))
{
ut_a(!bpage->flush(evict, space));
mysql_mutex_unlock(&buf_pool.mutex);
return 0;
}
}

mysql_mutex_unlock(&buf_pool.mutex);

ulint count= 0;
page_id_t id= page_id;
page_id_t high= buf_flush_check_neighbors(*space, id, contiguous, evict);
Expand Down
15 changes: 5 additions & 10 deletions storage/innobase/fil/fil0fil.cc
Original file line number Diff line number Diff line change
Expand Up @@ -558,7 +558,7 @@ fil_space_extend_must_retry(
ut_ad(UT_LIST_GET_LAST(space->chain) == node);
ut_ad(size >= FIL_IBD_FILE_INITIAL_SIZE);
ut_ad(node->space == space);
ut_ad(space->referenced() || space->is_being_truncated);
ut_ad(space->referenced());

*success = space->size >= size;

Expand Down Expand Up @@ -647,8 +647,7 @@ fil_space_extend_must_retry(
default:
ut_ad(space->purpose == FIL_TYPE_TABLESPACE
|| space->purpose == FIL_TYPE_IMPORT);
if (space->purpose == FIL_TYPE_TABLESPACE
&& !space->is_being_truncated) {
if (space->purpose == FIL_TYPE_TABLESPACE) {
goto do_flush;
}
break;
Expand Down Expand Up @@ -733,12 +732,10 @@ bool fil_space_extend(fil_space_t *space, uint32_t size)
bool success= false;
const bool acquired= space->acquire();
mysql_mutex_lock(&fil_system.mutex);
if (acquired || space->is_being_truncated)
{
if (acquired)
while (fil_space_extend_must_retry(space, UT_LIST_GET_LAST(space->chain),
size, &success))
mysql_mutex_lock(&fil_system.mutex);
}
mysql_mutex_unlock(&fil_system.mutex);
if (acquired)
space->release();
Expand Down Expand Up @@ -3058,11 +3055,9 @@ fil_space_validate_for_mtr_commit(
ut_ad(!is_predefined_tablespace(space->id));

/* We are serving mtr_commit(). While there is an active
mini-transaction, we should have !space->stop_new_ops. This is
mini-transaction, we should have !space->is_stopping(). This is
guaranteed by meta-data locks or transactional locks. */
ut_ad(!space->is_stopping()
|| space->is_being_truncated /* fil_truncate_prepare() */
|| space->referenced());
ut_ad(!space->is_stopping() || space->referenced());
}
#endif /* UNIV_DEBUG */

Expand Down
77 changes: 5 additions & 72 deletions storage/innobase/fsp/fsp0fsp.cc
Original file line number Diff line number Diff line change
Expand Up @@ -1034,77 +1034,11 @@ static
buf_block_t*
fsp_page_create(fil_space_t *space, page_no_t offset, mtr_t *mtr)
{
buf_block_t *block;

if (UNIV_UNLIKELY(space->is_being_truncated))
{
const page_id_t page_id{space->id, offset};
uint32_t state;
block= mtr->get_already_latched(page_id, MTR_MEMO_PAGE_X_FIX);
if (block)
goto have_latch;
else
{
buf_pool_t::hash_chain &chain=
buf_pool.page_hash.cell_get(page_id.fold());
mysql_mutex_lock(&buf_pool.mutex);
block= reinterpret_cast<buf_block_t*>
(buf_pool.page_hash.get(page_id, chain));
if (!block)
{
mysql_mutex_unlock(&buf_pool.mutex);
goto create;
}
}

if (!mtr->have_x_latch(*block))
{
const bool got{block->page.lock.x_lock_try()};
mysql_mutex_unlock(&buf_pool.mutex);
if (!got)
{
block->page.lock.x_lock();
const page_id_t id{block->page.id()};
if (UNIV_UNLIKELY(id != page_id))
{
ut_ad(id.is_corrupted());
block->page.lock.x_unlock();
goto create;
}
}
state= block->page.fix() + 1;
mtr->memo_push(block, MTR_MEMO_PAGE_X_FIX);
}
else
{
mysql_mutex_unlock(&buf_pool.mutex);
have_latch:
state= block->page.state();
}

ut_ad(state > buf_page_t::FREED);
ut_ad(state < buf_page_t::READ_FIX);
ut_ad((state & buf_page_t::LRU_MASK) != buf_page_t::IBUF_EXIST);
ut_ad(block->page.lock.x_lock_count() == 1);
ut_ad(block->page.frame);
#ifdef BTR_CUR_HASH_ADAPT
ut_ad(!block->index);
#endif

block->page.set_reinit(state < buf_page_t::UNFIXED
? buf_page_t::FREED
: (state & buf_page_t::LRU_MASK));
}
else
{
create:
buf_block_t *free_block= buf_LRU_get_free_block(false);
block= buf_page_create(space, static_cast<uint32_t>(offset),
space->zip_size(), mtr, free_block);
if (UNIV_UNLIKELY(block != free_block))
buf_pool.free_block(free_block);
}

buf_block_t *free_block= buf_LRU_get_free_block(false);
buf_block_t *block= buf_page_create(space, static_cast<uint32_t>(offset),
space->zip_size(), mtr, free_block);
if (UNIV_UNLIKELY(block != free_block))
buf_pool.free_block(free_block);
fsp_init_file_page(space, block, mtr);
return block;
}
Expand Down Expand Up @@ -1799,7 +1733,6 @@ fseg_create(fil_space_t *space, ulint byte_offset, mtr_t *mtr, dberr_t *err,

ut_d(const auto x = block->page.lock.x_lock_count());
ut_ad(x || block->page.lock.not_recursive());
ut_ad(x == 1 || space->is_being_truncated);
ut_ad(x <= 2);
ut_ad(!fil_page_get_type(block->page.frame));
mtr->write<1>(*block, FIL_PAGE_TYPE + 1 + block->page.frame,
Expand Down
21 changes: 8 additions & 13 deletions storage/innobase/include/fil0fil.h
Original file line number Diff line number Diff line change
Expand Up @@ -362,8 +362,6 @@ struct fil_space_t final
Protected by log_sys.mutex.
If and only if this is nonzero, the
tablespace will be in named_spaces. */
/** whether undo tablespace truncation is in progress */
bool is_being_truncated;
fil_type_t purpose;/*!< purpose */
UT_LIST_BASE_NODE_T(fil_node_t) chain;
/*!< base node for the file chain */
Expand Down Expand Up @@ -442,13 +440,21 @@ struct fil_space_t final
/** LSN of freeing last page; protected by freed_range_mutex */
lsn_t last_freed_lsn;

/** LSN of undo tablespace creation or 0; protected by latch */
lsn_t create_lsn;
public:
/** @return whether doublewrite buffering is needed */
inline bool use_doublewrite() const;

/** @return whether a page has been freed */
inline bool is_freed(uint32_t page);

/** Set create_lsn. */
inline void set_create_lsn(lsn_t lsn);

/** @return the latest tablespace rebuild LSN, or 0 */
lsn_t get_create_lsn() const { return create_lsn; }

/** Apply freed_ranges to the file.
@param writable whether the file is writable
@return number of pages written or hole-punched */
Expand Down Expand Up @@ -526,9 +532,6 @@ struct fil_space_t final
/** Note that operations on the tablespace must stop. */
inline void set_stopping();

/** Note that operations on the tablespace can resume after truncation */
inline void clear_stopping();

/** Drop the tablespace and wait for any pending operations to cease
@param id tablespace identifier
@param detached_handle pointer to file to be closed later, or nullptr
Expand Down Expand Up @@ -1625,14 +1628,6 @@ inline void fil_space_t::set_stopping()
#endif
}

inline void fil_space_t::clear_stopping()
{
mysql_mutex_assert_owner(&fil_system.mutex);
static_assert(STOPPING_WRITES == 1U << 30, "compatibility");
ut_d(auto n=) n_pending.fetch_sub(STOPPING_WRITES, std::memory_order_relaxed);
ut_ad((n & STOPPING) == STOPPING_WRITES);
}

/** Flush pending writes from the file system cache to the file. */
template<bool have_reference> inline void fil_space_t::flush()
{
Expand Down
5 changes: 3 additions & 2 deletions storage/innobase/include/mtr0mtr.h
Original file line number Diff line number Diff line change
Expand Up @@ -89,8 +89,9 @@ struct mtr_t {
{ auto s= m_memo.size(); rollback_to_savepoint(s - 1, s); }

/** Commit a mini-transaction that is shrinking a tablespace.
@param space tablespace that is being shrunk */
ATTRIBUTE_COLD void commit_shrink(fil_space_t &space);
@param space tablespace that is being shrunk
@param size new size in pages */
ATTRIBUTE_COLD void commit_shrink(fil_space_t &space, uint32_t size);

/** Commit a mini-transaction that is deleting or renaming a file.
@param space tablespace that is being renamed or deleted
Expand Down
2 changes: 1 addition & 1 deletion storage/innobase/include/srv0srv.h
Original file line number Diff line number Diff line change
Expand Up @@ -271,7 +271,7 @@ extern my_bool srv_undo_log_truncate;
extern my_bool srv_prefix_index_cluster_optimization;

/** Default size of UNDO tablespace (10MiB for innodb_page_size=16k) */
constexpr ulint SRV_UNDO_TABLESPACE_SIZE_IN_PAGES= (10U << 20) /
constexpr uint32_t SRV_UNDO_TABLESPACE_SIZE_IN_PAGES= (10U << 20) /
UNIV_PAGE_SIZE_DEF;

extern char* srv_log_group_home_dir;
Expand Down
38 changes: 25 additions & 13 deletions storage/innobase/mtr/mtr0mtr.cc
Original file line number Diff line number Diff line change
Expand Up @@ -258,9 +258,21 @@ void mtr_t::rollback_to_savepoint(ulint begin, ulint end)
m_memo.erase(m_memo.begin() + begin, m_memo.begin() + end);
}

/** Set create_lsn. */
inline void fil_space_t::set_create_lsn(lsn_t lsn)
{
#ifndef SUX_LOCK_GENERIC
ut_ad(latch.is_write_locked());
#endif
/* Concurrent log_checkpoint_low() must be impossible. */
mysql_mutex_assert_owner(&log_sys.mutex);
create_lsn= lsn;
}

/** Commit a mini-transaction that is shrinking a tablespace.
@param space tablespace that is being shrunk */
void mtr_t::commit_shrink(fil_space_t &space)
@param space tablespace that is being shrunk
@param size new size in pages */
void mtr_t::commit_shrink(fil_space_t &space, uint32_t size)
{
ut_ad(is_active());
ut_ad(!is_inside_ibuf());
Expand All @@ -278,16 +290,23 @@ void mtr_t::commit_shrink(fil_space_t &space)
const lsn_t start_lsn= do_write().first;
ut_d(m_log.erase());

fil_node_t *file= UT_LIST_GET_LAST(space.chain);
mysql_mutex_lock(&log_sys.flush_order_mutex);
mysql_mutex_lock(&fil_system.mutex);
ut_ad(file->is_open());
space.size= file->size= size;
space.set_create_lsn(m_commit_lsn);
mysql_mutex_unlock(&fil_system.mutex);

space.clear_freed_ranges();

/* Durably write the reduced FSP_SIZE before truncating the data file. */
log_write_and_flush();

os_file_truncate(space.chain.start->name, space.chain.start->handle,
os_offset_t{space.size} << srv_page_size_shift, true);

space.clear_freed_ranges();
os_offset_t{size} << srv_page_size_shift, true);

const page_id_t high{space.id, space.size};
const page_id_t high{space.id, size};

for (mtr_memo_slot_t &slot : m_memo)
{
Expand Down Expand Up @@ -331,13 +350,6 @@ void mtr_t::commit_shrink(fil_space_t &space)

mysql_mutex_unlock(&log_sys.flush_order_mutex);

mysql_mutex_lock(&fil_system.mutex);
ut_ad(space.is_being_truncated);
ut_ad(space.is_stopping_writes());
space.clear_stopping();
space.is_being_truncated= false;
mysql_mutex_unlock(&fil_system.mutex);

release();
release_resources();
srv_stats.log_write_requests.inc();
Expand Down
Loading

0 comments on commit 3613fb2

Please sign in to comment.