Skip to content
Permalink
Browse files
MDEV-18976 Implement OPT_PAGE_CHECKSUM log record for improved valida…
…tion

We will introduce an optional log record OPT_PAGE_CHECKSUM for recording
page checksums, so that more inconsistencies on crash recovery may be
caught.

mtr_t::page_checksum(const buf_page_t&): Write OPT_PAGE_CHECKSUM
(currently not for ROW_FORMAT=COMPRESSED pages).

mtr_t::do_write(): Write OPT_PAGE_CHECKSUM records for all pages
(currently, in debug builds only).

mtr_t::is_logged(): Return whether log should be written.

mtr_t::set_log_mode_sub(const mtr_t&): Set the logging mode of
a sub-minitransaction when another mini-transaction is holding
latches on some modified pages. When creating or freeing BLOB pages,
we may only write OPT_PAGE_CHECKSUM records in the main mini-transaction,
after all changes have been written to the log.

MTR_LOG_SUB: Log mode for a sub-mini-transaction.

mtr_t::free(): Define non-inline, and invoke MarkFreed.

MarkFreed: For any matching page in the mini-transaction log,
change the first entry to say MTR_MEMO_PAGE_X_MODIFY and any subsequent
entries to MTR_MEMO_PAGE_X_FIX.

FindModified: Simplify a condition. MTR_MEMO_MODIFY can only be set
if MTR_MEMO_PAGE_X_FIX or MTR_MEMO_PAGE_SX_FIX are set.

FindBlockX: Consider also MTR_MEMO_PAGE_X_MODIFY.

recv_sys_t::parse(): Store OPT_PAGE_CHECKSUM records.

log_phys_t::apply(): Validate OPT_PAGE_CHECKSUM records.

log_phys_t::page_checksum(): Validate an OPT_PAGE_CHECKSUM record.

Tested by: Matthias Leich
  • Loading branch information
dr-m committed Jun 6, 2022
1 parent cc4eabc commit 4179f93
Show file tree
Hide file tree
Showing 12 changed files with 275 additions and 124 deletions.
@@ -6943,7 +6943,7 @@ btr_store_big_rec_extern_fields(

mtr.start();
index->set_modified(mtr);
mtr.set_log_mode(btr_mtr->get_log_mode());
mtr.set_log_mode_sub(*btr_mtr);
mtr.memo_push(rec_block, MTR_MEMO_PAGE_X_FIX);

rec_block->page.fix();
@@ -7287,7 +7287,7 @@ btr_free_externally_stored_field(

mtr.start();
mtr.set_spaces(*local_mtr);
mtr.set_log_mode(local_mtr->get_log_mode());
mtr.set_log_mode_sub(*local_mtr);

ut_ad(!index->table->is_temporary()
|| local_mtr->get_log_mode() == MTR_LOG_NO_REDO);
@@ -1483,7 +1483,7 @@ inline void mtr_t::log_file_op(mfile_type_t type, ulint space_id,
ut_ad(!strcmp(&path[strlen(path) - strlen(DOT_IBD)], DOT_IBD));

flag_modified();
if (m_log_mode != MTR_LOG_ALL)
if (!is_logged())
return;
m_last= nullptr;

@@ -475,26 +475,20 @@ updating an allocation bitmap page.
@param[in] mtr mini-transaction */
void fil_space_t::modify_check(const mtr_t& mtr) const
{
switch (mtr.get_log_mode()) {
case MTR_LOG_NONE:
/* These modes are only allowed within a non-bitmap page
when there is a higher-level redo log record written. */
ut_ad(purpose == FIL_TYPE_TABLESPACE
|| purpose == FIL_TYPE_TEMPORARY);
break;
case MTR_LOG_NO_REDO:
ut_ad(purpose == FIL_TYPE_TEMPORARY
|| purpose == FIL_TYPE_IMPORT);
return;
case MTR_LOG_ALL:
/* We may only write redo log for a persistent
tablespace. */
ut_ad(purpose == FIL_TYPE_TABLESPACE);
ut_ad(mtr.is_named_space(id));
return;
}

ut_ad("invalid log mode" == 0);
switch (mtr.get_log_mode()) {
case MTR_LOG_NONE:
/* These modes are only allowed within a non-bitmap page
when there is a higher-level redo log record written. */
ut_ad(purpose == FIL_TYPE_TABLESPACE || purpose == FIL_TYPE_TEMPORARY);
break;
case MTR_LOG_NO_REDO:
ut_ad(purpose == FIL_TYPE_TEMPORARY || purpose == FIL_TYPE_IMPORT);
break;
default:
/* We may only write redo log for a persistent tablespace. */
ut_ad(purpose == FIL_TYPE_TABLESPACE);
ut_ad(mtr.is_named_space(id));
}
}
#endif

@@ -24,8 +24,7 @@ The database buffer pool high-level routines
Created 11/5/1995 Heikki Tuuri
*******************************************************/

#ifndef buf0buf_h
#define buf0buf_h
#pragma once

/** Magic value to use instead of checksums when they are disabled */
#define BUF_NO_CHECKSUM_MAGIC 0xDEADBEEFUL
@@ -2201,5 +2200,3 @@ struct CheckUnzipLRUAndLRUList {
#include "buf0buf.inl"

#endif /* !UNIV_INNOCHECKSUM */

#endif
@@ -1278,8 +1278,9 @@ struct fil_addr_t {

/** For the first page in a system tablespace data file(ibdata*, not *.ibd):
the file has been flushed to disk at least up to this lsn
For other pages: 32-bit key version used to encrypt the page + 32-bit checksum
or 64 bites of zero if no encryption */
For other pages of tablespaces not in innodb_checksum_algorithm=full_crc32
format: 32-bit key version used to encrypt the page + 32-bit checksum
or 64 bits of zero if no encryption */
#define FIL_PAGE_FILE_FLUSH_LSN_OR_KEY_VERSION 26U

/** This overloads FIL_PAGE_FILE_FLUSH_LSN for RTREE Split Sequence Number */
@@ -196,7 +196,7 @@ inline bool mtr_t::write(const buf_block_t &block, void *ptr, V val)
}
byte *p= static_cast<byte*>(ptr);
const byte *const end= p + l;
if (w != FORCED && m_log_mode == MTR_LOG_ALL)
if (w != FORCED && is_logged())
{
const byte *b= buf;
while (*p++ == *b++)
@@ -224,7 +224,7 @@ inline void mtr_t::memset(const buf_block_t &b, ulint ofs, ulint len, byte val)
{
ut_ad(len);
set_modified(b);
if (m_log_mode != MTR_LOG_ALL)
if (!is_logged())
return;

static_assert(MIN_4BYTE > UNIV_PAGE_SIZE_MAX, "consistency");
@@ -261,7 +261,7 @@ inline void mtr_t::memset(const buf_block_t &b, ulint ofs, size_t len,
ut_ad(size);
ut_ad(len > size); /* use mtr_t::memcpy() for shorter writes */
set_modified(b);
if (m_log_mode != MTR_LOG_ALL)
if (!is_logged())
return;

static_assert(MIN_4BYTE > UNIV_PAGE_SIZE_MAX, "consistency");
@@ -319,7 +319,7 @@ inline void mtr_t::memcpy_low(const buf_block_t &block, uint16_t offset,
{
ut_ad(len);
set_modified(block);
if (m_log_mode != MTR_LOG_ALL)
if (!is_logged())
return;
if (len < mtr_buf_t::MAX_DATA_SIZE - (1 + 3 + 3 + 5 + 5))
{
@@ -354,7 +354,7 @@ inline void mtr_t::memmove(const buf_block_t &b, ulint d, ulint s, ulint len)
ut_ad(d + len <= ulint(srv_page_size));

set_modified(b);
if (m_log_mode != MTR_LOG_ALL)
if (!is_logged())
return;
static_assert(MIN_4BYTE > UNIV_PAGE_SIZE_MAX, "consistency");
size_t lenlen= (len < MIN_2BYTE ? 1 : len < MIN_3BYTE ? 2 : 3);
@@ -387,7 +387,7 @@ template<byte type>
inline byte *mtr_t::log_write(const page_id_t id, const buf_page_t *bpage,
size_t len, bool alloc, size_t offset)
{
static_assert(!(type & 15) && type != RESERVED && type != OPTION &&
static_assert(!(type & 15) && type != RESERVED &&
type <= FILE_CHECKPOINT, "invalid type");
ut_ad(type >= FILE_CREATE || is_named_space(id.space()));
ut_ad(!bpage || bpage->id() == id);
@@ -491,7 +491,7 @@ inline void mtr_t::memcpy(const buf_block_t &b, void *dest, const void *str,
ut_ad(ut_align_down(dest, srv_page_size) == b.page.frame);
char *d= static_cast<char*>(dest);
const char *s= static_cast<const char*>(str);
if (w != FORCED && m_log_mode == MTR_LOG_ALL)
if (w != FORCED && is_logged())
{
ut_ad(len);
const char *const end= d + len;
@@ -531,35 +531,20 @@ inline void mtr_t::init(buf_block_t *b)

b->page.set_reinit(b->page.state() & buf_page_t::LRU_MASK);

if (m_log_mode != MTR_LOG_ALL)
{
ut_ad(m_log_mode == MTR_LOG_NONE || m_log_mode == MTR_LOG_NO_REDO);
if (!is_logged())
return;
}

m_log.close(log_write<INIT_PAGE>(b->page.id(), &b->page));
m_last_offset= FIL_PAGE_TYPE;
}

/** Free a page.
@param[in] space tablespace contains page to be freed
@param[in] offset page offset to be freed */
inline void mtr_t::free(fil_space_t &space, uint32_t offset)
{
ut_ad(is_named_space(&space));
ut_ad(!m_freed_space || m_freed_space == &space);

if (m_log_mode == MTR_LOG_ALL)
m_log.close(log_write<FREE_PAGE>({space.id, offset}, nullptr));
}

/** Write an EXTENDED log record.
@param block buffer pool page
@param type extended record subtype; @see mrec_ext_t */
inline void mtr_t::log_write_extended(const buf_block_t &block, byte type)
{
set_modified(block);
if (m_log_mode != MTR_LOG_ALL)
if (!is_logged())
return;
byte *l= log_write<EXTENDED>(block.page.id(), &block.page, 1, true);
*l++= type;
@@ -586,7 +571,7 @@ inline void mtr_t::page_delete(const buf_block_t &block, ulint prev_rec)
ut_ad(!block.zip_size());
ut_ad(prev_rec < block.physical_size());
set_modified(block);
if (m_log_mode != MTR_LOG_ALL)
if (!is_logged())
return;
size_t len= (prev_rec < MIN_2BYTE ? 2 : prev_rec < MIN_3BYTE ? 3 : 4);
byte *l= log_write<EXTENDED>(block.page.id(), &block.page, len, true);
@@ -613,7 +598,7 @@ inline void mtr_t::page_delete(const buf_block_t &block, ulint prev_rec,
ut_ad(hdr_size < MIN_3BYTE);
ut_ad(prev_rec < block.physical_size());
ut_ad(data_size < block.physical_size());
if (m_log_mode != MTR_LOG_ALL)
if (!is_logged())
return;
size_t len= prev_rec < MIN_2BYTE ? 2 : prev_rec < MIN_3BYTE ? 3 : 4;
len+= hdr_size < MIN_2BYTE ? 1 : 2;
@@ -645,7 +630,7 @@ inline void mtr_t::undo_append(const buf_block_t &block,
{
ut_ad(len > 2);
set_modified(block);
if (m_log_mode != MTR_LOG_ALL)
if (!is_logged())
return;
const bool small= len + 1 < mtr_buf_t::MAX_DATA_SIZE - (1 + 3 + 3 + 5 + 5);
byte *end= log_write<EXTENDED>(block.page.id(), &block.page, len + 1, small);
@@ -668,7 +653,7 @@ inline void mtr_t::undo_append(const buf_block_t &block,
@param id first page identifier that will not be in the file */
inline void mtr_t::trim_pages(const page_id_t id)
{
if (m_log_mode != MTR_LOG_ALL)
if (!is_logged())
return;
byte *l= log_write<EXTENDED>(id, nullptr, 1, true);
*l++= TRIM_PAGES;
@@ -136,10 +136,18 @@ struct mtr_t {
mtr_log_t get_log_mode() const
{
static_assert(MTR_LOG_ALL == 0, "efficiency");
ut_ad(m_log_mode <= MTR_LOG_NO_REDO);
return static_cast<mtr_log_t>(m_log_mode);
}

/** @return whether log is to be written for changes */
bool is_logged() const
{
static_assert(MTR_LOG_ALL == 0, "efficiency");
static_assert(MTR_LOG_NONE & MTR_LOG_NO_REDO, "efficiency");
static_assert(!(MTR_LOG_NONE & MTR_LOG_SUB), "efficiency");
return !(m_log_mode & MTR_LOG_NONE);
}

/** Change the logging mode.
@param mode logging mode
@return old mode */
@@ -150,6 +158,15 @@ struct mtr_t {
return old_mode;
}

/** Set the log mode of a sub-minitransaction
@param mtr parent mini-transaction */
void set_log_mode_sub(const mtr_t &mtr)
{
ut_ad(mtr.m_log_mode == MTR_LOG_ALL || mtr.m_log_mode == MTR_LOG_NO_REDO);
m_log_mode= mtr.m_log_mode | MTR_LOG_SUB;
static_assert((MTR_LOG_SUB | MTR_LOG_NO_REDO) == MTR_LOG_NO_REDO, "");
}

/** Check if we are holding a block latch in exclusive mode
@param block buffer pool block to search for */
bool have_x_latch(const buf_block_t &block) const;
@@ -372,6 +389,9 @@ struct mtr_t {
/** @return whether the log and memo are empty */
bool is_empty() const { return m_memo.size() == 0 && m_log.size() == 0; }

/** Write an OPT_PAGE_CHECKSUM record. */
inline void page_checksum(const buf_page_t &bpage);

/** Write request types */
enum write_type
{
@@ -470,9 +490,9 @@ struct mtr_t {
@param[in,out] b buffer page */
void init(buf_block_t *b);
/** Free a page.
@param[in] space tablespace contains page to be freed
@param[in] offset page offset to be freed */
inline void free(fil_space_t &space, uint32_t offset);
@param space tablespace
@param offset offset of the page to be freed */
void free(const fil_space_t &space, uint32_t offset);
/** Write log for partly initializing a B-tree or R-tree page.
@param block B-tree or R-tree page
@param comp false=ROW_FORMAT=REDUNDANT, true=COMPACT or DYNAMIC */
@@ -41,6 +41,11 @@ enum mtr_log_t {
Set for attempting modification of a ROW_FORMAT=COMPRESSED page. */
MTR_LOG_NONE,

/** Log all operations, but do not write any OPT_PAGE_CHECKSUM
records because some of the modified pages were also modified
by another mini-transaction that did not write its log yet. */
MTR_LOG_SUB,

/** Don't generate REDO log but add dirty pages to flush list */
MTR_LOG_NO_REDO
};
@@ -77,12 +82,8 @@ type. The following record types refer to data pages:
RESERVED (6): reserved for future use; a subtype code
(encoded immediately after the length) would be written
to reserve code space for further extensions
OPTION (7): optional record that may be ignored; a subtype code
(encoded immediately after the length) would distinguish actual
usage, such as:
* MDEV-18976 page checksum record
* binlog record
* SQL statement (at the start of statement)
OPTION (7): optional record that may be ignored; a subtype @see mrec_opt
(encoded immediately after the length) would distinguish actual usage
Bits 3..0 indicate the redo log record length, excluding the first
byte, but including additional length bytes and any other bytes,
@@ -229,9 +230,7 @@ enum mrec_type_t
/** Reserved for future use. */
RESERVED= 0x60,
/** Optional record that may be ignored in crash recovery.
A subtype code will be encoded immediately after the length.
Possible subtypes would include a MDEV-18976 page checksum record,
a binlog record, or an SQL statement. */
A subtype (@see mrec_opt) will be encoded after the page identifier. */
OPTION= 0x70
};

@@ -283,6 +282,15 @@ enum mrec_ext_t
};


/** Recognized OPTION record subtypes. */
enum mrec_opt
{
/** page checksum at the end of the mini-transaction */
OPT_PAGE_CHECKSUM= 0
/* Other possible subtypes: a binlog record, or an SQL statement. */
};


/** Redo log record types for file-level operations. These bit
patterns will be written to redo log files, so the existing codes or
their interpretation on crash recovery must not be changed. */

0 comments on commit 4179f93

Please sign in to comment.