Skip to content

Commit

Permalink
MDEV-28766: SET GLOBAL innodb_log_file_buffering
Browse files Browse the repository at this point in the history
In commit c4c8830 (MDEV-28111) we disabled
the file system cache on the InnoDB write-ahead log file (ib_logfile0)
by default on Linux.

It turns out that especially with innodb_flush_trx_log_at_commit=2,
writing to the log via the file system cache typically improves throughput,
especially on slow storage or at a small number of concurrent transactions.
For other values of innodb_flush_log_at_trx_commit, direct writes were
observed to be mostly but not always faster. Whether it pays off to
disable the file system cache on the log may depend on the type of storage,
the workload, and the operating system kernel version.

On Linux and Microsoft Windows, we will introduce the settable Boolean
global variable innodb_log_file_buffering that indicates whether the
file system cache on the redo log file is enabled. The default value is
innodb_log_file_buffering=OFF. If the server is started up with
innodb_flush_log_at_trx_commit=2, the value will be changed to
innodb_log_file_buffering=ON.

When a persistent memory interface is being used for the log,
the value cannot be changed from innodb_log_file_buffering=OFF.
On Linux, when the physical block size cannot be determined
to be a power of 2 between 64 and 4096 bytes, the file system cache
cannot be disabled, and innodb_log_file_buffering=ON cannot be changed.

Server log messages will indicate whether the file system cache is
enabled for the redo log:

[Note] InnoDB: Buffered log writes (block size=512 bytes)
[Note] InnoDB: File system buffers for log disabled (block size=512 bytes)

After this change, the startup parameter innodb_flush_method will no
longer control whether O_DIRECT will be set on the redo log on Linux.

On other operating systems that support O_DIRECT, no interface has been
implemented for controlling the file system cache for the redo log.
The innodb_flush_method values O_DIRECT, O_DIRECT_NO_FSYNC, O_DSYNC
will enable O_DIRECT for data files, not the log.

Tested by: Matthias Leich, Axel Schwenke
  • Loading branch information
dr-m committed Jun 14, 2022
1 parent 813986a commit 4c0cd95
Show file tree
Hide file tree
Showing 6 changed files with 139 additions and 38 deletions.
3 changes: 2 additions & 1 deletion mysql-test/suite/sys_vars/r/sysvars_innodb.result
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ variable_name not in (
'innodb_numa_interleave', # only available WITH_NUMA
'innodb_evict_tables_on_commit_debug', # one may want to override this
'innodb_use_native_aio', # default value depends on OS
'innodb_log_file_buffering', # only available on Linux and Windows
'innodb_buffer_pool_load_pages_abort') # debug build only, and is only for testing
order by variable_name;
VARIABLE_NAME INNODB_ADAPTIVE_FLUSHING
Expand Down Expand Up @@ -1020,7 +1021,7 @@ SESSION_VALUE NULL
DEFAULT_VALUE
VARIABLE_SCOPE GLOBAL
VARIABLE_TYPE VARCHAR
VARIABLE_COMMENT Path to InnoDB log files.
VARIABLE_COMMENT Path to ib_logfile0
NUMERIC_MIN_VALUE NULL
NUMERIC_MAX_VALUE NULL
NUMERIC_BLOCK_SIZE NULL
Expand Down
1 change: 1 addition & 0 deletions mysql-test/suite/sys_vars/t/sysvars_innodb.test
Original file line number Diff line number Diff line change
Expand Up @@ -12,5 +12,6 @@ select VARIABLE_NAME, SESSION_VALUE, DEFAULT_VALUE, VARIABLE_SCOPE, VARIABLE_TYP
'innodb_numa_interleave', # only available WITH_NUMA
'innodb_evict_tables_on_commit_debug', # one may want to override this
'innodb_use_native_aio', # default value depends on OS
'innodb_log_file_buffering', # only available on Linux and Windows
'innodb_buffer_pool_load_pages_abort') # debug build only, and is only for testing
order by variable_name;
30 changes: 29 additions & 1 deletion storage/innobase/handler/ha_innodb.cc
Original file line number Diff line number Diff line change
Expand Up @@ -4066,6 +4066,14 @@ static int innodb_init_params()
}
#endif

#if defined __linux__ || defined _WIN32
if (srv_flush_log_at_trx_commit == 2) {
/* Do not disable the file system cache if
innodb_flush_log_at_trx_commit=2. */
log_sys.log_buffered = true;
}
#endif

if (srv_read_only_mode) {
ib::info() << "Started in read only mode";
srv_use_doublewrite_buf = FALSE;
Expand Down Expand Up @@ -18442,6 +18450,16 @@ buffer_pool_load_abort(
}
}

#if defined __linux__ || defined _WIN32
static void innodb_log_file_buffering_update(THD *thd, st_mysql_sys_var*,
void *, const void *save)
{
mysql_mutex_unlock(&LOCK_global_system_variables);
log_sys.set_buffered(*static_cast<const my_bool*>(save));
mysql_mutex_lock(&LOCK_global_system_variables);
}
#endif

/** Update innodb_status_output or innodb_status_output_locks,
which control InnoDB "status monitor" output to the error log.
@param[out] var current value
Expand Down Expand Up @@ -18858,7 +18876,7 @@ static MYSQL_SYSVAR_ENUM(flush_method, srv_file_flush_method,

static MYSQL_SYSVAR_STR(log_group_home_dir, srv_log_group_home_dir,
PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
"Path to InnoDB log files.", NULL, NULL, NULL);
"Path to ib_logfile0", NULL, NULL, NULL);

static MYSQL_SYSVAR_DOUBLE(max_dirty_pages_pct, srv_max_buf_pool_modified_pct,
PLUGIN_VAR_RQCMDARG,
Expand Down Expand Up @@ -19250,6 +19268,13 @@ static MYSQL_SYSVAR_SIZE_T(log_buffer_size, log_sys.buf_size,
"Redo log buffer size in bytes.",
NULL, NULL, 16U << 20, 2U << 20, SIZE_T_MAX, 4096);

#if defined __linux__ || defined _WIN32
static MYSQL_SYSVAR_BOOL(log_file_buffering, log_sys.log_buffered,
PLUGIN_VAR_OPCMDARG,
"Whether the file system cache for ib_logfile0 is enabled",
nullptr, innodb_log_file_buffering_update, FALSE);
#endif

static MYSQL_SYSVAR_ULONGLONG(log_file_size, srv_log_file_size,
PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
"Redo log size in bytes.",
Expand Down Expand Up @@ -19692,6 +19717,9 @@ static struct st_mysql_sys_var* innobase_system_variables[]= {
MYSQL_SYSVAR(deadlock_report),
MYSQL_SYSVAR(page_size),
MYSQL_SYSVAR(log_buffer_size),
#if defined __linux__ || defined _WIN32
MYSQL_SYSVAR(log_file_buffering),
#endif
MYSQL_SYSVAR(log_file_size),
MYSQL_SYSVAR(log_group_home_dir),
MYSQL_SYSVAR(max_dirty_pages_pct),
Expand Down
17 changes: 17 additions & 0 deletions storage/innobase/include/log0log.h
Original file line number Diff line number Diff line change
Expand Up @@ -249,6 +249,16 @@ typedef srw_lock log_rwlock_t;
uint32_t format;
/** Log file */
log_file_t log;
#if defined __linux__ || defined _WIN32
/** whether file system caching is enabled for the log */
my_bool log_buffered;
# ifdef _WIN32
static constexpr bool log_maybe_unbuffered= true;
# else
/** whether file system caching may be disabled */
bool log_maybe_unbuffered;
# endif
#endif

/** Fields involved in checkpoints @{ */
lsn_t log_capacity; /*!< capacity of the log; if
Expand Down Expand Up @@ -289,10 +299,17 @@ typedef srw_lock log_rwlock_t;

bool is_opened() const noexcept { return log.is_opened(); }

static constexpr bool resize_in_progress() { return false; }

/** Rename a log file after resizing.
@return whether an error occurred */
static bool rename_resized() noexcept;

#if defined __linux__ || defined _WIN32
/** Try to enable or disable file system caching (update log_buffered) */
void set_buffered(bool buffered);
#endif

void attach(log_file_t file, os_offset_t size);

void close_file();
Expand Down
75 changes: 63 additions & 12 deletions storage/innobase/log/log0log.cc
Original file line number Diff line number Diff line change
Expand Up @@ -209,6 +209,8 @@ void log_t::attach(log_file_t file, os_offset_t size)
#if defined __linux__ || defined _WIN32
set_block_size(CPU_LEVEL1_DCACHE_LINESIZE);
#endif
log_maybe_unbuffered= true;
log_buffered= false;
return;
}
}
Expand All @@ -220,18 +222,11 @@ void log_t::attach(log_file_t file, os_offset_t size)
#endif

#if defined __linux__ || defined _WIN32
if (!block_size)
set_block_size(512);
# ifdef __linux__
else if (srv_file_flush_method != SRV_O_DSYNC &&
srv_file_flush_method != SRV_O_DIRECT &&
srv_file_flush_method != SRV_O_DIRECT_NO_FSYNC)
sql_print_information("InnoDB: Buffered log writes (block size=%u bytes)",
block_size);
#endif
else
sql_print_information("InnoDB: File system buffers for log"
" disabled (block size=%u bytes)", block_size);
sql_print_information("InnoDB: %s (block size=%u bytes)",
log_buffered
? "Buffered log writes"
: "File system buffers for log disabled",
block_size);
#endif

#ifdef HAVE_PMEM
Expand Down Expand Up @@ -327,6 +322,62 @@ void log_t::close_file()
ib::fatal() << "closing ib_logfile0 failed: " << err;
}

#if defined __linux__ || defined _WIN32
/** Acquire all latches that protect the log. */
static void log_resize_acquire()
{
if (!log_sys.is_pmem())
{
while (flush_lock.acquire(log_sys.get_lsn() + 1, nullptr) !=
group_commit_lock::ACQUIRED);
while (write_lock.acquire(log_sys.get_lsn() + 1, nullptr) !=
group_commit_lock::ACQUIRED);
}

log_sys.latch.wr_lock(SRW_LOCK_CALL);
}

/** Release the latches that protect the log. */
void log_resize_release()
{
log_sys.latch.wr_unlock();

if (!log_sys.is_pmem())
{
lsn_t lsn1= write_lock.release(write_lock.value());
lsn_t lsn2= flush_lock.release(flush_lock.value());
if (lsn1 || lsn2)
log_write_up_to(std::max(lsn1, lsn2), true, nullptr);
}
}

/** Try to enable or disable file system caching (update log_buffered) */
void log_t::set_buffered(bool buffered)
{
if (!log_maybe_unbuffered || is_pmem() || high_level_read_only)
return;
log_resize_acquire();
if (!resize_in_progress() && is_opened() && bool(log_buffered) != buffered)
{
os_file_close_func(log.m_file);
log.m_file= OS_FILE_CLOSED;
std::string path{get_log_file_path()};
log_buffered= buffered;
bool success;
log.m_file= os_file_create_func(path.c_str(),
OS_FILE_OPEN, OS_FILE_NORMAL, OS_LOG_FILE,
false, &success);
ut_a(log.m_file != OS_FILE_CLOSED);
sql_print_information("InnoDB: %s (block size=%u bytes)",
log_buffered
? "Buffered log writes"
: "File system buffers for log disabled",
block_size);
}
log_resize_release();
}
#endif

/** Write an aligned buffer to ib_logfile0.
@param buf buffer to be written
@param len length of data to be written
Expand Down
51 changes: 27 additions & 24 deletions storage/innobase/os/os0file.cc
Original file line number Diff line number Diff line change
Expand Up @@ -1055,6 +1055,7 @@ os_file_create_simple_func(
we open the same file in the same mode, see man page of open(2). */
if (!srv_read_only_mode && *success) {
switch (srv_file_flush_method) {
case SRV_O_DSYNC:
case SRV_O_DIRECT:
case SRV_O_DIRECT_NO_FSYNC:
os_file_set_nocache(file, name, mode_str);
Expand Down Expand Up @@ -1240,13 +1241,13 @@ os_file_create_func(

#if (defined(UNIV_SOLARIS) && defined(DIRECTIO_ON)) || defined O_DIRECT
if (type == OS_DATA_FILE) {
# ifdef __linux__
use_o_direct:
# endif
switch (srv_file_flush_method) {
case SRV_O_DSYNC:
case SRV_O_DIRECT:
case SRV_O_DIRECT_NO_FSYNC:
# ifdef __linux__
use_o_direct:
# endif
os_file_set_nocache(file, name, mode_str);
break;
default:
Expand All @@ -1263,9 +1264,6 @@ os_file_create_func(
goto skip_o_direct;
}
MSAN_STAT_WORKAROUND(&st);
if (st.st_size & 4095) {
goto skip_o_direct;
}
if (snprintf(b, sizeof b,
"/sys/dev/block/%u:%u/queue/physical_block_size",
major(st.st_dev), minor(st.st_dev))
Expand Down Expand Up @@ -1298,11 +1296,16 @@ os_file_create_func(
if (s > 4096 || s < 64 || !ut_is_2pow(s)) {
goto skip_o_direct;
}
log_sys.log_maybe_unbuffered= true;
log_sys.set_block_size(uint32_t(s));
goto use_o_direct;
if (!log_sys.log_buffered && !(st.st_size & (s - 1))) {
goto use_o_direct;
}
} else {
skip_o_direct:
log_sys.set_block_size(0);
log_sys.log_maybe_unbuffered= false;
log_sys.log_buffered= true;
log_sys.set_block_size(512);
}
}
# endif
Expand Down Expand Up @@ -2057,17 +2060,15 @@ os_file_create_directory(
}

/** Get disk sector size for a file. */
size_t get_sector_size(HANDLE file)
static size_t get_sector_size(HANDLE file)
{
FILE_STORAGE_INFO fsi;
ULONG s= 4096;
if (GetFileInformationByHandleEx(file, FileStorageInfo, &fsi, sizeof fsi))
{
s= fsi.PhysicalBytesPerSectorForPerformance;
if (s > 4096 || s < 64 || !ut_is_2pow(s))
{
return 4096;
}
}
return s;
}
Expand Down Expand Up @@ -2165,8 +2166,9 @@ os_file_create_func(
? FILE_FLAG_OVERLAPPED : 0;

if (type == OS_LOG_FILE) {
if(srv_flush_log_at_trx_commit != 2 && !log_sys.is_opened())
if (!log_sys.is_opened() && !log_sys.log_buffered) {
attributes|= FILE_FLAG_NO_BUFFERING;
}
if (srv_file_flush_method == SRV_O_DSYNC)
attributes|= FILE_FLAG_WRITE_THROUGH;
}
Expand Down Expand Up @@ -2197,21 +2199,22 @@ os_file_create_func(
name, access, share_mode, my_win_file_secattr(),
create_flag, attributes, NULL);

if (file != INVALID_HANDLE_VALUE && type == OS_LOG_FILE
&& (attributes & FILE_FLAG_NO_BUFFERING)) {
uint32 s= (uint32_t) get_sector_size(file);
log_sys.set_block_size(uint32_t(s));
/* FIXME! remove it when backup is fixed, so that it
does not produce redo with irregular sizes.*/
if (os_file_get_size(file) % s) {
attributes &= ~FILE_FLAG_NO_BUFFERING;
create_flag = OPEN_ALWAYS;
CloseHandle(file);
continue;
*success = file != INVALID_HANDLE_VALUE;

if (*success && type == OS_LOG_FILE) {
uint32_t s = uint32_t(get_sector_size(file));
log_sys.set_block_size(s);
if (attributes & FILE_FLAG_NO_BUFFERING) {
if (os_file_get_size(file) % s) {
attributes &= ~FILE_FLAG_NO_BUFFERING;
create_flag = OPEN_ALWAYS;
CloseHandle(file);
continue;
}
log_sys.log_buffered = false;
}
}

*success = (file != INVALID_HANDLE_VALUE);
if (*success) {
break;
}
Expand Down

0 comments on commit 4c0cd95

Please sign in to comment.