Skip to content
Permalink
Browse files
MDEV-23855: Remove fil_system.LRU and reduce fil_system.mutex contention
Also fixes MDEV-23929: innodb_flush_neighbors is not being ignored
for system tablespace on SSD

When the maximum configured number of file is exceeded, InnoDB will
close data files. We used to maintain a fil_system.LRU list and
a counter fil_node_t::n_pending to achieve this, at the huge cost
of multiple fil_system.mutex operations per I/O operation.

fil_node_open_file_low(): Implement a FIFO replacement policy:
The last opened file will be moved to the end of fil_system.space_list,
and files will be closed from the start of the list. However, we will
not move tablespaces in fil_system.space_list while
i_s_tablespaces_encryption_fill_table() is executing
(producing output for INFORMATION_SCHEMA.INNODB_TABLESPACES_ENCRYPTION)
because it may cause information of some tablespaces to go missing.
We also avoid this in mariabackup --backup because datafiles_iter_next()
assumes that the ordering is not changed.

IORequest: Fold more parameters to IORequest::type.

fil_space_t::io(): Replaces fil_io().

fil_space_t::flush(): Replaces fil_flush().

OS_AIO_IBUF: Remove. We will always issue synchronous reads of the
change buffer pages in buf_read_page_low().

We will always ignore some errors for background reads.

This should reduce fil_system.mutex contention a little.

fil_node_t::complete_write(): Replaces fil_node_t::complete_io().
On both read and write completion, fil_space_t::release_for_io()
will have to be called.

fil_space_t::io(): Do not acquire fil_system.mutex in the normal
code path.

xb_delta_open_matching_space(): Do not try to open the system tablespace
which was already opened. This fixes a file sharing violation in
mariabackup --prepare --incremental.

Reviewed by: Vladislav Vaintroub
  • Loading branch information
dr-m committed Oct 26, 2020
1 parent 3a9a3be commit 45ed9dd
Show file tree
Hide file tree
Showing 39 changed files with 1,303 additions and 1,900 deletions.
@@ -93,7 +93,6 @@ xb_fil_node_close_file(
mutex_enter(&fil_system.mutex);

ut_ad(node);
ut_a(node->n_pending == 0);
ut_a(node->n_pending_flushes == 0);
ut_a(!node->being_extended);

@@ -108,20 +107,10 @@ xb_fil_node_close_file(
ut_a(ret);

node->handle = OS_FILE_CLOSED;
mutex_exit(&fil_system.mutex);

ut_a(fil_system.n_open > 0);
fil_system.n_open--;

if (node->space->purpose == FIL_TYPE_TABLESPACE &&
fil_is_user_tablespace_id(node->space->id)) {

ut_a(UT_LIST_GET_LEN(fil_system.LRU) > 0);

/* The node is in the LRU list, remove it */
UT_LIST_REMOVE(fil_system.LRU, node);
}

mutex_exit(&fil_system.mutex);
}

/************************************************************************
@@ -180,18 +169,8 @@ xb_fil_cur_open(

return(XB_FIL_CUR_SKIP);
}
mutex_enter(&fil_system.mutex);

fil_system.n_open++;

if (node->space->purpose == FIL_TYPE_TABLESPACE &&
fil_is_user_tablespace_id(node->space->id)) {

/* Put the node to the LRU list */
UT_LIST_ADD_FIRST(fil_system.LRU, node);
}

mutex_exit(&fil_system.mutex);
}

ut_ad(node->is_open());
@@ -427,7 +406,7 @@ xb_fil_cur_read(
retry_count = 10;
ret = XB_FIL_CUR_SUCCESS;

fil_space_t *space = fil_space_acquire_for_io(cursor->space_id);
fil_space_t *space = fil_space_t::get_for_io(cursor->space_id);

if (!space) {
return XB_FIL_CUR_ERROR;
@@ -3011,6 +3011,7 @@ void
xb_fil_io_init()
{
fil_system.create(srv_file_per_table ? 50000 : 5000);
fil_system.freeze_space_list = 1;
fil_system.space_id_reuse_warned = true;
}

@@ -3087,24 +3088,16 @@ xb_load_single_table_tablespace(
bool is_empty_file = file->exists() && file->is_empty_file();

if (err == DB_SUCCESS && file->space_id() != SRV_TMP_SPACE_ID) {
os_offset_t node_size = os_file_get_size(file->handle());
os_offset_t n_pages;

ut_a(node_size != (os_offset_t) -1);

n_pages = node_size / fil_space_t::physical_size(file->flags());

space = fil_space_create(
space = fil_space_t::create(
name, file->space_id(), file->flags(),
FIL_TYPE_TABLESPACE, NULL/* TODO: crypt_data */);

ut_a(space != NULL);

space->add(file->filepath(), OS_FILE_CLOSED, uint32_t(n_pages),
false, false);
space->add(file->filepath(), OS_FILE_CLOSED, 0, false, false);
/* by opening the tablespace we forcing node and space objects
in the cache to be populated with fields from space header */
space->open();
space->get_size();

if (srv_operation == SRV_OPERATION_RESTORE_DELTA
|| xb_close_files) {
@@ -3406,19 +3399,6 @@ xb_load_tablespaces()
return(DB_SUCCESS);
}

/************************************************************************
Initialize the tablespace memory cache and populate it by scanning for and
opening data files.
@returns DB_SUCCESS or error code.*/
static
dberr_t
xb_data_files_init()
{
xb_fil_io_init();

return(xb_load_tablespaces());
}

/** Destroy the tablespace memory cache. */
static void xb_data_files_close()
{
@@ -4607,6 +4587,22 @@ xb_delta_open_matching_space(
return file;
}

if (!info.space_id && fil_system.sys_space) {
fil_node_t *node
= UT_LIST_GET_FIRST(fil_system.sys_space->chain);
for (; node; node = UT_LIST_GET_NEXT(chain, node)) {
if (!strcmp(node->name, real_name)) {
break;
}
}
if (node && node->handle != OS_FILE_CLOSED) {
*success = true;
return node->handle;
}
msg("mariabackup: Cannot find file %s\n", real_name);
return OS_FILE_CLOSED;
}

log_mutex_enter();
if (!fil_is_user_tablespace_id(info.space_id)) {
found:
@@ -4704,8 +4700,8 @@ xb_delta_open_matching_space(
ut_ad(fil_space_t::zip_size(flags) == info.zip_size);
ut_ad(fil_space_t::physical_size(flags) == info.page_size);

if (fil_space_create(dest_space_name, info.space_id, flags,
FIL_TYPE_TABLESPACE, 0)) {
if (fil_space_t::create(dest_space_name, info.space_id, flags,
FIL_TYPE_TABLESPACE, 0)) {
*success = xb_space_create_file(real_name, info.space_id,
flags, &file);
} else {
@@ -4925,15 +4921,15 @@ xtrabackup_apply_delta(
os_file_close(src_file);
os_file_delete(0,src_path);
}
if (dst_file != OS_FILE_CLOSED)
if (dst_file != OS_FILE_CLOSED && info.space_id)
os_file_close(dst_file);
return TRUE;

error:
aligned_free(incremental_buffer);
if (src_file != OS_FILE_CLOSED)
os_file_close(src_file);
if (dst_file != OS_FILE_CLOSED)
if (dst_file != OS_FILE_CLOSED && info.space_id)
os_file_close(dst_file);
msg("Error: xtrabackup_apply_delta(): "
"failed to apply %s to %s.\n", src_path, dst_path);
@@ -5387,16 +5383,17 @@ static bool xtrabackup_prepare_func(char** argv)
srv_allow_writes_event = os_event_create(0);
os_event_set(srv_allow_writes_event);
#endif
dberr_t err = xb_data_files_init();
if (err != DB_SUCCESS) {
xb_fil_io_init();
if (dberr_t err = xb_load_tablespaces()) {
msg("mariabackup: error: xb_data_files_init() failed "
"with error %s\n", ut_strerr(err));
goto error_cleanup;
}

inc_dir_tables_hash.create(1000);

ok = xtrabackup_apply_deltas();
ok = fil_system.sys_space->open(false)
&& xtrabackup_apply_deltas();

xb_data_files_close();

@@ -5426,6 +5423,8 @@ static bool xtrabackup_prepare_func(char** argv)
goto error_cleanup;
}

fil_system.freeze_space_list = 0;

/* increase IO threads */
if (srv_n_file_io_threads < 10) {
srv_n_read_io_threads = 4;
@@ -5447,6 +5446,8 @@ static bool xtrabackup_prepare_func(char** argv)
goto error_cleanup;
}

ut_ad(!fil_system.freeze_space_list);

if (ok) {
msg("Last binlog file %s, position %lld",
trx_sys.recovered_binlog_filename,
@@ -29,6 +29,7 @@ create table t1(a int not null primary key, b char(200)) engine=innodb;
--source include/wait_condition.inc

SELECT NAME FROM INFORMATION_SCHEMA.INNODB_TABLESPACES_ENCRYPTION WHERE MIN_KEY_VERSION = 0;
--sorted_result
SELECT NAME FROM INFORMATION_SCHEMA.INNODB_TABLESPACES_ENCRYPTION WHERE MIN_KEY_VERSION <> 0;

--echo # Success!
@@ -41,6 +42,7 @@ SET GLOBAL innodb_encrypt_tables = off;
--let $wait_condition=SELECT COUNT(*) = $tables_count FROM INFORMATION_SCHEMA.INNODB_TABLESPACES_ENCRYPTION WHERE MIN_KEY_VERSION = 0 AND ROTATING_OR_FLUSHING = 0;
--source include/wait_condition.inc

--sorted_result
SELECT NAME FROM INFORMATION_SCHEMA.INNODB_TABLESPACES_ENCRYPTION WHERE MIN_KEY_VERSION = 0;
SELECT NAME FROM INFORMATION_SCHEMA.INNODB_TABLESPACES_ENCRYPTION WHERE MIN_KEY_VERSION <> 0;

@@ -51,6 +53,7 @@ SELECT NAME FROM INFORMATION_SCHEMA.INNODB_TABLESPACES_ENCRYPTION WHERE MIN_KEY_
--let $restart_parameters=--skip-file-key-management --innodb-encrypt-tables=OFF --innodb-encryption-threads=0 --innodb-tablespaces-encryption
-- source include/restart_mysqld.inc

--sorted_result
SELECT NAME FROM INFORMATION_SCHEMA.INNODB_TABLESPACES_ENCRYPTION WHERE MIN_KEY_VERSION = 0;
SELECT NAME FROM INFORMATION_SCHEMA.INNODB_TABLESPACES_ENCRYPTION WHERE MIN_KEY_VERSION <> 0;

@@ -26,6 +26,7 @@ let $restart_parameters= --innodb_encryption_threads=5 --innodb_encryption_rotat
--source include/wait_condition.inc

SELECT NAME FROM INFORMATION_SCHEMA.INNODB_TABLESPACES_ENCRYPTION WHERE MIN_KEY_VERSION = 0;
--sorted_result
SELECT NAME FROM INFORMATION_SCHEMA.INNODB_TABLESPACES_ENCRYPTION WHERE MIN_KEY_VERSION <> 0;

--echo # Restart the server with innodb_encryption_rotate_key_age= 0
@@ -45,6 +46,7 @@ create table t4 (f1 int not null)engine=innodb encrypted=NO;

SELECT NAME FROM INFORMATION_SCHEMA.INNODB_TABLESPACES_ENCRYPTION WHERE MIN_KEY_VERSION = 0;

--sorted_result
SELECT NAME FROM INFORMATION_SCHEMA.INNODB_TABLESPACES_ENCRYPTION WHERE MIN_KEY_VERSION <> 0;

--echo # Disable encryption when innodb_encryption_rotate_key_age is 0
@@ -57,6 +59,7 @@ set global innodb_encrypt_tables = OFF;
--let $wait_condition=SELECT COUNT(*) >= $tables_count FROM INFORMATION_SCHEMA.INNODB_TABLESPACES_ENCRYPTION WHERE MIN_KEY_VERSION = 0 AND ROTATING_OR_FLUSHING = 0;
--source include/wait_condition.inc

--sorted_result
SELECT NAME FROM INFORMATION_SCHEMA.INNODB_TABLESPACES_ENCRYPTION WHERE MIN_KEY_VERSION = 0;
--echo # Display only encrypted create tables (t3)
SELECT NAME FROM INFORMATION_SCHEMA.INNODB_TABLESPACES_ENCRYPTION WHERE MIN_KEY_VERSION <> 0;
@@ -73,11 +76,13 @@ set global innodb_encrypt_tables = ON;

SELECT NAME FROM INFORMATION_SCHEMA.INNODB_TABLESPACES_ENCRYPTION WHERE MIN_KEY_VERSION = 0;
--echo # Display only unencrypted create tables (t4)
--sorted_result
SELECT NAME FROM INFORMATION_SCHEMA.INNODB_TABLESPACES_ENCRYPTION WHERE MIN_KEY_VERSION <> 0;

--let $restart_parameters=
-- source include/restart_mysqld.inc

SELECT NAME FROM INFORMATION_SCHEMA.INNODB_TABLESPACES_ENCRYPTION WHERE MIN_KEY_VERSION = 0;
--sorted_result
SELECT NAME FROM INFORMATION_SCHEMA.INNODB_TABLESPACES_ENCRYPTION WHERE MIN_KEY_VERSION <> 0;
DROP TABLE t4, t3, t2, t1;
@@ -1,4 +1,4 @@
call mtr.add_suppression("InnoDB: innodb_open_files=13 is exceeded");
call mtr.add_suppression("InnoDB: innodb_open_files=.* is exceeded");
SET @save_tdc= @@GLOBAL.table_definition_cache;
SET @save_toc= @@GLOBAL.table_open_cache;
SET GLOBAL table_definition_cache= 400;
@@ -32,18 +32,6 @@ commit;
set autocommit=1;


let $success= `SELECT variable_value FROM information_schema.global_status WHERE variable_name = 'innodb_num_page_compressed_trim_op'`;

if (!$success) {
--disable_query_log
--disable_result_log
DROP PROCEDURE innodb_insert_proc;
DROP TABLE innodb_page_compressed;
--enable_query_log
--enable_result_log
--skip "Test requires TRIM";
}

DROP PROCEDURE innodb_insert_proc;
DROP TABLE innodb_page_compressed;

@@ -4,7 +4,7 @@
# This test is slow on buildbot.
--source include/big_test.inc

call mtr.add_suppression("InnoDB: innodb_open_files=13 is exceeded");
call mtr.add_suppression("InnoDB: innodb_open_files=.* is exceeded");

SET @save_tdc= @@GLOBAL.table_definition_cache;
SET @save_toc= @@GLOBAL.table_open_cache;
@@ -1,3 +1,4 @@

# Copyright (c) 2006, 2017, Oracle and/or its affiliates. All rights reserved.
# Copyright (c) 2014, 2020, MariaDB Corporation.
#
@@ -186,7 +187,6 @@ SET(INNOBASE_SOURCES
include/mtr0mtr.h
include/mtr0mtr.ic
include/mtr0types.h
include/os0api.h
include/os0event.h
include/os0file.h
include/os0file.ic
@@ -3304,21 +3304,34 @@ btr_cur_ins_lock_and_undo(

/**
Prefetch siblings of the leaf for the pessimistic operation.
@param block leaf page */
static void btr_cur_prefetch_siblings(const buf_block_t* block)
@param block leaf page
@param index index of the page */
static void btr_cur_prefetch_siblings(const buf_block_t *block,
const dict_index_t *index)
{
const page_t *page= block->frame;
ut_ad(page_is_leaf(page));
ut_ad(page_is_leaf(block->frame));

if (index->is_ibuf())
return;

const page_t *page= block->frame;
uint32_t prev= mach_read_from_4(my_assume_aligned<4>(page + FIL_PAGE_PREV));
uint32_t next= mach_read_from_4(my_assume_aligned<4>(page + FIL_PAGE_NEXT));

if (prev != FIL_NULL)
buf_read_page_background(page_id_t(block->page.id().space(), prev),
{
ut_a(index->table->space->acquire_for_io());
buf_read_page_background(index->table->space,
page_id_t(block->page.id().space(), prev),
block->zip_size(), false);
}
if (next != FIL_NULL)
buf_read_page_background(page_id_t(block->page.id().space(), next),
{
ut_a(index->table->space->acquire_for_io());
buf_read_page_background(index->table->space,
page_id_t(block->page.id().space(), next),
block->zip_size(), false);
}
}

/*************************************************************//**
@@ -3436,8 +3449,8 @@ btr_cur_optimistic_insert(

/* prefetch siblings of the leaf for the pessimistic
operation, if the page is leaf. */
if (page_is_leaf(page) && !index->is_ibuf()) {
btr_cur_prefetch_siblings(block);
if (page_is_leaf(page)) {
btr_cur_prefetch_siblings(block, index);
}
fail_err:

@@ -4575,7 +4588,7 @@ btr_cur_optimistic_update(

/* prefetch siblings of the leaf for the pessimistic
operation. */
btr_cur_prefetch_siblings(block);
btr_cur_prefetch_siblings(block, index);

return(DB_OVERFLOW);
}
@@ -4766,10 +4779,10 @@ btr_cur_optimistic_update(
}
}

if (err != DB_SUCCESS && !index->is_ibuf()) {
if (err != DB_SUCCESS) {
/* prefetch siblings of the leaf for the pessimistic
operation. */
btr_cur_prefetch_siblings(block);
btr_cur_prefetch_siblings(block, index);
}

return(err);
@@ -5481,7 +5494,7 @@ btr_cur_optimistic_delete_func(
if (!no_compress_needed) {
/* prefetch siblings of the leaf for the pessimistic
operation. */
btr_cur_prefetch_siblings(block);
btr_cur_prefetch_siblings(block, cursor->index);
goto func_exit;
}

0 comments on commit 45ed9dd

Please sign in to comment.