Skip to content

Commit 30edd55

Browse files
committed
MDEV-26029: Sparse files are inefficient on thinly provisioned storage
The MariaDB implementation of page_compressed tables for InnoDB used sparse files. In the worst case, in the data file, every data page will consist of some data followed by a hole. This may be extremely inefficient in some file systems. If the underlying storage device is thinly provisioned (can compress data on the fly), it would be good to write regular files (with sequences of NUL bytes at the end of each page_compressed block) and let the storage device take care of compressing the data. For reads, sparse file regions and regions containing NUL bytes will be indistinguishable. my_test_if_disable_punch_hole(): A new predicate for detecting thinly provisioned storage. (Not implemented yet.) innodb_atomic_writes: Correct the comment. buf_flush_page(): Support all values of fil_node_t::punch_hole. On a thinly provisioned storage device, we will always write NUL-padded innodb_page_size bytes also for page_compressed tables. buf_flush_freed_pages(): Remove a redundant condition. fil_space_t::atomic_write_supported: Remove. (This was duplicating fil_node_t::atomic_write.) fil_space_t::punch_hole: Remove. (Duplicated fil_node_t::punch_hole.) fil_node_t: Remove magic_n, and consolidate flags into bitfields. For punch_hole we introduce a third value that indicates a thinly provisioned storage device. fil_node_t::find_metadata(): Detect all attributes of the file.
1 parent b11aa0d commit 30edd55

File tree

9 files changed

+129
-184
lines changed

9 files changed

+129
-184
lines changed

include/my_sys.h

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
/* Copyright (c) 2000, 2013, Oracle and/or its affiliates.
2-
Copyright (c) 2010, 2020, MariaDB Corporation.
2+
Copyright (c) 2010, 2021, MariaDB Corporation.
33
44
This program is free software; you can redistribute it and/or modify
55
it under the terms of the GNU General Public License as published by
@@ -183,10 +183,11 @@ extern BOOL my_obtain_privilege(LPCSTR lpPrivilege);
183183
#endif
184184

185185
void my_init_atomic_write(void);
186+
#define my_test_if_thinly_provisioned(A) 0
186187
#ifdef __linux__
187188
my_bool my_test_if_atomic_write(File handle, int pagesize);
188189
#else
189-
#define my_test_if_atomic_write(A, B) 0
190+
# define my_test_if_atomic_write(A, B) 0
190191
#endif /* __linux__ */
191192
extern my_bool my_may_have_atomic_write;
192193

mysql-test/suite/sys_vars/r/sysvars_innodb.result

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1754,7 +1754,7 @@ SESSION_VALUE NULL
17541754
DEFAULT_VALUE ON
17551755
VARIABLE_SCOPE GLOBAL
17561756
VARIABLE_TYPE BOOLEAN
1757-
VARIABLE_COMMENT Enable atomic writes, instead of using the doublewrite buffer, for files on devices that supports atomic writes. This option only works on Linux with either FusionIO cards using the directFS filesystem or with Shannon cards using any file system.
1757+
VARIABLE_COMMENT Enable atomic writes, instead of using the doublewrite buffer, for files on devices that supports atomic writes.
17581758
NUMERIC_MIN_VALUE NULL
17591759
NUMERIC_MAX_VALUE NULL
17601760
NUMERIC_BLOCK_SIZE NULL

storage/innobase/buf/buf0dblwr.cc

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -712,6 +712,7 @@ void buf_dblwr_t::add_to_batch(const IORequest &request, size_t size)
712712
ut_ad(request.bpage);
713713
ut_ad(request.bpage->in_file());
714714
ut_ad(request.node);
715+
ut_ad(request.node->space->purpose == FIL_TYPE_TABLESPACE);
715716
ut_ad(request.node->space->id == request.bpage->id().space());
716717
ut_ad(request.node->space->referenced());
717718
ut_ad(!srv_read_only_mode);

storage/innobase/buf/buf0flu.cc

Lines changed: 13 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -804,8 +804,6 @@ static bool buf_flush_page(buf_page_t *bpage, bool lru, fil_space_t *space)
804804
ut_ad(bpage->ready_for_flush());
805805
ut_ad((space->purpose == FIL_TYPE_TEMPORARY) ==
806806
(space == fil_system.temp_space));
807-
ut_ad(space->purpose == FIL_TYPE_TABLESPACE ||
808-
space->atomic_write_supported);
809807
ut_ad(space->referenced());
810808
ut_ad(lru || space != fil_system.temp_space);
811809

@@ -912,8 +910,16 @@ static bool buf_flush_page(buf_page_t *bpage, bool lru, fil_space_t *space)
912910
}
913911

914912
#if defined HAVE_FALLOC_PUNCH_HOLE_AND_KEEP_SIZE || defined _WIN32
915-
if (size != orig_size && space->punch_hole)
916-
type= lru ? IORequest::PUNCH_LRU : IORequest::PUNCH;
913+
if (size != orig_size)
914+
{
915+
switch (space->chain.start->punch_hole) {
916+
case 1:
917+
type= lru ? IORequest::PUNCH_LRU : IORequest::PUNCH;
918+
break;
919+
case 2:
920+
size= orig_size;
921+
}
922+
}
917923
#endif
918924
frame=page;
919925
}
@@ -1036,8 +1042,8 @@ innodb_immediate_scrub_data_uncompressed from the freed ranges.
10361042
@param space tablespace which may contain ranges of freed pages */
10371043
static void buf_flush_freed_pages(fil_space_t *space)
10381044
{
1039-
const bool punch_hole= space->punch_hole;
1040-
if (!srv_immediate_scrub_data_uncompressed && !punch_hole)
1045+
const bool punch_hole= space->chain.start->punch_hole == 1;
1046+
if (!punch_hole && !srv_immediate_scrub_data_uncompressed)
10411047
return;
10421048
lsn_t flush_to_disk_lsn= log_sys.get_flushed_lsn();
10431049

@@ -1064,7 +1070,7 @@ static void buf_flush_freed_pages(fil_space_t *space)
10641070
(range.last - range.first + 1) * physical_size,
10651071
nullptr);
10661072
}
1067-
else if (srv_immediate_scrub_data_uncompressed)
1073+
else
10681074
{
10691075
for (os_offset_t i= range.first; i <= range.last; i++)
10701076
{

storage/innobase/fil/fil0fil.cc

Lines changed: 2 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -317,8 +317,6 @@ fil_node_t* fil_space_t::add(const char* name, pfs_os_file_t handle,
317317

318318
node->size = size;
319319

320-
node->magic_n = FIL_NODE_MAGIC_N;
321-
322320
node->init_size = size;
323321
node->max_size = max_pages;
324322

@@ -718,7 +716,6 @@ bool fil_space_extend(fil_space_t *space, uint32_t size)
718716
inline pfs_os_file_t fil_node_t::close_to_free(bool detach_handle)
719717
{
720718
mysql_mutex_assert_owner(&fil_system.mutex);
721-
ut_a(magic_n == FIL_NODE_MAGIC_N);
722719
ut_a(!being_extended);
723720

724721
if (is_open() &&
@@ -941,16 +938,6 @@ fil_space_t *fil_space_t::create(ulint id, ulint flags,
941938

942939
space->latch.SRW_LOCK_INIT(fil_space_latch_key);
943940

944-
if (space->purpose == FIL_TYPE_TEMPORARY) {
945-
/* SysTablespace::open_or_create() would pass
946-
size!=0 to fil_space_t::add(), so first_time_open
947-
would not hold in fil_node_open_file(), and we
948-
must assign this manually. We do not care about
949-
the durability or atomicity of writes to the
950-
temporary tablespace files. */
951-
space->atomic_write_supported = true;
952-
}
953-
954941
mysql_mutex_lock(&fil_system.mutex);
955942

956943
if (const fil_space_t *old_space = fil_space_get_by_id(id)) {
@@ -1951,9 +1938,6 @@ fil_rename_tablespace(
19511938
return(success);
19521939
}
19531940

1954-
/* FIXME: remove this! */
1955-
IF_WIN(, bool os_is_sparse_file_supported(os_file_t fh));
1956-
19571941
/** Create a tablespace file.
19581942
@param[in] space_id Tablespace ID
19591943
@param[in] name Tablespace name in dbname/tablename format.
@@ -2041,7 +2025,6 @@ fil_ibd_create(
20412025
}
20422026

20432027
const bool is_compressed = fil_space_t::is_compressed(flags);
2044-
bool punch_hole = is_compressed;
20452028
fil_space_crypt_t* crypt_data = nullptr;
20462029
#ifdef _WIN32
20472030
if (is_compressed) {
@@ -2060,9 +2043,6 @@ fil_ibd_create(
20602043
return NULL;
20612044
}
20622045

2063-
/* FIXME: remove this */
2064-
IF_WIN(, punch_hole = punch_hole && os_is_sparse_file_supported(file));
2065-
20662046
/* We have to write the space id to the file immediately and flush the
20672047
file to disk. This is because in crash recovery we must be aware what
20682048
tablespaces exist and what are their space id's, so that we can apply
@@ -2115,9 +2095,8 @@ fil_ibd_create(
21152095
if (fil_space_t* space = fil_space_t::create(space_id, flags,
21162096
FIL_TYPE_TABLESPACE,
21172097
crypt_data, mode)) {
2118-
space->punch_hole = punch_hole;
21192098
fil_node_t* node = space->add(path, file, size, false, true);
2120-
node->find_metadata(file);
2099+
IF_WIN(node->find_metadata(), node->find_metadata(file, true));
21212100
mtr.start();
21222101
mtr.set_named_space(space);
21232102
fsp_header_init(space, size, &mtr);
@@ -2878,7 +2857,7 @@ fil_io_t fil_space_t::io(const IORequest &type, os_offset_t offset, size_t len,
28782857
/* Punch hole is not supported, make space not to
28792858
support punch hole */
28802859
if (UNIV_UNLIKELY(err == DB_IO_NO_PUNCH_HOLE)) {
2881-
punch_hole = false;
2860+
node->punch_hole = false;
28822861
err = DB_SUCCESS;
28832862
}
28842863
goto release_sync_write;

storage/innobase/handler/ha_innodb.cc

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -18510,9 +18510,7 @@ static MYSQL_SYSVAR_BOOL(doublewrite, srv_use_doublewrite_buf,
1851018510
static MYSQL_SYSVAR_BOOL(use_atomic_writes, srv_use_atomic_writes,
1851118511
PLUGIN_VAR_NOCMDARG | PLUGIN_VAR_READONLY,
1851218512
"Enable atomic writes, instead of using the doublewrite buffer, for files "
18513-
"on devices that supports atomic writes. "
18514-
"This option only works on Linux with either FusionIO cards using "
18515-
"the directFS filesystem or with Shannon cards using any file system.",
18513+
"on devices that supports atomic writes.",
1851618514
NULL, NULL, TRUE);
1851718515

1851818516
static MYSQL_SYSVAR_BOOL(stats_include_delete_marked,

storage/innobase/include/fil0fil.h

Lines changed: 54 additions & 64 deletions
Original file line numberDiff line numberDiff line change
@@ -424,13 +424,6 @@ struct fil_space_t final
424424
/** Checks that this tablespace needs key rotation. */
425425
bool is_in_default_encrypt;
426426

427-
/** True if the device this filespace is on supports atomic writes */
428-
bool atomic_write_supported;
429-
430-
/** True if file system storing this tablespace supports
431-
punch hole */
432-
bool punch_hole;
433-
434427
/** mutex to protect freed ranges */
435428
std::mutex freed_range_mutex;
436429

@@ -444,11 +437,7 @@ struct fil_space_t final
444437
ulint magic_n;/*!< FIL_SPACE_MAGIC_N */
445438

446439
/** @return whether doublewrite buffering is needed */
447-
bool use_doublewrite() const
448-
{
449-
return !atomic_write_supported && srv_use_doublewrite_buf &&
450-
buf_dblwr.is_initialised();
451-
}
440+
inline bool use_doublewrite() const;
452441

453442
/** Append a file to the chain of files of a space.
454443
@param[in] name file name of a file that is not open
@@ -509,6 +498,8 @@ struct fil_space_t final
509498
/** @return whether the storage device is rotational (HDD, not SSD) */
510499
inline bool is_rotational() const;
511500

501+
/** whether the tablespace discovery is being deferred during crash
502+
recovery due to incompletely written page 0 */
512503
inline bool is_deferred() const;
513504

514505
/** Open each file. Never invoked on .ibd files.
@@ -1066,60 +1057,56 @@ struct fil_space_t final
10661057
/** File node of a tablespace or the log data space */
10671058
struct fil_node_t final
10681059
{
1069-
/** tablespace containing this file */
1070-
fil_space_t* space;
1071-
/** file name; protected by fil_system.mutex and log_sys.mutex. */
1072-
char* name;
1073-
/** file handle (valid if is_open) */
1074-
pfs_os_file_t handle;
1075-
/** whether the file actually is a raw device or disk partition */
1076-
bool is_raw_disk;
1077-
/** whether the file is on non-rotational media (SSD) */
1078-
bool on_ssd;
1079-
/** size of the file in database pages (0 if not known yet);
1080-
the possible last incomplete megabyte may be ignored
1081-
if space->id == 0 */
1082-
uint32_t size;
1083-
/** initial size of the file in database pages;
1084-
FIL_IBD_FILE_INITIAL_SIZE by default */
1085-
uint32_t init_size;
1086-
/** maximum size of the file in database pages (0 if unlimited) */
1087-
uint32_t max_size;
1088-
/** whether the file is currently being extended */
1089-
Atomic_relaxed<bool> being_extended;
1090-
/** link to other files in this tablespace */
1091-
UT_LIST_NODE_T(fil_node_t) chain;
1092-
1093-
/** whether this file could use atomic write (data file) */
1094-
bool atomic_write;
1095-
1096-
/** Filesystem block size */
1097-
ulint block_size;
1098-
1099-
/** Deferring the tablespace during recovery and it
1100-
can be used to skip the validation of page0 */
1101-
bool deferred=false;
1102-
1103-
/** FIL_NODE_MAGIC_N */
1104-
ulint magic_n;
1105-
1106-
/** @return whether this file is open */
1107-
bool is_open() const
1108-
{
1109-
return(handle != OS_FILE_CLOSED);
1110-
}
1060+
/** tablespace containing this file */
1061+
fil_space_t *space;
1062+
/** file name; protected by fil_system.mutex and log_sys.mutex */
1063+
char *name;
1064+
/** file handle */
1065+
pfs_os_file_t handle;
1066+
/** whether the file is on non-rotational media (SSD) */
1067+
unsigned on_ssd:1;
1068+
/** how to write page_compressed tables
1069+
(0=do not punch holes but write minimal amount of data, 1=punch holes,
1070+
2=always write the same amount; thinly provisioned storage will compress) */
1071+
unsigned punch_hole:2;
1072+
/** whether this file could use atomic write */
1073+
unsigned atomic_write:1;
1074+
/** whether the file actually is a raw device or disk partition */
1075+
unsigned is_raw_disk:1;
1076+
/** whether the tablespace discovery is being deferred during crash
1077+
recovery due to incompletely written page 0 */
1078+
unsigned deferred:1;
1079+
1080+
/** size of the file in database pages (0 if not known yet);
1081+
the possible last incomplete megabyte may be ignored if space->id == 0 */
1082+
uint32_t size;
1083+
/** initial size of the file in database pages;
1084+
FIL_IBD_FILE_INITIAL_SIZE by default */
1085+
uint32_t init_size;
1086+
/** maximum size of the file in database pages (0 if unlimited) */
1087+
uint32_t max_size;
1088+
/** whether the file is currently being extended */
1089+
Atomic_relaxed<bool> being_extended;
1090+
/** link to other files in this tablespace */
1091+
UT_LIST_NODE_T(fil_node_t) chain;
1092+
1093+
/** Filesystem block size */
1094+
ulint block_size;
1095+
1096+
/** @return whether this file is open */
1097+
bool is_open() const { return handle != OS_FILE_CLOSED; }
11111098

1112-
/** Read the first page of a data file.
1113-
@return whether the page was found valid */
1114-
bool read_page0();
1099+
/** Read the first page of a data file.
1100+
@return whether the page was found valid */
1101+
bool read_page0();
11151102

1116-
/** Determine some file metadata when creating or reading the file.
1117-
@param file the file that is being created, or OS_FILE_CLOSED */
1118-
void find_metadata(os_file_t file = OS_FILE_CLOSED
1103+
/** Determine some file metadata when creating or reading the file.
1104+
@param file the file that is being created, or OS_FILE_CLOSED */
1105+
void find_metadata(os_file_t file= OS_FILE_CLOSED
11191106
#ifndef _WIN32
1120-
, struct stat* statbuf = NULL
1107+
, bool create= false, struct stat *statbuf= nullptr
11211108
#endif
1122-
);
1109+
);
11231110

11241111
/** Close the file handle. */
11251112
void close();
@@ -1138,8 +1125,11 @@ struct fil_node_t final
11381125
void prepare_to_close_or_detach();
11391126
};
11401127

1141-
/** Value of fil_node_t::magic_n */
1142-
#define FIL_NODE_MAGIC_N 89389
1128+
inline bool fil_space_t::use_doublewrite() const
1129+
{
1130+
return !UT_LIST_GET_FIRST(chain)->atomic_write && srv_use_doublewrite_buf &&
1131+
buf_dblwr.is_initialised();
1132+
}
11431133

11441134
inline void fil_space_t::set_imported()
11451135
{

0 commit comments

Comments
 (0)