Skip to content

Commit 21adad0

Browse files
committed
MDEV-8901: InnoDB: Punch hole is incorrecty done also to log files causing assertion and database corruption
Analysis: Problem is that punch hole does not know the actual page size of the page and does the page belong to an data file or to a log file. Fix: Pass down the file type and page size to os layer to be used when trim is called. Also fix unsafe null pointer access to actual write_size.
1 parent 90f2c82 commit 21adad0

File tree

10 files changed

+118
-86
lines changed

10 files changed

+118
-86
lines changed

config.h.cmake

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -219,6 +219,7 @@
219219
#cmakedefine HAVE_POSIX_FALLOCATE 1
220220
#cmakedefine HAVE_LINUX_FALLOC_H 1
221221
#cmakedefine HAVE_FALLOCATE 1
222+
#cmakedefine HAVE_FALLOC_PUNCH_HOLE_AND_KEEP_SIZE 1
222223
#cmakedefine HAVE_PREAD 1
223224
#cmakedefine HAVE_PAUSE_INSTRUCTION 1
224225
#cmakedefine HAVE_FAKE_PAUSE_INSTRUCTION 1

configure.cmake

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1058,3 +1058,22 @@ CHECK_STRUCT_HAS_MEMBER("struct dirent" d_ino "dirent.h" STRUCT_DIRENT_HAS_D_IN
10581058
CHECK_STRUCT_HAS_MEMBER("struct dirent" d_namlen "dirent.h" STRUCT_DIRENT_HAS_D_NAMLEN)
10591059
SET(SPRINTF_RETURNS_INT 1)
10601060
CHECK_INCLUDE_FILE(ucontext.h HAVE_UCONTEXT_H)
1061+
1062+
IF(NOT MSVC)
1063+
CHECK_C_SOURCE_RUNS(
1064+
"
1065+
#define _GNU_SOURCE
1066+
#include <fcntl.h>
1067+
#include <linux/falloc.h>
1068+
int main()
1069+
{
1070+
/* Ignore the return value for now. Check if the flags exist.
1071+
The return value is checked at runtime. */
1072+
fallocate(0, FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE, 0, 0);
1073+
1074+
return(0);
1075+
}"
1076+
HAVE_FALLOC_PUNCH_HOLE_AND_KEEP_SIZE
1077+
)
1078+
ENDIF()
1079+

storage/innobase/fil/fil0fil.cc

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -5223,9 +5223,9 @@ fil_extend_space_to_desired_size(
52235223
success = os_file_write(node->name, node->handle, buf,
52245224
offset, page_size * n_pages);
52255225
#else
5226-
success = os_aio(OS_FILE_WRITE, OS_AIO_SYNC,
5226+
success = os_aio(OS_FILE_WRITE, 0, OS_AIO_SYNC,
52275227
node->name, node->handle, buf,
5228-
offset, page_size * n_pages,
5228+
offset, page_size * n_pages, page_size,
52295229
node, NULL, 0);
52305230
#endif /* UNIV_HOTBACKUP */
52315231

@@ -5872,12 +5872,14 @@ fil_io(
58725872
/* Queue the aio request */
58735873
ret = os_aio(
58745874
type,
5875+
is_log,
58755876
mode | wake_later,
58765877
node->name,
58775878
node->handle,
58785879
buf,
58795880
offset,
58805881
len,
5882+
zip_size ? zip_size : UNIV_PAGE_SIZE,
58815883
node,
58825884
message,
58835885
write_size);

storage/innobase/include/os0file.h

Lines changed: 11 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -311,10 +311,10 @@ The wrapper functions have the prefix of "innodb_". */
311311
# define os_file_close(file) \
312312
pfs_os_file_close_func(file, __FILE__, __LINE__)
313313

314-
# define os_aio(type, mode, name, file, buf, offset, \
315-
n, message1, message2, write_size) \
316-
pfs_os_aio_func(type, mode, name, file, buf, offset, \
317-
n, message1, message2, write_size, \
314+
# define os_aio(type, is_log, mode, name, file, buf, offset, \
315+
n, page_size, message1, message2, write_size) \
316+
pfs_os_aio_func(type, is_log, mode, name, file, buf, offset, \
317+
n, page_size, message1, message2, write_size, \
318318
__FILE__, __LINE__)
319319

320320

@@ -357,10 +357,10 @@ to original un-instrumented file I/O APIs */
357357

358358
# define os_file_close(file) os_file_close_func(file)
359359

360-
# define os_aio(type, mode, name, file, buf, offset, n, message1, \
360+
# define os_aio(type, is_log, mode, name, file, buf, offset, n, page_size, message1, \
361361
message2, write_size) \
362-
os_aio_func(type, mode, name, file, buf, offset, n, \
363-
message1, message2, write_size)
362+
os_aio_func(type, is_log, mode, name, file, buf, offset, n, \
363+
page_size, message1, message2, write_size)
364364

365365
# define os_file_read(file, buf, offset, n) \
366366
os_file_read_func(file, buf, offset, n)
@@ -749,6 +749,7 @@ ibool
749749
pfs_os_aio_func(
750750
/*============*/
751751
ulint type, /*!< in: OS_FILE_READ or OS_FILE_WRITE */
752+
ulint is_log, /*!< in: 1 is OS_FILE_LOG or 0 */
752753
ulint mode, /*!< in: OS_AIO_NORMAL etc. I/O mode */
753754
const char* name, /*!< in: name of the file or path as a
754755
null-terminated string */
@@ -757,6 +758,7 @@ pfs_os_aio_func(
757758
to write */
758759
os_offset_t offset, /*!< in: file offset where to read or write */
759760
ulint n, /*!< in: number of bytes to read or write */
761+
ulint page_size, /*!< in: page size in bytes */
760762
fil_node_t* message1,/*!< in: message for the aio handler
761763
(can be used to identify a completed
762764
aio operation); ignored if mode is
@@ -1107,6 +1109,7 @@ ibool
11071109
os_aio_func(
11081110
/*========*/
11091111
ulint type, /*!< in: OS_FILE_READ or OS_FILE_WRITE */
1112+
ulint is_log, /*!< in: 1 is OS_FILE_LOG or 0 */
11101113
ulint mode, /*!< in: OS_AIO_NORMAL, ..., possibly ORed
11111114
to OS_AIO_SIMULATED_WAKE_LATER: the
11121115
last flag advises this function not to wake
@@ -1127,6 +1130,7 @@ os_aio_func(
11271130
to write */
11281131
os_offset_t offset, /*!< in: file offset where to read or write */
11291132
ulint n, /*!< in: number of bytes to read or write */
1133+
ulint page_size, /*!< in: page size in bytes */
11301134
fil_node_t* message1,/*!< in: message for the aio handler
11311135
(can be used to identify a completed
11321136
aio operation); ignored if mode is

storage/innobase/include/os0file.ic

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -199,6 +199,7 @@ ibool
199199
pfs_os_aio_func(
200200
/*============*/
201201
ulint type, /*!< in: OS_FILE_READ or OS_FILE_WRITE */
202+
ulint is_log, /*!< in: 1 is OS_FILE_LOG or 0 */
202203
ulint mode, /*!< in: OS_AIO_NORMAL etc. I/O mode */
203204
const char* name, /*!< in: name of the file or path as a
204205
null-terminated string */
@@ -207,6 +208,7 @@ pfs_os_aio_func(
207208
to write */
208209
os_offset_t offset, /*!< in: file offset where to read or write */
209210
ulint n, /*!< in: number of bytes to read or write */
211+
ulint page_size, /*!< in: page size in bytes */
210212
fil_node_t* message1,/*!< in: message for the aio handler
211213
(can be used to identify a completed
212214
aio operation); ignored if mode is
@@ -234,8 +236,8 @@ pfs_os_aio_func(
234236
: PSI_FILE_READ,
235237
src_file, src_line);
236238

237-
result = os_aio_func(type, mode, name, file, buf, offset,
238-
n, message1, message2, write_size);
239+
result = os_aio_func(type, is_log, mode, name, file, buf, offset,
240+
n, page_size, message1, message2, write_size);
239241

240242
register_pfs_file_io_end(locker, n);
241243

storage/innobase/os/os0file.cc

Lines changed: 33 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -49,9 +49,8 @@ Created 10/21/1995 Heikki Tuuri
4949
#include "buf0buf.h"
5050
#include "srv0mon.h"
5151
#include "srv0srv.h"
52-
#ifdef HAVE_POSIX_FALLOCATE
52+
#ifdef HAVE_LINUX_UNISTD_H
5353
#include "unistd.h"
54-
#include "fcntl.h"
5554
#endif
5655
#ifndef UNIV_HOTBACKUP
5756
# include "os0sync.h"
@@ -84,14 +83,10 @@ Created 10/21/1995 Heikki Tuuri
8483
#include <linux/falloc.h>
8584
#endif
8685

87-
#if defined(HAVE_FALLOCATE)
88-
#ifndef FALLOC_FL_KEEP_SIZE
89-
#define FALLOC_FL_KEEP_SIZE 0x01
90-
#endif
91-
#ifndef FALLOC_FL_PUNCH_HOLE
92-
#define FALLOC_FL_PUNCH_HOLE 0x02
93-
#endif
94-
#endif
86+
#ifdef HAVE_FALLOC_PUNCH_HOLE_AND_KEEP_SIZE
87+
# include <fcntl.h>
88+
# include <linux/falloc.h>
89+
#endif /* HAVE_FALLOC_PUNCH_HOLE_AND_KEEP_SIZE */
9590

9691
#ifdef HAVE_LZO
9792
#include "lzo/lzo1x.h"
@@ -209,6 +204,9 @@ struct os_aio_slot_t{
209204
write */
210205
byte* buf; /*!< buffer used in i/o */
211206
ulint type; /*!< OS_FILE_READ or OS_FILE_WRITE */
207+
ulint is_log; /*!< 1 if OS_FILE_LOG or 0 */
208+
ulint page_size; /*!< UNIV_PAGE_SIZE or zip_size */
209+
212210
os_offset_t offset; /*!< file offset in bytes */
213211
os_file_t file; /*!< file where to read or write */
214212
const char* name; /*!< file name or path */
@@ -4474,6 +4472,7 @@ os_aio_slot_t*
44744472
os_aio_array_reserve_slot(
44754473
/*======================*/
44764474
ulint type, /*!< in: OS_FILE_READ or OS_FILE_WRITE */
4475+
ulint is_log, /*!< in: 1 is OS_FILE_LOG or 0 */
44774476
os_aio_array_t* array, /*!< in: aio array */
44784477
fil_node_t* message1,/*!< in: message to be passed along with
44794478
the aio operation */
@@ -4486,6 +4485,7 @@ os_aio_array_reserve_slot(
44864485
to write */
44874486
os_offset_t offset, /*!< in: file offset */
44884487
ulint len, /*!< in: length of the block to read or write */
4488+
ulint page_size, /*!< in: page size in bytes */
44894489
ulint* write_size)/*!< in/out: Actual write size initialized
44904490
after fist successfull trim
44914491
operation for this page and if
@@ -4580,6 +4580,8 @@ os_aio_array_reserve_slot(
45804580
slot->offset = offset;
45814581
slot->io_already_done = FALSE;
45824582
slot->write_size = write_size;
4583+
slot->is_log = is_log;
4584+
slot->page_size = page_size;
45834585

45844586
if (message1) {
45854587
slot->file_block_size = fil_node_get_block_size(message1);
@@ -4836,6 +4838,7 @@ ibool
48364838
os_aio_func(
48374839
/*========*/
48384840
ulint type, /*!< in: OS_FILE_READ or OS_FILE_WRITE */
4841+
ulint is_log, /*!< in: 1 is OS_FILE_LOG or 0 */
48394842
ulint mode, /*!< in: OS_AIO_NORMAL, ..., possibly ORed
48404843
to OS_AIO_SIMULATED_WAKE_LATER: the
48414844
last flag advises this function not to wake
@@ -4856,6 +4859,7 @@ os_aio_func(
48564859
to write */
48574860
os_offset_t offset, /*!< in: file offset where to read or write */
48584861
ulint n, /*!< in: number of bytes to read or write */
4862+
ulint page_size, /*!< in: page size in bytes */
48594863
fil_node_t* message1,/*!< in: message for the aio handler
48604864
(can be used to identify a completed
48614865
aio operation); ignored if mode is
@@ -4982,8 +4986,8 @@ os_aio_func(
49824986
array = NULL; /* Eliminate compiler warning */
49834987
}
49844988

4985-
slot = os_aio_array_reserve_slot(type, array, message1, message2, file,
4986-
name, buf, offset, n, write_size);
4989+
slot = os_aio_array_reserve_slot(type, is_log, array, message1, message2, file,
4990+
name, buf, offset, n, page_size, write_size);
49874991

49884992
if (type == OS_FILE_READ) {
49894993
if (srv_use_native_aio) {
@@ -5251,7 +5255,10 @@ os_aio_windows_handle(
52515255
ret_val = ret && len == slot->len;
52525256
}
52535257

5254-
if (slot->type == OS_FILE_WRITE && srv_use_trim && os_fallocate_failed == FALSE) {
5258+
if (slot->type == OS_FILE_WRITE &&
5259+
!slot->is_log &&
5260+
srv_use_trim &&
5261+
os_fallocate_failed == FALSE) {
52555262
// Deallocate unused blocks from file system
52565263
os_file_trim(slot);
52575264
}
@@ -5345,7 +5352,10 @@ os_aio_linux_collect(
53455352
/* We have not overstepped to next segment. */
53465353
ut_a(slot->pos < end_pos);
53475354

5348-
if (slot->type == OS_FILE_WRITE && srv_use_trim && os_fallocate_failed == FALSE) {
5355+
if (slot->type == OS_FILE_WRITE &&
5356+
!slot->is_log &&
5357+
srv_use_trim &&
5358+
os_fallocate_failed == FALSE) {
53495359
// Deallocate unused blocks from file system
53505360
os_file_trim(slot);
53515361
}
@@ -6220,19 +6230,13 @@ os_file_trim(
62206230
{
62216231

62226232
size_t len = slot->len;
6223-
size_t trim_len = UNIV_PAGE_SIZE - len;
6233+
size_t trim_len = slot->page_size - len;
62246234
os_offset_t off = slot->offset + len;
62256235
size_t bsize = slot->file_block_size;
62266236

6227-
// len here should be alligned to sector size
6228-
ut_ad((trim_len % bsize) == 0);
6229-
ut_ad((len % bsize) == 0);
6230-
ut_ad(bsize != 0);
6231-
ut_ad((off % bsize) == 0);
6232-
62336237
#ifdef UNIV_TRIM_DEBUG
62346238
fprintf(stderr, "Note: TRIM: write_size %lu trim_len %lu len %lu off %lu bz %lu\n",
6235-
*slot->write_size, trim_len, len, off, bsize);
6239+
slot->write_size ? *slot->write_size : 0, trim_len, len, off, bsize);
62366240
#endif
62376241

62386242
// Nothing to do if trim length is zero or if actual write
@@ -6247,22 +6251,19 @@ os_file_trim(
62476251
*slot->write_size > 0 &&
62486252
len >= *slot->write_size)) {
62496253

6250-
#ifdef UNIV_PAGECOMPRESS_DEBUG
6251-
fprintf(stderr, "Note: TRIM: write_size %lu trim_len %lu len %lu\n",
6252-
*slot->write_size, trim_len, len);
6253-
#endif
6254+
if (slot->write_size) {
6255+
if (*slot->write_size > 0 && len >= *slot->write_size) {
6256+
srv_stats.page_compressed_trim_op_saved.inc();
6257+
}
62546258

6255-
if (*slot->write_size > 0 && len >= *slot->write_size) {
6256-
srv_stats.page_compressed_trim_op_saved.inc();
6259+
*slot->write_size = len;
62576260
}
62586261

6259-
*slot->write_size = len;
6260-
62616262
return (TRUE);
62626263
}
62636264

62646265
#ifdef __linux__
6265-
#if defined(HAVE_FALLOCATE)
6266+
#if defined(HAVE_FALLOC_PUNCH_HOLE_AND_KEEP_SIZE)
62666267
int ret = fallocate(slot->file, FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE, off, trim_len);
62676268

62686269
if (ret) {
@@ -6300,7 +6301,7 @@ os_file_trim(
63006301
*slot->write_size = 0;
63016302
}
63026303

6303-
#endif /* HAVE_FALLOCATE ... */
6304+
#endif /* HAVE_FALLOC_PUNCH_HOLE_AND_KEEP_SIZE ... */
63046305

63056306
#elif defined(_WIN32)
63066307
FILE_LEVEL_TRIM flt;

storage/xtradb/fil/fil0fil.cc

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -5253,9 +5253,9 @@ fil_extend_space_to_desired_size(
52535253
success = os_file_write(node->name, node->handle, buf,
52545254
offset, page_size * n_pages);
52555255
#else
5256-
success = os_aio(OS_FILE_WRITE, OS_AIO_SYNC,
5256+
success = os_aio(OS_FILE_WRITE, 0, OS_AIO_SYNC,
52575257
node->name, node->handle, buf,
5258-
offset, page_size * n_pages,
5258+
offset, page_size * n_pages, page_size,
52595259
node, NULL, space_id, NULL, 0);
52605260
#endif /* UNIV_HOTBACKUP */
52615261

@@ -5918,12 +5918,14 @@ _fil_io(
59185918
/* Queue the aio request */
59195919
ret = os_aio(
59205920
type,
5921+
is_log,
59215922
mode | wake_later,
59225923
node->name,
59235924
node->handle,
59245925
buf,
59255926
offset,
59265927
len,
5928+
zip_size ? zip_size : UNIV_PAGE_SIZE,
59275929
node,
59285930
message,
59295931
space_id,

0 commit comments

Comments
 (0)