Skip to content

Commit 6acada7

Browse files
committed
MDEV-34062: Implement innodb_log_file_mmap on 64-bit systems
When using the default innodb_log_buffer_size=2m, mariadb-backup --backup would spend a lot of time re-reading and re-parsing the log. For reads, it would be beneficial to memory-map the entire ib_logfile0 to the address space (typically 48 bits or 256 TiB) and read it from there, both during --backup and --prepare. We will introduce the Boolean read-only parameter innodb_log_file_mmap that will be OFF by default on most platforms, to avoid aggressive read-ahead of the entire ib_logfile0 in when only a tiny portion would be accessed. On Linux and FreeBSD the default is innodb_log_file_mmap=ON, because those platforms define a specific mmap(2) option for enabling such read-ahead and therefore it can be assumed that the default would be on-demand paging. This parameter will only have impact on the initial InnoDB startup and recovery. Any writes to the log will use regular I/O, except when the ib_logfile0 is stored in a specially configured file system that is backed by persistent memory (Linux "mount -o dax"). We also experimented with allowing writes of the ib_logfile0 via a memory mapping and decided against it. A fundamental problem would be unnecessary read-before-write in case of a major page fault, that is, when a new, not yet cached, virtual memory page in the circular ib_logfile0 is being written to. There appears to be no way to tell the operating system that we do not care about the previous contents of the page, or that the page fault handler should just zero it out. Many references to HAVE_PMEM have been replaced with references to HAVE_INNODB_MMAP. The predicate log_sys.is_pmem() has been replaced with log_sys.is_mmap() && !log_sys.is_opened(). Memory-mapped regular files differ from MAP_SYNC (PMEM) mappings in the way that an open file handle to ib_logfile0 will be retained. In both code paths, log_sys.is_mmap() will hold. Holding a file handle open will allow log_t::clear_mmap() to disable the interface with fewer operations. It should be noted that ever since commit 685d958 (MDEV-14425) most 64-bit Linux platforms on our CI platforms (s390x a.k.a. IBM System Z being a notable exception) read and write /dev/shm/*/ib_logfile0 via a memory mapping, pretending that it is persistent memory (mount -o dax). So, the memory mapping based log parsing that this change is enabling by default on Linux and FreeBSD has already been extensively tested on Linux. ::log_mmap(): If a log cannot be opened as PMEM and the desired access is read-only, try to open a read-only memory mapping. xtrabackup_copy_mmap_snippet(), xtrabackup_copy_mmap_logfile(): Copy the InnoDB log in mariadb-backup --backup from a memory mapped file.
1 parent 971cf59 commit 6acada7

File tree

22 files changed

+572
-289
lines changed

22 files changed

+572
-289
lines changed

cmake/os/WindowsCache.cmake

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -61,7 +61,6 @@ SET(HAVE_GETIFADDRS CACHE INTERNAL "")
6161
SET(HAVE_GETCWD 1 CACHE INTERNAL "")
6262
SET(HAVE_GETHOSTBYADDR_R CACHE INTERNAL "")
6363
SET(HAVE_GETHRTIME CACHE INTERNAL "")
64-
SET(HAVE_GETPAGESIZE CACHE INTERNAL "")
6564
SET(HAVE_GETPASS CACHE INTERNAL "")
6665
SET(HAVE_GETMNTENT CACHE INTERNAL "")
6766
SET(HAVE_GETMNTENT_IN_SYS_MNTAB CACHE INTERNAL "")

config.h.cmake

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -151,7 +151,6 @@
151151
#cmakedefine HAVE_GETCWD 1
152152
#cmakedefine HAVE_GETHOSTBYADDR_R 1
153153
#cmakedefine HAVE_GETHRTIME 1
154-
#cmakedefine HAVE_GETPAGESIZE 1
155154
#cmakedefine HAVE_GETPAGESIZES 1
156155
#cmakedefine HAVE_GETPASS 1
157156
#cmakedefine HAVE_GETPASSPHRASE 1

configure.cmake

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -463,7 +463,6 @@ CHECK_SYMBOL_EXISTS(madvise "sys/mman.h" HAVE_DECL_MADVISE)
463463
CHECK_SYMBOL_EXISTS(getpagesizes "sys/mman.h" HAVE_GETPAGESIZES)
464464
CHECK_SYMBOL_EXISTS(tzname "time.h" HAVE_TZNAME)
465465
CHECK_SYMBOL_EXISTS(lrand48 "stdlib.h" HAVE_LRAND48)
466-
CHECK_SYMBOL_EXISTS(getpagesize "unistd.h" HAVE_GETPAGESIZE)
467466
CHECK_SYMBOL_EXISTS(TIOCGWINSZ "sys/ioctl.h" GWINSZ_IN_SYS_IOCTL)
468467
CHECK_SYMBOL_EXISTS(FIONREAD "sys/ioctl.h" FIONREAD_IN_SYS_IOCTL)
469468
CHECK_SYMBOL_EXISTS(TIOCSTAT "sys/ioctl.h" TIOCSTAT_IN_SYS_IOCTL)

extra/mariabackup/xtrabackup.cc

Lines changed: 124 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -205,6 +205,8 @@ lsn_t checkpoint_lsn_start;
205205
lsn_t checkpoint_no_start;
206206
/** whether log_copying_thread() is active; protected by recv_sys.mutex */
207207
static bool log_copying_running;
208+
/** for --backup, target LSN to copy the log to; protected by recv_sys.mutex */
209+
lsn_t metadata_to_lsn;
208210

209211
uint xtrabackup_parallel;
210212

@@ -236,7 +238,6 @@ my_bool opt_encrypted_backup;
236238
#define XTRABACKUP_METADATA_FILENAME "xtrabackup_checkpoints"
237239
char metadata_type[30] = ""; /*[full-backuped|log-applied|incremental]*/
238240
static lsn_t metadata_from_lsn;
239-
lsn_t metadata_to_lsn;
240241
static lsn_t metadata_last_lsn;
241242

242243
static ds_file_t* dst_log_file;
@@ -282,9 +283,6 @@ my_bool xtrabackup_incremental_force_scan = FALSE;
282283
*/
283284
ulong xtrabackup_innodb_force_recovery = 0;
284285

285-
/* The flushed lsn which is read from data files */
286-
lsn_t flushed_lsn= 0;
287-
288286
ulong xb_open_files_limit= 0;
289287
char *xb_plugin_dir;
290288
char *xb_plugin_load;
@@ -1329,6 +1327,9 @@ enum options_xtrabackup
13291327
OPT_INNODB_BUFFER_POOL_FILENAME,
13301328
OPT_INNODB_LOCK_WAIT_TIMEOUT,
13311329
OPT_INNODB_LOG_BUFFER_SIZE,
1330+
#ifdef HAVE_INNODB_MMAP
1331+
OPT_INNODB_LOG_FILE_MMAP,
1332+
#endif
13321333
#if defined __linux__ || defined _WIN32
13331334
OPT_INNODB_LOG_FILE_BUFFERING,
13341335
#endif
@@ -1890,6 +1891,13 @@ struct my_option xb_server_options[] =
18901891
(G_PTR*) &log_sys.buf_size, (G_PTR*) &log_sys.buf_size, 0,
18911892
GET_UINT, REQUIRED_ARG, 2U << 20,
18921893
2U << 20, log_sys.buf_size_max, 0, 4096, 0},
1894+
#ifdef HAVE_INNODB_MMAP
1895+
{"innodb_log_file_mmap", OPT_INNODB_LOG_FILE_SIZE,
1896+
"Whether ib_logfile0 should be memory-mapped",
1897+
(G_PTR*) &log_sys.log_mmap,
1898+
(G_PTR*) &log_sys.log_mmap, 0, GET_BOOL, NO_ARG,
1899+
log_sys.log_mmap_default, 0, 0, 0, 0, 0},
1900+
#endif
18931901
#if defined __linux__ || defined _WIN32
18941902
{"innodb_log_file_buffering", OPT_INNODB_LOG_FILE_BUFFERING,
18951903
"Whether the file system cache for ib_logfile0 is enabled during --backup",
@@ -3368,25 +3376,126 @@ static my_bool xtrabackup_copy_datafile(ds_ctxt *ds_data,
33683376
return(FALSE);
33693377
}
33703378

3379+
#ifdef HAVE_INNODB_MMAP
3380+
static int
3381+
xtrabackup_copy_mmap_snippet(ds_file_t *ds, const byte *start, const byte *end)
3382+
{
3383+
if (UNIV_UNLIKELY(start > end))
3384+
{
3385+
if (int r= ds_write(ds, start, log_sys.buf + log_sys.file_size - start))
3386+
return r;
3387+
start= log_sys.buf + log_sys.START_OFFSET;
3388+
}
3389+
return ds_write(ds, start, end - start);
3390+
}
3391+
3392+
/** Copy memory-mapped log until the end of the log is reached
3393+
or the log_copying_stop signal is received
3394+
@return whether the operation failed */
3395+
static bool xtrabackup_copy_mmap_logfile()
3396+
{
3397+
mysql_mutex_assert_owner(&recv_sys.mutex);
3398+
recv_sys.offset= size_t(log_sys.calc_lsn_offset(recv_sys.lsn));
3399+
recv_sys.len= size_t(log_sys.file_size);
3400+
const size_t seq_offset{log_sys.is_encrypted() ? 8U + 5U : 5U};
3401+
const char one{'\1'};
3402+
3403+
for (unsigned retry_count{0};;)
3404+
{
3405+
recv_sys_t::parse_mtr_result r;
3406+
const byte *start= &log_sys.buf[recv_sys.offset];
3407+
3408+
if (recv_sys.parse_mmap<false>(false) == recv_sys_t::OK)
3409+
{
3410+
const byte *end;
3411+
3412+
do
3413+
{
3414+
/* Set the sequence bit (the backed-up log will not wrap around) */
3415+
size_t seqo= recv_sys.offset - seq_offset;
3416+
if (seqo < log_sys.START_OFFSET)
3417+
seqo+= log_sys.file_size - log_sys.START_OFFSET;
3418+
const byte *seq= &log_sys.buf[seqo];
3419+
ut_ad(*seq == log_sys.get_sequence_bit(recv_sys.lsn - seq_offset));
3420+
if (!*seq)
3421+
{
3422+
if (xtrabackup_copy_mmap_snippet(dst_log_file, start, seq) ||
3423+
ds_write(dst_log_file, &one, 1))
3424+
goto write_error;
3425+
start = seq + 1;
3426+
}
3427+
}
3428+
while ((r= recv_sys.parse_mmap<false>(false)) == recv_sys_t::OK);
3429+
3430+
end= &log_sys.buf[recv_sys.offset];
3431+
3432+
if (xtrabackup_copy_mmap_snippet(dst_log_file, start, end))
3433+
{
3434+
write_error:
3435+
msg("Error: write to ib_logfile0 failed");
3436+
return true;
3437+
}
3438+
3439+
start= end;
3440+
3441+
pthread_cond_broadcast(&scanned_lsn_cond);
3442+
3443+
if (r == recv_sys_t::GOT_EOF)
3444+
break;
3445+
3446+
retry_count= 0;
3447+
}
3448+
else
3449+
{
3450+
if (metadata_to_lsn)
3451+
{
3452+
if (metadata_to_lsn <= recv_sys.lsn)
3453+
return false;
3454+
}
3455+
else if (xtrabackup_throttle && io_ticket-- < 0)
3456+
mysql_cond_wait(&wait_throttle, &recv_sys.mutex);
3457+
3458+
if (!retry_count++)
3459+
msg("Retrying read of log at LSN=" LSN_PF, recv_sys.lsn);
3460+
else if (retry_count == 100)
3461+
break;
3462+
else
3463+
{
3464+
timespec abstime;
3465+
set_timespec_nsec(abstime, 1000000ULL /* 1 ms */);
3466+
if (!mysql_cond_timedwait(&log_copying_stop, &recv_sys.mutex,
3467+
&abstime))
3468+
return true;
3469+
}
3470+
}
3471+
}
3472+
3473+
if (verbose)
3474+
msg(">> log scanned up to (" LSN_PF ")", recv_sys.lsn);
3475+
return false;
3476+
}
3477+
#endif
3478+
33713479
/** Copy redo log until the current end of the log is reached
3372-
@return whether the operation failed */
3480+
@return whether the operation failed */
33733481
static bool xtrabackup_copy_logfile()
33743482
{
33753483
mysql_mutex_assert_owner(&recv_sys.mutex);
33763484
DBUG_EXECUTE_IF("log_checksum_mismatch", return false;);
33773485

33783486
ut_a(dst_log_file);
33793487
ut_ad(recv_sys.is_initialised());
3488+
3489+
#ifdef HAVE_INNODB_MMAP
3490+
if (log_sys.is_mmap())
3491+
return xtrabackup_copy_mmap_logfile();
3492+
#endif
33803493
const size_t sequence_offset{log_sys.is_encrypted() ? 8U + 5U : 5U};
33813494
const size_t block_size_1{log_sys.write_size - 1};
33823495

3383-
ut_ad(!log_sys.is_pmem());
3384-
3385-
{
3386-
recv_sys.offset= size_t(recv_sys.lsn - log_sys.get_first_lsn()) &
3387-
block_size_1;
3388-
recv_sys.len= 0;
3389-
}
3496+
recv_sys.offset= size_t(recv_sys.lsn - log_sys.get_first_lsn()) &
3497+
block_size_1;
3498+
recv_sys.len= 0;
33903499

33913500
for (unsigned retry_count{0};;)
33923501
{
@@ -5376,9 +5485,8 @@ static bool xtrabackup_backup_func()
53765485
goto fail;
53775486
}
53785487

5379-
if (!log_sys.create()) {
5380-
goto fail;
5381-
}
5488+
log_sys.create();
5489+
53825490
/* get current checkpoint_lsn */
53835491
{
53845492
log_sys.latch.wr_lock(SRW_LOCK_CALL);
@@ -6730,9 +6838,7 @@ static bool xtrabackup_prepare_func(char** argv)
67306838
}
67316839

67326840
recv_sys.create();
6733-
if (!log_sys.create()) {
6734-
goto error;
6735-
}
6841+
log_sys.create();
67366842
recv_sys.recovery_on = true;
67376843

67386844
xb_fil_io_init();

include/my_sys.h

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1017,11 +1017,7 @@ extern int my_win_pclose(FILE*);
10171017
#endif
10181018

10191019
/* my_getpagesize */
1020-
#ifdef HAVE_GETPAGESIZE
1021-
#define my_getpagesize() getpagesize()
1022-
#else
10231020
int my_getpagesize(void);
1024-
#endif
10251021

10261022
int my_msync(int, void *, size_t, int);
10271023

mysql-test/suite/innodb/r/log_file_size_online.result

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,12 @@ SHOW VARIABLES LIKE 'innodb_log_file_size';
1919
Variable_name Value
2020
innodb_log_file_size 4194304
2121
FOUND 1 /InnoDB: Resized log to 4\.000MiB/ in mysqld.1.err
22+
SET @save=@@GLOBAL.innodb_log_file_buffering;
23+
SET GLOBAL innodb_log_file_buffering=OFF;
24+
SET GLOBAL innodb_log_file_buffering=ON;
25+
SET GLOBAL innodb_log_file_buffering=@save;
26+
SET GLOBAL innodb_log_file_mmap=OFF;
27+
Got one of the listed errors
2228
SET GLOBAL innodb_log_file_size=5242880;
2329
connect con1,localhost,root;
2430
UPDATE t SET b='' WHERE a<10;

mysql-test/suite/innodb/t/log_file_size_online.test

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,17 @@ SHOW VARIABLES LIKE 'innodb_log_file_size';
2525
let SEARCH_PATTERN = InnoDB: Resized log to 4\\.000MiB;
2626
--source include/search_pattern_in_file.inc
2727

28+
--error 0,ER_UNKNOWN_SYSTEM_VARIABLE
29+
SET @save=@@GLOBAL.innodb_log_file_buffering;
30+
--error 0,ER_UNKNOWN_SYSTEM_VARIABLE
31+
SET GLOBAL innodb_log_file_buffering=OFF;
32+
--error 0,ER_UNKNOWN_SYSTEM_VARIABLE
33+
SET GLOBAL innodb_log_file_buffering=ON;
34+
--error 0,ER_UNKNOWN_SYSTEM_VARIABLE
35+
SET GLOBAL innodb_log_file_buffering=@save;
36+
--error ER_INCORRECT_GLOBAL_LOCAL_VAR,ER_UNKNOWN_SYSTEM_VARIABLE
37+
SET GLOBAL innodb_log_file_mmap=OFF;
38+
2839
send SET GLOBAL innodb_log_file_size=5242880;
2940

3041
--connect con1,localhost,root

mysql-test/suite/sys_vars/r/sysvars_innodb.result

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@ variable_name not in (
44
'innodb_numa_interleave', # only available WITH_NUMA
55
'innodb_evict_tables_on_commit_debug', # one may want to override this
66
'innodb_use_native_aio', # default value depends on OS
7+
'innodb_log_file_mmap', # only available on 64-bit
78
'innodb_log_file_buffering', # only available on Linux and Windows
89
'innodb_buffer_pool_load_pages_abort') # debug build only, and is only for testing
910
order by variable_name;

mysql-test/suite/sys_vars/t/sysvars_innodb.test

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@ select VARIABLE_NAME, SESSION_VALUE, DEFAULT_VALUE, VARIABLE_SCOPE, VARIABLE_TYP
1111
'innodb_numa_interleave', # only available WITH_NUMA
1212
'innodb_evict_tables_on_commit_debug', # one may want to override this
1313
'innodb_use_native_aio', # default value depends on OS
14+
'innodb_log_file_mmap', # only available on 64-bit
1415
'innodb_log_file_buffering', # only available on Linux and Windows
1516
'innodb_buffer_pool_load_pages_abort') # debug build only, and is only for testing
1617
order by variable_name;

mysys/my_getpagesize.c

Lines changed: 7 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -16,8 +16,6 @@
1616

1717
#include "mysys_priv.h"
1818

19-
#ifndef HAVE_GETPAGESIZE
20-
2119
#if defined _WIN32
2220

2321
int my_getpagesize(void)
@@ -27,6 +25,13 @@ int my_getpagesize(void)
2725
return si.dwPageSize;
2826
}
2927

28+
#elif defined _SC_PAGESIZE
29+
30+
int my_getpagesize(void)
31+
{
32+
return (int)sysconf(_SC_PAGESIZE);
33+
}
34+
3035
#else
3136

3237
/* Default implementation */
@@ -36,6 +41,3 @@ int my_getpagesize(void)
3641
}
3742

3843
#endif
39-
40-
#endif
41-

mysys/my_init.c

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -151,9 +151,7 @@ my_bool my_init(void)
151151
my_umask= 0660; /* Default umask for new files */
152152
my_umask_dir= 0700; /* Default umask for new directories */
153153
my_global_flags= 0;
154-
#ifdef _SC_PAGESIZE
155-
my_system_page_size= sysconf(_SC_PAGESIZE);
156-
#endif
154+
my_system_page_size= my_getpagesize();
157155

158156
/* Default creation of new files */
159157
if ((str= getenv("UMASK")) != 0)

0 commit comments

Comments
 (0)