From 8c5d5bc5de135ed143bfe91c99fd53a8c9b4487c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jan=20Lindstr=C3=B6m?= Date: Mon, 3 Feb 2014 10:08:15 +0200 Subject: [PATCH] Fixed merge error on InnoDB page compression level handling. Merged page compression feature to XtraDB storage engine. Added feature where page compression can use lz4 compression method (innodb_use_lz4, default OFF). --- storage/innobase/CMakeLists.txt | 1 + storage/innobase/btr/btr0btr.cc | 4 +- storage/innobase/btr/btr0cur.cc | 4 +- storage/innobase/fil/fil0fil.cc | 2 +- storage/innobase/fil/fil0pagecompress.cc | 170 ++-- storage/innobase/fil/lz4.c | 822 +++++++++++++++++++ storage/innobase/fil/lz4.h | 205 +++++ storage/innobase/handler/ha_innodb.cc | 44 +- storage/innobase/include/fil0fil.h | 1 + storage/innobase/include/fsp0pagecompress.ic | 5 +- storage/innobase/include/page0zip.h | 2 +- storage/innobase/include/srv0srv.h | 7 +- storage/innobase/page/page0cur.cc | 2 +- storage/innobase/page/page0page.cc | 6 +- storage/innobase/page/page0zip.cc | 4 +- storage/innobase/srv/srv0srv.cc | 18 +- storage/xtradb/CMakeLists.txt | 4 + storage/xtradb/buf/buf0buf.cc | 23 + storage/xtradb/buf/buf0dblwr.cc | 26 +- storage/xtradb/buf/buf0flu.cc | 349 +++++++- storage/xtradb/buf/buf0rea.cc | 5 +- storage/xtradb/dict/dict0dict.cc | 1 + storage/xtradb/fil/fil0fil.cc | 152 +++- storage/xtradb/fil/fil0pagecompress.cc | 324 ++++++++ storage/xtradb/fil/lz4.c | 822 +++++++++++++++++++ storage/xtradb/fil/lz4.h | 205 +++++ storage/xtradb/handler/ha_innodb.cc | 246 +++++- storage/xtradb/handler/ha_innodb.h | 18 + storage/xtradb/handler/handler0alter.cc | 28 + storage/xtradb/include/buf0buf.h | 21 + storage/xtradb/include/buf0flu.h | 7 + storage/xtradb/include/dict0dict.h | 12 +- storage/xtradb/include/dict0dict.ic | 164 +++- storage/xtradb/include/dict0mem.h | 56 +- storage/xtradb/include/dict0pagecompress.h | 94 +++ storage/xtradb/include/dict0pagecompress.ic | 191 +++++ storage/xtradb/include/dict0types.h | 9 + storage/xtradb/include/fil0fil.h | 43 +- storage/xtradb/include/fil0pagecompress.h | 118 +++ storage/xtradb/include/fsp0fsp.h | 68 +- storage/xtradb/include/fsp0fsp.ic | 19 + storage/xtradb/include/fsp0pagecompress.h | 73 ++ storage/xtradb/include/fsp0pagecompress.ic | 177 ++++ storage/xtradb/include/os0file.h | 69 +- storage/xtradb/include/os0file.ic | 26 +- storage/xtradb/include/srv0mon.h | 11 + storage/xtradb/include/srv0srv.h | 62 +- storage/xtradb/log/log0log.cc | 20 +- storage/xtradb/log/log0online.cc | 6 +- storage/xtradb/log/log0recv.cc | 19 +- storage/xtradb/os/os0file.cc | 553 +++++++++++-- storage/xtradb/srv/srv0mon.cc | 68 ++ storage/xtradb/srv/srv0srv.cc | 43 +- storage/xtradb/srv/srv0start.cc | 730 +++++++++++++++- 54 files changed, 5839 insertions(+), 320 deletions(-) create mode 100644 storage/innobase/fil/lz4.c create mode 100644 storage/innobase/fil/lz4.h create mode 100644 storage/xtradb/fil/fil0pagecompress.cc create mode 100644 storage/xtradb/fil/lz4.c create mode 100644 storage/xtradb/fil/lz4.h create mode 100644 storage/xtradb/include/dict0pagecompress.h create mode 100644 storage/xtradb/include/dict0pagecompress.ic create mode 100644 storage/xtradb/include/fil0pagecompress.h create mode 100644 storage/xtradb/include/fsp0pagecompress.h create mode 100644 storage/xtradb/include/fsp0pagecompress.ic diff --git a/storage/innobase/CMakeLists.txt b/storage/innobase/CMakeLists.txt index e41d2406bd2b4..0b1043bc42159 100644 --- a/storage/innobase/CMakeLists.txt +++ b/storage/innobase/CMakeLists.txt @@ -294,6 +294,7 @@ SET(INNOBASE_SOURCES eval/eval0proc.cc fil/fil0fil.cc fil/fil0pagecompress.cc + fil/lz4.c fsp/fsp0fsp.cc fut/fut0fut.cc fut/fut0lst.cc diff --git a/storage/innobase/btr/btr0btr.cc b/storage/innobase/btr/btr0btr.cc index e3e127c3acebe..3d7dc9931468c 100644 --- a/storage/innobase/btr/btr0btr.cc +++ b/storage/innobase/btr/btr0btr.cc @@ -1923,7 +1923,7 @@ btr_page_reorganize( dict_index_t* index, /*!< in: record descriptor */ mtr_t* mtr) /*!< in: mtr */ { - return(btr_page_reorganize_low(FALSE, page_compression_level, + return(btr_page_reorganize_low(FALSE, page_zip_level, block, index, mtr)); } #endif /* !UNIV_HOTBACKUP */ @@ -1942,7 +1942,7 @@ btr_parse_page_reorganize( buf_block_t* block, /*!< in: page to be reorganized, or NULL */ mtr_t* mtr) /*!< in: mtr or NULL */ { - ulint level = page_compression_level; + ulint level = page_zip_level; ut_ad(ptr && end_ptr); diff --git a/storage/innobase/btr/btr0cur.cc b/storage/innobase/btr/btr0cur.cc index ecc1718877063..5feb136386747 100644 --- a/storage/innobase/btr/btr0cur.cc +++ b/storage/innobase/btr/btr0cur.cc @@ -1844,7 +1844,7 @@ btr_cur_update_alloc_zip( /* Have a local copy of the variables as these can change dynamically. */ bool log_compressed = page_log_compressed_pages; - ulint compression_level = page_compression_level; + ulint compression_level = page_zip_level; page_t* page = buf_block_get_frame(block); ut_a(page_zip == buf_block_get_page_zip(block)); @@ -4334,7 +4334,7 @@ btr_store_big_rec_extern_fields( heap = mem_heap_create(250000); page_zip_set_alloc(&c_stream, heap); - err = deflateInit2(&c_stream, page_compression_level, + err = deflateInit2(&c_stream, page_zip_level, Z_DEFLATED, 15, 7, Z_DEFAULT_STRATEGY); ut_a(err == Z_OK); } diff --git a/storage/innobase/fil/fil0fil.cc b/storage/innobase/fil/fil0fil.cc index 1718e68d6676c..3803d0a93aa82 100644 --- a/storage/innobase/fil/fil0fil.cc +++ b/storage/innobase/fil/fil0fil.cc @@ -5303,7 +5303,7 @@ fil_io( os_offset_t offset; ibool ignore_nonexistent_pages; ibool page_compressed = FALSE; - ibool page_compression_level = 0; + ulint page_compression_level = 0; is_log = type & OS_FILE_LOG; type = type & ~OS_FILE_LOG; diff --git a/storage/innobase/fil/fil0pagecompress.cc b/storage/innobase/fil/fil0pagecompress.cc index 2da9d70e1979b..10ac273955f02 100644 --- a/storage/innobase/fil/fil0pagecompress.cc +++ b/storage/innobase/fil/fil0pagecompress.cc @@ -1,6 +1,6 @@ /***************************************************************************** -Copyright (C) 2013 SkySQL Ab. All Rights Reserved. +Copyright (C) 2013, 2014, SkySQL Ab. All Rights Reserved. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -63,6 +63,7 @@ static ulint srv_data_read, srv_data_written; #include #endif #include "row0mysql.h" +#include "lz4.h" /****************************************************************//** For page compressed pages compress the page before actual write @@ -100,7 +101,7 @@ fil_compress_page( /* If no compression level was provided to this table, use system default level */ if (level == 0) { - level = srv_compress_zlib_level; + level = page_zip_level; } #ifdef UNIV_DEBUG @@ -110,60 +111,88 @@ fil_compress_page( #endif write_size = UNIV_PAGE_SIZE - header_len; - err = compress2(out_buf+header_len, &write_size, buf, len, level); - if (err != Z_OK) { - /* If error we leave the actual page as it was */ + if (srv_use_lz4) { + err = LZ4_compress_limitedOutput((const char *)buf, (char *)out_buf+header_len, len, write_size); + write_size = err; - fprintf(stderr, - "InnoDB: Warning: Compression failed for space %lu name %s len %lu rt %d write %lu\n", - space_id, fil_space_name(space), len, err, write_size); + if (err == 0) { + /* If error we leave the actual page as it was */ + + fprintf(stderr, + "InnoDB: Warning: Compression failed for space %lu name %s len %lu rt %d write %lu\n", + space_id, fil_space_name(space), len, err, write_size); + + *out_len = len; + return (buf); + } + } else { + err = compress2(out_buf+header_len, &write_size, buf, len, level); + + if (err != Z_OK) { + /* If error we leave the actual page as it was */ + + fprintf(stderr, + "InnoDB: Warning: Compression failed for space %lu name %s len %lu rt %d write %lu\n", + space_id, fil_space_name(space), len, err, write_size); + + *out_len = len; + return (buf); + } + } - *out_len = len; - return (buf); + /* Set up the page header */ + memcpy(out_buf, buf, FIL_PAGE_DATA); + /* Set up the checksum */ + mach_write_to_4(out_buf+FIL_PAGE_SPACE_OR_CHKSUM, BUF_NO_CHECKSUM_MAGIC); + /* Set up the correct page type */ + mach_write_to_2(out_buf+FIL_PAGE_TYPE, FIL_PAGE_PAGE_COMPRESSED); + /* Set up the flush lsn to be compression algorithm */ + if (srv_use_lz4) { + mach_write_to_8(out_buf+FIL_PAGE_FILE_FLUSH_LSN, FIL_PAGE_COMPRESSION_LZ4); } else { - /* Set up the page header */ - memcpy(out_buf, buf, FIL_PAGE_DATA); - /* Set up the checksum */ - mach_write_to_4(out_buf+FIL_PAGE_SPACE_OR_CHKSUM, BUF_NO_CHECKSUM_MAGIC); - /* Set up the correct page type */ - mach_write_to_2(out_buf+FIL_PAGE_TYPE, FIL_PAGE_PAGE_COMPRESSED); - /* Set up the flush lsn to be compression algorithm */ mach_write_to_8(out_buf+FIL_PAGE_FILE_FLUSH_LSN, FIL_PAGE_COMPRESSION_ZLIB); - /* Set up the actual payload lenght */ - mach_write_to_2(out_buf+FIL_PAGE_DATA, write_size); + } + /* Set up the actual payload lenght */ + mach_write_to_2(out_buf+FIL_PAGE_DATA, write_size); #ifdef UNIV_DEBUG - /* Verify */ - ut_ad(fil_page_is_compressed(out_buf)); - ut_ad(mach_read_from_4(out_buf+FIL_PAGE_SPACE_OR_CHKSUM) == BUF_NO_CHECKSUM_MAGIC); - ut_ad(mach_read_from_2(out_buf+FIL_PAGE_DATA) == write_size); + /* Verify */ + ut_ad(fil_page_is_compressed(out_buf)); + ut_ad(mach_read_from_4(out_buf+FIL_PAGE_SPACE_OR_CHKSUM) == BUF_NO_CHECKSUM_MAGIC); + ut_ad(mach_read_from_2(out_buf+FIL_PAGE_DATA) == write_size); + if (srv_use_lz4) { + ut_ad(mach_read_from_8(out_buf+FIL_PAGE_FILE_FLUSH_LSN) == FIL_PAGE_COMPRESSION_LZ4); + } else { ut_ad(mach_read_from_8(out_buf+FIL_PAGE_FILE_FLUSH_LSN) == FIL_PAGE_COMPRESSION_ZLIB); + } #endif - write_size+=header_len; - /* Actual write needs to be alligned on block size */ - if (write_size % OS_FILE_LOG_BLOCK_SIZE) { - write_size = (write_size + (OS_FILE_LOG_BLOCK_SIZE - (write_size % OS_FILE_LOG_BLOCK_SIZE))); - } + write_size+=header_len; + /* Actual write needs to be alligned on block size */ + if (write_size % OS_FILE_LOG_BLOCK_SIZE) { + write_size = (write_size + (OS_FILE_LOG_BLOCK_SIZE - (write_size % OS_FILE_LOG_BLOCK_SIZE))); + } #ifdef UNIV_DEBUG - fprintf(stderr, - "InnoDB: Note: Compression succeeded for space %lu name %s len %lu out_len %lu\n", - space_id, fil_space_name(space), len, write_size); + fprintf(stderr, + "InnoDB: Note: Compression succeeded for space %lu name %s len %lu out_len %lu\n", + space_id, fil_space_name(space), len, write_size); #endif + #define SECT_SIZE 512 - srv_stats.page_compression_saved.add((len - write_size)); - if ((len - write_size) > 0) { - srv_stats.page_compression_trim_sect512.add(((len - write_size) / SECT_SIZE)); - srv_stats.page_compression_trim_sect4096.add(((len - write_size) / (SECT_SIZE*8))); - } - //srv_stats.page_compressed_trim_op.inc(); - srv_stats.pages_page_compressed.inc(); - *out_len = write_size; - return(out_buf); + srv_stats.page_compression_saved.add((len - write_size)); + if ((len - write_size) > 0) { + srv_stats.page_compression_trim_sect512.add(((len - write_size) / SECT_SIZE)); + srv_stats.page_compression_trim_sect4096.add(((len - write_size) / (SECT_SIZE*8))); } + //srv_stats.page_compressed_trim_op.inc(); + srv_stats.pages_page_compressed.inc(); + *out_len = write_size; + + return(out_buf); + } /****************************************************************//** @@ -203,16 +232,30 @@ fil_decompress_page( /* Get compression algorithm */ compression_alg = mach_read_from_8(buf+FIL_PAGE_FILE_FLUSH_LSN); - if (compression_alg == FIL_PAGE_COMPRESSION_ZLIB) { - // If no buffer was given, we need to allocate temporal buffer - if (page_buf == NULL) { - in_buf = static_cast(ut_malloc(UNIV_PAGE_SIZE)); - } else { - in_buf = page_buf; - } + // If no buffer was given, we need to allocate temporal buffer + if (page_buf == NULL) { +#ifdef UNIV_DEBUG + fprintf(stderr, + "InnoDB: Note: Compression buffer not given, allocating...\n"); +#endif + in_buf = static_cast(ut_malloc(UNIV_PAGE_SIZE)); + } else { + in_buf = page_buf; + } + + /* Get the actual size of compressed page */ + actual_size = mach_read_from_2(buf+FIL_PAGE_DATA); + /* Check if payload size is corrupted */ + if (actual_size == 0 || actual_size > UNIV_PAGE_SIZE) { + fprintf(stderr, + "InnoDB: Corruption: We try to uncompress corrupted page\n" + "InnoDB: actual size %lu compression %s\n", + actual_size, fil_get_compression_alg_name(compression_alg)); + fflush(stderr); + ut_error; + } - /* Get the actual size of compressed page */ - actual_size = mach_read_from_2(buf+FIL_PAGE_DATA); + if (compression_alg == FIL_PAGE_COMPRESSION_ZLIB) { #ifdef UNIV_DEBUG fprintf(stderr, @@ -242,17 +285,19 @@ fil_decompress_page( "InnoDB: Note: Decompression succeeded for len %lu \n", len); #endif + } else if (compression_alg == FIL_PAGE_COMPRESSION_LZ4) { + err = LZ4_decompress_fast((const char *)buf+FIL_PAGE_DATA+FIL_PAGE_COMPRESSED_SIZE, (char *)in_buf, UNIV_PAGE_SIZE); - /* Copy the uncompressed page to the buffer pool, not - really any other options. */ - memcpy(buf, in_buf, len); + if (err != actual_size) { + fprintf(stderr, + "InnoDB: Corruption: Page is marked as compressed\n" + "InnoDB: but decompression read only %d bytes.\n" + "InnoDB: size %lu len %lu\n", + err, actual_size, len); + fflush(stderr); - // Need to free temporal buffer if no buffer was given - if (page_buf == NULL) { - ut_free(in_buf); + ut_error; } - - srv_stats.pages_page_decompressed.inc(); } else { fprintf(stderr, "InnoDB: Corruption: Page is marked as compressed\n" @@ -263,6 +308,17 @@ fil_decompress_page( fflush(stderr); ut_error; } + + srv_stats.pages_page_decompressed.inc(); + + /* Copy the uncompressed page to the buffer pool, not + really any other options. */ + memcpy(buf, in_buf, len); + + // Need to free temporal buffer if no buffer was given + if (page_buf == NULL) { + ut_free(in_buf); + } } diff --git a/storage/innobase/fil/lz4.c b/storage/innobase/fil/lz4.c new file mode 100644 index 0000000000000..4e864de67d32d --- /dev/null +++ b/storage/innobase/fil/lz4.c @@ -0,0 +1,822 @@ +/* + LZ4 - Fast LZ compression algorithm + Copyright (C) 2011-2013, Yann Collet. + BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php) + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above + copyright notice, this list of conditions and the following disclaimer + in the documentation and/or other materials provided with the + distribution. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + + You can contact the author at : + - LZ4 source repository : http://code.google.com/p/lz4/ + - LZ4 public forum : https://groups.google.com/forum/#!forum/lz4c +*/ + +//************************************** +// Tuning parameters +//************************************** +// MEMORY_USAGE : +// Memory usage formula : N->2^N Bytes (examples : 10 -> 1KB; 12 -> 4KB ; 16 -> 64KB; 20 -> 1MB; etc.) +// Increasing memory usage improves compression ratio +// Reduced memory usage can improve speed, due to cache effect +// Default value is 14, for 16KB, which nicely fits into Intel x86 L1 cache +#define MEMORY_USAGE 14 + +// HEAPMODE : +// Select how default compression functions will allocate memory for their hash table, +// in memory stack (0:default, fastest), or in memory heap (1:requires memory allocation (malloc)). +#define HEAPMODE 0 + + +//************************************** +// CPU Feature Detection +//************************************** +// 32 or 64 bits ? +#if (defined(__x86_64__) || defined(_M_X64) || defined(_WIN64) \ + || defined(__powerpc64__) || defined(__ppc64__) || defined(__PPC64__) \ + || defined(__64BIT__) || defined(_LP64) || defined(__LP64__) \ + || defined(__ia64) || defined(__itanium__) || defined(_M_IA64) ) // Detects 64 bits mode +# define LZ4_ARCH64 1 +#else +# define LZ4_ARCH64 0 +#endif + +// Little Endian or Big Endian ? +// Overwrite the #define below if you know your architecture endianess +#if defined (__GLIBC__) +# include +# if (__BYTE_ORDER == __BIG_ENDIAN) +# define LZ4_BIG_ENDIAN 1 +# endif +#elif (defined(__BIG_ENDIAN__) || defined(__BIG_ENDIAN) || defined(_BIG_ENDIAN)) && !(defined(__LITTLE_ENDIAN__) || defined(__LITTLE_ENDIAN) || defined(_LITTLE_ENDIAN)) +# define LZ4_BIG_ENDIAN 1 +#elif defined(__sparc) || defined(__sparc__) \ + || defined(__powerpc__) || defined(__ppc__) || defined(__PPC__) \ + || defined(__hpux) || defined(__hppa) \ + || defined(_MIPSEB) || defined(__s390__) +# define LZ4_BIG_ENDIAN 1 +#else +// Little Endian assumed. PDP Endian and other very rare endian format are unsupported. +#endif + +// Unaligned memory access is automatically enabled for "common" CPU, such as x86. +// For others CPU, such as ARM, the compiler may be more cautious, inserting unnecessary extra code to ensure aligned access property +// If you know your target CPU supports unaligned memory access, you want to force this option manually to improve performance +#if defined(__ARM_FEATURE_UNALIGNED) +# define LZ4_FORCE_UNALIGNED_ACCESS 1 +#endif + +// Define this parameter if your target system or compiler does not support hardware bit count +#if defined(_MSC_VER) && defined(_WIN32_WCE) // Visual Studio for Windows CE does not support Hardware bit count +# define LZ4_FORCE_SW_BITCOUNT +#endif + +// BIG_ENDIAN_NATIVE_BUT_INCOMPATIBLE : +// This option may provide a small boost to performance for some big endian cpu, although probably modest. +// You may set this option to 1 if data will remain within closed environment. +// This option is useless on Little_Endian CPU (such as x86) +//#define BIG_ENDIAN_NATIVE_BUT_INCOMPATIBLE 1 + + +//************************************** +// Compiler Options +//************************************** +#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) // C99 +/* "restrict" is a known keyword */ +#else +# define restrict // Disable restrict +#endif + +#ifdef _MSC_VER // Visual Studio +# define FORCE_INLINE static __forceinline +# include // For Visual 2005 +# if LZ4_ARCH64 // 64-bits +# pragma intrinsic(_BitScanForward64) // For Visual 2005 +# pragma intrinsic(_BitScanReverse64) // For Visual 2005 +# else // 32-bits +# pragma intrinsic(_BitScanForward) // For Visual 2005 +# pragma intrinsic(_BitScanReverse) // For Visual 2005 +# endif +# pragma warning(disable : 4127) // disable: C4127: conditional expression is constant +#else +# ifdef __GNUC__ +# define FORCE_INLINE static inline __attribute__((always_inline)) +# else +# define FORCE_INLINE static inline +# endif +#endif + +#ifdef _MSC_VER +# define lz4_bswap16(x) _byteswap_ushort(x) +#else +# define lz4_bswap16(x) ((unsigned short int) ((((x) >> 8) & 0xffu) | (((x) & 0xffu) << 8))) +#endif + +#define GCC_VERSION (__GNUC__ * 100 + __GNUC_MINOR__) + +#if (GCC_VERSION >= 302) || (__INTEL_COMPILER >= 800) || defined(__clang__) +# define expect(expr,value) (__builtin_expect ((expr),(value)) ) +#else +# define expect(expr,value) (expr) +#endif + +#define likely(expr) expect((expr) != 0, 1) +#define unlikely(expr) expect((expr) != 0, 0) + + +//************************************** +// Memory routines +//************************************** +#include // malloc, calloc, free +#define ALLOCATOR(n,s) calloc(n,s) +#define FREEMEM free +#include // memset, memcpy +#define MEM_INIT memset + + +//************************************** +// Includes +//************************************** +#include "lz4.h" + + +//************************************** +// Basic Types +//************************************** +#if defined (__STDC_VERSION__) && __STDC_VERSION__ >= 199901L // C99 +# include + typedef uint8_t BYTE; + typedef uint16_t U16; + typedef uint32_t U32; + typedef int32_t S32; + typedef uint64_t U64; +#else + typedef unsigned char BYTE; + typedef unsigned short U16; + typedef unsigned int U32; + typedef signed int S32; + typedef unsigned long long U64; +#endif + +#if defined(__GNUC__) && !defined(LZ4_FORCE_UNALIGNED_ACCESS) +# define _PACKED __attribute__ ((packed)) +#else +# define _PACKED +#endif + +#if !defined(LZ4_FORCE_UNALIGNED_ACCESS) && !defined(__GNUC__) +# if defined(__IBMC__) || defined(__SUNPRO_C) || defined(__SUNPRO_CC) +# pragma pack(1) +# else +# pragma pack(push, 1) +# endif +#endif + +typedef struct { U16 v; } _PACKED U16_S; +typedef struct { U32 v; } _PACKED U32_S; +typedef struct { U64 v; } _PACKED U64_S; +typedef struct {size_t v;} _PACKED size_t_S; + +#if !defined(LZ4_FORCE_UNALIGNED_ACCESS) && !defined(__GNUC__) +# if defined(__SUNPRO_C) || defined(__SUNPRO_CC) +# pragma pack(0) +# else +# pragma pack(pop) +# endif +#endif + +#define A16(x) (((U16_S *)(x))->v) +#define A32(x) (((U32_S *)(x))->v) +#define A64(x) (((U64_S *)(x))->v) +#define AARCH(x) (((size_t_S *)(x))->v) + + +//************************************** +// Constants +//************************************** +#define LZ4_HASHLOG (MEMORY_USAGE-2) +#define HASHTABLESIZE (1 << MEMORY_USAGE) +#define HASHNBCELLS4 (1 << LZ4_HASHLOG) + +#define MINMATCH 4 + +#define COPYLENGTH 8 +#define LASTLITERALS 5 +#define MFLIMIT (COPYLENGTH+MINMATCH) +const int LZ4_minLength = (MFLIMIT+1); + +#define LZ4_64KLIMIT ((1<<16) + (MFLIMIT-1)) +#define SKIPSTRENGTH 6 // Increasing this value will make the compression run slower on incompressible data + +#define MAXD_LOG 16 +#define MAX_DISTANCE ((1 << MAXD_LOG) - 1) + +#define ML_BITS 4 +#define ML_MASK ((1U<=e; + + +//**************************** +// Private functions +//**************************** +#if LZ4_ARCH64 + +FORCE_INLINE int LZ4_NbCommonBytes (register U64 val) +{ +# if defined(LZ4_BIG_ENDIAN) +# if defined(_MSC_VER) && !defined(LZ4_FORCE_SW_BITCOUNT) + unsigned long r = 0; + _BitScanReverse64( &r, val ); + return (int)(r>>3); +# elif defined(__GNUC__) && (GCC_VERSION >= 304) && !defined(LZ4_FORCE_SW_BITCOUNT) + return (__builtin_clzll(val) >> 3); +# else + int r; + if (!(val>>32)) { r=4; } else { r=0; val>>=32; } + if (!(val>>16)) { r+=2; val>>=8; } else { val>>=24; } + r += (!val); + return r; +# endif +# else +# if defined(_MSC_VER) && !defined(LZ4_FORCE_SW_BITCOUNT) + unsigned long r = 0; + _BitScanForward64( &r, val ); + return (int)(r>>3); +# elif defined(__GNUC__) && (GCC_VERSION >= 304) && !defined(LZ4_FORCE_SW_BITCOUNT) + return (__builtin_ctzll(val) >> 3); +# else + static const int DeBruijnBytePos[64] = { 0, 0, 0, 0, 0, 1, 1, 2, 0, 3, 1, 3, 1, 4, 2, 7, 0, 2, 3, 6, 1, 5, 3, 5, 1, 3, 4, 4, 2, 5, 6, 7, 7, 0, 1, 2, 3, 3, 4, 6, 2, 6, 5, 5, 3, 4, 5, 6, 7, 1, 2, 4, 6, 4, 4, 5, 7, 2, 6, 5, 7, 6, 7, 7 }; + return DeBruijnBytePos[((U64)((val & -(long long)val) * 0x0218A392CDABBD3FULL)) >> 58]; +# endif +# endif +} + +#else + +FORCE_INLINE int LZ4_NbCommonBytes (register U32 val) +{ +# if defined(LZ4_BIG_ENDIAN) +# if defined(_MSC_VER) && !defined(LZ4_FORCE_SW_BITCOUNT) + unsigned long r = 0; + _BitScanReverse( &r, val ); + return (int)(r>>3); +# elif defined(__GNUC__) && (GCC_VERSION >= 304) && !defined(LZ4_FORCE_SW_BITCOUNT) + return (__builtin_clz(val) >> 3); +# else + int r; + if (!(val>>16)) { r=2; val>>=8; } else { r=0; val>>=24; } + r += (!val); + return r; +# endif +# else +# if defined(_MSC_VER) && !defined(LZ4_FORCE_SW_BITCOUNT) + unsigned long r; + _BitScanForward( &r, val ); + return (int)(r>>3); +# elif defined(__GNUC__) && (GCC_VERSION >= 304) && !defined(LZ4_FORCE_SW_BITCOUNT) + return (__builtin_ctz(val) >> 3); +# else + static const int DeBruijnBytePos[32] = { 0, 0, 3, 0, 3, 1, 3, 0, 3, 2, 2, 1, 3, 2, 0, 1, 3, 3, 1, 2, 2, 2, 2, 0, 3, 1, 2, 0, 1, 0, 1, 1 }; + return DeBruijnBytePos[((U32)((val & -(S32)val) * 0x077CB531U)) >> 27]; +# endif +# endif +} + +#endif + + +//**************************** +// Compression functions +//**************************** +FORCE_INLINE int LZ4_hashSequence(U32 sequence, tableType_t tableType) +{ + if (tableType == byU16) + return (((sequence) * 2654435761U) >> ((MINMATCH*8)-(LZ4_HASHLOG+1))); + else + return (((sequence) * 2654435761U) >> ((MINMATCH*8)-LZ4_HASHLOG)); +} + +FORCE_INLINE int LZ4_hashPosition(const BYTE* p, tableType_t tableType) { return LZ4_hashSequence(A32(p), tableType); } + +FORCE_INLINE void LZ4_putPositionOnHash(const BYTE* p, U32 h, void* tableBase, tableType_t tableType, const BYTE* srcBase) +{ + switch (tableType) + { + case byPtr: { const BYTE** hashTable = (const BYTE**) tableBase; hashTable[h] = p; break; } + case byU32: { U32* hashTable = (U32*) tableBase; hashTable[h] = (U32)(p-srcBase); break; } + case byU16: { U16* hashTable = (U16*) tableBase; hashTable[h] = (U16)(p-srcBase); break; } + } +} + +FORCE_INLINE void LZ4_putPosition(const BYTE* p, void* tableBase, tableType_t tableType, const BYTE* srcBase) +{ + U32 h = LZ4_hashPosition(p, tableType); + LZ4_putPositionOnHash(p, h, tableBase, tableType, srcBase); +} + +FORCE_INLINE const BYTE* LZ4_getPositionOnHash(U32 h, void* tableBase, tableType_t tableType, const BYTE* srcBase) +{ + if (tableType == byPtr) { const BYTE** hashTable = (const BYTE**) tableBase; return hashTable[h]; } + if (tableType == byU32) { U32* hashTable = (U32*) tableBase; return hashTable[h] + srcBase; } + { U16* hashTable = (U16*) tableBase; return hashTable[h] + srcBase; } // default, to ensure a return +} + +FORCE_INLINE const BYTE* LZ4_getPosition(const BYTE* p, void* tableBase, tableType_t tableType, const BYTE* srcBase) +{ + U32 h = LZ4_hashPosition(p, tableType); + return LZ4_getPositionOnHash(h, tableBase, tableType, srcBase); +} + + +FORCE_INLINE int LZ4_compress_generic( + void* ctx, + const char* source, + char* dest, + int inputSize, + int maxOutputSize, + + limitedOutput_directive limitedOutput, + tableType_t tableType, + prefix64k_directive prefix) +{ + const BYTE* ip = (const BYTE*) source; + const BYTE* const base = (prefix==withPrefix) ? ((LZ4_Data_Structure*)ctx)->base : (const BYTE*) source; + const BYTE* const lowLimit = ((prefix==withPrefix) ? ((LZ4_Data_Structure*)ctx)->bufferStart : (const BYTE*)source); + const BYTE* anchor = (const BYTE*) source; + const BYTE* const iend = ip + inputSize; + const BYTE* const mflimit = iend - MFLIMIT; + const BYTE* const matchlimit = iend - LASTLITERALS; + + BYTE* op = (BYTE*) dest; + BYTE* const oend = op + maxOutputSize; + + int length; + const int skipStrength = SKIPSTRENGTH; + U32 forwardH; + + // Init conditions + if ((U32)inputSize > (U32)LZ4_MAX_INPUT_SIZE) return 0; // Unsupported input size, too large (or negative) + if ((prefix==withPrefix) && (ip != ((LZ4_Data_Structure*)ctx)->nextBlock)) return 0; // must continue from end of previous block + if (prefix==withPrefix) ((LZ4_Data_Structure*)ctx)->nextBlock=iend; // do it now, due to potential early exit + if ((tableType == byU16) && (inputSize>=LZ4_64KLIMIT)) return 0; // Size too large (not within 64K limit) + if (inputSize> skipStrength; + ip = forwardIp; + forwardIp = ip + step; + + if unlikely(forwardIp > mflimit) { goto _last_literals; } + + forwardH = LZ4_hashPosition(forwardIp, tableType); + ref = LZ4_getPositionOnHash(h, ctx, tableType, base); + LZ4_putPositionOnHash(ip, h, ctx, tableType, base); + + } while ((ref + MAX_DISTANCE < ip) || (A32(ref) != A32(ip))); + + // Catch up + while ((ip>anchor) && (ref > lowLimit) && unlikely(ip[-1]==ref[-1])) { ip--; ref--; } + + // Encode Literal length + length = (int)(ip - anchor); + token = op++; + if ((limitedOutput) && unlikely(op + length + (2 + 1 + LASTLITERALS) + (length/255) > oend)) return 0; // Check output limit + if (length>=(int)RUN_MASK) + { + int len = length-RUN_MASK; + *token=(RUN_MASK<= 255 ; len-=255) *op++ = 255; + *op++ = (BYTE)len; + } + else *token = (BYTE)(length<>8) > oend)) return 0; // Check output limit + if (length>=(int)ML_MASK) + { + *token += ML_MASK; + length -= ML_MASK; + for (; length > 509 ; length-=510) { *op++ = 255; *op++ = 255; } + if (length >= 255) { length-=255; *op++ = 255; } + *op++ = (BYTE)length; + } + else *token += (BYTE)(length); + + // Test end of chunk + if (ip > mflimit) { anchor = ip; break; } + + // Fill table + LZ4_putPosition(ip-2, ctx, tableType, base); + + // Test next position + ref = LZ4_getPosition(ip, ctx, tableType, base); + LZ4_putPosition(ip, ctx, tableType, base); + if ((ref + MAX_DISTANCE >= ip) && (A32(ref) == A32(ip))) { token = op++; *token=0; goto _next_match; } + + // Prepare next loop + anchor = ip++; + forwardH = LZ4_hashPosition(ip, tableType); + } + +_last_literals: + // Encode Last Literals + { + int lastRun = (int)(iend - anchor); + if ((limitedOutput) && (((char*)op - dest) + lastRun + 1 + ((lastRun+255-RUN_MASK)/255) > (U32)maxOutputSize)) return 0; // Check output limit + if (lastRun>=(int)RUN_MASK) { *op++=(RUN_MASK<= 255 ; lastRun-=255) *op++ = 255; *op++ = (BYTE) lastRun; } + else *op++ = (BYTE)(lastRun<hashTable, 0, sizeof(lz4ds->hashTable)); + lz4ds->bufferStart = base; + lz4ds->base = base; + lz4ds->nextBlock = base; +} + + +void* LZ4_create (const char* inputBuffer) +{ + void* lz4ds = ALLOCATOR(1, sizeof(LZ4_Data_Structure)); + LZ4_init ((LZ4_Data_Structure*)lz4ds, (const BYTE*)inputBuffer); + return lz4ds; +} + + +int LZ4_free (void* LZ4_Data) +{ + FREEMEM(LZ4_Data); + return (0); +} + + +char* LZ4_slideInputBuffer (void* LZ4_Data) +{ + LZ4_Data_Structure* lz4ds = (LZ4_Data_Structure*)LZ4_Data; + size_t delta = lz4ds->nextBlock - (lz4ds->bufferStart + 64 KB); + + if ( (lz4ds->base - delta > lz4ds->base) // underflow control + || ((size_t)(lz4ds->nextBlock - lz4ds->base) > 0xE0000000) ) // close to 32-bits limit + { + size_t deltaLimit = (lz4ds->nextBlock - 64 KB) - lz4ds->base; + int nH; + + for (nH=0; nH < HASHNBCELLS4; nH++) + { + if ((size_t)(lz4ds->hashTable[nH]) < deltaLimit) lz4ds->hashTable[nH] = 0; + else lz4ds->hashTable[nH] -= (U32)deltaLimit; + } + memcpy((void*)(lz4ds->bufferStart), (const void*)(lz4ds->nextBlock - 64 KB), 64 KB); + lz4ds->base = lz4ds->bufferStart; + lz4ds->nextBlock = lz4ds->base + 64 KB; + } + else + { + memcpy((void*)(lz4ds->bufferStart), (const void*)(lz4ds->nextBlock - 64 KB), 64 KB); + lz4ds->nextBlock -= delta; + lz4ds->base -= delta; + } + + return (char*)(lz4ds->nextBlock); +} + + +//**************************** +// Decompression functions +//**************************** + +// This generic decompression function cover all use cases. +// It shall be instanciated several times, using different sets of directives +// Note that it is essential this generic function is really inlined, +// in order to remove useless branches during compilation optimisation. +FORCE_INLINE int LZ4_decompress_generic( + const char* source, + char* dest, + int inputSize, // + int outputSize, // If endOnInput==endOnInputSize, this value is the max size of Output Buffer. + + int endOnInput, // endOnOutputSize, endOnInputSize + int prefix64k, // noPrefix, withPrefix + int partialDecoding, // full, partial + int targetOutputSize // only used if partialDecoding==partial + ) +{ + // Local Variables + const BYTE* restrict ip = (const BYTE*) source; + const BYTE* ref; + const BYTE* const iend = ip + inputSize; + + BYTE* op = (BYTE*) dest; + BYTE* const oend = op + outputSize; + BYTE* cpy; + BYTE* oexit = op + targetOutputSize; + + const size_t dec32table[] = {0, 3, 2, 3, 0, 0, 0, 0}; // static reduces speed for LZ4_decompress_safe() on GCC64 + static const size_t dec64table[] = {0, 0, 0, (size_t)-1, 0, 1, 2, 3}; + + + // Special cases + if ((partialDecoding) && (oexit> oend-MFLIMIT)) oexit = oend-MFLIMIT; // targetOutputSize too high => decode everything + if ((endOnInput) && unlikely(outputSize==0)) return ((inputSize==1) && (*ip==0)) ? 0 : -1; // Empty output buffer + if ((!endOnInput) && unlikely(outputSize==0)) return (*ip==0?1:-1); + + + // Main Loop + while (1) + { + unsigned token; + size_t length; + + // get runlength + token = *ip++; + if ((length=(token>>ML_BITS)) == RUN_MASK) + { + unsigned s=255; + while (((endOnInput)?ip(partialDecoding?oexit:oend-MFLIMIT)) || (ip+length>iend-(2+1+LASTLITERALS))) ) + || ((!endOnInput) && (cpy>oend-COPYLENGTH))) + { + if (partialDecoding) + { + if (cpy > oend) goto _output_error; // Error : write attempt beyond end of output buffer + if ((endOnInput) && (ip+length > iend)) goto _output_error; // Error : read attempt beyond end of input buffer + } + else + { + if ((!endOnInput) && (cpy != oend)) goto _output_error; // Error : block decoding must stop exactly there + if ((endOnInput) && ((ip+length != iend) || (cpy > oend))) goto _output_error; // Error : input must be consumed + } + memcpy(op, ip, length); + ip += length; + op += length; + break; // Necessarily EOF, due to parsing restrictions + } + LZ4_WILDCOPY(op, ip, cpy); ip -= (op-cpy); op = cpy; + + // get offset + LZ4_READ_LITTLEENDIAN_16(ref,cpy,ip); ip+=2; + if ((prefix64k==noPrefix) && unlikely(ref < (BYTE* const)dest)) goto _output_error; // Error : offset outside destination buffer + + // get matchlength + if ((length=(token&ML_MASK)) == ML_MASK) + { + while ((!endOnInput) || (ipoend-COPYLENGTH-(STEPSIZE-4)) + { + if (cpy > oend-LASTLITERALS) goto _output_error; // Error : last 5 bytes must be literals + LZ4_SECURECOPY(op, ref, (oend-COPYLENGTH)); + while(op (unsigned int)LZ4_MAX_INPUT_SIZE ? 0 : (isize) + ((isize)/255) + 16) +static inline int LZ4_compressBound(int isize) { return LZ4_COMPRESSBOUND(isize); } + +/* +LZ4_compressBound() : + Provides the maximum size that LZ4 may output in a "worst case" scenario (input data not compressible) + primarily useful for memory allocation of output buffer. + inline function is recommended for the general case, + macro is also provided when result needs to be evaluated at compilation (such as stack memory allocation). + + isize : is the input size. Max supported value is LZ4_MAX_INPUT_SIZE + return : maximum output size in a "worst case" scenario + or 0, if input size is too large ( > LZ4_MAX_INPUT_SIZE) +*/ + + +int LZ4_compress_limitedOutput (const char* source, char* dest, int inputSize, int maxOutputSize); + +/* +LZ4_compress_limitedOutput() : + Compress 'inputSize' bytes from 'source' into an output buffer 'dest' of maximum size 'maxOutputSize'. + If it cannot achieve it, compression will stop, and result of the function will be zero. + This function never writes outside of provided output buffer. + + inputSize : Max supported value is LZ4_MAX_INPUT_VALUE + maxOutputSize : is the size of the destination buffer (which must be already allocated) + return : the number of bytes written in buffer 'dest' + or 0 if the compression fails +*/ + + +int LZ4_decompress_fast (const char* source, char* dest, int outputSize); + +/* +LZ4_decompress_fast() : + outputSize : is the original (uncompressed) size + return : the number of bytes read from the source buffer (in other words, the compressed size) + If the source stream is malformed, the function will stop decoding and return a negative result. + note : This function is a bit faster than LZ4_decompress_safe() + This function never writes outside of output buffers, but may read beyond input buffer in case of malicious data packet. + Use this function preferably into a trusted environment (data to decode comes from a trusted source). + Destination buffer must be already allocated. Its size must be a minimum of 'outputSize' bytes. +*/ + +int LZ4_decompress_safe_partial (const char* source, char* dest, int inputSize, int targetOutputSize, int maxOutputSize); + +/* +LZ4_decompress_safe_partial() : + This function decompress a compressed block of size 'inputSize' at position 'source' + into output buffer 'dest' of size 'maxOutputSize'. + The function tries to stop decompressing operation as soon as 'targetOutputSize' has been reached, + reducing decompression time. + return : the number of bytes decoded in the destination buffer (necessarily <= maxOutputSize) + Note : this number can be < 'targetOutputSize' should the compressed block to decode be smaller. + Always control how many bytes were decoded. + If the source stream is detected malformed, the function will stop decoding and return a negative result. + This function never writes outside of output buffer, and never reads outside of input buffer. It is therefore protected against malicious data packets +*/ + + +//**************************** +// Stream Functions +//**************************** + +void* LZ4_create (const char* inputBuffer); +int LZ4_compress_continue (void* LZ4_Data, const char* source, char* dest, int inputSize); +int LZ4_compress_limitedOutput_continue (void* LZ4_Data, const char* source, char* dest, int inputSize, int maxOutputSize); +char* LZ4_slideInputBuffer (void* LZ4_Data); +int LZ4_free (void* LZ4_Data); + +/* +These functions allow the compression of dependent blocks, where each block benefits from prior 64 KB within preceding blocks. +In order to achieve this, it is necessary to start creating the LZ4 Data Structure, thanks to the function : + +void* LZ4_create (const char* inputBuffer); +The result of the function is the (void*) pointer on the LZ4 Data Structure. +This pointer will be needed in all other functions. +If the pointer returned is NULL, then the allocation has failed, and compression must be aborted. +The only parameter 'const char* inputBuffer' must, obviously, point at the beginning of input buffer. +The input buffer must be already allocated, and size at least 192KB. +'inputBuffer' will also be the 'const char* source' of the first block. + +All blocks are expected to lay next to each other within the input buffer, starting from 'inputBuffer'. +To compress each block, use either LZ4_compress_continue() or LZ4_compress_limitedOutput_continue(). +Their behavior are identical to LZ4_compress() or LZ4_compress_limitedOutput(), +but require the LZ4 Data Structure as their first argument, and check that each block starts right after the previous one. +If next block does not begin immediately after the previous one, the compression will fail (return 0). + +When it's no longer possible to lay the next block after the previous one (not enough space left into input buffer), a call to : +char* LZ4_slideInputBuffer(void* LZ4_Data); +must be performed. It will typically copy the latest 64KB of input at the beginning of input buffer. +Note that, for this function to work properly, minimum size of an input buffer must be 192KB. +==> The memory position where the next input data block must start is provided as the result of the function. + +Compression can then resume, using LZ4_compress_continue() or LZ4_compress_limitedOutput_continue(), as usual. + +When compression is completed, a call to LZ4_free() will release the memory used by the LZ4 Data Structure. +*/ + + +int LZ4_decompress_safe_withPrefix64k (const char* source, char* dest, int inputSize, int maxOutputSize); +int LZ4_decompress_fast_withPrefix64k (const char* source, char* dest, int outputSize); + +/* +*_withPrefix64k() : + These decoding functions work the same as their "normal name" versions, + but can use up to 64KB of data in front of 'char* dest'. + These functions are necessary to decode inter-dependant blocks. +*/ + + +//**************************** +// Obsolete Functions +//**************************** + +static inline int LZ4_uncompress (const char* source, char* dest, int outputSize) { return LZ4_decompress_fast(source, dest, outputSize); } +static inline int LZ4_uncompress_unknownOutputSize (const char* source, char* dest, int isize, int maxOutputSize) { return LZ4_decompress_safe(source, dest, isize, maxOutputSize); } + +/* +These functions are deprecated and should no longer be used. +They are provided here for compatibility with existing user programs. +*/ + + + +#if defined (__cplusplus) +} +#endif diff --git a/storage/innobase/handler/ha_innodb.cc b/storage/innobase/handler/ha_innodb.cc index eda7da81d5c20..d4ce4eb9c4ffb 100644 --- a/storage/innobase/handler/ha_innodb.cc +++ b/storage/innobase/handler/ha_innodb.cc @@ -4,7 +4,7 @@ Copyright (c) 2000, 2012, Oracle and/or its affiliates. All rights reserved. Copyright (c) 2008, 2009 Google Inc. Copyright (c) 2009, Percona Inc. Copyright (c) 2012, Facebook Inc. -Copyright (c) 2013, SkySQL Ab. +Copyright (c) 2013, 2014, SkySQL Ab. Portions of this file contain modifications contributed and copyrighted by Google, Inc. Those modifications are gratefully acknowledged and are described @@ -15429,29 +15429,6 @@ innodb_reset_all_monitor_update( TRUE); } -/****************************************************************//** -Update the system variable innodb_compression_level using the "saved" -value. This function is registered as a callback with MySQL. */ -static -void -innodb_compression_level_update( -/*============================*/ - THD* thd, /*!< in: thread handle */ - struct st_mysql_sys_var* var, /*!< in: pointer to - system variable */ - void* var_ptr,/*!< out: where the - formal string goes */ - const void* save) /*!< in: immediate result - from check function */ -{ - /* We have this call back just to avoid confusion between - ulong and ulint datatypes. */ - innobase_compression_level = - (*static_cast(save)); - page_compression_level = - (static_cast(innobase_compression_level)); -} - /****************************************************************//** Parse and enable InnoDB monitor counters during server startup. User can list the monitor counters/groups to be enable by specifying @@ -16140,11 +16117,11 @@ static MYSQL_SYSVAR_ULONG(replication_delay, srv_replication_delay, "innodb_thread_concurrency is reached (0 by default)", NULL, NULL, 0, 0, ~0UL, 0); -static MYSQL_SYSVAR_ULONG(compression_level, innobase_compression_level, +static MYSQL_SYSVAR_UINT(compression_level, page_zip_level, PLUGIN_VAR_RQCMDARG, - "Compression level used for compressed row format. 0 is no compression" + "Compression level used for zlib compression. 0 is no compression" ", 1 is fastest, 9 is best compression and default is 6.", - NULL, innodb_compression_level_update, + NULL, NULL, DEFAULT_COMPRESSION_LEVEL, 0, 9, 0); static MYSQL_SYSVAR_LONG(additional_mem_pool_size, innobase_additional_mem_pool_size, @@ -16620,11 +16597,6 @@ static MYSQL_SYSVAR_LONG(trim_pct, srv_trim_pct, "How many percent of compressed pages should be trimmed", NULL, NULL, 100, 0, 100, 0); -static MYSQL_SYSVAR_LONG(compress_zlib_level, srv_compress_zlib_level, - PLUGIN_VAR_OPCMDARG , - "Default zlib compression level", - NULL, NULL, 6, 0, 9, 0); - static MYSQL_SYSVAR_BOOL(compress_index_pages, srv_page_compress_index_pages, PLUGIN_VAR_OPCMDARG, "Use page compression for only index pages.", @@ -16635,6 +16607,12 @@ static MYSQL_SYSVAR_BOOL(use_trim, srv_use_trim, "Use trim.", NULL, NULL, TRUE); +static MYSQL_SYSVAR_BOOL(use_lz4, srv_use_lz4, + PLUGIN_VAR_OPCMDARG , + "Use LZ4 for page compression", + NULL, NULL, FALSE); + + static struct st_mysql_sys_var* innobase_system_variables[]= { MYSQL_SYSVAR(additional_mem_pool_size), MYSQL_SYSVAR(api_trx_level), @@ -16782,9 +16760,9 @@ static struct st_mysql_sys_var* innobase_system_variables[]= { #endif /* UNIV_DEBUG */ MYSQL_SYSVAR(compress_pages), MYSQL_SYSVAR(trim_pct), - MYSQL_SYSVAR(compress_zlib_level), MYSQL_SYSVAR(compress_index_pages), MYSQL_SYSVAR(use_trim), + MYSQL_SYSVAR(use_lz4), NULL }; diff --git a/storage/innobase/include/fil0fil.h b/storage/innobase/include/fil0fil.h index 01084d52365d1..918a92fa811fc 100644 --- a/storage/innobase/include/fil0fil.h +++ b/storage/innobase/include/fil0fil.h @@ -134,6 +134,7 @@ extern fil_addr_t fil_addr_null; actual payload data size on compressed pages. */ #define FIL_PAGE_COMPRESSION_ZLIB 1 /*!< Compressin algorithm ZLIB. */ +#define FIL_PAGE_COMPRESSION_LZ4 2 /*!< Compressin algorithm LZ4. */ /* @} */ /** File page trailer @{ */ diff --git a/storage/innobase/include/fsp0pagecompress.ic b/storage/innobase/include/fsp0pagecompress.ic index 755d91b3cd9f1..10f9d30d1f896 100644 --- a/storage/innobase/include/fsp0pagecompress.ic +++ b/storage/innobase/include/fsp0pagecompress.ic @@ -1,6 +1,6 @@ /***************************************************************************** -Copyright (C) 2013 SkySQL Ab. All Rights Reserved. +Copyright (C) 2013,2014 SkySQL Ab. All Rights Reserved. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -144,6 +144,9 @@ fil_get_compression_alg_name( case FIL_PAGE_COMPRESSION_ZLIB: return ("ZLIB"); break; + case FIL_PAGE_COMPRESSION_LZ4: + return ("LZ4"); + break; default: return("UNKNOWN"); break; diff --git a/storage/innobase/include/page0zip.h b/storage/innobase/include/page0zip.h index 12781bd61b87d..89260d0984e66 100644 --- a/storage/innobase/include/page0zip.h +++ b/storage/innobase/include/page0zip.h @@ -41,7 +41,7 @@ Created June 2005 by Marko Makela #include "mem0mem.h" /* Compression level to be used by zlib. Settable by user. */ -extern ulint page_compression_level; +extern uint page_zip_level; /* Default compression level. */ #define DEFAULT_COMPRESSION_LEVEL 6 diff --git a/storage/innobase/include/srv0srv.h b/storage/innobase/include/srv0srv.h index f4fa8b434fea3..a11c213d534e2 100644 --- a/storage/innobase/include/srv0srv.h +++ b/storage/innobase/include/srv0srv.h @@ -3,7 +3,7 @@ Copyright (c) 1995, 2012, Oracle and/or its affiliates. All rights reserved. Copyright (c) 2008, 2009, Google Inc. Copyright (c) 2009, Percona Inc. -Copyright (c) 2013, SkySQL Ab. All Rights Reserved. +Copyright (c) 2013, 2014, SkySQL Ab. All Rights Reserved. Portions of this file contain modifications contributed and copyrighted by Google, Inc. Those modifications are gratefully acknowledged and are described @@ -254,9 +254,8 @@ extern my_bool srv_use_posix_fallocate; /* Use atomic writes i.e disable doublewrite buffer */ extern my_bool srv_use_atomic_writes; -/* Default zlib compression level */ -extern long srv_compress_zlib_level; - +/* If this flag IS TRUE, then we use lz4 to compress/decompress pages */ +extern my_bool srv_use_lz4; #ifdef __WIN__ extern ibool srv_use_native_conditions; diff --git a/storage/innobase/page/page0cur.cc b/storage/innobase/page/page0cur.cc index f416d38cc3581..9d6a62cae8fe9 100644 --- a/storage/innobase/page/page0cur.cc +++ b/storage/innobase/page/page0cur.cc @@ -1180,7 +1180,7 @@ page_cur_insert_rec_zip_reorg( /* Make a local copy as the values can change dynamically. */ bool log_compressed = page_log_compressed_pages; - ulint level = page_compression_level; + ulint level = page_zip_level; /* Recompress or reorganize and recompress the page. */ if (page_zip_compress(page_zip, page, index, level, diff --git a/storage/innobase/page/page0page.cc b/storage/innobase/page/page0page.cc index 6b7b8424856b2..bf73a249f957c 100644 --- a/storage/innobase/page/page0page.cc +++ b/storage/innobase/page/page0page.cc @@ -514,7 +514,7 @@ page_create_zip( mach_write_to_2(page + PAGE_HEADER + PAGE_LEVEL, level); if (!page_zip_compress(page_zip, page, index, - page_compression_level, mtr)) { + page_zip_level, mtr)) { /* The compression of a newly created page should always succeed. */ ut_error; @@ -663,7 +663,7 @@ page_copy_rec_list_end( if (!page_zip_compress(new_page_zip, new_page, index, - page_compression_level, + page_zip_level, mtr)) { /* Before trying to reorganize the page, store the number of preceding records on the page. */ @@ -788,7 +788,7 @@ page_copy_rec_list_start( goto zip_reorganize;); if (!page_zip_compress(new_page_zip, new_page, index, - page_compression_level, mtr)) { + page_zip_level, mtr)) { ulint ret_pos; #ifndef DBUG_OFF diff --git a/storage/innobase/page/page0zip.cc b/storage/innobase/page/page0zip.cc index dee375800027c..3fba62164304c 100644 --- a/storage/innobase/page/page0zip.cc +++ b/storage/innobase/page/page0zip.cc @@ -69,7 +69,7 @@ UNIV_INTERN mysql_pfs_key_t page_zip_stat_per_index_mutex_key; #endif /* !UNIV_HOTBACKUP */ /* Compression level to be used by zlib. Settable by user. */ -UNIV_INTERN ulint page_compression_level = 6; +UNIV_INTERN uint page_zip_level = DEFAULT_COMPRESSION_LEVEL; /* Whether or not to log compressed page images to avoid possible compression algorithm changes in zlib. */ @@ -4631,7 +4631,7 @@ page_zip_reorganize( mtr_set_log_mode(mtr, log_mode); if (!page_zip_compress(page_zip, page, index, - page_compression_level, mtr)) { + page_zip_level, mtr)) { #ifndef UNIV_HOTBACKUP buf_block_free(temp_block); diff --git a/storage/innobase/srv/srv0srv.cc b/storage/innobase/srv/srv0srv.cc index 90864cee9ef03..cffd3f928c349 100644 --- a/storage/innobase/srv/srv0srv.cc +++ b/storage/innobase/srv/srv0srv.cc @@ -3,6 +3,7 @@ Copyright (c) 1995, 2012, Oracle and/or its affiliates. All Rights Reserved. Copyright (c) 2008, 2009 Google Inc. Copyright (c) 2009, Percona Inc. +Copyright (c) 2013, 2014, SkySQL Ab. Portions of this file contain modifications contributed and copyrighted by Google, Inc. Those modifications are gratefully acknowledged and are described @@ -147,21 +148,20 @@ UNIV_INTERN my_bool srv_use_native_aio = TRUE; /* If this flag is TRUE, then we will use page compression to the pages */ -UNIV_INTERN my_bool srv_compress_pages = FALSE; +UNIV_INTERN my_bool srv_compress_pages = FALSE; /* If this flag is TRUE, then we will use page compression only for index pages */ -UNIV_INTERN my_bool srv_page_compress_index_pages = FALSE; -UNIV_INTERN long srv_trim_pct = 100; -/* Default compression level if page compression is used and no compression -level is set for the table*/ -UNIV_INTERN long srv_compress_zlib_level = 6; +UNIV_INTERN my_bool srv_page_compress_index_pages = FALSE; +UNIV_INTERN long srv_trim_pct = 100; /* If this flag is TRUE, then we will use fallocate(PUCH_HOLE) to the pages */ -UNIV_INTERN my_bool srv_use_trim = TRUE; +UNIV_INTERN my_bool srv_use_trim = TRUE; /* If this flag is TRUE, then we will use posix fallocate for file extentsion */ -UNIV_INTERN my_bool srv_use_posix_fallocate = FALSE; +UNIV_INTERN my_bool srv_use_posix_fallocate = FALSE; /* If this flag is TRUE, then we disable doublewrite buffer */ -UNIV_INTERN my_bool srv_use_atomic_writes = FALSE; +UNIV_INTERN my_bool srv_use_atomic_writes = FALSE; +/* If this flag IS TRUE, then we use lz4 to compress/decompress pages */ +UNIV_INTERN my_bool srv_use_lz4 = FALSE; #ifdef __WIN__ /* Windows native condition variables. We use runtime loading / function diff --git a/storage/xtradb/CMakeLists.txt b/storage/xtradb/CMakeLists.txt index 282db2ddf3153..5050ca34da9da 100644 --- a/storage/xtradb/CMakeLists.txt +++ b/storage/xtradb/CMakeLists.txt @@ -284,6 +284,8 @@ SET(INNOBASE_SOURCES buf/buf0flu.cc buf/buf0lru.cc buf/buf0rea.cc +# TODO: JAN uncomment +# buf/buf0mtflu.cc data/data0data.cc data/data0type.cc dict/dict0boot.cc @@ -297,6 +299,8 @@ SET(INNOBASE_SOURCES eval/eval0eval.cc eval/eval0proc.cc fil/fil0fil.cc + fil/fil0pagecompress.cc + fil/lz4.c fsp/fsp0fsp.cc fut/fut0fut.cc fut/fut0lst.cc diff --git a/storage/xtradb/buf/buf0buf.cc b/storage/xtradb/buf/buf0buf.cc index d4b170028d923..b995e3ee737b9 100644 --- a/storage/xtradb/buf/buf0buf.cc +++ b/storage/xtradb/buf/buf0buf.cc @@ -2,6 +2,7 @@ Copyright (c) 1995, 2013, Oracle and/or its affiliates. All Rights Reserved. Copyright (c) 2008, Google Inc. +Copyright (c) 2013, SkySQL Ab. All Rights Reserved. Portions of this file contain modifications contributed and copyrighted by Google, Inc. Those modifications are gratefully acknowledged and are described @@ -3371,6 +3372,7 @@ buf_page_init_low( bpage->access_time = 0; bpage->newest_modification = 0; bpage->oldest_modification = 0; + bpage->write_size = 0; HASH_INVALIDATE(bpage, hash); bpage->is_corrupt = FALSE; #if defined UNIV_DEBUG_FILE_ACCESSES || defined UNIV_DEBUG @@ -5501,3 +5503,24 @@ buf_page_init_for_backup_restore( } } #endif /* !UNIV_HOTBACKUP */ + +/*********************************************************************//** +Aquire LRU list mutex */ +void +buf_pool_mutex_enter( +/*=================*/ + buf_pool_t* buf_pool) /*!< in: buffer pool */ +{ + ut_ad(!mutex_own(&buf_pool->LRU_list_mutex)); + mutex_enter(&buf_pool->LRU_list_mutex); +} +/*********************************************************************//** +Exit LRU list mutex */ +void +buf_pool_mutex_exit( +/*================*/ + buf_pool_t* buf_pool) /*!< in: buffer pool */ +{ + ut_ad(mutex_own(&buf_pool->LRU_list_mutex)); + mutex_exit(&buf_pool->LRU_list_mutex); +} diff --git a/storage/xtradb/buf/buf0dblwr.cc b/storage/xtradb/buf/buf0dblwr.cc index 506a5b177ba26..30b41dc754e43 100644 --- a/storage/xtradb/buf/buf0dblwr.cc +++ b/storage/xtradb/buf/buf0dblwr.cc @@ -1,6 +1,7 @@ /***************************************************************************** Copyright (c) 1995, 2013, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2013, SkySQL Ab. All Rights Reserved. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -382,7 +383,7 @@ buf_dblwr_init_or_restore_pages( buffer */ fil_io(OS_FILE_READ, true, TRX_SYS_SPACE, 0, TRX_SYS_PAGE_NO, 0, - UNIV_PAGE_SIZE, read_buf, NULL); + UNIV_PAGE_SIZE, read_buf, NULL, 0); doublewrite = read_buf + TRX_SYS_DOUBLEWRITE; if (mach_read_from_4(doublewrite + TRX_SYS_DOUBLEWRITE_MAGIC) @@ -418,11 +419,11 @@ buf_dblwr_init_or_restore_pages( fil_io(OS_FILE_READ, true, TRX_SYS_SPACE, 0, block1, 0, TRX_SYS_DOUBLEWRITE_BLOCK_SIZE * UNIV_PAGE_SIZE, - buf, NULL); + buf, NULL, 0); fil_io(OS_FILE_READ, true, TRX_SYS_SPACE, 0, block2, 0, TRX_SYS_DOUBLEWRITE_BLOCK_SIZE * UNIV_PAGE_SIZE, buf + TRX_SYS_DOUBLEWRITE_BLOCK_SIZE * UNIV_PAGE_SIZE, - NULL); + NULL, 0); /* Check if any of these pages is half-written in data files, in the intended position */ @@ -450,7 +451,7 @@ buf_dblwr_init_or_restore_pages( } fil_io(OS_FILE_WRITE, true, 0, 0, source_page_no, 0, - UNIV_PAGE_SIZE, page, NULL); + UNIV_PAGE_SIZE, page, NULL, 0); } else { space_id = mach_read_from_4( @@ -492,7 +493,7 @@ buf_dblwr_init_or_restore_pages( fil_io(OS_FILE_READ, true, space_id, zip_size, page_no, 0, zip_size ? zip_size : UNIV_PAGE_SIZE, - read_buf, NULL); + read_buf, NULL, 0); /* Check if the page is corrupt */ @@ -544,7 +545,7 @@ buf_dblwr_init_or_restore_pages( fil_io(OS_FILE_WRITE, true, space_id, zip_size, page_no, 0, zip_size ? zip_size : UNIV_PAGE_SIZE, - page, NULL); + page, NULL, 0); ib_logf(IB_LOG_LEVEL_INFO, "Recovered the page from" @@ -763,7 +764,7 @@ buf_dblwr_write_block_to_datafile( buf_page_get_page_no(bpage), 0, buf_page_get_zip_size(bpage), (void*) bpage->zip.data, - (void*) bpage); + (void*) bpage, 0); return; } @@ -775,7 +776,8 @@ buf_dblwr_write_block_to_datafile( fil_io(flags, sync, buf_block_get_space(block), 0, buf_block_get_page_no(block), 0, UNIV_PAGE_SIZE, - (void*) block->frame, (void*) block); + (void*) block->frame, (void*) block, + (ulint *)&bpage->write_size); } /********************************************************************//** @@ -869,7 +871,7 @@ buf_dblwr_flush_buffered_writes(void) fil_io(OS_FILE_WRITE, true, TRX_SYS_SPACE, 0, buf_dblwr->block1, 0, len, - (void*) write_buf, NULL); + (void*) write_buf, NULL, 0); if (buf_dblwr->first_free <= TRX_SYS_DOUBLEWRITE_BLOCK_SIZE) { /* No unwritten pages in the second block. */ @@ -885,7 +887,7 @@ buf_dblwr_flush_buffered_writes(void) fil_io(OS_FILE_WRITE, true, TRX_SYS_SPACE, 0, buf_dblwr->block2, 0, len, - (void*) write_buf, NULL); + (void*) write_buf, NULL, 0); flush: /* increment the doublewrite flushed pages counter */ @@ -1115,14 +1117,14 @@ buf_dblwr_write_single_page( fil_io(OS_FILE_WRITE, true, TRX_SYS_SPACE, 0, offset, 0, UNIV_PAGE_SIZE, (void*) (buf_dblwr->write_buf - + UNIV_PAGE_SIZE * i), NULL); + + UNIV_PAGE_SIZE * i), NULL, 0); } else { /* It is a regular page. Write it directly to the doublewrite buffer */ fil_io(OS_FILE_WRITE, true, TRX_SYS_SPACE, 0, offset, 0, UNIV_PAGE_SIZE, (void*) ((buf_block_t*) bpage)->frame, - NULL); + NULL, 0); } /* Now flush the doublewrite buffer data to disk */ diff --git a/storage/xtradb/buf/buf0flu.cc b/storage/xtradb/buf/buf0flu.cc index abcee504d2e2e..3c030eb60eec2 100644 --- a/storage/xtradb/buf/buf0flu.cc +++ b/storage/xtradb/buf/buf0flu.cc @@ -1,6 +1,8 @@ /***************************************************************************** Copyright (c) 1995, 2013, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2013, 2014, SkySQL Ab. All Rights Reserved. +Copyright (c) 2013, 2014, Fusion-io. All Rights Reserved. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -48,6 +50,7 @@ Created 11/11/1995 Heikki Tuuri #include "srv0mon.h" #include "mysql/plugin.h" #include "mysql/service_thd_wait.h" +#include "fil0pagecompress.h" /** Number of pages flushed through non flush_list flushes. */ // static ulint buf_lru_flush_page_count = 0; @@ -71,11 +74,6 @@ in thrashing. */ /* @} */ -/** Handled page counters for a single flush */ -struct flush_counters_t { - ulint flushed; /*!< number of dirty pages flushed */ - ulint evicted; /*!< number of clean pages evicted */ -}; /******************************************************************//** Increases flush_list size in bytes with zip_size for compressed page, @@ -721,8 +719,10 @@ buf_flush_write_complete( buf_pool->n_flush[flush_type]--; - /* fprintf(stderr, "n pending flush %lu\n", - buf_pool->n_flush[flush_type]); */ +#ifdef UNIV_DEBUG + fprintf(stderr, "n pending flush %lu\n", + buf_pool->n_flush[flush_type]); +#endif if (buf_pool->n_flush[flush_type] == 0 && buf_pool->init_flush[flush_type] == FALSE) { @@ -880,6 +880,8 @@ buf_flush_write_block_low( { ulint zip_size = buf_page_get_zip_size(bpage); page_t* frame = NULL; + ulint space_id = buf_page_get_space(bpage); + atomic_writes_t awrites = fil_space_get_atomic_writes(space_id); #ifdef UNIV_DEBUG buf_pool_t* buf_pool = buf_pool_from_bpage(bpage); @@ -955,12 +957,26 @@ buf_flush_write_block_low( sync, buf_page_get_space(bpage), zip_size, buf_page_get_page_no(bpage), 0, zip_size ? zip_size : UNIV_PAGE_SIZE, - frame, bpage); - } else if (flush_type == BUF_FLUSH_SINGLE_PAGE) { - buf_dblwr_write_single_page(bpage, sync); + frame, bpage, &bpage->write_size); } else { - ut_ad(!sync); - buf_dblwr_add_to_batch(bpage); + /* InnoDB uses doublewrite buffer and doublewrite buffer + is initialized. User can define do we use atomic writes + on a file space (table) or not. If atomic writes are + not used we should use doublewrite buffer and if + atomic writes should be used, no doublewrite buffer + is used. */ + + if (awrites == ATOMIC_WRITES_ON) { + fil_io(OS_FILE_WRITE | OS_AIO_SIMULATED_WAKE_LATER, + FALSE, buf_page_get_space(bpage), zip_size, + buf_page_get_page_no(bpage), 0, + zip_size ? zip_size : UNIV_PAGE_SIZE, + frame, bpage, &bpage->write_size); + } else if (flush_type == BUF_FLUSH_SINGLE_PAGE) { + buf_dblwr_write_single_page(bpage, sync); + } else { + buf_dblwr_add_to_batch(bpage); + } } /* When doing single page flushing the IO is done synchronously @@ -1747,7 +1763,6 @@ end up waiting for these latches! NOTE 2: in the case of a flush list flush, the calling thread is not allowed to own any latches on pages! @return number of blocks for which the write request was queued */ __attribute__((nonnull)) -static void buf_flush_batch( /*============*/ @@ -1806,7 +1821,6 @@ buf_flush_batch( /******************************************************************//** Gather the aggregated stats for both flush list and LRU list flushing */ -static void buf_flush_common( /*=============*/ @@ -1833,7 +1847,6 @@ buf_flush_common( /******************************************************************//** Start a buffer flush batch for LRU or flush list */ -static ibool buf_flush_start( /*============*/ @@ -1862,7 +1875,6 @@ buf_flush_start( /******************************************************************//** End a buffer flush batch for LRU or flush list */ -static void buf_flush_end( /*==========*/ @@ -1912,11 +1924,55 @@ buf_flush_wait_batch_end( } } else { thd_wait_begin(NULL, THD_WAIT_DISKIO); - os_event_wait(buf_pool->no_flush[type]); + os_event_wait(buf_pool->no_flush[type]); thd_wait_end(NULL); } } +/* JAN: TODO: */ +/*******************************************************************//** +This utility flushes dirty blocks from the end of the LRU list and also +puts replaceable clean pages from the end of the LRU list to the free +list. +NOTE: The calling thread is not allowed to own any latches on pages! +@return true if a batch was queued successfully. false if another batch +of same type was already running. */ +static +bool +pgcomp_buf_flush_LRU( +/*==========*/ + buf_pool_t* buf_pool, /*!< in/out: buffer pool instance */ + ulint min_n, /*!< in: wished minimum mumber of blocks + flushed (it is not guaranteed that the + actual number is that big, though) */ + ulint* n_processed) /*!< out: the number of pages + which were processed is passed + back to caller. Ignored if NULL */ +{ + flush_counters_t n; + + if (n_processed) { + *n_processed = 0; + } + + if (!buf_flush_start(buf_pool, BUF_FLUSH_LRU)) { + return(false); + } + + buf_flush_batch(buf_pool, BUF_FLUSH_LRU, min_n, 0, false, &n); + + buf_flush_end(buf_pool, BUF_FLUSH_LRU); + + buf_flush_common(BUF_FLUSH_LRU, n.flushed); + + if (n_processed) { + *n_processed = n.flushed; + } + + return(true); +} +/* JAN: TODO: END: */ + /*******************************************************************//** This utility flushes dirty blocks from the end of the LRU list and also puts replaceable clean pages from the end of the LRU list to the free @@ -1954,6 +2010,168 @@ buf_flush_LRU( return(true); } +/* JAN: TODO: */ +/*******************************************************************//**/ +extern int is_pgcomp_wrk_init_done(void); +extern int pgcomp_flush_work_items(int buf_pool_inst, int *pages_flushed, + int flush_type, int min_n, unsigned long long lsn_limit); + +#define MT_COMP_WATER_MARK 50 + +#include +int timediff(struct timeval *g_time, struct timeval *s_time, struct timeval *d_time) +{ + if (g_time->tv_usec < s_time->tv_usec) + { + int nsec = (s_time->tv_usec - g_time->tv_usec) / 1000000 + 1; + s_time->tv_usec -= 1000000 * nsec; + s_time->tv_sec += nsec; + } + if (g_time->tv_usec - s_time->tv_usec > 1000000) + { + int nsec = (s_time->tv_usec - g_time->tv_usec) / 1000000; + s_time->tv_usec += 1000000 * nsec; + s_time->tv_sec -= nsec; + } + d_time->tv_sec = g_time->tv_sec - s_time->tv_sec; + d_time->tv_usec = g_time->tv_usec - s_time->tv_usec; + + return 0; +} + +static pthread_mutex_t pgcomp_mtx = PTHREAD_MUTEX_INITIALIZER; +/*******************************************************************//** +Multi-threaded version of buf_flush_list +*/ +UNIV_INTERN +bool +pgcomp_buf_flush_list( +/*==================*/ + ulint min_n, /*!< in: wished minimum mumber of blocks + flushed (it is not guaranteed that the + actual number is that big, though) */ + lsn_t lsn_limit, /*!< in the case BUF_FLUSH_LIST all + blocks whose oldest_modification is + smaller than this should be flushed + (if their number does not exceed + min_n), otherwise ignored */ + ulint* n_processed) /*!< out: the number of pages + which were processed is passed + back to caller. Ignored if NULL */ + +{ + ulint i; + bool success = true; + struct timeval p_start_time, p_end_time, d_time; + flush_counters_t n; + + if (n_processed) { + *n_processed = 0; + } + + if (min_n != ULINT_MAX) { + /* Ensure that flushing is spread evenly amongst the + buffer pool instances. When min_n is ULINT_MAX + we need to flush everything up to the lsn limit + so no limit here. */ + min_n = (min_n + srv_buf_pool_instances - 1) + / srv_buf_pool_instances; + } + +#ifdef UNIV_DEBUG + gettimeofday(&p_start_time, 0x0); +#endif + if(is_pgcomp_wrk_init_done() && (min_n > MT_COMP_WATER_MARK)) { + int cnt_flush[32]; + + //stack_trace(); + pthread_mutex_lock(&pgcomp_mtx); + //gettimeofday(&p_start_time, 0x0); + //fprintf(stderr, "Calling into wrk-pgcomp [min:%lu]", min_n); + pgcomp_flush_work_items(srv_buf_pool_instances, + cnt_flush, BUF_FLUSH_LIST, + min_n, lsn_limit); + + for (i = 0; i < srv_buf_pool_instances; i++) { + if (n_processed) { + *n_processed += cnt_flush[i]; + } + if (cnt_flush[i]) { + MONITOR_INC_VALUE_CUMULATIVE( + MONITOR_FLUSH_BATCH_TOTAL_PAGE, + MONITOR_FLUSH_BATCH_COUNT, + MONITOR_FLUSH_BATCH_PAGES, + cnt_flush[i]); + + } + } + + pthread_mutex_unlock(&pgcomp_mtx); + +#ifdef UNIV_DEBUG + gettimeofday(&p_end_time, 0x0); + timediff(&p_end_time, &p_start_time, &d_time); + fprintf(stderr, "[1] [*n_processed: (min:%lu)%lu %llu usec]\n", ( + min_n * srv_buf_pool_instances), *n_processed, + (unsigned long long)(d_time.tv_usec+(d_time.tv_sec*1000000))); +#endif + return(success); + } + /* Flush to lsn_limit in all buffer pool instances */ + for (i = 0; i < srv_buf_pool_instances; i++) { + buf_pool_t* buf_pool; + + buf_pool = buf_pool_from_array(i); + + if (!buf_flush_start(buf_pool, BUF_FLUSH_LIST)) { + /* We have two choices here. If lsn_limit was + specified then skipping an instance of buffer + pool means we cannot guarantee that all pages + up to lsn_limit has been flushed. We can + return right now with failure or we can try + to flush remaining buffer pools up to the + lsn_limit. We attempt to flush other buffer + pools based on the assumption that it will + help in the retry which will follow the + failure. */ + success = false; + + continue; + } + + buf_flush_batch( + buf_pool, BUF_FLUSH_LIST, min_n, lsn_limit, false, &n); + + buf_flush_end(buf_pool, BUF_FLUSH_LIST); + + buf_flush_common(BUF_FLUSH_LIST, n.flushed); + + if (n_processed) { + *n_processed += n.flushed; + } + + if (n.flushed) { + MONITOR_INC_VALUE_CUMULATIVE( + MONITOR_FLUSH_BATCH_TOTAL_PAGE, + MONITOR_FLUSH_BATCH_COUNT, + MONITOR_FLUSH_BATCH_PAGES, + n.flushed); + } + } + +#ifdef UNIV_DEBUG + gettimeofday(&p_end_time, 0x0); + timediff(&p_end_time, &p_start_time, &d_time); + + fprintf(stderr, "[2] [*n_processed: (min:%lu)%lu %llu usec]\n", ( + min_n * srv_buf_pool_instances), *n_processed, + (unsigned long long)(d_time.tv_usec+(d_time.tv_sec*1000000))); +#endif + return(success); +} + +/* JAN: TODO: END: */ + /*******************************************************************//** This utility flushes dirty blocks from the end of the flush list of all buffer pool instances. @@ -1986,6 +2204,12 @@ buf_flush_list( bool timeout = false; ulint flush_start_time = 0; + /* JAN: TODO: */ + if (is_pgcomp_wrk_init_done()) { + return(pgcomp_buf_flush_list(min_n, lsn_limit, n_processed)); + } + /* JAN: TODO: END: */ + for (i = 0; i < srv_buf_pool_instances; i++) { requested_pages[i] = 0; active_instance[i] = true; @@ -2179,6 +2403,60 @@ buf_flush_single_page_from_LRU( return(freed); } +/* JAN: TODO: */ +/*********************************************************************//** +pgcomp_Clears up tail of the LRU lists: +* Put replaceable pages at the tail of LRU to the free list +* Flush dirty pages at the tail of LRU to the disk +The depth to which we scan each buffer pool is controlled by dynamic +config parameter innodb_LRU_scan_depth. +@return total pages flushed */ +UNIV_INTERN +ulint +pgcomp_buf_flush_LRU_tail(void) +/*====================*/ +{ + struct timeval p_start_time, p_end_time, d_time; + ulint total_flushed=0, i=0; + int cnt_flush[32]; + +#ifdef UNIV_DEBUG + gettimeofday(&p_start_time, 0x0); +#endif + assert(is_pgcomp_wrk_init_done()); + + pthread_mutex_lock(&pgcomp_mtx); + pgcomp_flush_work_items(srv_buf_pool_instances, + cnt_flush, BUF_FLUSH_LRU, srv_LRU_scan_depth, 0); + + for (i = 0; i < srv_buf_pool_instances; i++) { + if (cnt_flush[i]) { + total_flushed += cnt_flush[i]; + + MONITOR_INC_VALUE_CUMULATIVE( + MONITOR_LRU_BATCH_TOTAL_PAGE, + MONITOR_LRU_BATCH_COUNT, + MONITOR_LRU_BATCH_PAGES, + cnt_flush[i]); + } + } + + pthread_mutex_unlock(&pgcomp_mtx); + +#ifdef UNIV_DEBUG + gettimeofday(&p_end_time, 0x0); + timediff(&p_end_time, &p_start_time, &d_time); + + fprintf(stderr, "[1] [*n_processed: (min:%lu)%lu %llu usec]\n", ( + srv_LRU_scan_depth * srv_buf_pool_instances), total_flushed, + (unsigned long long)(d_time.tv_usec+(d_time.tv_sec*1000000))); +#endif + + return(total_flushed); +} +/* JAN: TODO: END: */ + + /*********************************************************************//** Clears up tail of the LRU lists: * Put replaceable pages at the tail of LRU to the free list @@ -2203,6 +2481,13 @@ buf_flush_LRU_tail(void) ulint free_list_lwm = srv_LRU_scan_depth / 100 * srv_cleaner_free_list_lwm; + /* JAN: TODO: */ + if(is_pgcomp_wrk_init_done()) + { + return(pgcomp_buf_flush_LRU_tail()); + } + /* JAN: TODO: END */ + for (ulint i = 0; i < srv_buf_pool_instances; i++) { const buf_pool_t* buf_pool = buf_pool_from_array(i); @@ -2640,6 +2925,7 @@ DECLARE_THREAD(buf_flush_page_cleaner_thread)( ulint n_flushed = 0; ulint last_activity = srv_get_activity_count(); ulint lru_sleep_time = srv_cleaner_max_lru_time; + ulint n_lru=0, n_pgc_flush=0, n_pgc_batch=0; ut_ad(!srv_read_only_mode); @@ -2684,15 +2970,25 @@ DECLARE_THREAD(buf_flush_page_cleaner_thread)( next_loop_time = ut_time_ms() + page_cleaner_sleep_time; /* Flush pages from end of LRU if required */ - n_flushed = buf_flush_LRU_tail(); + n_lru = n_flushed = buf_flush_LRU_tail(); +#ifdef UNIV_DEBUG + if (n_lru) { + fprintf(stderr,"n_lru:%lu ",n_lru); + } +#endif if (srv_check_activity(last_activity)) { last_activity = srv_get_activity_count(); /* Flush pages from flush_list if required */ - n_flushed += page_cleaner_flush_pages_if_needed(); + n_flushed += n_pgc_flush = page_cleaner_flush_pages_if_needed(); +#ifdef UNIV_DEBUG + if (n_pgc_flush) { + fprintf(stderr,"n_pgc_flush:%lu ",n_pgc_flush); + } +#endif } else { - n_flushed = page_cleaner_do_flush_batch( + n_pgc_batch = n_flushed = page_cleaner_do_flush_batch( PCT_IO(100), LSN_MAX); @@ -2703,7 +2999,20 @@ DECLARE_THREAD(buf_flush_page_cleaner_thread)( MONITOR_FLUSH_BACKGROUND_PAGES, n_flushed); } +#ifdef UNIV_DEBUG + if (n_pgc_batch) { + fprintf(stderr,"n_pgc_batch:%lu ",n_pgc_batch); + } +#endif } + +#ifdef UNIV_DEBUG + if (n_lru || n_pgc_flush || n_pgc_batch) { + fprintf(stderr,"\n"); + n_lru = n_pgc_flush = n_pgc_batch = 0; + } +#endif + } ut_ad(srv_shutdown_state > 0); diff --git a/storage/xtradb/buf/buf0rea.cc b/storage/xtradb/buf/buf0rea.cc index 6e348bbf004f6..3dec3df6f2b19 100644 --- a/storage/xtradb/buf/buf0rea.cc +++ b/storage/xtradb/buf/buf0rea.cc @@ -1,6 +1,7 @@ /***************************************************************************** Copyright (c) 1995, 2013, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2013, 2014, SkySQL Ab. All Rights Reserved. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -229,14 +230,14 @@ buf_read_page_low( *err = _fil_io(OS_FILE_READ | wake_later | ignore_nonexistent_pages, sync, space, zip_size, offset, 0, zip_size, - bpage->zip.data, bpage, trx); + bpage->zip.data, bpage, 0, trx); } else { ut_a(buf_page_get_state(bpage) == BUF_BLOCK_FILE_PAGE); *err = _fil_io(OS_FILE_READ | wake_later | ignore_nonexistent_pages, sync, space, 0, offset, 0, UNIV_PAGE_SIZE, - ((buf_block_t*) bpage)->frame, bpage, trx); + ((buf_block_t*) bpage)->frame, bpage, 0, trx); } if (sync) { diff --git a/storage/xtradb/dict/dict0dict.cc b/storage/xtradb/dict/dict0dict.cc index a20456fe3cf8f..d6a05d2b21431 100644 --- a/storage/xtradb/dict/dict0dict.cc +++ b/storage/xtradb/dict/dict0dict.cc @@ -2,6 +2,7 @@ Copyright (c) 1996, 2013, Oracle and/or its affiliates. All Rights Reserved. Copyright (c) 2012, Facebook Inc. +Copyright (c) 2013, SkySQL Ab. All Rights Reserved. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software diff --git a/storage/xtradb/fil/fil0fil.cc b/storage/xtradb/fil/fil0fil.cc index 9861f85b8141a..f3e952299ffda 100644 --- a/storage/xtradb/fil/fil0fil.cc +++ b/storage/xtradb/fil/fil0fil.cc @@ -1,6 +1,7 @@ /***************************************************************************** Copyright (c) 1995, 2013, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2013 SkySQL Ab. All Rights Reserved. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -54,6 +55,15 @@ Created 10/25/1995 Heikki Tuuri # include "srv0srv.h" static ulint srv_data_read, srv_data_written; #endif /* !UNIV_HOTBACKUP */ +#include "fil0pagecompress.h" +#include "zlib.h" +#ifdef __linux__ +#include +#include +#include +#include +#endif +#include "row0mysql.h" /* IMPLEMENTATION OF THE TABLESPACE MEMORY CACHE @@ -434,11 +444,16 @@ fil_read( block size multiple */ void* buf, /*!< in/out: buffer where to store data read; in aio this must be appropriately aligned */ - void* message) /*!< in: message for aio handler if non-sync - aio used, else ignored */ + void* message, /*!< in: message for aio handler if non-sync + aio used, else ignored */ + ulint* write_size) /*!< in/out: Actual write size initialized + after fist successfull trim + operation for this page and if + initialized we do not trim again if + actual page size does not decrease. */ { return(fil_io(OS_FILE_READ, sync, space_id, zip_size, block_offset, - byte_offset, len, buf, message)); + byte_offset, len, buf, message, write_size)); } /********************************************************************//** @@ -463,18 +478,22 @@ fil_write( be a block size multiple */ void* buf, /*!< in: buffer from which to write; in aio this must be appropriately aligned */ - void* message) /*!< in: message for aio handler if non-sync - aio used, else ignored */ + void* message, /*!< in: message for aio handler if non-sync + aio used, else ignored */ + ulint* write_size) /*!< in/out: Actual write size initialized + after fist successfull trim + operation for this page and if + initialized we do not trim again if + actual page size does not decrease. */ { ut_ad(!srv_read_only_mode); return(fil_io(OS_FILE_WRITE, sync, space_id, zip_size, block_offset, - byte_offset, len, buf, message)); + byte_offset, len, buf, message, write_size)); } /*******************************************************************//** Returns the table space by a given id, NULL if not found. */ -UNIV_INLINE fil_space_t* fil_space_get_by_id( /*================*/ @@ -492,6 +511,19 @@ fil_space_get_by_id( return(space); } +/****************************************************************//** +Get space id from fil node */ +ulint +fil_node_get_space_id( +/*==================*/ + fil_node_t* node) /*!< in: Compressed node*/ +{ + ut_ad(node); + ut_ad(node->space); + + return (node->space->id); +} + /*******************************************************************//** Returns the table space by a given name, NULL if not found. */ UNIV_INLINE @@ -712,8 +744,9 @@ fil_node_open_file( byte* buf2; byte* page; ulint space_id; - ulint flags; + ulint flags=0; ulint page_size; + ibool atomic_writes=FALSE; ut_ad(mutex_own(&(system->mutex))); ut_a(node->n_pending == 0); @@ -730,7 +763,7 @@ fil_node_open_file( node->handle = os_file_create_simple_no_error_handling( innodb_file_data_key, node->name, OS_FILE_OPEN, - OS_FILE_READ_ONLY, &success); + OS_FILE_READ_ONLY, &success, 0); if (!success) { /* The following call prints an error message */ os_file_get_last_error(true); @@ -782,6 +815,7 @@ fil_node_open_file( space_id = fsp_header_get_space_id(page); flags = fsp_header_get_flags(page); page_size = fsp_flags_get_page_size(flags); + atomic_writes = fsp_flags_get_atomic_writes(flags); ut_free(buf2); @@ -832,6 +866,17 @@ fil_node_open_file( ut_error; } + if (UNIV_UNLIKELY(space->flags != flags)) { + if (!dict_tf_verify_flags(space->flags, flags)) { + fprintf(stderr, + "InnoDB: Error: table flags are 0x%lx" + " in the data dictionary\n" + "InnoDB: but the flags in file %s are 0x%lx!\n", + space->flags, node->name, flags); + ut_error; + } + } + if (size_bytes >= 1024 * 1024) { /* Truncate the size to whole megabytes. */ size_bytes = ut_2pow_round(size_bytes, 1024 * 1024); @@ -851,6 +896,8 @@ fil_node_open_file( space->size += node->size; } + atomic_writes = fsp_flags_get_atomic_writes(space->flags); + /* printf("Opening file %s\n", node->name); */ /* Open the file for reading and writing, in Windows normally in the @@ -861,18 +908,18 @@ fil_node_open_file( node->handle = os_file_create(innodb_file_log_key, node->name, OS_FILE_OPEN, OS_FILE_AIO, OS_LOG_FILE, - &ret); + &ret, atomic_writes); } else if (node->is_raw_disk) { node->handle = os_file_create(innodb_file_data_key, node->name, OS_FILE_OPEN_RAW, OS_FILE_AIO, OS_DATA_FILE, - &ret); + &ret, atomic_writes); } else { node->handle = os_file_create(innodb_file_data_key, node->name, OS_FILE_OPEN, OS_FILE_AIO, OS_DATA_FILE, - &ret); + &ret, atomic_writes); } ut_a(ret); @@ -1932,12 +1979,12 @@ fil_write_lsn_and_arch_no_to_file( buf = static_cast(ut_align(buf1, UNIV_PAGE_SIZE)); err = fil_read(TRUE, space, 0, sum_of_sizes, 0, - UNIV_PAGE_SIZE, buf, NULL); + UNIV_PAGE_SIZE, buf, NULL, 0); if (err == DB_SUCCESS) { mach_write_to_8(buf + FIL_PAGE_FILE_FLUSH_LSN, lsn); err = fil_write(TRUE, space, 0, sum_of_sizes, 0, - UNIV_PAGE_SIZE, buf, NULL); + UNIV_PAGE_SIZE, buf, NULL, 0); } mem_free(buf1); @@ -3222,7 +3269,7 @@ fil_create_link_file( file = os_file_create_simple_no_error_handling( innodb_file_data_key, link_filepath, - OS_FILE_CREATE, OS_FILE_READ_WRITE, &success); + OS_FILE_CREATE, OS_FILE_READ_WRITE, &success, 0); if (!success) { /* The following call will print an error message */ @@ -3331,8 +3378,9 @@ fil_open_linked_file( /*===============*/ const char* tablename, /*!< in: database/tablename */ char** remote_filepath,/*!< out: remote filepath */ - os_file_t* remote_file) /*!< out: remote file handle */ - + os_file_t* remote_file, /*!< out: remote file handle */ + ulint atomic_writes) /*!< in: atomic writes table option + value */ { ibool success; @@ -3346,7 +3394,7 @@ fil_open_linked_file( *remote_file = os_file_create_simple_no_error_handling( innodb_file_data_key, *remote_filepath, OS_FILE_OPEN, OS_FILE_READ_ONLY, - &success); + &success, atomic_writes); if (!success) { char* link_filepath = fil_make_isl_name(tablename); @@ -3401,6 +3449,7 @@ fil_create_new_single_table_tablespace( /* TRUE if a table is created with CREATE TEMPORARY TABLE */ bool is_temp = !!(flags2 & DICT_TF2_TEMPORARY); bool has_data_dir = FSP_FLAGS_HAS_DATA_DIR(flags); + bool atomic_writes = FSP_FLAGS_GET_ATOMIC_WRITES(flags); ut_a(space_id > 0); ut_ad(!srv_read_only_mode); @@ -3433,7 +3482,8 @@ fil_create_new_single_table_tablespace( OS_FILE_CREATE | OS_FILE_ON_ERROR_NO_EXIT, OS_FILE_NORMAL, OS_DATA_FILE, - &ret); + &ret, + atomic_writes); if (ret == FALSE) { /* The following call will print an error message */ @@ -3498,6 +3548,7 @@ fil_create_new_single_table_tablespace( flags = fsp_flags_set_page_size(flags, UNIV_PAGE_SIZE); fsp_header_init_fields(page, space_id, flags); mach_write_to_4(page + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID, space_id); + ut_ad(fsp_flags_is_valid(flags)); if (!(fsp_flags_is_compressed(flags))) { buf_flush_init_for_writing(page, NULL, 0); @@ -3685,6 +3736,7 @@ fil_open_single_table_tablespace( fsp_open_info remote; ulint tablespaces_found = 0; ulint valid_tablespaces_found = 0; + ibool atomic_writes = FALSE; #ifdef UNIV_SYNC_DEBUG ut_ad(!fix_dict || rw_lock_own(&dict_operation_lock, RW_LOCK_EX)); @@ -3719,7 +3771,7 @@ fil_open_single_table_tablespace( } link_file_found = fil_open_linked_file( - tablename, &remote.filepath, &remote.file); + tablename, &remote.filepath, &remote.file, atomic_writes); remote.success = link_file_found; if (remote.success) { /* possibility of multiple files. */ @@ -3747,7 +3799,7 @@ fil_open_single_table_tablespace( if (dict.filepath) { dict.file = os_file_create_simple_no_error_handling( innodb_file_data_key, dict.filepath, OS_FILE_OPEN, - OS_FILE_READ_ONLY, &dict.success); + OS_FILE_READ_ONLY, &dict.success, atomic_writes); if (dict.success) { /* possibility of multiple files. */ validate = true; @@ -3759,7 +3811,7 @@ fil_open_single_table_tablespace( ut_a(def.filepath); def.file = os_file_create_simple_no_error_handling( innodb_file_data_key, def.filepath, OS_FILE_OPEN, - OS_FILE_READ_ONLY, &def.success); + OS_FILE_READ_ONLY, &def.success, atomic_writes); if (def.success) { tablespaces_found++; } @@ -4155,7 +4207,7 @@ fil_load_single_table_tablespace( /* Check for a link file which locates a remote tablespace. */ remote.success = fil_open_linked_file( - tablename, &remote.filepath, &remote.file); + tablename, &remote.filepath, &remote.file, FALSE); /* Read the first page of the remote tablespace */ if (remote.success) { @@ -4170,7 +4222,7 @@ fil_load_single_table_tablespace( /* Try to open the tablespace in the datadir. */ def.file = os_file_create_simple_no_error_handling( innodb_file_data_key, def.filepath, OS_FILE_OPEN, - OS_FILE_READ_ONLY, &def.success); + OS_FILE_READ_ONLY, &def.success, FALSE); /* Read the first page of the remote tablespace */ if (def.success) { @@ -4938,7 +4990,6 @@ fil_extend_space_to_desired_size( #ifdef HAVE_POSIX_FALLOCATE if (srv_use_posix_fallocate) { - mutex_exit(&fil_system->mutex); success = os_file_set_size(node->name, node->handle, (size_after_extend - file_start_page_no) * page_size); @@ -4975,7 +5026,7 @@ fil_extend_space_to_desired_size( success = os_aio(OS_FILE_WRITE, OS_AIO_SYNC, node->name, node->handle, buf, offset, page_size * n_pages, - NULL, NULL, space_id, NULL); + NULL, NULL, space_id, NULL, 0, 0, 0); #endif /* UNIV_HOTBACKUP */ if (success) { os_has_said_disk_full = FALSE; @@ -5361,7 +5412,12 @@ _fil_io( or from where to write; in aio this must be appropriately aligned */ void* message, /*!< in: message for aio handler if non-sync - aio used, else ignored */ + aio used, else ignored */ + ulint* write_size, /*!< in/out: Actual write size initialized + after fist successfull trim + operation for this page and if + initialized we do not trim again if + actual page size does not decrease. */ trx_t* trx) { ulint mode; @@ -5372,6 +5428,8 @@ _fil_io( ulint wake_later; os_offset_t offset; ibool ignore_nonexistent_pages; + ibool page_compressed = FALSE; + ulint page_compression_level = 0; is_log = type & OS_FILE_LOG; type = type & ~OS_FILE_LOG; @@ -5425,6 +5483,9 @@ _fil_io( } else if (type == OS_FILE_WRITE) { ut_ad(!srv_read_only_mode); srv_stats.data_written.add(len); + if (fil_page_is_index_page((byte *)buf)) { + srv_stats.index_pages_written.inc(); + } } /* Reserve the fil_system mutex and make sure that we can open at @@ -5434,6 +5495,8 @@ _fil_io( space = fil_space_get_by_id(space_id); + page_compressed = fsp_flags_is_page_compressed(space->flags); + page_compression_level = fsp_flags_get_page_compression_level(space->flags); /* If we are deleting a tablespace we don't allow any read operations on that. However, we do allow write operations. */ if (space == 0 || (type == OS_FILE_READ && space->stop_new_ops)) { @@ -5579,7 +5642,8 @@ _fil_io( /* Queue the aio request */ ret = os_aio(type, mode | wake_later, node->name, node->handle, buf, - offset, len, node, message, space_id, trx); + offset, len, node, message, space_id, trx, + page_compressed, page_compression_level, write_size); #else /* In ibbackup do normal i/o, not aio */ @@ -6214,7 +6278,7 @@ fil_tablespace_iterate( file = os_file_create_simple_no_error_handling( innodb_file_data_key, filepath, - OS_FILE_OPEN, OS_FILE_READ_WRITE, &success); + OS_FILE_OPEN, OS_FILE_READ_WRITE, &success, FALSE); DBUG_EXECUTE_IF("fil_tablespace_iterate_failure", { @@ -6501,3 +6565,33 @@ fil_space_set_corrupt( mutex_exit(&fil_system->mutex); } + +/****************************************************************//** +Acquire fil_system mutex */ +void +fil_system_enter(void) +/*==================*/ +{ + ut_ad(!mutex_own(&fil_system->mutex)); + mutex_enter(&fil_system->mutex); +} + +/****************************************************************//** +Release fil_system mutex */ +void +fil_system_exit(void) +/*=================*/ +{ + ut_ad(mutex_own(&fil_system->mutex)); + mutex_exit(&fil_system->mutex); +} + +/*******************************************************************//** +Return space name */ +char* +fil_space_name( +/*===========*/ + fil_space_t* space) /*!< in: space */ +{ + return (space->name); +} diff --git a/storage/xtradb/fil/fil0pagecompress.cc b/storage/xtradb/fil/fil0pagecompress.cc new file mode 100644 index 0000000000000..10ac273955f02 --- /dev/null +++ b/storage/xtradb/fil/fil0pagecompress.cc @@ -0,0 +1,324 @@ +/***************************************************************************** + +Copyright (C) 2013, 2014, SkySQL Ab. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + +*****************************************************************************/ + +/******************************************************************//** +@file fil/fil0pagecompress.cc +Implementation for page compressed file spaces. + +Created 11/12/2013 Jan Lindström jan.lindstrom@skysql.com +***********************************************************************/ + +#include "fil0fil.h" +#include "fil0pagecompress.h" + +#include +#include + +#include "mem0mem.h" +#include "hash0hash.h" +#include "os0file.h" +#include "mach0data.h" +#include "buf0buf.h" +#include "buf0flu.h" +#include "log0recv.h" +#include "fsp0fsp.h" +#include "srv0srv.h" +#include "srv0start.h" +#include "mtr0mtr.h" +#include "mtr0log.h" +#include "dict0dict.h" +#include "page0page.h" +#include "page0zip.h" +#include "trx0sys.h" +#include "row0mysql.h" +#ifndef UNIV_HOTBACKUP +# include "buf0lru.h" +# include "ibuf0ibuf.h" +# include "sync0sync.h" +# include "os0sync.h" +#else /* !UNIV_HOTBACKUP */ +# include "srv0srv.h" +static ulint srv_data_read, srv_data_written; +#endif /* !UNIV_HOTBACKUP */ +#include "zlib.h" +#ifdef __linux__ +#include +#include +#include +#include +#endif +#include "row0mysql.h" +#include "lz4.h" + +/****************************************************************//** +For page compressed pages compress the page before actual write +operation. +@return compressed page to be written*/ +byte* +fil_compress_page( +/*==============*/ + ulint space_id, /*!< in: tablespace id of the + table. */ + byte* buf, /*!< in: buffer from which to write; in aio + this must be appropriately aligned */ + byte* out_buf, /*!< out: compressed buffer */ + ulint len, /*!< in: length of input buffer.*/ + ulint compression_level, /* in: compression level */ + ulint* out_len) /*!< out: actual length of compressed page */ +{ + int err = Z_OK; + int level = 0; + ulint header_len = FIL_PAGE_DATA + FIL_PAGE_COMPRESSED_SIZE; + ulint write_size=0; + + ut_ad(buf); + ut_ad(out_buf); + ut_ad(len); + ut_ad(out_len); + + level = compression_level; + ut_ad(fil_space_is_page_compressed(space_id)); + + fil_system_enter(); + fil_space_t* space = fil_space_get_by_id(space_id); + fil_system_exit(); + + /* If no compression level was provided to this table, use system + default level */ + if (level == 0) { + level = page_zip_level; + } + +#ifdef UNIV_DEBUG + fprintf(stderr, + "InnoDB: Note: Preparing for compress for space %lu name %s len %lu\n", + space_id, fil_space_name(space), len); +#endif + + write_size = UNIV_PAGE_SIZE - header_len; + + if (srv_use_lz4) { + err = LZ4_compress_limitedOutput((const char *)buf, (char *)out_buf+header_len, len, write_size); + write_size = err; + + if (err == 0) { + /* If error we leave the actual page as it was */ + + fprintf(stderr, + "InnoDB: Warning: Compression failed for space %lu name %s len %lu rt %d write %lu\n", + space_id, fil_space_name(space), len, err, write_size); + + *out_len = len; + return (buf); + } + } else { + err = compress2(out_buf+header_len, &write_size, buf, len, level); + + if (err != Z_OK) { + /* If error we leave the actual page as it was */ + + fprintf(stderr, + "InnoDB: Warning: Compression failed for space %lu name %s len %lu rt %d write %lu\n", + space_id, fil_space_name(space), len, err, write_size); + + *out_len = len; + return (buf); + } + } + + /* Set up the page header */ + memcpy(out_buf, buf, FIL_PAGE_DATA); + /* Set up the checksum */ + mach_write_to_4(out_buf+FIL_PAGE_SPACE_OR_CHKSUM, BUF_NO_CHECKSUM_MAGIC); + /* Set up the correct page type */ + mach_write_to_2(out_buf+FIL_PAGE_TYPE, FIL_PAGE_PAGE_COMPRESSED); + /* Set up the flush lsn to be compression algorithm */ + if (srv_use_lz4) { + mach_write_to_8(out_buf+FIL_PAGE_FILE_FLUSH_LSN, FIL_PAGE_COMPRESSION_LZ4); + } else { + mach_write_to_8(out_buf+FIL_PAGE_FILE_FLUSH_LSN, FIL_PAGE_COMPRESSION_ZLIB); + } + /* Set up the actual payload lenght */ + mach_write_to_2(out_buf+FIL_PAGE_DATA, write_size); + +#ifdef UNIV_DEBUG + /* Verify */ + ut_ad(fil_page_is_compressed(out_buf)); + ut_ad(mach_read_from_4(out_buf+FIL_PAGE_SPACE_OR_CHKSUM) == BUF_NO_CHECKSUM_MAGIC); + ut_ad(mach_read_from_2(out_buf+FIL_PAGE_DATA) == write_size); + if (srv_use_lz4) { + ut_ad(mach_read_from_8(out_buf+FIL_PAGE_FILE_FLUSH_LSN) == FIL_PAGE_COMPRESSION_LZ4); + } else { + ut_ad(mach_read_from_8(out_buf+FIL_PAGE_FILE_FLUSH_LSN) == FIL_PAGE_COMPRESSION_ZLIB); + } +#endif + + write_size+=header_len; + /* Actual write needs to be alligned on block size */ + if (write_size % OS_FILE_LOG_BLOCK_SIZE) { + write_size = (write_size + (OS_FILE_LOG_BLOCK_SIZE - (write_size % OS_FILE_LOG_BLOCK_SIZE))); + } + +#ifdef UNIV_DEBUG + fprintf(stderr, + "InnoDB: Note: Compression succeeded for space %lu name %s len %lu out_len %lu\n", + space_id, fil_space_name(space), len, write_size); +#endif + +#define SECT_SIZE 512 + + srv_stats.page_compression_saved.add((len - write_size)); + if ((len - write_size) > 0) { + srv_stats.page_compression_trim_sect512.add(((len - write_size) / SECT_SIZE)); + srv_stats.page_compression_trim_sect4096.add(((len - write_size) / (SECT_SIZE*8))); + } + //srv_stats.page_compressed_trim_op.inc(); + srv_stats.pages_page_compressed.inc(); + *out_len = write_size; + + return(out_buf); + +} + +/****************************************************************//** +For page compressed pages decompress the page after actual read +operation. */ +void +fil_decompress_page( +/*================*/ + byte* page_buf, /*!< in: preallocated buffer or NULL */ + byte* buf, /*!< out: buffer from which to read; in aio + this must be appropriately aligned */ + ulint len) /*!< in: length of output buffer.*/ +{ + int err = 0; + ulint actual_size = 0; + ulint compression_alg = 0; + byte *in_buf; + + ut_ad(buf); + ut_ad(len); + + /* Before actual decompress, make sure that page type is correct */ + + if (mach_read_from_4(buf+FIL_PAGE_SPACE_OR_CHKSUM) != BUF_NO_CHECKSUM_MAGIC || + mach_read_from_2(buf+FIL_PAGE_TYPE) != FIL_PAGE_PAGE_COMPRESSED) { + fprintf(stderr, + "InnoDB: Corruption: We try to uncompress corrupted page\n" + "InnoDB: CRC %lu type %lu.\n" + "InnoDB: len %lu\n", + mach_read_from_4(buf+FIL_PAGE_SPACE_OR_CHKSUM), + mach_read_from_2(buf+FIL_PAGE_TYPE), len); + + fflush(stderr); + ut_error; + } + + /* Get compression algorithm */ + compression_alg = mach_read_from_8(buf+FIL_PAGE_FILE_FLUSH_LSN); + + // If no buffer was given, we need to allocate temporal buffer + if (page_buf == NULL) { +#ifdef UNIV_DEBUG + fprintf(stderr, + "InnoDB: Note: Compression buffer not given, allocating...\n"); +#endif + in_buf = static_cast(ut_malloc(UNIV_PAGE_SIZE)); + } else { + in_buf = page_buf; + } + + /* Get the actual size of compressed page */ + actual_size = mach_read_from_2(buf+FIL_PAGE_DATA); + /* Check if payload size is corrupted */ + if (actual_size == 0 || actual_size > UNIV_PAGE_SIZE) { + fprintf(stderr, + "InnoDB: Corruption: We try to uncompress corrupted page\n" + "InnoDB: actual size %lu compression %s\n", + actual_size, fil_get_compression_alg_name(compression_alg)); + fflush(stderr); + ut_error; + } + + if (compression_alg == FIL_PAGE_COMPRESSION_ZLIB) { + +#ifdef UNIV_DEBUG + fprintf(stderr, + "InnoDB: Note: Preparing for decompress for len %lu\n", + actual_size); +#endif + + err= uncompress(in_buf, &len, buf+FIL_PAGE_DATA+FIL_PAGE_COMPRESSED_SIZE, (unsigned long)actual_size); + + + /* If uncompress fails it means that page is corrupted */ + if (err != Z_OK) { + + fprintf(stderr, + "InnoDB: Corruption: Page is marked as compressed\n" + "InnoDB: but uncompress failed with error %d.\n" + "InnoDB: size %lu len %lu\n", + err, actual_size, len); + + fflush(stderr); + + ut_error; + } + +#ifdef UNIV_DEBUG + fprintf(stderr, + "InnoDB: Note: Decompression succeeded for len %lu \n", + len); +#endif + } else if (compression_alg == FIL_PAGE_COMPRESSION_LZ4) { + err = LZ4_decompress_fast((const char *)buf+FIL_PAGE_DATA+FIL_PAGE_COMPRESSED_SIZE, (char *)in_buf, UNIV_PAGE_SIZE); + + if (err != actual_size) { + fprintf(stderr, + "InnoDB: Corruption: Page is marked as compressed\n" + "InnoDB: but decompression read only %d bytes.\n" + "InnoDB: size %lu len %lu\n", + err, actual_size, len); + fflush(stderr); + + ut_error; + } + } else { + fprintf(stderr, + "InnoDB: Corruption: Page is marked as compressed\n" + "InnoDB: but compression algorithm %s\n" + "InnoDB: is not known.\n" + ,fil_get_compression_alg_name(compression_alg)); + + fflush(stderr); + ut_error; + } + + srv_stats.pages_page_decompressed.inc(); + + /* Copy the uncompressed page to the buffer pool, not + really any other options. */ + memcpy(buf, in_buf, len); + + // Need to free temporal buffer if no buffer was given + if (page_buf == NULL) { + ut_free(in_buf); + } +} + + diff --git a/storage/xtradb/fil/lz4.c b/storage/xtradb/fil/lz4.c new file mode 100644 index 0000000000000..4e864de67d32d --- /dev/null +++ b/storage/xtradb/fil/lz4.c @@ -0,0 +1,822 @@ +/* + LZ4 - Fast LZ compression algorithm + Copyright (C) 2011-2013, Yann Collet. + BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php) + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above + copyright notice, this list of conditions and the following disclaimer + in the documentation and/or other materials provided with the + distribution. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + + You can contact the author at : + - LZ4 source repository : http://code.google.com/p/lz4/ + - LZ4 public forum : https://groups.google.com/forum/#!forum/lz4c +*/ + +//************************************** +// Tuning parameters +//************************************** +// MEMORY_USAGE : +// Memory usage formula : N->2^N Bytes (examples : 10 -> 1KB; 12 -> 4KB ; 16 -> 64KB; 20 -> 1MB; etc.) +// Increasing memory usage improves compression ratio +// Reduced memory usage can improve speed, due to cache effect +// Default value is 14, for 16KB, which nicely fits into Intel x86 L1 cache +#define MEMORY_USAGE 14 + +// HEAPMODE : +// Select how default compression functions will allocate memory for their hash table, +// in memory stack (0:default, fastest), or in memory heap (1:requires memory allocation (malloc)). +#define HEAPMODE 0 + + +//************************************** +// CPU Feature Detection +//************************************** +// 32 or 64 bits ? +#if (defined(__x86_64__) || defined(_M_X64) || defined(_WIN64) \ + || defined(__powerpc64__) || defined(__ppc64__) || defined(__PPC64__) \ + || defined(__64BIT__) || defined(_LP64) || defined(__LP64__) \ + || defined(__ia64) || defined(__itanium__) || defined(_M_IA64) ) // Detects 64 bits mode +# define LZ4_ARCH64 1 +#else +# define LZ4_ARCH64 0 +#endif + +// Little Endian or Big Endian ? +// Overwrite the #define below if you know your architecture endianess +#if defined (__GLIBC__) +# include +# if (__BYTE_ORDER == __BIG_ENDIAN) +# define LZ4_BIG_ENDIAN 1 +# endif +#elif (defined(__BIG_ENDIAN__) || defined(__BIG_ENDIAN) || defined(_BIG_ENDIAN)) && !(defined(__LITTLE_ENDIAN__) || defined(__LITTLE_ENDIAN) || defined(_LITTLE_ENDIAN)) +# define LZ4_BIG_ENDIAN 1 +#elif defined(__sparc) || defined(__sparc__) \ + || defined(__powerpc__) || defined(__ppc__) || defined(__PPC__) \ + || defined(__hpux) || defined(__hppa) \ + || defined(_MIPSEB) || defined(__s390__) +# define LZ4_BIG_ENDIAN 1 +#else +// Little Endian assumed. PDP Endian and other very rare endian format are unsupported. +#endif + +// Unaligned memory access is automatically enabled for "common" CPU, such as x86. +// For others CPU, such as ARM, the compiler may be more cautious, inserting unnecessary extra code to ensure aligned access property +// If you know your target CPU supports unaligned memory access, you want to force this option manually to improve performance +#if defined(__ARM_FEATURE_UNALIGNED) +# define LZ4_FORCE_UNALIGNED_ACCESS 1 +#endif + +// Define this parameter if your target system or compiler does not support hardware bit count +#if defined(_MSC_VER) && defined(_WIN32_WCE) // Visual Studio for Windows CE does not support Hardware bit count +# define LZ4_FORCE_SW_BITCOUNT +#endif + +// BIG_ENDIAN_NATIVE_BUT_INCOMPATIBLE : +// This option may provide a small boost to performance for some big endian cpu, although probably modest. +// You may set this option to 1 if data will remain within closed environment. +// This option is useless on Little_Endian CPU (such as x86) +//#define BIG_ENDIAN_NATIVE_BUT_INCOMPATIBLE 1 + + +//************************************** +// Compiler Options +//************************************** +#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) // C99 +/* "restrict" is a known keyword */ +#else +# define restrict // Disable restrict +#endif + +#ifdef _MSC_VER // Visual Studio +# define FORCE_INLINE static __forceinline +# include // For Visual 2005 +# if LZ4_ARCH64 // 64-bits +# pragma intrinsic(_BitScanForward64) // For Visual 2005 +# pragma intrinsic(_BitScanReverse64) // For Visual 2005 +# else // 32-bits +# pragma intrinsic(_BitScanForward) // For Visual 2005 +# pragma intrinsic(_BitScanReverse) // For Visual 2005 +# endif +# pragma warning(disable : 4127) // disable: C4127: conditional expression is constant +#else +# ifdef __GNUC__ +# define FORCE_INLINE static inline __attribute__((always_inline)) +# else +# define FORCE_INLINE static inline +# endif +#endif + +#ifdef _MSC_VER +# define lz4_bswap16(x) _byteswap_ushort(x) +#else +# define lz4_bswap16(x) ((unsigned short int) ((((x) >> 8) & 0xffu) | (((x) & 0xffu) << 8))) +#endif + +#define GCC_VERSION (__GNUC__ * 100 + __GNUC_MINOR__) + +#if (GCC_VERSION >= 302) || (__INTEL_COMPILER >= 800) || defined(__clang__) +# define expect(expr,value) (__builtin_expect ((expr),(value)) ) +#else +# define expect(expr,value) (expr) +#endif + +#define likely(expr) expect((expr) != 0, 1) +#define unlikely(expr) expect((expr) != 0, 0) + + +//************************************** +// Memory routines +//************************************** +#include // malloc, calloc, free +#define ALLOCATOR(n,s) calloc(n,s) +#define FREEMEM free +#include // memset, memcpy +#define MEM_INIT memset + + +//************************************** +// Includes +//************************************** +#include "lz4.h" + + +//************************************** +// Basic Types +//************************************** +#if defined (__STDC_VERSION__) && __STDC_VERSION__ >= 199901L // C99 +# include + typedef uint8_t BYTE; + typedef uint16_t U16; + typedef uint32_t U32; + typedef int32_t S32; + typedef uint64_t U64; +#else + typedef unsigned char BYTE; + typedef unsigned short U16; + typedef unsigned int U32; + typedef signed int S32; + typedef unsigned long long U64; +#endif + +#if defined(__GNUC__) && !defined(LZ4_FORCE_UNALIGNED_ACCESS) +# define _PACKED __attribute__ ((packed)) +#else +# define _PACKED +#endif + +#if !defined(LZ4_FORCE_UNALIGNED_ACCESS) && !defined(__GNUC__) +# if defined(__IBMC__) || defined(__SUNPRO_C) || defined(__SUNPRO_CC) +# pragma pack(1) +# else +# pragma pack(push, 1) +# endif +#endif + +typedef struct { U16 v; } _PACKED U16_S; +typedef struct { U32 v; } _PACKED U32_S; +typedef struct { U64 v; } _PACKED U64_S; +typedef struct {size_t v;} _PACKED size_t_S; + +#if !defined(LZ4_FORCE_UNALIGNED_ACCESS) && !defined(__GNUC__) +# if defined(__SUNPRO_C) || defined(__SUNPRO_CC) +# pragma pack(0) +# else +# pragma pack(pop) +# endif +#endif + +#define A16(x) (((U16_S *)(x))->v) +#define A32(x) (((U32_S *)(x))->v) +#define A64(x) (((U64_S *)(x))->v) +#define AARCH(x) (((size_t_S *)(x))->v) + + +//************************************** +// Constants +//************************************** +#define LZ4_HASHLOG (MEMORY_USAGE-2) +#define HASHTABLESIZE (1 << MEMORY_USAGE) +#define HASHNBCELLS4 (1 << LZ4_HASHLOG) + +#define MINMATCH 4 + +#define COPYLENGTH 8 +#define LASTLITERALS 5 +#define MFLIMIT (COPYLENGTH+MINMATCH) +const int LZ4_minLength = (MFLIMIT+1); + +#define LZ4_64KLIMIT ((1<<16) + (MFLIMIT-1)) +#define SKIPSTRENGTH 6 // Increasing this value will make the compression run slower on incompressible data + +#define MAXD_LOG 16 +#define MAX_DISTANCE ((1 << MAXD_LOG) - 1) + +#define ML_BITS 4 +#define ML_MASK ((1U<=e; + + +//**************************** +// Private functions +//**************************** +#if LZ4_ARCH64 + +FORCE_INLINE int LZ4_NbCommonBytes (register U64 val) +{ +# if defined(LZ4_BIG_ENDIAN) +# if defined(_MSC_VER) && !defined(LZ4_FORCE_SW_BITCOUNT) + unsigned long r = 0; + _BitScanReverse64( &r, val ); + return (int)(r>>3); +# elif defined(__GNUC__) && (GCC_VERSION >= 304) && !defined(LZ4_FORCE_SW_BITCOUNT) + return (__builtin_clzll(val) >> 3); +# else + int r; + if (!(val>>32)) { r=4; } else { r=0; val>>=32; } + if (!(val>>16)) { r+=2; val>>=8; } else { val>>=24; } + r += (!val); + return r; +# endif +# else +# if defined(_MSC_VER) && !defined(LZ4_FORCE_SW_BITCOUNT) + unsigned long r = 0; + _BitScanForward64( &r, val ); + return (int)(r>>3); +# elif defined(__GNUC__) && (GCC_VERSION >= 304) && !defined(LZ4_FORCE_SW_BITCOUNT) + return (__builtin_ctzll(val) >> 3); +# else + static const int DeBruijnBytePos[64] = { 0, 0, 0, 0, 0, 1, 1, 2, 0, 3, 1, 3, 1, 4, 2, 7, 0, 2, 3, 6, 1, 5, 3, 5, 1, 3, 4, 4, 2, 5, 6, 7, 7, 0, 1, 2, 3, 3, 4, 6, 2, 6, 5, 5, 3, 4, 5, 6, 7, 1, 2, 4, 6, 4, 4, 5, 7, 2, 6, 5, 7, 6, 7, 7 }; + return DeBruijnBytePos[((U64)((val & -(long long)val) * 0x0218A392CDABBD3FULL)) >> 58]; +# endif +# endif +} + +#else + +FORCE_INLINE int LZ4_NbCommonBytes (register U32 val) +{ +# if defined(LZ4_BIG_ENDIAN) +# if defined(_MSC_VER) && !defined(LZ4_FORCE_SW_BITCOUNT) + unsigned long r = 0; + _BitScanReverse( &r, val ); + return (int)(r>>3); +# elif defined(__GNUC__) && (GCC_VERSION >= 304) && !defined(LZ4_FORCE_SW_BITCOUNT) + return (__builtin_clz(val) >> 3); +# else + int r; + if (!(val>>16)) { r=2; val>>=8; } else { r=0; val>>=24; } + r += (!val); + return r; +# endif +# else +# if defined(_MSC_VER) && !defined(LZ4_FORCE_SW_BITCOUNT) + unsigned long r; + _BitScanForward( &r, val ); + return (int)(r>>3); +# elif defined(__GNUC__) && (GCC_VERSION >= 304) && !defined(LZ4_FORCE_SW_BITCOUNT) + return (__builtin_ctz(val) >> 3); +# else + static const int DeBruijnBytePos[32] = { 0, 0, 3, 0, 3, 1, 3, 0, 3, 2, 2, 1, 3, 2, 0, 1, 3, 3, 1, 2, 2, 2, 2, 0, 3, 1, 2, 0, 1, 0, 1, 1 }; + return DeBruijnBytePos[((U32)((val & -(S32)val) * 0x077CB531U)) >> 27]; +# endif +# endif +} + +#endif + + +//**************************** +// Compression functions +//**************************** +FORCE_INLINE int LZ4_hashSequence(U32 sequence, tableType_t tableType) +{ + if (tableType == byU16) + return (((sequence) * 2654435761U) >> ((MINMATCH*8)-(LZ4_HASHLOG+1))); + else + return (((sequence) * 2654435761U) >> ((MINMATCH*8)-LZ4_HASHLOG)); +} + +FORCE_INLINE int LZ4_hashPosition(const BYTE* p, tableType_t tableType) { return LZ4_hashSequence(A32(p), tableType); } + +FORCE_INLINE void LZ4_putPositionOnHash(const BYTE* p, U32 h, void* tableBase, tableType_t tableType, const BYTE* srcBase) +{ + switch (tableType) + { + case byPtr: { const BYTE** hashTable = (const BYTE**) tableBase; hashTable[h] = p; break; } + case byU32: { U32* hashTable = (U32*) tableBase; hashTable[h] = (U32)(p-srcBase); break; } + case byU16: { U16* hashTable = (U16*) tableBase; hashTable[h] = (U16)(p-srcBase); break; } + } +} + +FORCE_INLINE void LZ4_putPosition(const BYTE* p, void* tableBase, tableType_t tableType, const BYTE* srcBase) +{ + U32 h = LZ4_hashPosition(p, tableType); + LZ4_putPositionOnHash(p, h, tableBase, tableType, srcBase); +} + +FORCE_INLINE const BYTE* LZ4_getPositionOnHash(U32 h, void* tableBase, tableType_t tableType, const BYTE* srcBase) +{ + if (tableType == byPtr) { const BYTE** hashTable = (const BYTE**) tableBase; return hashTable[h]; } + if (tableType == byU32) { U32* hashTable = (U32*) tableBase; return hashTable[h] + srcBase; } + { U16* hashTable = (U16*) tableBase; return hashTable[h] + srcBase; } // default, to ensure a return +} + +FORCE_INLINE const BYTE* LZ4_getPosition(const BYTE* p, void* tableBase, tableType_t tableType, const BYTE* srcBase) +{ + U32 h = LZ4_hashPosition(p, tableType); + return LZ4_getPositionOnHash(h, tableBase, tableType, srcBase); +} + + +FORCE_INLINE int LZ4_compress_generic( + void* ctx, + const char* source, + char* dest, + int inputSize, + int maxOutputSize, + + limitedOutput_directive limitedOutput, + tableType_t tableType, + prefix64k_directive prefix) +{ + const BYTE* ip = (const BYTE*) source; + const BYTE* const base = (prefix==withPrefix) ? ((LZ4_Data_Structure*)ctx)->base : (const BYTE*) source; + const BYTE* const lowLimit = ((prefix==withPrefix) ? ((LZ4_Data_Structure*)ctx)->bufferStart : (const BYTE*)source); + const BYTE* anchor = (const BYTE*) source; + const BYTE* const iend = ip + inputSize; + const BYTE* const mflimit = iend - MFLIMIT; + const BYTE* const matchlimit = iend - LASTLITERALS; + + BYTE* op = (BYTE*) dest; + BYTE* const oend = op + maxOutputSize; + + int length; + const int skipStrength = SKIPSTRENGTH; + U32 forwardH; + + // Init conditions + if ((U32)inputSize > (U32)LZ4_MAX_INPUT_SIZE) return 0; // Unsupported input size, too large (or negative) + if ((prefix==withPrefix) && (ip != ((LZ4_Data_Structure*)ctx)->nextBlock)) return 0; // must continue from end of previous block + if (prefix==withPrefix) ((LZ4_Data_Structure*)ctx)->nextBlock=iend; // do it now, due to potential early exit + if ((tableType == byU16) && (inputSize>=LZ4_64KLIMIT)) return 0; // Size too large (not within 64K limit) + if (inputSize> skipStrength; + ip = forwardIp; + forwardIp = ip + step; + + if unlikely(forwardIp > mflimit) { goto _last_literals; } + + forwardH = LZ4_hashPosition(forwardIp, tableType); + ref = LZ4_getPositionOnHash(h, ctx, tableType, base); + LZ4_putPositionOnHash(ip, h, ctx, tableType, base); + + } while ((ref + MAX_DISTANCE < ip) || (A32(ref) != A32(ip))); + + // Catch up + while ((ip>anchor) && (ref > lowLimit) && unlikely(ip[-1]==ref[-1])) { ip--; ref--; } + + // Encode Literal length + length = (int)(ip - anchor); + token = op++; + if ((limitedOutput) && unlikely(op + length + (2 + 1 + LASTLITERALS) + (length/255) > oend)) return 0; // Check output limit + if (length>=(int)RUN_MASK) + { + int len = length-RUN_MASK; + *token=(RUN_MASK<= 255 ; len-=255) *op++ = 255; + *op++ = (BYTE)len; + } + else *token = (BYTE)(length<>8) > oend)) return 0; // Check output limit + if (length>=(int)ML_MASK) + { + *token += ML_MASK; + length -= ML_MASK; + for (; length > 509 ; length-=510) { *op++ = 255; *op++ = 255; } + if (length >= 255) { length-=255; *op++ = 255; } + *op++ = (BYTE)length; + } + else *token += (BYTE)(length); + + // Test end of chunk + if (ip > mflimit) { anchor = ip; break; } + + // Fill table + LZ4_putPosition(ip-2, ctx, tableType, base); + + // Test next position + ref = LZ4_getPosition(ip, ctx, tableType, base); + LZ4_putPosition(ip, ctx, tableType, base); + if ((ref + MAX_DISTANCE >= ip) && (A32(ref) == A32(ip))) { token = op++; *token=0; goto _next_match; } + + // Prepare next loop + anchor = ip++; + forwardH = LZ4_hashPosition(ip, tableType); + } + +_last_literals: + // Encode Last Literals + { + int lastRun = (int)(iend - anchor); + if ((limitedOutput) && (((char*)op - dest) + lastRun + 1 + ((lastRun+255-RUN_MASK)/255) > (U32)maxOutputSize)) return 0; // Check output limit + if (lastRun>=(int)RUN_MASK) { *op++=(RUN_MASK<= 255 ; lastRun-=255) *op++ = 255; *op++ = (BYTE) lastRun; } + else *op++ = (BYTE)(lastRun<hashTable, 0, sizeof(lz4ds->hashTable)); + lz4ds->bufferStart = base; + lz4ds->base = base; + lz4ds->nextBlock = base; +} + + +void* LZ4_create (const char* inputBuffer) +{ + void* lz4ds = ALLOCATOR(1, sizeof(LZ4_Data_Structure)); + LZ4_init ((LZ4_Data_Structure*)lz4ds, (const BYTE*)inputBuffer); + return lz4ds; +} + + +int LZ4_free (void* LZ4_Data) +{ + FREEMEM(LZ4_Data); + return (0); +} + + +char* LZ4_slideInputBuffer (void* LZ4_Data) +{ + LZ4_Data_Structure* lz4ds = (LZ4_Data_Structure*)LZ4_Data; + size_t delta = lz4ds->nextBlock - (lz4ds->bufferStart + 64 KB); + + if ( (lz4ds->base - delta > lz4ds->base) // underflow control + || ((size_t)(lz4ds->nextBlock - lz4ds->base) > 0xE0000000) ) // close to 32-bits limit + { + size_t deltaLimit = (lz4ds->nextBlock - 64 KB) - lz4ds->base; + int nH; + + for (nH=0; nH < HASHNBCELLS4; nH++) + { + if ((size_t)(lz4ds->hashTable[nH]) < deltaLimit) lz4ds->hashTable[nH] = 0; + else lz4ds->hashTable[nH] -= (U32)deltaLimit; + } + memcpy((void*)(lz4ds->bufferStart), (const void*)(lz4ds->nextBlock - 64 KB), 64 KB); + lz4ds->base = lz4ds->bufferStart; + lz4ds->nextBlock = lz4ds->base + 64 KB; + } + else + { + memcpy((void*)(lz4ds->bufferStart), (const void*)(lz4ds->nextBlock - 64 KB), 64 KB); + lz4ds->nextBlock -= delta; + lz4ds->base -= delta; + } + + return (char*)(lz4ds->nextBlock); +} + + +//**************************** +// Decompression functions +//**************************** + +// This generic decompression function cover all use cases. +// It shall be instanciated several times, using different sets of directives +// Note that it is essential this generic function is really inlined, +// in order to remove useless branches during compilation optimisation. +FORCE_INLINE int LZ4_decompress_generic( + const char* source, + char* dest, + int inputSize, // + int outputSize, // If endOnInput==endOnInputSize, this value is the max size of Output Buffer. + + int endOnInput, // endOnOutputSize, endOnInputSize + int prefix64k, // noPrefix, withPrefix + int partialDecoding, // full, partial + int targetOutputSize // only used if partialDecoding==partial + ) +{ + // Local Variables + const BYTE* restrict ip = (const BYTE*) source; + const BYTE* ref; + const BYTE* const iend = ip + inputSize; + + BYTE* op = (BYTE*) dest; + BYTE* const oend = op + outputSize; + BYTE* cpy; + BYTE* oexit = op + targetOutputSize; + + const size_t dec32table[] = {0, 3, 2, 3, 0, 0, 0, 0}; // static reduces speed for LZ4_decompress_safe() on GCC64 + static const size_t dec64table[] = {0, 0, 0, (size_t)-1, 0, 1, 2, 3}; + + + // Special cases + if ((partialDecoding) && (oexit> oend-MFLIMIT)) oexit = oend-MFLIMIT; // targetOutputSize too high => decode everything + if ((endOnInput) && unlikely(outputSize==0)) return ((inputSize==1) && (*ip==0)) ? 0 : -1; // Empty output buffer + if ((!endOnInput) && unlikely(outputSize==0)) return (*ip==0?1:-1); + + + // Main Loop + while (1) + { + unsigned token; + size_t length; + + // get runlength + token = *ip++; + if ((length=(token>>ML_BITS)) == RUN_MASK) + { + unsigned s=255; + while (((endOnInput)?ip(partialDecoding?oexit:oend-MFLIMIT)) || (ip+length>iend-(2+1+LASTLITERALS))) ) + || ((!endOnInput) && (cpy>oend-COPYLENGTH))) + { + if (partialDecoding) + { + if (cpy > oend) goto _output_error; // Error : write attempt beyond end of output buffer + if ((endOnInput) && (ip+length > iend)) goto _output_error; // Error : read attempt beyond end of input buffer + } + else + { + if ((!endOnInput) && (cpy != oend)) goto _output_error; // Error : block decoding must stop exactly there + if ((endOnInput) && ((ip+length != iend) || (cpy > oend))) goto _output_error; // Error : input must be consumed + } + memcpy(op, ip, length); + ip += length; + op += length; + break; // Necessarily EOF, due to parsing restrictions + } + LZ4_WILDCOPY(op, ip, cpy); ip -= (op-cpy); op = cpy; + + // get offset + LZ4_READ_LITTLEENDIAN_16(ref,cpy,ip); ip+=2; + if ((prefix64k==noPrefix) && unlikely(ref < (BYTE* const)dest)) goto _output_error; // Error : offset outside destination buffer + + // get matchlength + if ((length=(token&ML_MASK)) == ML_MASK) + { + while ((!endOnInput) || (ipoend-COPYLENGTH-(STEPSIZE-4)) + { + if (cpy > oend-LASTLITERALS) goto _output_error; // Error : last 5 bytes must be literals + LZ4_SECURECOPY(op, ref, (oend-COPYLENGTH)); + while(op (unsigned int)LZ4_MAX_INPUT_SIZE ? 0 : (isize) + ((isize)/255) + 16) +static inline int LZ4_compressBound(int isize) { return LZ4_COMPRESSBOUND(isize); } + +/* +LZ4_compressBound() : + Provides the maximum size that LZ4 may output in a "worst case" scenario (input data not compressible) + primarily useful for memory allocation of output buffer. + inline function is recommended for the general case, + macro is also provided when result needs to be evaluated at compilation (such as stack memory allocation). + + isize : is the input size. Max supported value is LZ4_MAX_INPUT_SIZE + return : maximum output size in a "worst case" scenario + or 0, if input size is too large ( > LZ4_MAX_INPUT_SIZE) +*/ + + +int LZ4_compress_limitedOutput (const char* source, char* dest, int inputSize, int maxOutputSize); + +/* +LZ4_compress_limitedOutput() : + Compress 'inputSize' bytes from 'source' into an output buffer 'dest' of maximum size 'maxOutputSize'. + If it cannot achieve it, compression will stop, and result of the function will be zero. + This function never writes outside of provided output buffer. + + inputSize : Max supported value is LZ4_MAX_INPUT_VALUE + maxOutputSize : is the size of the destination buffer (which must be already allocated) + return : the number of bytes written in buffer 'dest' + or 0 if the compression fails +*/ + + +int LZ4_decompress_fast (const char* source, char* dest, int outputSize); + +/* +LZ4_decompress_fast() : + outputSize : is the original (uncompressed) size + return : the number of bytes read from the source buffer (in other words, the compressed size) + If the source stream is malformed, the function will stop decoding and return a negative result. + note : This function is a bit faster than LZ4_decompress_safe() + This function never writes outside of output buffers, but may read beyond input buffer in case of malicious data packet. + Use this function preferably into a trusted environment (data to decode comes from a trusted source). + Destination buffer must be already allocated. Its size must be a minimum of 'outputSize' bytes. +*/ + +int LZ4_decompress_safe_partial (const char* source, char* dest, int inputSize, int targetOutputSize, int maxOutputSize); + +/* +LZ4_decompress_safe_partial() : + This function decompress a compressed block of size 'inputSize' at position 'source' + into output buffer 'dest' of size 'maxOutputSize'. + The function tries to stop decompressing operation as soon as 'targetOutputSize' has been reached, + reducing decompression time. + return : the number of bytes decoded in the destination buffer (necessarily <= maxOutputSize) + Note : this number can be < 'targetOutputSize' should the compressed block to decode be smaller. + Always control how many bytes were decoded. + If the source stream is detected malformed, the function will stop decoding and return a negative result. + This function never writes outside of output buffer, and never reads outside of input buffer. It is therefore protected against malicious data packets +*/ + + +//**************************** +// Stream Functions +//**************************** + +void* LZ4_create (const char* inputBuffer); +int LZ4_compress_continue (void* LZ4_Data, const char* source, char* dest, int inputSize); +int LZ4_compress_limitedOutput_continue (void* LZ4_Data, const char* source, char* dest, int inputSize, int maxOutputSize); +char* LZ4_slideInputBuffer (void* LZ4_Data); +int LZ4_free (void* LZ4_Data); + +/* +These functions allow the compression of dependent blocks, where each block benefits from prior 64 KB within preceding blocks. +In order to achieve this, it is necessary to start creating the LZ4 Data Structure, thanks to the function : + +void* LZ4_create (const char* inputBuffer); +The result of the function is the (void*) pointer on the LZ4 Data Structure. +This pointer will be needed in all other functions. +If the pointer returned is NULL, then the allocation has failed, and compression must be aborted. +The only parameter 'const char* inputBuffer' must, obviously, point at the beginning of input buffer. +The input buffer must be already allocated, and size at least 192KB. +'inputBuffer' will also be the 'const char* source' of the first block. + +All blocks are expected to lay next to each other within the input buffer, starting from 'inputBuffer'. +To compress each block, use either LZ4_compress_continue() or LZ4_compress_limitedOutput_continue(). +Their behavior are identical to LZ4_compress() or LZ4_compress_limitedOutput(), +but require the LZ4 Data Structure as their first argument, and check that each block starts right after the previous one. +If next block does not begin immediately after the previous one, the compression will fail (return 0). + +When it's no longer possible to lay the next block after the previous one (not enough space left into input buffer), a call to : +char* LZ4_slideInputBuffer(void* LZ4_Data); +must be performed. It will typically copy the latest 64KB of input at the beginning of input buffer. +Note that, for this function to work properly, minimum size of an input buffer must be 192KB. +==> The memory position where the next input data block must start is provided as the result of the function. + +Compression can then resume, using LZ4_compress_continue() or LZ4_compress_limitedOutput_continue(), as usual. + +When compression is completed, a call to LZ4_free() will release the memory used by the LZ4 Data Structure. +*/ + + +int LZ4_decompress_safe_withPrefix64k (const char* source, char* dest, int inputSize, int maxOutputSize); +int LZ4_decompress_fast_withPrefix64k (const char* source, char* dest, int outputSize); + +/* +*_withPrefix64k() : + These decoding functions work the same as their "normal name" versions, + but can use up to 64KB of data in front of 'char* dest'. + These functions are necessary to decode inter-dependant blocks. +*/ + + +//**************************** +// Obsolete Functions +//**************************** + +static inline int LZ4_uncompress (const char* source, char* dest, int outputSize) { return LZ4_decompress_fast(source, dest, outputSize); } +static inline int LZ4_uncompress_unknownOutputSize (const char* source, char* dest, int isize, int maxOutputSize) { return LZ4_decompress_safe(source, dest, isize, maxOutputSize); } + +/* +These functions are deprecated and should no longer be used. +They are provided here for compatibility with existing user programs. +*/ + + + +#if defined (__cplusplus) +} +#endif diff --git a/storage/xtradb/handler/ha_innodb.cc b/storage/xtradb/handler/ha_innodb.cc index 43cfa23a99f4d..ead0b0fc9021d 100644 --- a/storage/xtradb/handler/ha_innodb.cc +++ b/storage/xtradb/handler/ha_innodb.cc @@ -4,6 +4,7 @@ Copyright (c) 2000, 2013, Oracle and/or its affiliates. All rights reserved. Copyright (c) 2008, 2009 Google Inc. Copyright (c) 2009, Percona Inc. Copyright (c) 2012, Facebook Inc. +Copyright (c) 2013, 2014, SkySQL Ab. Portions of this file contain modifications contributed and copyrighted by Google, Inc. Those modifications are gratefully acknowledged and are described @@ -558,6 +559,27 @@ ib_cb_t innodb_api_cb[] = { (ib_cb_t) ib_cfg_bk_commit_interval }; +/** + Structure for CREATE TABLE options (table options). + It needs to be called ha_table_option_struct. + + The option values can be specified in the CREATE TABLE at the end: + CREATE TABLE ( ... ) *here* +*/ + +ha_create_table_option innodb_table_option_list[]= +{ + /* With this option user can enable page compression feature for the + table */ + HA_TOPTION_BOOL("PAGE_COMPRESSED", page_compressed, 0), + /* With this option user can set zip compression level for page + compression for this table*/ + HA_TOPTION_NUMBER("PAGE_COMPRESSION_LEVEL", page_compression_level, ULINT_UNDEFINED, 0, 9, 1), + /* With this option user can enable atomic writes feature for this table */ + HA_TOPTION_ENUM("ATOMIC_WRITES", atomic_writes, "DEFAULT,ON,OFF", 0), + HA_TOPTION_END +}; + /*************************************************************//** Check whether valid argument given to innodb_ft_*_stopword_table. This function is registered as a callback with MySQL. @@ -873,6 +895,25 @@ static SHOW_VAR innodb_status_variables[]= { (char*) &export_vars.innodb_x_lock_spin_rounds, SHOW_LONGLONG}, {"x_lock_spin_waits", (char*) &export_vars.innodb_x_lock_spin_waits, SHOW_LONGLONG}, + + /* Status variables for page compression */ + {"page_compression_saved", + (char*) &export_vars.innodb_page_compression_saved, SHOW_LONGLONG}, + {"page_compression_trim_sect512", + (char*) &export_vars.innodb_page_compression_trim_sect512, SHOW_LONGLONG}, + {"page_compression_trim_sect4096", + (char*) &export_vars.innodb_page_compression_trim_sect4096, SHOW_LONGLONG}, + {"num_index_pages_written", + (char*) &export_vars.innodb_index_pages_written, SHOW_LONGLONG}, + {"num_pages_page_compressed", + (char*) &export_vars.innodb_pages_page_compressed, SHOW_LONGLONG}, + {"num_page_compressed_trim_op", + (char*) &export_vars.innodb_page_compressed_trim_op, SHOW_LONGLONG}, + {"num_page_compressed_trim_op_saved", + (char*) &export_vars.innodb_page_compressed_trim_op_saved, SHOW_LONGLONG}, + {"num_pages_page_decompressed", + (char*) &export_vars.innodb_pages_page_decompressed, SHOW_LONGLONG}, + {NullS, NullS, SHOW_LONG} }; @@ -3156,6 +3197,8 @@ innobase_init( if (srv_file_per_table) innobase_hton->tablefile_extensions = ha_innobase_exts; + innobase_hton->table_options = innodb_table_option_list; + ut_a(DATA_MYSQL_TRUE_VARCHAR == (ulint)MYSQL_TYPE_VARCHAR); #ifndef DBUG_OFF @@ -10010,11 +10053,16 @@ innobase_table_flags( enum row_type row_format; rec_format_t innodb_row_format = REC_FORMAT_COMPACT; bool use_data_dir; + ha_table_option_struct *options= form->s->option_struct; /* Cache the value of innodb_file_format, in case it is modified by another thread while the table is being created. */ const ulint file_format_allowed = srv_file_format; + /* Cache the value of innobase_compression_level, in case it is + modified by another thread while the table is being created. */ + const ulint default_compression_level = page_zip_level; + *flags = 0; *flags2 = 0; @@ -10063,6 +10111,8 @@ innobase_table_flags( } } + row_format = form->s->row_type; + if (create_info->key_block_size) { /* The requested compressed page size (key_block_size) is given in kilobytes. If it is a valid number, store @@ -10110,8 +10160,6 @@ innobase_table_flags( } } - row_format = form->s->row_type; - if (zip_ssize && zip_allowed) { /* if ROW_FORMAT is set to default, automatically change it to COMPRESSED.*/ @@ -10166,10 +10214,18 @@ innobase_table_flags( " innodb_file_format > Antelope.", get_row_format_name(row_format)); } else { - innodb_row_format = (row_format == ROW_TYPE_DYNAMIC - ? REC_FORMAT_DYNAMIC - : REC_FORMAT_COMPRESSED); - break; + switch(row_format) { + case ROW_TYPE_COMPRESSED: + innodb_row_format = REC_FORMAT_COMPRESSED; + break; + case ROW_TYPE_DYNAMIC: + innodb_row_format = REC_FORMAT_DYNAMIC; + break; + default: + /* Not possible, avoid compiler warning */ + break; + } + break; /* Correct row_format */ } zip_allowed = FALSE; /* fall through to set row_format = COMPACT */ @@ -10196,7 +10252,15 @@ innobase_table_flags( && ((create_info->data_file_name != NULL) && !(create_info->options & HA_LEX_CREATE_TMP_TABLE)); - dict_tf_set(flags, innodb_row_format, zip_ssize, use_data_dir); + /* Set up table dictionary flags */ + dict_tf_set(flags, + innodb_row_format, + zip_ssize, + use_data_dir, + options->page_compressed, + (ulint)options->page_compression_level == ULINT_UNDEFINED ? + default_compression_level : options->page_compression_level, + options->atomic_writes); if (create_info->options & HA_LEX_CREATE_TMP_TABLE) { *flags2 |= DICT_TF2_TEMPORARY; @@ -10209,6 +10273,112 @@ innobase_table_flags( DBUG_RETURN(true); } +/*****************************************************************//** +Check engine specific table options not handled by SQL-parser. +@return NULL if valid, string if not */ +UNIV_INTERN +const char* +ha_innobase::check_table_options( + THD *thd, /*!< in: thread handle */ + TABLE* table, /*!< in: information on table + columns and indexes */ + HA_CREATE_INFO* create_info, /*!< in: more information of the + created table, contains also the + create statement string */ + const bool use_tablespace, /*!< in: use file par table */ + const ulint file_format) +{ + enum row_type row_format = table->s->row_type;; + ha_table_option_struct *options= table->s->option_struct; + atomic_writes_t awrites = (atomic_writes_t)options->atomic_writes; + + /* Check page compression requirements */ + if (options->page_compressed) { + if (!srv_compress_pages) { + push_warning( + thd, Sql_condition::WARN_LEVEL_WARN, + HA_WRONG_CREATE_OPTION, + "InnoDB: PAGE_COMPRESSED requires" + "innodb_compress_pages not enabled"); + return "PAGE_COMPRESSED"; + } + + if (row_format == ROW_TYPE_COMPRESSED) { + push_warning( + thd, Sql_condition::WARN_LEVEL_WARN, + HA_WRONG_CREATE_OPTION, + "InnoDB: PAGE_COMPRESSED table can't have" + " ROW_TYPE=COMPRESSED"); + return "PAGE_COMPRESSED"; + } + + if (!use_tablespace) { + push_warning( + thd, Sql_condition::WARN_LEVEL_WARN, + HA_WRONG_CREATE_OPTION, + "InnoDB: PAGE_COMPRESSED requires" + " innodb_file_per_table."); + return "PAGE_COMPRESSED"; + } + + if (file_format < UNIV_FORMAT_B) { + push_warning( + thd, Sql_condition::WARN_LEVEL_WARN, + HA_WRONG_CREATE_OPTION, + "InnoDB: PAGE_COMPRESSED requires" + " innodb_file_format > Antelope."); + return "PAGE_COMPRESSED"; + } + + if (create_info->key_block_size) { + push_warning( + thd, Sql_condition::WARN_LEVEL_WARN, + HA_WRONG_CREATE_OPTION, + "InnoDB: PAGE_COMPRESSED table can't have" + " key_block_size"); + return "PAGE_COMPRESSED"; + } + } + + /* Check page compression level requirements, some of them are + already checked above */ + if ((ulint)options->page_compression_level != ULINT_UNDEFINED) { + if (options->page_compressed == false) { + push_warning( + thd, Sql_condition::WARN_LEVEL_WARN, + HA_WRONG_CREATE_OPTION, + "InnoDB: PAGE_COMPRESSION_LEVEL requires" + " PAGE_COMPRESSED"); + return "PAGE_COMPRESSION_LEVEL"; + } + + if (options->page_compression_level < 0 || options->page_compression_level > 9) { + push_warning_printf( + thd, Sql_condition::WARN_LEVEL_WARN, + HA_WRONG_CREATE_OPTION, + "InnoDB: invalid PAGE_COMPRESSION_LEVEL = %lu." + " Valid values are [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]", + create_info->key_block_size); + return "PAGE_COMPRESSION_LEVEL"; + } + } + + /* Check atomic writes requirements */ + if (awrites == ATOMIC_WRITES_ON || + (awrites == ATOMIC_WRITES_DEFAULT && srv_use_atomic_writes)) { + if (!use_tablespace) { + push_warning( + thd, Sql_condition::WARN_LEVEL_WARN, + HA_WRONG_CREATE_OPTION, + "InnoDB: ATOMIC_WRITES requires" + " innodb_file_per_table."); + return "ATOMIC_WRITES"; + } + } + + return 0; +} + /*****************************************************************//** Creates a new table to an InnoDB database. @return error number */ @@ -10240,6 +10410,7 @@ ha_innobase::create( while creating the table. So we read the current value here and make all further decisions based on this. */ bool use_tablespace = srv_file_per_table; + const ulint file_format = srv_file_format; /* Zip Shift Size - log2 - 9 of compressed page size, zero for uncompressed */ @@ -10263,6 +10434,12 @@ ha_innobase::create( /* Create the table definition in InnoDB */ + /* Validate table options not handled by the SQL-parser */ + if(check_table_options(thd, form, create_info, use_tablespace, + file_format)) { + DBUG_RETURN(HA_WRONG_CREATE_OPTION); + } + /* Validate create options if innodb_strict_mode is set. */ if (create_options_are_invalid( thd, form, create_info, use_tablespace)) { @@ -14578,6 +14755,12 @@ ha_innobase::check_if_incompatible_data( HA_CREATE_INFO* info, uint table_changes) { + ha_table_option_struct *param_old, *param_new; + + /* Cache engine specific options */ + param_new = info->option_struct; + param_old = table->s->option_struct; + innobase_copy_frm_flags_from_create_info(prebuilt->table, info); if (table_changes != IS_EQUAL_YES) { @@ -14604,6 +14787,13 @@ ha_innobase::check_if_incompatible_data( return(COMPATIBLE_DATA_NO); } + /* Changes on engine specific table options requests a rebuild of the table. */ + if (param_new->page_compressed != param_old->page_compressed || + param_new->page_compression_level != param_old->page_compression_level || + param_new->atomic_writes != param_old->atomic_writes) { + return(COMPATIBLE_DATA_NO); + } + return(COMPATIBLE_DATA_YES); } @@ -17079,12 +17269,6 @@ static MYSQL_SYSVAR_ULONG(replication_delay, srv_replication_delay, "innodb_thread_concurrency is reached (0 by default)", NULL, NULL, 0, 0, ~0UL, 0); -static MYSQL_SYSVAR_UINT(compression_level, page_zip_level, - PLUGIN_VAR_RQCMDARG, - "Compression level used for compressed row format. 0 is no compression" - ", 1 is fastest, 9 is best compression and default is 6.", - NULL, NULL, DEFAULT_COMPRESSION_LEVEL, 0, 9, 0); - static MYSQL_SYSVAR_BOOL(log_compressed_pages, page_zip_log_pages, PLUGIN_VAR_OPCMDARG, "Enables/disables the logging of entire compressed page images." @@ -17758,6 +17942,37 @@ static MYSQL_SYSVAR_BOOL(use_stacktrace, srv_use_stacktrace, "Print stacktrace on long semaphore wait (off by default supported only on linux)", NULL, NULL, FALSE); +static MYSQL_SYSVAR_BOOL(compress_pages, srv_compress_pages, + PLUGIN_VAR_NOCMDARG | PLUGIN_VAR_READONLY, + "Use page compression.", + NULL, NULL, FALSE); + +static MYSQL_SYSVAR_LONG(trim_pct, srv_trim_pct, + PLUGIN_VAR_OPCMDARG , + "How many percent of compressed pages should be trimmed", + NULL, NULL, 100, 0, 100, 0); + +static MYSQL_SYSVAR_UINT(compression_level, page_zip_level, + PLUGIN_VAR_RQCMDARG, + "Compression level used for zlib compression. 0 is no compression" + ", 1 is fastest, 9 is best compression and default is 6.", + NULL, NULL, DEFAULT_COMPRESSION_LEVEL, 0, 9, 0); + +static MYSQL_SYSVAR_BOOL(compress_index_pages, srv_page_compress_index_pages, + PLUGIN_VAR_OPCMDARG, + "Use page compression for only index pages.", + NULL, NULL, FALSE); + +static MYSQL_SYSVAR_BOOL(use_trim, srv_use_trim, + PLUGIN_VAR_OPCMDARG, + "Use trim.", + NULL, NULL, TRUE); + +static MYSQL_SYSVAR_BOOL(use_lz4, srv_use_lz4, + PLUGIN_VAR_OPCMDARG , + "Use LZ4 for page compression", + NULL, NULL, FALSE); + static struct st_mysql_sys_var* innobase_system_variables[]= { MYSQL_SYSVAR(log_block_size), MYSQL_SYSVAR(additional_mem_pool_size), @@ -17948,6 +18163,11 @@ static struct st_mysql_sys_var* innobase_system_variables[]= { MYSQL_SYSVAR(fake_changes), MYSQL_SYSVAR(locking_fake_changes), MYSQL_SYSVAR(use_stacktrace), + MYSQL_SYSVAR(compress_pages), + MYSQL_SYSVAR(trim_pct), + MYSQL_SYSVAR(compress_index_pages), + MYSQL_SYSVAR(use_trim), + MYSQL_SYSVAR(use_lz4), NULL }; diff --git a/storage/xtradb/handler/ha_innodb.h b/storage/xtradb/handler/ha_innodb.h index 773a9b6b04d22..b4df711356c58 100644 --- a/storage/xtradb/handler/ha_innodb.h +++ b/storage/xtradb/handler/ha_innodb.h @@ -1,6 +1,7 @@ /***************************************************************************** Copyright (c) 2000, 2012, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2013, SkySQL Ab. All Rights Reserved. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -57,6 +58,21 @@ typedef struct st_innobase_share { /** Prebuilt structures in an InnoDB table handle used within MySQL */ struct row_prebuilt_t; +/** Engine specific table options are definined using this struct */ +struct ha_table_option_struct +{ + bool page_compressed; /*!< Table is using page compression + if this option is true. */ + int page_compression_level; /*!< Table page compression level + or UNIV_UNSPECIFIED. */ + uint atomic_writes; /*!< Use atomic writes for this + table if this options is ON or + in DEFAULT if + srv_use_atomic_writes=1. + Atomic writes are not used if + value OFF.*/ +}; + /** The class defining a handle to an Innodb table */ class ha_innobase: public handler { @@ -184,6 +200,8 @@ class ha_innobase: public handler char* norm_name, char* temp_path, char* remote_path); + const char* check_table_options(THD *thd, TABLE* table, + HA_CREATE_INFO* create_info, const bool use_tablespace, const ulint file_format); int create(const char *name, register TABLE *form, HA_CREATE_INFO *create_info); int truncate(); diff --git a/storage/xtradb/handler/handler0alter.cc b/storage/xtradb/handler/handler0alter.cc index 9c535285d1ed2..24dc1086cc5e7 100644 --- a/storage/xtradb/handler/handler0alter.cc +++ b/storage/xtradb/handler/handler0alter.cc @@ -1,6 +1,7 @@ /***************************************************************************** Copyright (c) 2005, 2013, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2013, SkySQL Ab. All Rights Reserved. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -252,6 +253,22 @@ ha_innobase::check_if_supported_inplace_alter( update_thd(); trx_search_latch_release_if_reserved(prebuilt->trx); + /* Change on engine specific table options require rebuild of the + table */ + if (ha_alter_info->handler_flags + == Alter_inplace_info::CHANGE_CREATE_OPTION) { + ha_table_option_struct *new_options= ha_alter_info->create_info->option_struct; + ha_table_option_struct *old_options= table->s->option_struct; + + if (new_options->page_compressed != old_options->page_compressed || + new_options->page_compression_level != old_options->page_compression_level || + new_options->atomic_writes != old_options->atomic_writes) { + ha_alter_info->unsupported_reason = innobase_get_err_msg( + ER_ALTER_OPERATION_NOT_SUPPORTED_REASON); + DBUG_RETURN(HA_ALTER_INPLACE_NOT_SUPPORTED); + } + } + if (ha_alter_info->handler_flags & ~(INNOBASE_INPLACE_IGNORE | INNOBASE_ALTER_NOREBUILD @@ -3372,6 +3389,17 @@ ha_innobase::prepare_inplace_alter_table( if (ha_alter_info->handler_flags & Alter_inplace_info::CHANGE_CREATE_OPTION) { + /* Check engine specific table options */ + if (const char* invalid_tbopt = check_table_options( + user_thd, altered_table, + ha_alter_info->create_info, + prebuilt->table->space != 0, + srv_file_format)) { + my_error(ER_ILLEGAL_HA_CREATE_OPTION, MYF(0), + table_type(), invalid_tbopt); + goto err_exit_no_heap; + } + if (const char* invalid_opt = create_options_are_invalid( user_thd, altered_table, ha_alter_info->create_info, diff --git a/storage/xtradb/include/buf0buf.h b/storage/xtradb/include/buf0buf.h index ba2f413429cbc..8fedeeaa8321c 100644 --- a/storage/xtradb/include/buf0buf.h +++ b/storage/xtradb/include/buf0buf.h @@ -1,6 +1,7 @@ /***************************************************************************** Copyright (c) 1995, 2013, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2013, SkySQL Ab. All Rights Reserved. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -1489,6 +1490,12 @@ struct buf_page_t{ state == BUF_BLOCK_ZIP_PAGE and zip.data == NULL means an active buf_pool->watch */ + + ulint write_size; /* Write size is set when this + page is first time written and then + if written again we check is TRIM + operation needed. */ + #ifndef UNIV_HOTBACKUP buf_page_t* hash; /*!< node used in chaining to buf_pool->page_hash or @@ -2118,6 +2125,20 @@ struct CheckUnzipLRUAndLRUList { }; #endif /* UNIV_DEBUG || defined UNIV_BUF_DEBUG */ +/*********************************************************************//** +Aquire LRU list mutex */ +void +buf_pool_mutex_enter( +/*=================*/ + buf_pool_t* buf_pool); /*!< in: buffer pool */ +/*********************************************************************//** +Exit LRU list mutex */ +void +buf_pool_mutex_exit( +/*================*/ + buf_pool_t* buf_pool); /*!< in: buffer pool */ + + #ifndef UNIV_NONINL #include "buf0buf.ic" #endif diff --git a/storage/xtradb/include/buf0flu.h b/storage/xtradb/include/buf0flu.h index f4542e7c2060a..6b2827e77a763 100644 --- a/storage/xtradb/include/buf0flu.h +++ b/storage/xtradb/include/buf0flu.h @@ -36,6 +36,13 @@ Created 11/5/1995 Heikki Tuuri /** Flag indicating if the page_cleaner is in active state. */ extern ibool buf_page_cleaner_is_active; +/** Handled page counters for a single flush */ +struct flush_counters_t { + ulint flushed; /*!< number of dirty pages flushed */ + ulint evicted; /*!< number of clean pages evicted */ +}; + + /********************************************************************//** Remove a block from the flush list of modified blocks. */ UNIV_INTERN diff --git a/storage/xtradb/include/dict0dict.h b/storage/xtradb/include/dict0dict.h index 6669f60b95af0..8ab05c50dbd88 100644 --- a/storage/xtradb/include/dict0dict.h +++ b/storage/xtradb/include/dict0dict.h @@ -2,6 +2,7 @@ Copyright (c) 1996, 2013, Oracle and/or its affiliates. All Rights Reserved. Copyright (c) 2012, Facebook Inc. +Copyright (c) 2013, SkySQL Ab. All Rights Reserved. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -42,6 +43,8 @@ Created 1/8/1996 Heikki Tuuri #include "ut0byte.h" #include "trx0types.h" #include "row0types.h" +#include "fsp0fsp.h" +#include "dict0pagecompress.h" #ifndef UNIV_HOTBACKUP # include "sync0sync.h" @@ -904,7 +907,14 @@ dict_tf_set( ulint* flags, /*!< in/out: table */ rec_format_t format, /*!< in: file format */ ulint zip_ssize, /*!< in: zip shift size */ - bool remote_path) /*!< in: table uses DATA DIRECTORY */ + bool remote_path, /*!< in: table uses DATA DIRECTORY + */ + bool page_compressed,/*!< in: table uses page compressed + pages */ + ulint page_compression_level, /*!< in: table page compression + level */ + ulint atomic_writes) /*!< in: table atomic + writes option value*/ __attribute__((nonnull)); /********************************************************************//** Convert a 32 bit integer table flags to the 32 bit integer that is diff --git a/storage/xtradb/include/dict0dict.ic b/storage/xtradb/include/dict0dict.ic index c261d6a3aeed8..502b1d028d821 100644 --- a/storage/xtradb/include/dict0dict.ic +++ b/storage/xtradb/include/dict0dict.ic @@ -1,6 +1,7 @@ /***************************************************************************** Copyright (c) 1996, 2012, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2013, SkySQL Ab. All Rights Reserved. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -537,10 +538,27 @@ dict_tf_is_valid( ulint zip_ssize = DICT_TF_GET_ZIP_SSIZE(flags); ulint atomic_blobs = DICT_TF_HAS_ATOMIC_BLOBS(flags); ulint unused = DICT_TF_GET_UNUSED(flags); + ulint page_compression = DICT_TF_GET_PAGE_COMPRESSION(flags); + ulint page_compression_level = DICT_TF_GET_PAGE_COMPRESSION_LEVEL(flags); + ulint data_dir = DICT_TF_HAS_DATA_DIR(flags); + ulint atomic_writes = DICT_TF_GET_ATOMIC_WRITES(flags); /* Make sure there are no bits that we do not know about. */ if (unused != 0) { + fprintf(stderr, + "InnoDB: Error: table unused flags are %ld" + " in the data dictionary and are corrupted\n" + "InnoDB: Error: data dictionary flags are\n" + "InnoDB: compact %ld atomic_blobs %ld\n" + "InnoDB: unused %ld data_dir %ld zip_ssize %ld\n" + "InnoDB: page_compression %ld page_compression_level %ld\n" + "InnoDB: atomic_writes %ld\n", + unused, + compact, atomic_blobs, unused, data_dir, zip_ssize, + page_compression, page_compression_level, atomic_writes + ); + return(false); } else if (atomic_blobs) { @@ -550,12 +568,36 @@ dict_tf_is_valid( data stored off-page in the clustered index. */ if (!compact) { + fprintf(stderr, + "InnoDB: Error: table compact flags are %ld" + " in the data dictionary and are corrupted\n" + "InnoDB: Error: data dictionary flags are\n" + "InnoDB: compact %ld atomic_blobs %ld\n" + "InnoDB: unused %ld data_dir %ld zip_ssize %ld\n" + "InnoDB: page_compression %ld page_compression_level %ld\n" + "InnoDB: atomic_writes %ld\n", + compact, compact, atomic_blobs, unused, data_dir, zip_ssize, + page_compression, page_compression_level, atomic_writes + ); + return(false); } } else if (zip_ssize) { /* Antelope does not support COMPRESSED row format. */ + fprintf(stderr, + "InnoDB: Error: table flags are %ld" + " in the data dictionary and are corrupted\n" + "InnoDB: Error: data dictionary flags are\n" + "InnoDB: compact %ld atomic_blobs %ld\n" + "InnoDB: unused %ld data_dir %ld zip_ssize %ld\n" + "InnoDB: page_compression %ld page_compression_level %ld\n" + "InnoDB: atomic_writes %ld\n", + flags, compact, atomic_blobs, unused, data_dir, zip_ssize, + page_compression, page_compression_level, atomic_writes + ); + return(false); } @@ -568,6 +610,41 @@ dict_tf_is_valid( || !atomic_blobs || zip_ssize > PAGE_ZIP_SSIZE_MAX) { + fprintf(stderr, + "InnoDB: Error: table compact flags are %ld in the data dictionary and are corrupted\n" + "InnoDB: Error: data dictionary flags are\n" + "InnoDB: compact %ld atomic_blobs %ld\n" + "InnoDB: unused %ld data_dir %ld zip_ssize %ld\n" + "InnoDB: page_compression %ld page_compression_level %ld\n" + "InnoDB: atomic_writes %ld\n", + flags, + compact, atomic_blobs, unused, data_dir, zip_ssize, + page_compression, page_compression_level, atomic_writes + + ); + return(false); + } + } + + if (page_compression || page_compression_level) { + /* Page compression format must have compact and + atomic_blobs and page_compression_level requires + page_compression */ + if (!compact + || !page_compression + || !atomic_blobs) { + + fprintf(stderr, + "InnoDB: Error: table flags are %ld in the data dictionary and are corrupted\n" + "InnoDB: Error: data dictionary flags are\n" + "InnoDB: compact %ld atomic_blobs %ld\n" + "InnoDB: unused %ld data_dir %ld zip_ssize %ld\n" + "InnoDB: page_compression %ld page_compression_level %ld\n" + "InnoDB: atomic_writes %ld\n", + flags, compact, atomic_blobs, unused, data_dir, zip_ssize, + page_compression, page_compression_level, atomic_writes + ); + return(false); } } @@ -594,6 +671,10 @@ dict_sys_tables_type_validate( ulint zip_ssize = DICT_TF_GET_ZIP_SSIZE(type); ulint atomic_blobs = DICT_TF_HAS_ATOMIC_BLOBS(type); ulint unused = DICT_TF_GET_UNUSED(type); + ulint page_compression = DICT_TF_GET_PAGE_COMPRESSION(type); + ulint page_compression_level = DICT_TF_GET_PAGE_COMPRESSION_LEVEL(type); + ulint atomic_writes = DICT_TF_GET_ATOMIC_WRITES(type); + atomic_writes_t awrites = (atomic_writes_t)atomic_writes; /* The low order bit of SYS_TABLES.TYPE is always set to 1. If the format is UNIV_FORMAT_B or higher, this field is the same @@ -647,6 +728,24 @@ dict_sys_tables_type_validate( format, so the DATA_DIR flag is compatible with any other table flags. However, it is not used with TEMPORARY tables.*/ + if (page_compression || page_compression_level) { + /* page compressed row format must have low_order_bit and + atomic_blobs bits set and the DICT_N_COLS_COMPACT flag + should be in N_COLS, but we already know about the + low_order_bit and DICT_N_COLS_COMPACT flags. */ + + if (!atomic_blobs || !page_compression) { + return(ULINT_UNDEFINED); + } + } + + if (awrites == ATOMIC_WRITES_ON || + (awrites == ATOMIC_WRITES_DEFAULT && srv_use_atomic_writes)) { + if (!atomic_blobs) { + return(ULINT_UNDEFINED); + } + } + /* Return the validated SYS_TABLES.TYPE. */ return(type); } @@ -719,8 +818,16 @@ dict_tf_set( ulint* flags, /*!< in/out: table flags */ rec_format_t format, /*!< in: file format */ ulint zip_ssize, /*!< in: zip shift size */ - bool use_data_dir) /*!< in: table uses DATA DIRECTORY */ + bool use_data_dir, /*!< in: table uses DATA DIRECTORY + */ + bool page_compressed,/*!< in: table uses page compressed + pages */ + ulint page_compression_level, /*!< in: table page compression + level */ + ulint atomic_writes) /*!< in: table atomic writes setup */ { + atomic_writes_t awrites = (atomic_writes_t)atomic_writes; + switch (format) { case REC_FORMAT_REDUNDANT: *flags = 0; @@ -745,6 +852,28 @@ dict_tf_set( if (use_data_dir) { *flags |= (1 << DICT_TF_POS_DATA_DIR); } + + if (page_compressed) { + *flags = DICT_TF_COMPACT + | (1 << DICT_TF_POS_ATOMIC_BLOBS) + | (1 << DICT_TF_POS_PAGE_COMPRESSION) + | (page_compression_level << DICT_TF_POS_PAGE_COMPRESSION_LEVEL); + + ut_ad(zip_ssize == 0); + ut_ad(dict_tf_get_page_compression(*flags) == TRUE); + ut_ad(dict_tf_get_page_compression_level(*flags) == page_compression_level); + } + + if (awrites != ATOMIC_WRITES_DEFAULT) { + *flags |= (atomic_writes << DICT_TF_POS_ATOMIC_WRITES); + ut_ad(dict_tf_get_atomic_writes(*flags) == awrites); + } + + if (awrites == ATOMIC_WRITES_ON || + (awrites == ATOMIC_WRITES_DEFAULT && srv_use_atomic_writes )) { + *flags |= (1 << DICT_TF_POS_ATOMIC_BLOBS); + } + } /********************************************************************//** @@ -765,6 +894,9 @@ dict_tf_to_fsp_flags( ulint table_flags) /*!< in: dict_table_t::flags */ { ulint fsp_flags; + ulint page_compression = DICT_TF_GET_PAGE_COMPRESSION(table_flags); + ulint page_compression_level = DICT_TF_GET_PAGE_COMPRESSION_LEVEL(table_flags); + ulint atomic_writes = DICT_TF_GET_ATOMIC_WRITES(table_flags); DBUG_EXECUTE_IF("dict_tf_to_fsp_flags_failure", return(ULINT_UNDEFINED);); @@ -783,7 +915,20 @@ dict_tf_to_fsp_flags( fsp_flags |= DICT_TF_HAS_DATA_DIR(table_flags) ? FSP_FLAGS_MASK_DATA_DIR : 0; + /* In addition, tablespace flags also contain if the page + compression is used for this table. */ + fsp_flags |= FSP_FLAGS_SET_PAGE_COMPRESSION(fsp_flags, page_compression); + + /* In addition, tablespace flags also contain page compression level + if page compression is used for this table. */ + fsp_flags |= FSP_FLAGS_SET_PAGE_COMPRESSION_LEVEL(fsp_flags, page_compression_level); + + /* In addition, tablespace flags also contain flag if atomic writes + is used for this table */ + fsp_flags |= FSP_FLAGS_SET_ATOMIC_WRITES(fsp_flags, atomic_writes); + ut_a(fsp_flags_is_valid(fsp_flags)); + ut_a(dict_tf_verify_flags(table_flags, fsp_flags)); return(fsp_flags); } @@ -811,10 +956,15 @@ dict_sys_tables_type_to_tf( /* Adjust bit zero. */ flags = redundant ? 0 : 1; - /* ZIP_SSIZE, ATOMIC_BLOBS & DATA_DIR are the same. */ + /* ZIP_SSIZE, ATOMIC_BLOBS, DATA_DIR, PAGE_COMPRESSION, + PAGE_COMPRESSION_LEVEL, ATOMIC_WRITES are the same. */ flags |= type & (DICT_TF_MASK_ZIP_SSIZE | DICT_TF_MASK_ATOMIC_BLOBS - | DICT_TF_MASK_DATA_DIR); + | DICT_TF_MASK_DATA_DIR + | DICT_TF_MASK_PAGE_COMPRESSION + | DICT_TF_MASK_PAGE_COMPRESSION_LEVEL + | DICT_TF_MASK_ATOMIC_WRITES + ); return(flags); } @@ -842,10 +992,14 @@ dict_tf_to_sys_tables_type( /* Adjust bit zero. It is always 1 in SYS_TABLES.TYPE */ type = 1; - /* ZIP_SSIZE, ATOMIC_BLOBS & DATA_DIR are the same. */ + /* ZIP_SSIZE, ATOMIC_BLOBS, DATA_DIR, PAGE_COMPRESSION, + PAGE_COMPRESSION_LEVEL, ATOMIC_WRITES are the same. */ type |= flags & (DICT_TF_MASK_ZIP_SSIZE | DICT_TF_MASK_ATOMIC_BLOBS - | DICT_TF_MASK_DATA_DIR); + | DICT_TF_MASK_DATA_DIR + | DICT_TF_MASK_PAGE_COMPRESSION + | DICT_TF_MASK_PAGE_COMPRESSION_LEVEL + | DICT_TF_MASK_ATOMIC_WRITES); return(type); } diff --git a/storage/xtradb/include/dict0mem.h b/storage/xtradb/include/dict0mem.h index bde0ce16094f3..087fde0ccb707 100644 --- a/storage/xtradb/include/dict0mem.h +++ b/storage/xtradb/include/dict0mem.h @@ -2,6 +2,7 @@ Copyright (c) 1996, 2013, Oracle and/or its affiliates. All Rights Reserved. Copyright (c) 2012, Facebook Inc. +Copyright (c) 2013, SkySQL Ab. All Rights Reserved. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -125,11 +126,26 @@ This flag prevents older engines from attempting to open the table and allows InnoDB to update_create_info() accordingly. */ #define DICT_TF_WIDTH_DATA_DIR 1 +/** +Width of the page compression flag +*/ +#define DICT_TF_WIDTH_PAGE_COMPRESSION 1 +#define DICT_TF_WIDTH_PAGE_COMPRESSION_LEVEL 4 + +/** +Width of atomic writes flag +DEFAULT=0, ON = 1, OFF = 2 +*/ +#define DICT_TF_WIDTH_ATOMIC_WRITES 2 + /** Width of all the currently known table flags */ #define DICT_TF_BITS (DICT_TF_WIDTH_COMPACT \ + DICT_TF_WIDTH_ZIP_SSIZE \ + DICT_TF_WIDTH_ATOMIC_BLOBS \ - + DICT_TF_WIDTH_DATA_DIR) + + DICT_TF_WIDTH_DATA_DIR \ + + DICT_TF_WIDTH_PAGE_COMPRESSION \ + + DICT_TF_WIDTH_PAGE_COMPRESSION_LEVEL \ + + DICT_TF_WIDTH_ATOMIC_WRITES) /** A mask of all the known/used bits in table flags */ #define DICT_TF_BIT_MASK (~(~0 << DICT_TF_BITS)) @@ -145,9 +161,18 @@ allows InnoDB to update_create_info() accordingly. */ /** Zero relative shift position of the DATA_DIR field */ #define DICT_TF_POS_DATA_DIR (DICT_TF_POS_ATOMIC_BLOBS \ + DICT_TF_WIDTH_ATOMIC_BLOBS) +/** Zero relative shift position of the PAGE_COMPRESSION field */ +#define DICT_TF_POS_PAGE_COMPRESSION (DICT_TF_POS_DATA_DIR \ + + DICT_TF_WIDTH_DATA_DIR) +/** Zero relative shift position of the PAGE_COMPRESSION_LEVEL field */ +#define DICT_TF_POS_PAGE_COMPRESSION_LEVEL (DICT_TF_POS_PAGE_COMPRESSION \ + + DICT_TF_WIDTH_PAGE_COMPRESSION) +/** Zero relative shift position of the ATOMIC_WRITES field */ +#define DICT_TF_POS_ATOMIC_WRITES (DICT_TF_POS_PAGE_COMPRESSION_LEVEL \ + + DICT_TF_WIDTH_PAGE_COMPRESSION_LEVEL) /** Zero relative shift position of the start of the UNUSED bits */ -#define DICT_TF_POS_UNUSED (DICT_TF_POS_DATA_DIR \ - + DICT_TF_WIDTH_DATA_DIR) +#define DICT_TF_POS_UNUSED (DICT_TF_POS_ATOMIC_WRITES \ + + DICT_TF_WIDTH_ATOMIC_WRITES) /** Bit mask of the COMPACT field */ #define DICT_TF_MASK_COMPACT \ @@ -165,6 +190,18 @@ allows InnoDB to update_create_info() accordingly. */ #define DICT_TF_MASK_DATA_DIR \ ((~(~0 << DICT_TF_WIDTH_DATA_DIR)) \ << DICT_TF_POS_DATA_DIR) +/** Bit mask of the PAGE_COMPRESSION field */ +#define DICT_TF_MASK_PAGE_COMPRESSION \ + ((~(~0 << DICT_TF_WIDTH_PAGE_COMPRESSION)) \ + << DICT_TF_POS_PAGE_COMPRESSION) +/** Bit mask of the PAGE_COMPRESSION_LEVEL field */ +#define DICT_TF_MASK_PAGE_COMPRESSION_LEVEL \ + ((~(~0 << DICT_TF_WIDTH_PAGE_COMPRESSION_LEVEL)) \ + << DICT_TF_POS_PAGE_COMPRESSION_LEVEL) +/** Bit mask of the ATOMIC_WRITES field */ +#define DICT_TF_MASK_ATOMIC_WRITES \ + ((~(~0 << DICT_TF_WIDTH_ATOMIC_WRITES)) \ + << DICT_TF_POS_ATOMIC_WRITES) /** Return the value of the COMPACT field */ #define DICT_TF_GET_COMPACT(flags) \ @@ -185,6 +222,19 @@ allows InnoDB to update_create_info() accordingly. */ /** Return the contents of the UNUSED bits */ #define DICT_TF_GET_UNUSED(flags) \ (flags >> DICT_TF_POS_UNUSED) + +/** Return the value of the PAGE_COMPRESSION field */ +#define DICT_TF_GET_PAGE_COMPRESSION(flags) \ + ((flags & DICT_TF_MASK_PAGE_COMPRESSION) \ + >> DICT_TF_POS_PAGE_COMPRESSION) +/** Return the value of the PAGE_COMPRESSION_LEVEL field */ +#define DICT_TF_GET_PAGE_COMPRESSION_LEVEL(flags) \ + ((flags & DICT_TF_MASK_PAGE_COMPRESSION_LEVEL) \ + >> DICT_TF_POS_PAGE_COMPRESSION_LEVEL) +/** Return the value of the ATOMIC_WRITES field */ +#define DICT_TF_GET_ATOMIC_WRITES(flags) \ + ((flags & DICT_TF_MASK_ATOMIC_WRITES) \ + >> DICT_TF_POS_ATOMIC_WRITES) /* @} */ #ifndef UNIV_INNOCHECKSUM diff --git a/storage/xtradb/include/dict0pagecompress.h b/storage/xtradb/include/dict0pagecompress.h new file mode 100644 index 0000000000000..19a2a6c52f3d8 --- /dev/null +++ b/storage/xtradb/include/dict0pagecompress.h @@ -0,0 +1,94 @@ +/***************************************************************************** + +Copyright (C) 2013 SkySQL Ab. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + +*****************************************************************************/ + +/******************************************************************//** +@file include/dict0pagecompress.h +Helper functions for extracting/storing page compression information +to dictionary. + +Created 11/12/2013 Jan Lindström jan.lindstrom@skysql.com +***********************************************************************/ + +#ifndef dict0pagecompress_h +#define dict0pagecompress_h + +/********************************************************************//** +Extract the page compression level from table flags. +@return page compression level, or 0 if not compressed */ +UNIV_INLINE +ulint +dict_tf_get_page_compression_level( +/*===============================*/ + ulint flags) /*!< in: flags */ + __attribute__((const)); +/********************************************************************//** +Extract the page compression flag from table flags +@return page compression flag, or false if not compressed */ +UNIV_INLINE +ibool +dict_tf_get_page_compression( +/*==========================*/ + ulint flags) /*!< in: flags */ + __attribute__((const)); + +/********************************************************************//** +Check whether the table uses the page compressed page format. +@return page compression level, or 0 if not compressed */ +UNIV_INLINE +ulint +dict_table_page_compression_level( +/*==============================*/ + const dict_table_t* table) /*!< in: table */ + __attribute__((const)); + +/********************************************************************//** +Verify that dictionary flags match tablespace flags +@return true if flags match, false if not */ +UNIV_INLINE +ibool +dict_tf_verify_flags( +/*=================*/ + ulint table_flags, /*!< in: dict_table_t::flags */ + ulint fsp_flags) /*!< in: fil_space_t::flags */ + __attribute__((const)); + +/********************************************************************//** +Extract the atomic writes flag from table flags. +@return true if atomic writes are used, false if not used */ +UNIV_INLINE +atomic_writes_t +dict_tf_get_atomic_writes( +/*======================*/ + ulint flags) /*!< in: flags */ + __attribute__((const)); + +/********************************************************************//** +Check whether the table uses the atomic writes. +@return true if atomic writes is used, false if not */ +UNIV_INLINE +atomic_writes_t +dict_table_get_atomic_writes( +/*=========================*/ + const dict_table_t* table); /*!< in: table */ + + +#ifndef UNIV_NONINL +#include "dict0pagecompress.ic" +#endif + +#endif diff --git a/storage/xtradb/include/dict0pagecompress.ic b/storage/xtradb/include/dict0pagecompress.ic new file mode 100644 index 0000000000000..fb9581fc6579d --- /dev/null +++ b/storage/xtradb/include/dict0pagecompress.ic @@ -0,0 +1,191 @@ +/***************************************************************************** + +Copyright (C) 2013 SkySQL Ab. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + +*****************************************************************************/ + +/******************************************************************//** +@file include/dict0pagecompress.ic +Inline implementation for helper functions for extracting/storing +page compression and atomic writes information to dictionary. + +Created 11/12/2013 Jan Lindström jan.lindstrom@skysql.com +***********************************************************************/ + +/********************************************************************//** +Verify that dictionary flags match tablespace flags +@return true if flags match, false if not */ +UNIV_INLINE +ibool +dict_tf_verify_flags( +/*=================*/ + ulint table_flags, /*!< in: dict_table_t::flags */ + ulint fsp_flags) /*!< in: fil_space_t::flags */ +{ + ulint table_unused = DICT_TF_GET_UNUSED(table_flags); + ulint compact = DICT_TF_GET_COMPACT(table_flags); + ulint ssize = DICT_TF_GET_ZIP_SSIZE(table_flags); + ulint atomic_blobs = DICT_TF_HAS_ATOMIC_BLOBS(table_flags); + ulint data_dir = DICT_TF_HAS_DATA_DIR(table_flags); + ulint page_compression = DICT_TF_GET_PAGE_COMPRESSION(table_flags); + ulint page_compression_level = DICT_TF_GET_PAGE_COMPRESSION_LEVEL(table_flags); + ulint atomic_writes = DICT_TF_GET_ATOMIC_WRITES(table_flags); + ulint post_antelope = FSP_FLAGS_GET_POST_ANTELOPE(fsp_flags); + ulint zip_ssize = FSP_FLAGS_GET_ZIP_SSIZE(fsp_flags); + ulint fsp_atomic_blobs = FSP_FLAGS_HAS_ATOMIC_BLOBS(fsp_flags); + ulint page_ssize = FSP_FLAGS_GET_PAGE_SSIZE(fsp_flags); + ulint fsp_unused = FSP_FLAGS_GET_UNUSED(fsp_flags); + ulint fsp_page_compression = FSP_FLAGS_GET_PAGE_COMPRESSION(fsp_flags); + ulint fsp_page_compression_level = FSP_FLAGS_GET_PAGE_COMPRESSION_LEVEL(fsp_flags); + ulint fsp_atomic_writes = FSP_FLAGS_GET_ATOMIC_WRITES(fsp_flags); + + DBUG_EXECUTE_IF("dict_tf_verify_flags_failure", + return(ULINT_UNDEFINED);); + + ut_ad(!table_unused); + ut_ad(!fsp_unused); + ut_ad(page_ssize == 0 || page_ssize != 0); /* silence compiler */ + ut_ad(compact == 0 || compact == 1); /* silence compiler */ + ut_ad(data_dir == 0 || data_dir == 1); /* silence compiler */ + ut_ad(post_antelope == 0 || post_antelope == 1); /* silence compiler */ + + if (ssize != zip_ssize) { + fprintf(stderr, + "InnoDB: Error: table flags has zip_ssize %ld" + " in the data dictionary\n" + "InnoDB: but the flags in file has zip_ssize %ld\n", + ssize, zip_ssize); + return (FALSE); + } + if (atomic_blobs != fsp_atomic_blobs) { + fprintf(stderr, + "InnoDB: Error: table flags has atomic_blobs %ld" + " in the data dictionary\n" + "InnoDB: but the flags in file has atomic_blobs %ld\n", + atomic_blobs, fsp_atomic_blobs); + + return (FALSE); + } + if (page_compression != fsp_page_compression) { + fprintf(stderr, + "InnoDB: Error: table flags has page_compression %ld" + " in the data dictionary\n" + "InnoDB: but the flags in file ahas page_compression %ld\n", + page_compression, fsp_page_compression); + + return (FALSE); + } + if (page_compression_level != fsp_page_compression_level) { + fprintf(stderr, + "InnoDB: Error: table flags has page_compression_level %ld" + " in the data dictionary\n" + "InnoDB: but the flags in file has page_compression_level %ld\n", + page_compression_level, fsp_page_compression_level); + + return (FALSE); + } + + if (atomic_writes != fsp_atomic_writes) { + fprintf(stderr, + "InnoDB: Error: table flags has atomic writes %ld" + " in the data dictionary\n" + "InnoDB: but the flags in file has atomic_writes %ld\n", + atomic_writes, fsp_atomic_writes); + + return (FALSE); + } + + return(TRUE); +} + +/********************************************************************//** +Extract the page compression level from dict_table_t::flags. +These flags are in memory, so assert that they are valid. +@return page compression level, or 0 if not compressed */ +UNIV_INLINE +ulint +dict_tf_get_page_compression_level( +/*===============================*/ + ulint flags) /*!< in: flags */ +{ + ulint page_compression_level = DICT_TF_GET_PAGE_COMPRESSION_LEVEL(flags); + + ut_ad(page_compression_level >= 0 && page_compression_level <= 9); + + return(page_compression_level); +} + +/********************************************************************//** +Check whether the table uses the page compression page format. +@return page compression level, or 0 if not compressed */ +UNIV_INLINE +ulint +dict_table_page_compression_level( +/*==============================*/ + const dict_table_t* table) /*!< in: table */ +{ + ut_ad(table); + ut_ad(dict_tf_get_page_compression(table->flags)); + + return(dict_tf_get_page_compression_level(table->flags)); +} + +/********************************************************************//** +Check whether the table uses the page compression page format. +@return true if page compressed, false if not */ +UNIV_INLINE +ibool +dict_tf_get_page_compression( +/*=========================*/ + ulint flags) /*!< in: flags */ +{ + return(DICT_TF_GET_PAGE_COMPRESSION(flags)); +} + +/********************************************************************//** +Check whether the table uses the page compression page format. +@return true if page compressed, false if not */ +UNIV_INLINE +ibool +dict_table_is_page_compressed( +/*==========================*/ + const dict_table_t* table) /*!< in: table */ +{ + return (dict_tf_get_page_compression(table->flags)); +} + +/********************************************************************//** +Extract the atomic writes flag from table flags. +@return enumerated value of atomic writes */ +UNIV_INLINE +atomic_writes_t +dict_tf_get_atomic_writes( +/*======================*/ + ulint flags) /*!< in: flags */ +{ + return((atomic_writes_t)DICT_TF_GET_ATOMIC_WRITES(flags)); +} + +/********************************************************************//** +Check whether the table uses the atomic writes. +@return enumerated value of atomic writes */ +UNIV_INLINE +atomic_writes_t +dict_table_get_atomic_writes( +/*=========================*/ + const dict_table_t* table) /*!< in: table */ +{ + return ((atomic_writes_t)dict_tf_get_atomic_writes(table->flags)); +} diff --git a/storage/xtradb/include/dict0types.h b/storage/xtradb/include/dict0types.h index 6acb6a2dcbe06..9e2101175808f 100644 --- a/storage/xtradb/include/dict0types.h +++ b/storage/xtradb/include/dict0types.h @@ -1,6 +1,7 @@ /***************************************************************************** Copyright (c) 1996, 2013, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2013, SkySQL Ab. All Rights Reserved. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -82,4 +83,12 @@ enum ib_quiesce_t { #define TEMP_TABLE_PREFIX "#sql" #define TEMP_TABLE_PATH_PREFIX "/" TEMP_TABLE_PREFIX +/** Enum values for atomic_writes table option */ +typedef enum { + ATOMIC_WRITES_DEFAULT = 0, + ATOMIC_WRITES_ON = 1, + ATOMIC_WRITES_OFF = 2 +} atomic_writes_t; + + #endif diff --git a/storage/xtradb/include/fil0fil.h b/storage/xtradb/include/fil0fil.h index 472c57fcbfc44..6b69a89969063 100644 --- a/storage/xtradb/include/fil0fil.h +++ b/storage/xtradb/include/fil0fil.h @@ -1,6 +1,7 @@ /***************************************************************************** Copyright (c) 1995, 2013, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2013, 2014, SkySQL Ab. All Rights Reserved. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -129,6 +130,13 @@ extern fil_addr_t fil_addr_null; #define FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID 34 /*!< starting from 4.1.x this contains the space id of the page */ #define FIL_PAGE_DATA 38 /*!< start of the data on the page */ +/* Following are used when page compression is used */ +#define FIL_PAGE_COMPRESSED_SIZE 2 /*!< Number of bytes used to store + actual payload data size on + compressed pages. */ +#define FIL_PAGE_COMPRESSION_ZLIB 1 /*!< Compressin algorithm ZLIB. */ +#define FIL_PAGE_COMPRESSION_LZ4 2 /*!< Compressin algorithm LZ4. */ + /* @} */ /** File page trailer @{ */ #define FIL_PAGE_END_LSN_OLD_CHKSUM 8 /*!< the low 4 bytes of this are used @@ -139,6 +147,7 @@ extern fil_addr_t fil_addr_null; /* @} */ /** File page types (values of FIL_PAGE_TYPE) @{ */ +#define FIL_PAGE_PAGE_COMPRESSED 34354 /*!< Page compressed page */ #define FIL_PAGE_INDEX 17855 /*!< B-tree node */ #define FIL_PAGE_UNDO_LOG 2 /*!< Undo log page */ #define FIL_PAGE_INODE 3 /*!< Index node */ @@ -721,8 +730,8 @@ fil_space_get_n_reserved_extents( Reads or writes data. This operation is asynchronous (aio). @return DB_SUCCESS, or DB_TABLESPACE_DELETED if we are trying to do i/o on a tablespace which does not exist */ -#define fil_io(type, sync, space_id, zip_size, block_offset, byte_offset, len, buf, message) \ - _fil_io(type, sync, space_id, zip_size, block_offset, byte_offset, len, buf, message, NULL) +#define fil_io(type, sync, space_id, zip_size, block_offset, byte_offset, len, buf, message, write_size) \ + _fil_io(type, sync, space_id, zip_size, block_offset, byte_offset, len, buf, message, write_size, NULL) UNIV_INTERN dberr_t @@ -752,7 +761,12 @@ _fil_io( or from where to write; in aio this must be appropriately aligned */ void* message, /*!< in: message for aio handler if non-sync - aio used, else ignored */ + aio used, else ignored */ + ulint* write_size, /*!< in/out: Actual write size initialized + after fist successfull trim + operation for this page and if + initialized we do not trim again if + actual page size does not decrease. */ trx_t* trx) __attribute__((nonnull(8))); /**********************************************************************//** @@ -1018,4 +1032,27 @@ fil_space_set_corrupt( /*==================*/ ulint space_id); +/****************************************************************//** +Acquire fil_system mutex */ +void +fil_system_enter(void); +/*==================*/ +/****************************************************************//** +Release fil_system mutex */ +void +fil_system_exit(void); +/*==================*/ +/*******************************************************************//** +Returns the table space by a given id, NULL if not found. */ +fil_space_t* +fil_space_get_by_id( +/*================*/ + ulint id); /*!< in: space id */ +/*******************************************************************//** +Return space name */ +char* +fil_space_name( +/*===========*/ + fil_space_t* space); /*!< in: space */ + #endif /* fil0fil_h */ diff --git a/storage/xtradb/include/fil0pagecompress.h b/storage/xtradb/include/fil0pagecompress.h new file mode 100644 index 0000000000000..342b105401c07 --- /dev/null +++ b/storage/xtradb/include/fil0pagecompress.h @@ -0,0 +1,118 @@ +/***************************************************************************** + +Copyright (C) 2013 SkySQL Ab. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + +*****************************************************************************/ + +#ifndef fil0pagecompress_h +#define fil0pagecompress_h + +#include "fsp0fsp.h" +#include "fsp0pagecompress.h" + +/******************************************************************//** +@file include/fil0pagecompress.h +Helper functions for extracting/storing page compression and +atomic writes information to table space. + +Created 11/12/2013 Jan Lindström jan.lindstrom@skysql.com +***********************************************************************/ + +/*******************************************************************//** +Returns the page compression level flag of the space, or 0 if the space +is not compressed. The tablespace must be cached in the memory cache. +@return page compression level if page compressed, ULINT_UNDEFINED if space not found */ +ulint +fil_space_get_page_compression_level( +/*=================================*/ + ulint id); /*!< in: space id */ +/*******************************************************************//** +Returns the page compression flag of the space, or false if the space +is not compressed. The tablespace must be cached in the memory cache. +@return true if page compressed, false if not or space not found */ +ibool +fil_space_is_page_compressed( +/*=========================*/ + ulint id); /*!< in: space id */ +/*******************************************************************//** +Returns the atomic writes flag of the space, or false if the space +is not using atomic writes. The tablespace must be cached in the memory cache. +@return atomic write table option value */ +atomic_writes_t +fil_space_get_atomic_writes( +/*=========================*/ + ulint id); /*!< in: space id */ +/*******************************************************************//** +Find out wheather the page is index page or not +@return true if page type index page, false if not */ +ibool +fil_page_is_index_page( +/*===================*/ + byte *buf); /*!< in: page */ + +/****************************************************************//** +Get the name of the compression algorithm used for page +compression. +@return compression algorithm name or "UNKNOWN" if not known*/ +const char* +fil_get_compression_alg_name( +/*=========================*/ + ulint comp_alg); /*!> FSP_FLAGS_POS_UNUSED) +/** Return the value of the PAGE_COMPRESSION field */ +#define FSP_FLAGS_GET_PAGE_COMPRESSION(flags) \ + ((flags & FSP_FLAGS_MASK_PAGE_COMPRESSION) \ + >> FSP_FLAGS_POS_PAGE_COMPRESSION) +/** Return the value of the PAGE_COMPRESSION_LEVEL field */ +#define FSP_FLAGS_GET_PAGE_COMPRESSION_LEVEL(flags) \ + ((flags & FSP_FLAGS_MASK_PAGE_COMPRESSION_LEVEL) \ + >> FSP_FLAGS_POS_PAGE_COMPRESSION_LEVEL) +/** Return the value of the ATOMIC_WRITES field */ +#define FSP_FLAGS_GET_ATOMIC_WRITES(flags) \ + ((flags & FSP_FLAGS_MASK_ATOMIC_WRITES) \ + >> FSP_FLAGS_POS_ATOMIC_WRITES) /** Set a PAGE_SSIZE into the correct bits in a given tablespace flags. */ #define FSP_FLAGS_SET_PAGE_SSIZE(flags, ssize) \ (flags | (ssize << FSP_FLAGS_POS_PAGE_SSIZE)) +/** Set a PAGE_COMPRESSION into the correct bits in a given +tablespace flags. */ +#define FSP_FLAGS_SET_PAGE_COMPRESSION(flags, compression) \ + (flags | (compression << FSP_FLAGS_POS_PAGE_COMPRESSION)) + +/** Set a PAGE_COMPRESSION_LEVEL into the correct bits in a given +tablespace flags. */ +#define FSP_FLAGS_SET_PAGE_COMPRESSION_LEVEL(flags, level) \ + (flags | (level << FSP_FLAGS_POS_PAGE_COMPRESSION_LEVEL)) +/** Set a ATOMIC_WRITES into the correct bits in a given +tablespace flags. */ +#define FSP_FLAGS_SET_ATOMIC_WRITES(flags, atomics) \ + (flags | (atomics << FSP_FLAGS_POS_ATOMIC_WRITES)) + /* @} */ /* @defgroup Tablespace Header Constants (moved from fsp0fsp.c) @{ */ diff --git a/storage/xtradb/include/fsp0fsp.ic b/storage/xtradb/include/fsp0fsp.ic index 0d81e817cc921..bc46967fab046 100644 --- a/storage/xtradb/include/fsp0fsp.ic +++ b/storage/xtradb/include/fsp0fsp.ic @@ -1,6 +1,7 @@ /***************************************************************************** Copyright (c) 1995, 2012, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2013, SkySQL Ab. All Rights Reserved. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -63,6 +64,10 @@ fsp_flags_is_valid( ulint atomic_blobs = FSP_FLAGS_HAS_ATOMIC_BLOBS(flags); ulint page_ssize = FSP_FLAGS_GET_PAGE_SSIZE(flags); ulint unused = FSP_FLAGS_GET_UNUSED(flags); + ulint page_compression = FSP_FLAGS_GET_PAGE_COMPRESSION(flags); + ulint page_compression_level = FSP_FLAGS_GET_PAGE_COMPRESSION_LEVEL(flags); + ulint atomic_writes = FSP_FLAGS_GET_ATOMIC_WRITES(flags); + atomic_writes_t awrites = (atomic_writes_t)atomic_writes; DBUG_EXECUTE_IF("fsp_flags_is_valid_failure", return(false);); @@ -108,6 +113,20 @@ fsp_flags_is_valid( # error "UNIV_FORMAT_MAX != UNIV_FORMAT_B, Add more validations." #endif + /* Page compression level requires page compression and atomic blobs + to be set */ + if (page_compression_level || page_compression) { + if (!page_compression || !atomic_blobs) { + return(false); + } + } + + if ((awrites == ATOMIC_WRITES_ON || + (awrites == ATOMIC_WRITES_DEFAULT && srv_use_atomic_writes)) + && !atomic_blobs) { + return (false); + } + /* The DATA_DIR field can be used for any row type so there is nothing here to validate. */ diff --git a/storage/xtradb/include/fsp0pagecompress.h b/storage/xtradb/include/fsp0pagecompress.h new file mode 100644 index 0000000000000..4913f1d6b2913 --- /dev/null +++ b/storage/xtradb/include/fsp0pagecompress.h @@ -0,0 +1,73 @@ +/***************************************************************************** + +Copyright (C) 2013 SkySQL Ab. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + +*****************************************************************************/ + +/******************************************************************//** +@file include/fsp0pagecompress.h +Helper functions for extracting/storing page compression and +atomic writes information to file space. + +Created 11/12/2013 Jan Lindström jan.lindstrom@skysql.com +***********************************************************************/ + +#ifndef fsp0pagecompress_h +#define fsp0pagecompress_h + +/**********************************************************************//** +Reads the page compression level from the first page of a tablespace. +@return page compression level, or 0 if uncompressed */ +UNIV_INTERN +ulint +fsp_header_get_compression_level( +/*=============================*/ + const page_t* page); /*!< in: first page of a tablespace */ + +/********************************************************************//** +Determine if the tablespace is page compressed from dict_table_t::flags. +@return TRUE if page compressed, FALSE if not compressed */ +UNIV_INLINE +ibool +fsp_flags_is_page_compressed( +/*=========================*/ + ulint flags); /*!< in: tablespace flags */ + +/********************************************************************//** +Extract the page compression level from tablespace flags. +A tablespace has only one physical page compression level +whether that page is compressed or not. +@return page compression level of the file-per-table tablespace, +or zero if the table is not compressed. */ +UNIV_INLINE +ulint +fsp_flags_get_page_compression_level( +/*=================================*/ + ulint flags); /*!< in: tablespace flags */ + +/********************************************************************//** +Determine the tablespace is using atomic writes from dict_table_t::flags. +@return true if atomic writes is used, false if not */ +UNIV_INLINE +atomic_writes_t +fsp_flags_get_atomic_writes( +/*========================*/ + ulint flags); /*!< in: tablespace flags */ + +#ifndef UNIV_NONINL +#include "fsp0pagecompress.ic" +#endif + +#endif diff --git a/storage/xtradb/include/fsp0pagecompress.ic b/storage/xtradb/include/fsp0pagecompress.ic new file mode 100644 index 0000000000000..873f6cd401d2e --- /dev/null +++ b/storage/xtradb/include/fsp0pagecompress.ic @@ -0,0 +1,177 @@ +/***************************************************************************** + +Copyright (C) 2013, 2014, SkySQL Ab. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + +*****************************************************************************/ + +/******************************************************************//** +@file include/fsp0pagecompress.ic +Implementation for helper functions for extracting/storing page +compression and atomic writes information to file space. + +Created 11/12/2013 Jan Lindström jan.lindstrom@skysql.com +***********************************************************************/ + +#include "fsp0fsp.h" + + +/********************************************************************//** +Determine if the tablespace is page compressed from dict_table_t::flags. +@return TRUE if page compressed, FALSE if not page compressed */ +UNIV_INLINE +ibool +fsp_flags_is_page_compressed( +/*=========================*/ + ulint flags) /*!< in: tablespace flags */ +{ + return(FSP_FLAGS_GET_PAGE_COMPRESSION(flags)); +} + +/********************************************************************//** +Determine the tablespace is page compression level from dict_table_t::flags. +@return page compression level or 0 if not compressed*/ +UNIV_INLINE +ulint +fsp_flags_get_page_compression_level( +/*=================================*/ + ulint flags) /*!< in: tablespace flags */ +{ + return(FSP_FLAGS_GET_PAGE_COMPRESSION_LEVEL(flags)); +} + +/********************************************************************//** +Determine the tablespace is using atomic writes from dict_table_t::flags. +@return true if atomic writes is used, false if not */ +UNIV_INLINE +atomic_writes_t +fsp_flags_get_atomic_writes( +/*========================*/ + ulint flags) /*!< in: tablespace flags */ +{ + return((atomic_writes_t)FSP_FLAGS_GET_ATOMIC_WRITES(flags)); +} + +/*******************************************************************//** +Find out wheather the page is index page or not +@return true if page type index page, false if not */ +UNIV_INLINE +ibool +fil_page_is_index_page( +/*===================*/ + byte *buf) /*!< in: page */ +{ + return(mach_read_from_2(buf+FIL_PAGE_TYPE) == FIL_PAGE_INDEX); +} + +/*******************************************************************//** +Find out wheather the page is page compressed +@return true if page is page compressed, false if not */ +UNIV_INLINE +ibool +fil_page_is_compressed( +/*===================*/ + byte *buf) /*!< in: page */ +{ + return(mach_read_from_2(buf+FIL_PAGE_TYPE) == FIL_PAGE_PAGE_COMPRESSED); +} + +/*******************************************************************//** +Returns the page compression level of the space, or 0 if the space +is not compressed. The tablespace must be cached in the memory cache. +@return page compression level, ULINT_UNDEFINED if space not found */ +UNIV_INLINE +ulint +fil_space_get_page_compression_level( +/*=================================*/ + ulint id) /*!< in: space id */ +{ + ulint flags; + + flags = fil_space_get_flags(id); + + if (flags && flags != ULINT_UNDEFINED) { + + return(fsp_flags_get_page_compression_level(flags)); + } + + return(flags); +} + +/*******************************************************************//** +Extract the page compression from space. +@return true if space is page compressed, false if space is not found +or space is not page compressed. */ +UNIV_INLINE +ibool +fil_space_is_page_compressed( +/*=========================*/ + ulint id) /*!< in: space id */ +{ + ulint flags; + + flags = fil_space_get_flags(id); + + if (flags && flags != ULINT_UNDEFINED) { + + return(fsp_flags_is_page_compressed(flags)); + } + + return(flags); +} + +/****************************************************************//** +Get the name of the compression algorithm used for page +compression. +@return compression algorithm name or "UNKNOWN" if not known*/ +UNIV_INLINE +const char* +fil_get_compression_alg_name( +/*=========================*/ + ulint comp_alg) /*!space_id, 0, (ulint) (next_offset / UNIV_PAGE_SIZE), (ulint) (next_offset % UNIV_PAGE_SIZE), write_len, buf, - group); + group, 0); srv_stats.os_log_pending_writes.dec(); @@ -1975,7 +1975,7 @@ log_group_checkpoint( write_offset / UNIV_PAGE_SIZE, write_offset % UNIV_PAGE_SIZE, OS_FILE_LOG_BLOCK_SIZE, - buf, ((byte*) group + 1)); + buf, ((byte*) group + 1), 0); ut_ad(((ulint) group & 0x1UL) == 0); } @@ -2055,7 +2055,7 @@ log_group_read_checkpoint_info( fil_io(OS_FILE_READ | OS_FILE_LOG, true, group->space_id, 0, field / UNIV_PAGE_SIZE, field % UNIV_PAGE_SIZE, - OS_FILE_LOG_BLOCK_SIZE, log_sys->checkpoint_buf, NULL); + OS_FILE_LOG_BLOCK_SIZE, log_sys->checkpoint_buf, NULL, 0); } /******************************************************//** @@ -2438,7 +2438,7 @@ log_group_read_log_seg( fil_io(OS_FILE_READ | OS_FILE_LOG, sync, group->space_id, 0, (ulint) (source_offset / UNIV_PAGE_SIZE), (ulint) (source_offset % UNIV_PAGE_SIZE), - len, buf, (type == LOG_ARCHIVE) ? &log_archive_io : NULL); + len, buf, (type == LOG_ARCHIVE) ? &log_archive_io : NULL, 0); start_lsn += len; buf += len; @@ -2563,7 +2563,7 @@ log_group_archive_file_header_write( dest_offset / UNIV_PAGE_SIZE, dest_offset % UNIV_PAGE_SIZE, 2 * OS_FILE_LOG_BLOCK_SIZE, - buf, &log_archive_io); + buf, &log_archive_io, 0); } /******************************************************//** @@ -2600,7 +2600,7 @@ log_group_archive_completed_header_write( dest_offset % UNIV_PAGE_SIZE, OS_FILE_LOG_BLOCK_SIZE, buf + LOG_FILE_ARCH_COMPLETED, - &log_archive_io); + &log_archive_io, 0); } /******************************************************//** @@ -2663,12 +2663,12 @@ log_group_archive( file_handle = os_file_create(innodb_file_log_key, name, open_mode, OS_FILE_AIO, - OS_DATA_FILE, &ret); + OS_DATA_FILE, &ret, FALSE); if (!ret && (open_mode == OS_FILE_CREATE)) { file_handle = os_file_create( innodb_file_log_key, name, OS_FILE_OPEN, - OS_FILE_AIO, OS_DATA_FILE, &ret); + OS_FILE_AIO, OS_DATA_FILE, &ret, FALSE); } if (!ret) { @@ -2737,7 +2737,7 @@ log_group_archive( (ulint) (next_offset / UNIV_PAGE_SIZE), (ulint) (next_offset % UNIV_PAGE_SIZE), ut_calc_align(len, OS_FILE_LOG_BLOCK_SIZE), buf, - &log_archive_io); + &log_archive_io, 0); start_lsn += len; next_offset += len; diff --git a/storage/xtradb/log/log0online.cc b/storage/xtradb/log/log0online.cc index 8c2bc5602a97c..2438303043ca6 100644 --- a/storage/xtradb/log/log0online.cc +++ b/storage/xtradb/log/log0online.cc @@ -547,7 +547,7 @@ log_online_start_bitmap_file(void) log_bmp_sys->out.name, OS_FILE_CREATE, OS_FILE_READ_WRITE, - &success); + &success, FALSE); } if (UNIV_UNLIKELY(!success)) { @@ -707,7 +707,7 @@ log_online_read_init(void) log_bmp_sys->out.file = os_file_create_simple_no_error_handling (innodb_file_bmp_key, log_bmp_sys->out.name, OS_FILE_OPEN, - OS_FILE_READ_WRITE, &success); + OS_FILE_READ_WRITE, &success, FALSE); if (!success) { @@ -1491,7 +1491,7 @@ log_online_open_bitmap_file_read_only( bitmap_file->name, OS_FILE_OPEN, OS_FILE_READ_ONLY, - &success); + &success, FALSE); if (UNIV_UNLIKELY(!success)) { /* Here and below assume that bitmap file names do not diff --git a/storage/xtradb/log/log0recv.cc b/storage/xtradb/log/log0recv.cc index d0b833f2bba37..1772def9f9bae 100644 --- a/storage/xtradb/log/log0recv.cc +++ b/storage/xtradb/log/log0recv.cc @@ -2,6 +2,7 @@ Copyright (c) 1997, 2013, Oracle and/or its affiliates. All Rights Reserved. Copyright (c) 2012, Facebook Inc. +Copyright (c) 2013, SkySQL Ab. All Rights Reserved. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -2131,7 +2132,7 @@ recv_apply_log_recs_for_backup(void) error = fil_io(OS_FILE_READ, true, recv_addr->space, zip_size, recv_addr->page_no, 0, zip_size, - block->page.zip.data, NULL); + block->page.zip.data, NULL, 0); if (error == DB_SUCCESS && !buf_zip_decompress(block, TRUE)) { exit(1); @@ -2141,7 +2142,7 @@ recv_apply_log_recs_for_backup(void) recv_addr->space, 0, recv_addr->page_no, 0, UNIV_PAGE_SIZE, - block->frame, NULL); + block->frame, NULL, 0); } if (error != DB_SUCCESS) { @@ -2170,13 +2171,13 @@ recv_apply_log_recs_for_backup(void) recv_addr->space, zip_size, recv_addr->page_no, 0, zip_size, - block->page.zip.data, NULL); + block->page.zip.data, NULL, 0); } else { error = fil_io(OS_FILE_WRITE, true, recv_addr->space, 0, recv_addr->page_no, 0, UNIV_PAGE_SIZE, - block->frame, NULL); + block->frame, NULL, 0); } skip_this_recv_addr: recv_addr = HASH_GET_NEXT(addr_hash, recv_addr); @@ -3144,7 +3145,7 @@ recv_recovery_from_checkpoint_start_func( fil_io(OS_FILE_READ | OS_FILE_LOG, true, max_cp_group->space_id, 0, 0, 0, LOG_FILE_HDR_SIZE, - log_hdr_buf, max_cp_group); + log_hdr_buf, max_cp_group, 0); if (0 == ut_memcmp(log_hdr_buf + LOG_FILE_WAS_CREATED_BY_HOT_BACKUP, (byte*)"ibbackup", (sizeof "ibbackup") - 1)) { @@ -3175,7 +3176,7 @@ recv_recovery_from_checkpoint_start_func( fil_io(OS_FILE_WRITE | OS_FILE_LOG, true, max_cp_group->space_id, 0, 0, 0, OS_FILE_LOG_BLOCK_SIZE, - log_hdr_buf, max_cp_group); + log_hdr_buf, max_cp_group, 0); } log_hdr_log_block_size @@ -3775,7 +3776,7 @@ log_group_recover_from_archive_file( file_handle = os_file_create(innodb_file_log_key, name, OS_FILE_OPEN, - OS_FILE_LOG, OS_FILE_AIO, &ret); + OS_FILE_LOG, OS_FILE_AIO, &ret, FALSE); if (ret == FALSE) { ask_again: @@ -3827,7 +3828,7 @@ log_group_recover_from_archive_file( /* Read the archive file header */ fil_io(OS_FILE_READ | OS_FILE_LOG, true, group->archive_space_id, 0, 0, 0, - LOG_FILE_HDR_SIZE, buf, NULL); + LOG_FILE_HDR_SIZE, buf, NULL, 0); /* Check if the archive file header is consistent */ @@ -3901,7 +3902,7 @@ log_group_recover_from_archive_file( fil_io(OS_FILE_READ | OS_FILE_LOG, true, group->archive_space_id, 0, read_offset / UNIV_PAGE_SIZE, - read_offset % UNIV_PAGE_SIZE, len, buf, NULL); + read_offset % UNIV_PAGE_SIZE, len, buf, NULL, 0); ret = recv_scan_log_recs( (buf_pool_get_n_pages() diff --git a/storage/xtradb/os/os0file.cc b/storage/xtradb/os/os0file.cc index 38eb5241da11c..43adf78c63c69 100644 --- a/storage/xtradb/os/os0file.cc +++ b/storage/xtradb/os/os0file.cc @@ -2,6 +2,7 @@ Copyright (c) 1995, 2013, Oracle and/or its affiliates. All Rights Reserved. Copyright (c) 2009, Percona Inc. +Copyright (c) 2013, SkySQL Ab. All Rights Reserved. Portions of this file contain modifications contributed and copyrighted by Percona Inc.. Those modifications are @@ -42,10 +43,16 @@ Created 10/21/1995 Heikki Tuuri #include "srv0srv.h" #include "srv0start.h" #include "fil0fil.h" +#include "fil0pagecompress.h" #include "buf0buf.h" #include "btr0types.h" #include "trx0trx.h" #include "srv0mon.h" +#include "srv0srv.h" +#ifdef HAVE_POSIX_FALLOCATE +#include "fcntl.h" +#include "linux/falloc.h" +#endif #ifndef UNIV_HOTBACKUP # include "os0sync.h" # include "os0thread.h" @@ -196,6 +203,28 @@ struct os_aio_slot_t{ and which can be used to identify which pending aio operation was completed */ + ulint bitmap; + + byte* page_compression_page; /*!< Memory allocated for + page compressed page and + freed after the write + has been completed */ + + ibool page_compression; + ulint page_compression_level; + + ulint* write_size; /*!< Actual write size initialized + after fist successfull trim + operation for this page and if + initialized we do not trim again if + actual page size does not decrease. */ + + byte* page_buf; /*!< Actual page buffer for + page compressed pages, do not + free this */ + + ibool page_compress_success; + #ifdef LINUX_NATIVE_AIO struct iocb control; /* Linux control block for aio */ int n_bytes; /* bytes written/read. */ @@ -301,6 +330,58 @@ UNIV_INTERN ulint os_n_pending_writes = 0; /** Number of pending read operations */ UNIV_INTERN ulint os_n_pending_reads = 0; +/** After first fallocate failure we will disable os_file_trim */ +UNIV_INTERN ibool os_fallocate_failed = FALSE; + +/**********************************************************************//** +Directly manipulate the allocated disk space by deallocating for the file referred to +by fd for the byte range starting at offset and continuing for len bytes. +Within the specified range, partial file system blocks are zeroed, and whole +file system blocks are removed from the file. After a successful call, +subsequent reads from this range will return zeroes. +@return true if success, false if error */ +UNIV_INTERN +ibool +os_file_trim( +/*=========*/ + os_file_t file, /*!< in: file to be trimmed */ + os_aio_slot_t* slot, /*!< in: slot structure */ + ulint len); /*!< in: length of area */ + +/**********************************************************************//** +Allocate memory for temporal buffer used for page compression. This +buffer is freed later. */ +UNIV_INTERN +void +os_slot_alloc_page_buf( +/*===================*/ + os_aio_slot_t* slot); /*!< in: slot structure */ + +/****************************************************************//** +Does error handling when a file operation fails. +@return TRUE if we should retry the operation */ +static +ibool +os_file_handle_error_no_exit( +/*=========================*/ + const char* name, /*!< in: name of a file or NULL */ + const char* operation, /*!< in: operation */ + ibool on_error_silent,/*!< in: if TRUE then don't print + any message to the log. */ + const char* file, /*!< in: file name */ + const ulint line); /*!< in: line */ + +/****************************************************************//** +Tries to enable the atomic write feature, if available, for the specified file +handle. +@return TRUE if success */ +static __attribute__((warn_unused_result)) +ibool +os_file_set_atomic_writes( +/*======================*/ + const char* name, /*!< in: name of the file */ + os_file_t file); /*!< in: handle to the file */ + #ifdef UNIV_DEBUG # ifndef UNIV_HOTBACKUP /**********************************************************************//** @@ -537,6 +618,16 @@ os_file_get_last_error_low( "InnoDB: because of either a thread exit" " or an application request.\n" "InnoDB: Retry attempt is made.\n"); + } else if (err == ECANCELED) { + fprintf(stderr, + "InnoDB: Operation canceled (%d):%s\n", + err, strerror(err)); + + if(srv_use_atomic_writes) { + fprintf(stderr, + "InnoDB: Error trying to enable atomic writes on " + "non-supported destination!\n"); + } } else { fprintf(stderr, "InnoDB: Some operating system error numbers" @@ -633,6 +724,8 @@ os_file_get_last_error_low( return(OS_FILE_AIO_RESOURCES_RESERVED); } break; + case ECANCELED: + return(OS_FILE_OPERATION_NOT_SUPPORTED); case EINTR: if (srv_use_native_aio) { return(OS_FILE_AIO_INTERRUPTED); @@ -672,9 +765,11 @@ os_file_handle_error_cond_exit( const char* operation, /*!< in: operation */ ibool should_exit, /*!< in: call exit(3) if unknown error and this parameter is TRUE */ - ibool on_error_silent)/*!< in: if TRUE then don't print + ibool on_error_silent,/*!< in: if TRUE then don't print any message to the log iff it is an unknown non-fatal error */ + const char* file, /*!< in: file name */ + const ulint line) /*!< in: line */ { ulint err; @@ -706,6 +801,9 @@ os_file_handle_error_cond_exit( os_has_said_disk_full = TRUE; + fprintf(stderr, + " InnoDB: at file %s and at line %ld\n", file, line); + fflush(stderr); return(FALSE); @@ -737,6 +835,9 @@ os_file_handle_error_cond_exit( is better to ignore on_error_silent and print an error message to the log. */ + fprintf(stderr, + " InnoDB: at file %s and at line %ld\n", file, line); + if (should_exit || !on_error_silent) { ib_logf(IB_LOG_LEVEL_ERROR, "File %s: '%s' returned OS " "error " ULINTPF ".%s", name ? name : "(unknown)", @@ -760,10 +861,12 @@ ibool os_file_handle_error( /*=================*/ const char* name, /*!< in: name of a file or NULL */ - const char* operation) /*!< in: operation */ + const char* operation, /*!< in: operation */ + const char* file, /*!< in: file name */ + const ulint line) /*!< in: line */ { /* exit in case of unknown error */ - return(os_file_handle_error_cond_exit(name, operation, TRUE, FALSE)); + return(os_file_handle_error_cond_exit(name, operation, TRUE, FALSE, file, line)); } /****************************************************************//** @@ -775,12 +878,14 @@ os_file_handle_error_no_exit( /*=========================*/ const char* name, /*!< in: name of a file or NULL */ const char* operation, /*!< in: operation */ - ibool on_error_silent)/*!< in: if TRUE then don't print + ibool on_error_silent,/*!< in: if TRUE then don't print any message to the log. */ + const char* file, /*!< in: file name */ + const ulint line) /*!< in: line */ { /* don't exit in case of unknown error */ return(os_file_handle_error_cond_exit( - name, operation, FALSE, on_error_silent)); + name, operation, FALSE, on_error_silent, file, line)); } #undef USE_FILE_LOCK @@ -923,7 +1028,7 @@ os_file_opendir( if (dir == INVALID_HANDLE_VALUE) { if (error_is_fatal) { - os_file_handle_error(dirname, "opendir"); + os_file_handle_error(dirname, "opendir", __FILE__, __LINE__); } return(NULL); @@ -934,7 +1039,7 @@ os_file_opendir( dir = opendir(dirname); if (dir == NULL && error_is_fatal) { - os_file_handle_error(dirname, "opendir"); + os_file_handle_error(dirname, "opendir", __FILE__, __LINE__); } return(dir); @@ -956,7 +1061,7 @@ os_file_closedir( ret = FindClose(dir); if (!ret) { - os_file_handle_error_no_exit(NULL, "closedir", FALSE); + os_file_handle_error_no_exit(NULL, "closedir", FALSE, __FILE__, __LINE__); return(-1); } @@ -968,7 +1073,7 @@ os_file_closedir( ret = closedir(dir); if (ret) { - os_file_handle_error_no_exit(NULL, "closedir", FALSE); + os_file_handle_error_no_exit(NULL, "closedir", FALSE, __FILE__, __LINE__); } return(ret); @@ -1040,7 +1145,7 @@ os_file_readdir_next_file( return(1); } else { - os_file_handle_error_no_exit(NULL, "readdir_next_file", FALSE); + os_file_handle_error_no_exit(NULL, "readdir_next_file", FALSE, __FILE__, __LINE__); return(-1); } #else @@ -1126,7 +1231,7 @@ os_file_readdir_next_file( goto next_file; } - os_file_handle_error_no_exit(full_path, "stat", FALSE); + os_file_handle_error_no_exit(full_path, "stat", FALSE, __FILE__, __LINE__); ut_free(full_path); @@ -1177,7 +1282,7 @@ os_file_create_directory( && !fail_if_exists))) { os_file_handle_error_no_exit( - pathname, "CreateDirectory", FALSE); + pathname, "CreateDirectory", FALSE, __FILE__, __LINE__); return(FALSE); } @@ -1190,7 +1295,7 @@ os_file_create_directory( if (!(rcode == 0 || (errno == EEXIST && !fail_if_exists))) { /* failure */ - os_file_handle_error_no_exit(pathname, "mkdir", FALSE); + os_file_handle_error_no_exit(pathname, "mkdir", FALSE, __FILE__, __LINE__); return(FALSE); } @@ -1300,7 +1405,7 @@ os_file_create_simple_func( retry = os_file_handle_error( name, create_mode == OS_FILE_OPEN ? - "open" : "create"); + "open" : "create", __FILE__, __LINE__); } else { *success = TRUE; @@ -1368,7 +1473,7 @@ os_file_create_simple_func( retry = os_file_handle_error( name, create_mode == OS_FILE_OPEN - ? "open" : "create"); + ? "open" : "create", __FILE__, __LINE__); } else { *success = TRUE; retry = false; @@ -1410,9 +1515,12 @@ os_file_create_simple_no_error_handling_func( OS_FILE_READ_WRITE, or OS_FILE_READ_ALLOW_DELETE; the last option is used by a backup program reading the file */ - ibool* success)/*!< out: TRUE if succeed, FALSE if error */ + ibool* success,/*!< out: TRUE if succeed, FALSE if error */ + ulint atomic_writes) /*! in: atomic writes table option + value */ { os_file_t file; + atomic_writes_t awrites = (atomic_writes_t) atomic_writes; *success = FALSE; #ifdef __WIN__ @@ -1473,6 +1581,15 @@ os_file_create_simple_no_error_handling_func( attributes, NULL); // No template file + if (file != INVALID_HANDLE_VALUE + && (awrites == ATOMIC_WRITES_ON || + (srv_use_atomic_writes && awrites == ATOMIC_WRITES_DEFAULT)) + && !os_file_set_atomic_writes(name, file)) { + CloseHandle(file); + *success = FALSE; + file = INVALID_HANDLE_VALUE; + } + *success = (file != INVALID_HANDLE_VALUE); #else /* __WIN__ */ int create_flag; @@ -1533,6 +1650,15 @@ os_file_create_simple_no_error_handling_func( } #endif /* USE_FILE_LOCK */ + if (file != -1 + && (awrites == ATOMIC_WRITES_ON || + (srv_use_atomic_writes && awrites == ATOMIC_WRITES_DEFAULT)) + && !os_file_set_atomic_writes(name, file)) { + *success = FALSE; + close(file); + file = -1; + } + #endif /* __WIN__ */ return(file); @@ -1602,7 +1728,7 @@ os_file_set_atomic_writes( if (ioctl(file, DFS_IOCTL_ATOMIC_WRITE_SET, &atomic_option)) { - os_file_handle_error_no_exit(name, "ioctl", FALSE); + os_file_handle_error_no_exit(name, "ioctl(DFS_IOCTL_ATOMIC_WRITE_SET)", FALSE, __FILE__, __LINE__); return(FALSE); } @@ -1636,12 +1762,15 @@ os_file_create_func( async i/o or unbuffered i/o: look in the function source code for the exact rules */ ulint type, /*!< in: OS_DATA_FILE or OS_LOG_FILE */ - ibool* success)/*!< out: TRUE if succeed, FALSE if error */ + ibool* success,/*!< out: TRUE if succeed, FALSE if error */ + ulint atomic_writes) /*! in: atomic writes table option + value */ { os_file_t file; ibool retry; ibool on_error_no_exit; ibool on_error_silent; + atomic_writes_t awrites = (atomic_writes_t) atomic_writes; #ifdef __WIN__ DBUG_EXECUTE_IF( @@ -1784,9 +1913,9 @@ os_file_create_func( if (on_error_no_exit) { retry = os_file_handle_error_no_exit( - name, operation, on_error_silent); + name, operation, on_error_silent, __FILE__, __LINE__); } else { - retry = os_file_handle_error(name, operation); + retry = os_file_handle_error(name, operation, __FILE__, __LINE__); } } else { *success = TRUE; @@ -1795,8 +1924,10 @@ os_file_create_func( } while (retry); - if (srv_use_atomic_writes && type == OS_DATA_FILE && - !os_file_set_atomic_writes(name, file)) { + if (file != INVALID_HANDLE_VALUE + && (awrites == ATOMIC_WRITES_ON || + (srv_use_atomic_writes && awrites == ATOMIC_WRITES_DEFAULT)) + && !os_file_set_atomic_writes(name, file)) { CloseHandle(file); *success = FALSE; file = INVALID_HANDLE_VALUE; @@ -1876,9 +2007,9 @@ os_file_create_func( if (on_error_no_exit) { retry = os_file_handle_error_no_exit( - name, operation, on_error_silent); + name, operation, on_error_silent, __FILE__, __LINE__); } else { - retry = os_file_handle_error(name, operation); + retry = os_file_handle_error(name, operation, __FILE__, __LINE__); } } else { *success = TRUE; @@ -1932,14 +2063,16 @@ os_file_create_func( } #endif /* USE_FILE_LOCK */ - if (srv_use_atomic_writes && type == OS_DATA_FILE + if (file != -1 + && (awrites == ATOMIC_WRITES_ON || + (srv_use_atomic_writes && awrites == ATOMIC_WRITES_DEFAULT)) && !os_file_set_atomic_writes(name, file)) { - *success = FALSE; close(file); file = -1; } + #endif /* __WIN__ */ return(file); @@ -1998,7 +2131,7 @@ os_file_delete_if_exists_func( ret = unlink(name); if (ret != 0 && errno != ENOENT) { - os_file_handle_error_no_exit(name, "delete", FALSE); + os_file_handle_error_no_exit(name, "delete", FALSE, __FILE__, __LINE__); return(false); } @@ -2062,7 +2195,7 @@ os_file_delete_func( ret = unlink(name); if (ret != 0) { - os_file_handle_error_no_exit(name, "delete", FALSE); + os_file_handle_error_no_exit(name, "delete", FALSE, __FILE__, __LINE__); return(false); } @@ -2106,7 +2239,7 @@ os_file_rename_func( return(TRUE); } - os_file_handle_error_no_exit(oldpath, "rename", FALSE); + os_file_handle_error_no_exit(oldpath, "rename", FALSE, __FILE__, __LINE__); return(FALSE); #else @@ -2115,7 +2248,7 @@ os_file_rename_func( ret = rename(oldpath, newpath); if (ret != 0) { - os_file_handle_error_no_exit(oldpath, "rename", FALSE); + os_file_handle_error_no_exit(oldpath, "rename", FALSE, __FILE__, __LINE__); return(FALSE); } @@ -2146,7 +2279,7 @@ os_file_close_func( return(TRUE); } - os_file_handle_error(NULL, "close"); + os_file_handle_error(NULL, "close", __FILE__, __LINE__); return(FALSE); #else @@ -2155,7 +2288,7 @@ os_file_close_func( ret = close(file); if (ret == -1) { - os_file_handle_error(NULL, "close"); + os_file_handle_error(NULL, "close", __FILE__, __LINE__); return(FALSE); } @@ -2247,6 +2380,12 @@ os_file_set_size( current_size = 0; +#ifdef UNIV_DEBUG + fprintf(stderr, "InnoDB: Note: File %s current_size %lu extended_size %lu\n", + name, os_file_get_size(file), size); +#endif + + #ifdef HAVE_POSIX_FALLOCATE if (srv_use_posix_fallocate) { @@ -2257,7 +2396,7 @@ os_file_set_size( INT64PF ", desired size " INT64PF "\n", name, current_size, size); os_file_handle_error_no_exit (name, "posix_fallocate", - FALSE); + FALSE, __FILE__, __LINE__); return(FALSE); } return(TRUE); @@ -2446,7 +2585,7 @@ os_file_flush_func( return(TRUE); } - os_file_handle_error(NULL, "flush"); + os_file_handle_error(NULL, "flush", __FILE__, __LINE__); /* It is a fatal error if a file flush does not succeed, because then the database can get corrupt on disk */ @@ -2500,7 +2639,7 @@ os_file_flush_func( ib_logf(IB_LOG_LEVEL_ERROR, "The OS said file flush did not succeed"); - os_file_handle_error(NULL, "flush"); + os_file_handle_error(NULL, "flush", __FILE__, __LINE__); /* It is a fatal error if a file flush does not succeed, because then the database can get corrupt on disk */ @@ -2855,6 +2994,9 @@ os_file_read_func( os_mutex_exit(os_file_count_mutex); if (ret && len == n) { + if (fil_page_is_compressed((byte *)buf)) { + fil_decompress_page(NULL, (byte *)buf, len); + } return(TRUE); } #else /* __WIN__ */ @@ -2868,6 +3010,10 @@ os_file_read_func( if ((ulint) ret == n) { + if (fil_page_is_compressed((byte *)buf)) { + fil_decompress_page(NULL, (byte *)buf, n); + } + return(TRUE); } @@ -2875,7 +3021,7 @@ os_file_read_func( "Tried to read "ULINTPF" bytes at offset " UINT64PF". " "Was only able to read %ld.", n, offset, (lint) ret); #endif /* __WIN__ */ - retry = os_file_handle_error(NULL, "read"); + retry = os_file_handle_error(NULL, "read", __FILE__, __LINE__); if (retry) { goto try_again; @@ -2968,10 +3114,14 @@ os_file_read_no_error_handling_func( if ((ulint) ret == n) { + if (fil_page_is_compressed((byte *)buf)) { + fil_decompress_page(NULL, (byte *)buf, n); + } + return(TRUE); } #endif /* __WIN__ */ - retry = os_file_handle_error_no_exit(NULL, "read", FALSE); + retry = os_file_handle_error_no_exit(NULL, "read", FALSE, __FILE__, __LINE__); if (retry) { goto try_again; @@ -3183,7 +3333,7 @@ os_file_status( } else if (ret) { /* file exists, but stat call failed */ - os_file_handle_error_no_exit(path, "stat", FALSE); + os_file_handle_error_no_exit(path, "stat", FALSE, __FILE__, __LINE__); return(FALSE); } @@ -3211,7 +3361,7 @@ os_file_status( } else if (ret) { /* file exists, but stat call failed */ - os_file_handle_error_no_exit(path, "stat", FALSE); + os_file_handle_error_no_exit(path, "stat", FALSE, __FILE__, __LINE__); return(FALSE); } @@ -3260,7 +3410,7 @@ os_file_get_status( } else if (ret) { /* file exists, but stat call failed */ - os_file_handle_error_no_exit(path, "stat", FALSE); + os_file_handle_error_no_exit(path, "stat", FALSE, __FILE__, __LINE__); return(DB_FAIL); @@ -3313,7 +3463,7 @@ os_file_get_status( } else if (ret) { /* file exists, but stat call failed */ - os_file_handle_error_no_exit(path, "stat", FALSE); + os_file_handle_error_no_exit(path, "stat", FALSE, __FILE__, __LINE__); return(DB_FAIL); @@ -3866,7 +4016,7 @@ os_aio_array_create( array->slots = static_cast( ut_malloc(n * sizeof(*array->slots))); - memset(array->slots, 0x0, sizeof(n * sizeof(*array->slots))); + memset(array->slots, 0x0, n * sizeof(*array->slots)); #if defined(LINUX_NATIVE_AIO) array->aio_ctx = NULL; @@ -3941,6 +4091,8 @@ os_aio_array_free( /*==============*/ os_aio_array_t*& array) /*!< in, own: array to free */ { + ulint i; + os_mutex_free(array->mutex); os_event_free(array->not_full); os_event_free(array->is_empty); @@ -3952,6 +4104,14 @@ os_aio_array_free( } #endif /* LINUX_NATIVE_AIO */ + for (i = 0; i < array->n_slots; i++) { + os_aio_slot_t* slot = os_aio_array_get_nth_slot(array, i); + if (slot->page_compression_page) { + ut_free(slot->page_compression_page); + slot->page_compression_page = NULL; + } + } + ut_free(array->slots); ut_free(array); @@ -4296,7 +4456,16 @@ os_aio_array_reserve_slot( to write */ os_offset_t offset, /*!< in: file offset */ ulint len, /*!< in: length of the block to read or write */ - ulint space_id) + ulint space_id, + ibool page_compression, /*!< in: is page compression used + on this file space */ + ulint page_compression_level, /*!< page compression + level to be used */ + ulint* write_size)/*!< in/out: Actual write size initialized + after fist successfull trim + operation for this page and if + initialized we do not trim again if + actual page size does not decrease. */ { os_aio_slot_t* slot = NULL; #ifdef WIN_ASYNC_IO @@ -4388,6 +4557,55 @@ os_aio_array_reserve_slot( slot->io_already_done = FALSE; slot->space_id = space_id; + slot->page_compress_success = FALSE; + slot->write_size = write_size; + slot->page_compression_level = page_compression_level; + slot->page_compression = page_compression; + + /* If the space is page compressed and this is write operation + and if either only index pages compression is disabled or + page is index page and only index pages compression is enabled then + we compress the page */ + if (message1 && + type == OS_FILE_WRITE && + page_compression && + (srv_page_compress_index_pages == false || + (srv_page_compress_index_pages == true && fil_page_is_index_page(slot->buf)))) { + ulint real_len = len; + byte* tmp = NULL; + + /* Release the array mutex while compressing */ + os_mutex_exit(array->mutex); + + // We allocate memory for page compressed buffer if and only + // if it is not yet allocated. + if (slot->page_buf == NULL) { + os_slot_alloc_page_buf(slot); + } + + ut_ad(slot->page_buf); + + /* Write buffer full of zeros, this is needed for trim, + can't really avoid this now. */ + memset(slot->page_buf, 0, len); + + tmp = fil_compress_page(fil_node_get_space_id(slot->message1), (byte *)buf, slot->page_buf, len, page_compression_level, &real_len); + + /* If compression succeeded, set up the length and buffer */ + if (tmp != buf) { + len = real_len; + buf = slot->page_buf; + slot->len = real_len; + slot->page_compress_success = TRUE; + } else { + slot->page_compress_success = FALSE; + } + + /* Take array mutex back */ + os_mutex_enter(array->mutex); + + } + #ifdef WIN_ASYNC_IO control = &slot->control; control->Offset = (DWORD) offset & 0xFFFFFFFF; @@ -4663,7 +4881,16 @@ os_aio_func( aio operation); ignored if mode is OS_AIO_SYNC */ ulint space_id, - trx_t* trx) + trx_t* trx, + ibool page_compression, /*!< in: is page compression used + on this file space */ + ulint page_compression_level, /*!< page compression + level to be used */ + ulint* write_size)/*!< in/out: Actual write size initialized + after fist successfull trim + operation for this page and if + initialized we do not trim again if + actual page size does not decrease. */ { os_aio_array_t* array; os_aio_slot_t* slot; @@ -4686,7 +4913,7 @@ os_aio_func( wake_later = mode & OS_AIO_SIMULATED_WAKE_LATER; mode = mode & (~OS_AIO_SIMULATED_WAKE_LATER); - if (mode == OS_AIO_SYNC) + if (mode == OS_AIO_SYNC) { ibool ret; /* This is actually an ordinary synchronous read or write: @@ -4753,7 +4980,8 @@ os_aio_func( trx->io_read += n; } slot = os_aio_array_reserve_slot(type, array, message1, message2, file, - name, buf, offset, n, space_id); + name, buf, offset, n, space_id, + page_compression, page_compression_level, write_size); if (type == OS_FILE_READ) { if (srv_use_native_aio) { os_n_file_reads++; @@ -4811,7 +5039,7 @@ os_aio_func( os_aio_array_free_slot(array, slot); if (os_file_handle_error( - name,type == OS_FILE_READ ? "aio read" : "aio write")) { + name,type == OS_FILE_READ ? "aio read" : "aio write", __FILE__, __LINE__)) { goto try_again; } @@ -4911,7 +5139,7 @@ os_aio_windows_handle( if (ret && len == slot->len) { ret_val = TRUE; - } else if (os_file_handle_error(slot->name, "Windows aio")) { + } else if (os_file_handle_error(slot->name, "Windows aio", __FILE__, __LINE__)) { retry = TRUE; } else { @@ -4939,11 +5167,17 @@ os_aio_windows_handle( switch (slot->type) { case OS_FILE_WRITE: - ret_val = os_file_write(slot->name, slot->file, slot->buf, - slot->control.Offset, slot->control.OffsetHigh, slot->len); + if (slot->message1 && page_compression && slot->page_buf) { + ret_val = os_file_write(slot->name, slot->file, slot->page_buf, + slot->control.Offset, slot->control.OffsetHigh, slot->len); + } else { + + ret_val = os_file_write(slot->name, slot->file, slot->buf, + slot->control.Offset, slot->control.OffsetHigh, slot->len); + } break; case OS_FILE_READ: - ret_val = os_file_read(slot->file, slot->buf, + ret_val = os_file_read(slot->file, slot->buf, slot->control.Offset, slot->control.OffsetHigh, slot->len); break; default: @@ -4969,6 +5203,28 @@ os_aio_windows_handle( ret_val = ret && len == slot->len; } + if (slot->message1 && page_compression) { + // We allocate memory for page compressed buffer if and only + // if it is not yet allocated. + if (slot->page_buf == NULL) { + os_slot_alloc_page_buf(slot); + } + ut_ad(slot->page_buf); + + if (slot->type == OS_FILE_READ) { + if (fil_page_is_compressed(slot->buf)) { + fil_decompress_page(slot->page_buf, slot->buf, slot->len); + } + } else { + if (slot->page_compress_success && fil_page_is_compressed(slot->page_buf)) { + if (srv_use_trim && os_fallocate_failed == FALSE) { + // Deallocate unused blocks from file system + os_file_trim(slot->file, slot, slot->len); + } + } + } + } + os_aio_array_free_slot((os_aio_array_t *)slot->arr, slot); return(ret_val); @@ -5058,6 +5314,33 @@ os_aio_linux_collect( /* We have not overstepped to next segment. */ ut_a(slot->pos < end_pos); + /* If the table is page compressed and this is read, + we decompress before we annouce the read is + complete. For writes, we free the compressed page. */ + if (slot->message1 && slot->page_compression) { + // We allocate memory for page compressed buffer if and only + // if it is not yet allocated. + if (slot->page_buf == NULL) { + os_slot_alloc_page_buf(slot); + } + ut_ad(slot->page_buf); + + if (slot->type == OS_FILE_READ) { + if (fil_page_is_compressed(slot->buf)) { + fil_decompress_page(slot->page_buf, slot->buf, slot->len); + } + } else { + if (slot->page_compress_success && + fil_page_is_compressed(slot->page_buf)) { + ut_ad(slot->page_compression_page); + if (srv_use_trim && os_fallocate_failed == FALSE) { + // Deallocate unused blocks from file system + os_file_trim(slot->file, slot, slot->len); + } + } + } + } + /* Mark this request as completed. The error handling will be done in the calling function. */ os_mutex_enter(array->mutex); @@ -5203,6 +5486,13 @@ os_aio_linux_handle( } else { errno = -slot->ret; + if (slot->ret == 0) { + fprintf(stderr, + "InnoDB: Number of bytes after aio %d requested %lu\n" + "InnoDB: from file %s\n", + slot->n_bytes, slot->len, slot->name); + } + /* os_file_handle_error does tell us if we should retry this IO. As it stands now, we don't do this retry when reaping requests from a different context than @@ -5210,7 +5500,7 @@ os_aio_linux_handle( windows and linux native AIO. We should probably look into this to transparently re-submit the IO. */ - os_file_handle_error(slot->name, "Linux aio"); + os_file_handle_error(slot->name, "Linux aio", __FILE__, __LINE__); ret = FALSE; } @@ -5884,3 +6174,162 @@ os_aio_all_slots_free(void) #endif /* UNIV_DEBUG */ #endif /* !UNIV_HOTBACKUP */ + +#ifdef _WIN32 +#include +#ifndef FSCTL_FILE_LEVEL_TRIM +#define FSCTL_FILE_LEVEL_TRIM CTL_CODE(FILE_DEVICE_FILE_SYSTEM, 130, METHOD_BUFFERED, FILE_WRITE_DATA) +typedef struct _FILE_LEVEL_TRIM_RANGE { + DWORDLONG Offset; + DWORDLONG Length; +} FILE_LEVEL_TRIM_RANGE, *PFILE_LEVEL_TRIM_RANGE; + +typedef struct _FILE_LEVEL_TRIM { + DWORD Key; + DWORD NumRanges; + FILE_LEVEL_TRIM_RANGE Ranges[1]; +} FILE_LEVEL_TRIM, *PFILE_LEVEL_TRIM; +#endif +#endif + +/**********************************************************************//** +Directly manipulate the allocated disk space by deallocating for the file referred to +by fd for the byte range starting at offset and continuing for len bytes. +Within the specified range, partial file system blocks are zeroed, and whole +file system blocks are removed from the file. After a successful call, +subsequent reads from this range will return zeroes. +@return true if success, false if error */ +UNIV_INTERN +ibool +os_file_trim( +/*=========*/ + os_file_t file, /*!< in: file to be trimmed */ + os_aio_slot_t* slot, /*!< in: slot structure */ + ulint len) /*!< in: length of area */ +{ + + size_t trim_len = UNIV_PAGE_SIZE - len; + os_offset_t off = slot->offset + len; + + // Nothing to do if trim length is zero or if actual write + // size is initialized and it is smaller than current write size. + // In first write if we trim we set write_size to actual bytes + // written and rest of the page is trimmed. In following writes + // there is no need to trim again if write_size only increases + // because rest of the page is already trimmed. If actual write + // size decreases we need to trim again. + if (trim_len == 0 || + (slot->write_size && + *slot->write_size > 0 && + len >= *slot->write_size)) { + +#ifdef UNIV_DEBUG + fprintf(stderr, "Note: TRIM: write_size %lu trim_len %lu len %lu\n", + *slot->write_size, trim_len, len); +#endif + + if (*slot->write_size > 0 && len >= *slot->write_size) { + srv_stats.page_compressed_trim_op_saved.inc(); + } + + *slot->write_size = len; + + return (TRUE); + } + +#ifdef __linux__ +#if defined(FALLOC_FL_PUNCH_HOLE) && defined (FALLOC_FL_KEEP_SIZE) + int ret = fallocate(file, FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE, off, trim_len); + + if (ret) { + /* After first failure do not try to trim again */ + os_fallocate_failed = TRUE; + ut_print_timestamp(stderr); + fprintf(stderr, + " InnoDB: [Warning] fallocate call failed with error code %d.\n" + " InnoDB: start: %lx len: %lu payload: %lu\n" + " InnoDB: Disabling fallocate for now.\n", ret, (slot->offset+len), trim_len, len); + + os_file_handle_error_no_exit(slot->name, + " fallocate(FALLOC_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE) ", + FALSE, __FILE__, __LINE__); + + if (slot->write_size) { + *slot->write_size = 0; + } + + return (FALSE); + } else { + if (slot->write_size) { + *slot->write_size = len; + } + } +#else + ut_print_timestamp(stderr); + fprintf(stderr, + " InnoDB: [Warning] fallocate not supported on this installation." + " InnoDB: Disabling fallocate for now."); + os_fallocate_failed = TRUE; + slot->write_size = NULL; + +#endif /* HAVE_FALLOCATE ... */ + +#elif defined(_WIN32) + FILE_LEVEL_TRIM flt; + flt.Key = 0; + flt.NumRanges = 1; + flt.Ranges[0].Offset = off; + flt.Ranges[0].Length = trim_len; + + BOOL ret = DeviceIoControl(file,FSCTL_FILE_LEVEL_TRIM,&flt, sizeof(flt), NULL, NULL, NULL, NULL); + + if (!ret) { + /* After first failure do not try to trim again */ + os_fallocate_failed = TRUE; + ut_print_timestamp(stderr); + fprintf(stderr, + " InnoDB: [Warning] fallocate call failed with error.\n" + " InnoDB: start: %lx len: %du payload: %lu\n" + " InnoDB: Disabling fallocate for now.\n", (slot->offset+len), trim_len, len); + + os_file_handle_error_no_exit(slot->name, + " DeviceIOControl(FSCTL_FILE_LEVEL_TRIM) ", + FALSE, __FILE__, __LINE__); + + if (slot->write_size) { + slot->write_size = 0; + } + return (FALSE); + } else { + if (slot->write_size) { + slot->write_size = len; + } + } +#endif + +#define SECT_SIZE 512 + srv_stats.page_compression_trim_sect512.add((trim_len / SECT_SIZE)); + srv_stats.page_compression_trim_sect4096.add((trim_len / (SECT_SIZE*8))); + srv_stats.page_compressed_trim_op.inc(); + + return (TRUE); + +} + +/**********************************************************************//** +Allocate memory for temporal buffer used for page compression. This +buffer is freed later. */ +UNIV_INTERN +void +os_slot_alloc_page_buf( +/*===================*/ + os_aio_slot_t* slot) /*!< in: slot structure */ +{ + byte* cbuf2; + byte* cbuf; + + cbuf2 = static_cast(ut_malloc(UNIV_PAGE_SIZE*2)); + cbuf = static_cast(ut_align(cbuf2, UNIV_PAGE_SIZE)); + slot->page_compression_page = static_cast(cbuf2); + slot->page_buf = static_cast(cbuf); +} diff --git a/storage/xtradb/srv/srv0mon.cc b/storage/xtradb/srv/srv0mon.cc index d98315ae9a2ab..0b5556ab61ae9 100644 --- a/storage/xtradb/srv/srv0mon.cc +++ b/storage/xtradb/srv/srv0mon.cc @@ -290,6 +290,12 @@ static monitor_info_t innodb_counter_info[] = MONITOR_EXISTING | MONITOR_DEFAULT_ON), MONITOR_DEFAULT_START, MONITOR_OVLD_PAGES_WRITTEN}, + {"buffer_index_pages_written", "buffer", + "Number of index pages written (innodb_index_pages_written)", + static_cast( + MONITOR_EXISTING | MONITOR_DEFAULT_ON), + MONITOR_DEFAULT_START, MONITOR_OVLD_INDEX_PAGES_WRITTEN}, + {"buffer_pages_read", "buffer", "Number of pages read (innodb_pages_read)", static_cast( @@ -879,6 +885,41 @@ static monitor_info_t innodb_counter_info[] = MONITOR_NONE, MONITOR_DEFAULT_START, MONITOR_PAD_DECREMENTS}, + {"compress_saved", "compression", + "Number of bytes saved by page compression", + MONITOR_NONE, + MONITOR_DEFAULT_START, MONITOR_OVLD_PAGE_COMPRESS_SAVED}, + + {"compress_trim_sect512", "compression", + "Number of sect-512 TRIMed by page compression", + MONITOR_NONE, + MONITOR_DEFAULT_START, MONITOR_OVLD_PAGE_COMPRESS_TRIM_SECT512}, + + {"compress_trim_sect4096", "compression", + "Number of sect-4K TRIMed by page compression", + MONITOR_NONE, + MONITOR_DEFAULT_START, MONITOR_OVLD_PAGE_COMPRESS_TRIM_SECT4096}, + + {"compress_pages_page_compressed", "compression", + "Number of pages compressed by page compression", + MONITOR_NONE, + MONITOR_DEFAULT_START, MONITOR_OVLD_PAGES_PAGE_COMPRESSED}, + + {"compress_page_compressed_trim_op", "compression", + "Number of TRIM operation performed by page compression", + MONITOR_NONE, + MONITOR_DEFAULT_START, MONITOR_OVLD_PAGE_COMPRESSED_TRIM_OP}, + + {"compress_page_compressed_trim_op_saved", "compression", + "Number of TRIM operation saved by page compression", + MONITOR_NONE, + MONITOR_DEFAULT_START, MONITOR_OVLD_PAGE_COMPRESSED_TRIM_OP_SAVED}, + + {"compress_pages_page_decompressed", "compression", + "Number of pages decompressed by page compression", + MONITOR_NONE, + MONITOR_DEFAULT_START, MONITOR_OVLD_PAGES_PAGE_DECOMPRESSED}, + /* ========== Counters for Index ========== */ {"module_index", "index", "Index Manager", MONITOR_MODULE, @@ -1532,6 +1573,11 @@ srv_mon_process_existing_counter( value = stat.n_pages_written; break; + /* innodb_index_pages_written, the number of page written */ + case MONITOR_OVLD_INDEX_PAGES_WRITTEN: + value = srv_stats.index_pages_written; + break; + /* innodb_pages_read */ case MONITOR_OVLD_PAGES_READ: buf_get_total_stat(&stat); @@ -1773,6 +1819,28 @@ srv_mon_process_existing_counter( value = btr_cur_n_non_sea; break; + case MONITOR_OVLD_PAGE_COMPRESS_SAVED: + value = srv_stats.page_compression_saved; + break; + case MONITOR_OVLD_PAGE_COMPRESS_TRIM_SECT512: + value = srv_stats.page_compression_trim_sect512; + break; + case MONITOR_OVLD_PAGE_COMPRESS_TRIM_SECT4096: + value = srv_stats.page_compression_trim_sect4096; + break; + case MONITOR_OVLD_PAGES_PAGE_COMPRESSED: + value = srv_stats.pages_page_compressed; + break; + case MONITOR_OVLD_PAGE_COMPRESSED_TRIM_OP: + value = srv_stats.page_compressed_trim_op; + break; + case MONITOR_OVLD_PAGE_COMPRESSED_TRIM_OP_SAVED: + value = srv_stats.page_compressed_trim_op_saved; + break; + case MONITOR_OVLD_PAGES_PAGE_DECOMPRESSED: + value = srv_stats.pages_page_decompressed; + break; + default: ut_error; } diff --git a/storage/xtradb/srv/srv0srv.cc b/storage/xtradb/srv/srv0srv.cc index 953bbba11f792..92acf847ca160 100644 --- a/storage/xtradb/srv/srv0srv.cc +++ b/storage/xtradb/srv/srv0srv.cc @@ -3,6 +3,7 @@ Copyright (c) 1995, 2013, Oracle and/or its affiliates. All Rights Reserved. Copyright (c) 2008, 2009 Google Inc. Copyright (c) 2009, Percona Inc. +Copyright (c) 2013, 2014, SkySQL Ab. Portions of this file contain modifications contributed and copyrighted by Google, Inc. Those modifications are gratefully acknowledged and are described @@ -160,6 +161,26 @@ use simulated aio we build below with threads. Currently we support native aio on windows and linux */ UNIV_INTERN my_bool srv_use_native_aio = TRUE; +/* If this flag is TRUE, then we will use page compression +to the pages */ +UNIV_INTERN my_bool srv_compress_pages = FALSE; +/* If this flag is TRUE, then we will use page compression +only for index pages */ +UNIV_INTERN my_bool srv_page_compress_index_pages = FALSE; +UNIV_INTERN long srv_trim_pct = 100; +/* Default compression level if page compression is used and no compression +level is set for the table*/ +UNIV_INTERN long srv_compress_zlib_level = 6; +/* If this flag is TRUE, then we will use fallocate(PUCH_HOLE) +to the pages */ +UNIV_INTERN my_bool srv_use_trim = TRUE; +/* If this flag is TRUE, then we will use posix fallocate for file extentsion */ +UNIV_INTERN my_bool srv_use_posix_fallocate = FALSE; +/* If this flag is TRUE, then we disable doublewrite buffer */ +UNIV_INTERN my_bool srv_use_atomic_writes = FALSE; +/* If this flag IS TRUE, then we use lz4 to compress/decompress pages */ +UNIV_INTERN my_bool srv_use_lz4 = FALSE; + #ifdef __WIN__ /* Windows native condition variables. We use runtime loading / function pointers, because they are not available on Windows Server 2003 and @@ -454,10 +475,6 @@ UNIV_INTERN unsigned long long srv_stats_persistent_sample_pages = 20; UNIV_INTERN my_bool srv_stats_auto_recalc = TRUE; UNIV_INTERN ibool srv_use_doublewrite_buf = TRUE; -UNIV_INTERN ibool srv_use_atomic_writes = FALSE; -#ifdef HAVE_POSIX_FALLOCATE -UNIV_INTERN ibool srv_use_posix_fallocate = FALSE; -#endif /** doublewrite buffer is 1MB is size i.e.: it can hold 128 16K pages. The following parameter is the size of the buffer that is used for @@ -493,6 +510,15 @@ static ulint srv_n_rows_read_old = 0; UNIV_INTERN ulint srv_truncated_status_writes = 0; UNIV_INTERN ulint srv_available_undo_logs = 0; +UNIV_INTERN ib_uint64_t srv_page_compression_saved = 0; +UNIV_INTERN ib_uint64_t srv_page_compression_trim_sect512 = 0; +UNIV_INTERN ib_uint64_t srv_page_compression_trim_sect4096 = 0; +UNIV_INTERN ib_uint64_t srv_index_pages_written = 0; +UNIV_INTERN ib_uint64_t srv_pages_page_compressed = 0; +UNIV_INTERN ib_uint64_t srv_page_compressed_trim_op = 0; +UNIV_INTERN ib_uint64_t srv_page_compressed_trim_op_saved = 0; +UNIV_INTERN ib_uint64_t srv_index_page_decompressed = 0; + /* Ensure status variables are on separate cache lines */ #define CACHE_LINE_SIZE 64 @@ -1835,6 +1861,15 @@ srv_export_innodb_status(void) export_vars.innodb_descriptors_memory = os_atomic_increment_ulint(&srv_descriptors_memory, 0); + export_vars.innodb_page_compression_saved = srv_stats.page_compression_saved; + export_vars.innodb_page_compression_trim_sect512 = srv_stats.page_compression_trim_sect512; + export_vars.innodb_page_compression_trim_sect4096 = srv_stats.page_compression_trim_sect4096; + export_vars.innodb_index_pages_written = srv_stats.index_pages_written; + export_vars.innodb_pages_page_compressed = srv_stats.pages_page_compressed; + export_vars.innodb_page_compressed_trim_op = srv_stats.page_compressed_trim_op; + export_vars.innodb_page_compressed_trim_op_saved = srv_stats.page_compressed_trim_op_saved; + export_vars.innodb_pages_page_decompressed = srv_stats.pages_page_decompressed; + #ifdef UNIV_DEBUG rw_lock_s_lock(&purge_sys->latch); trx_id_t done_trx_no = purge_sys->done.trx_no; diff --git a/storage/xtradb/srv/srv0start.cc b/storage/xtradb/srv/srv0start.cc index 3ddfd9ab3a440..faad8c3c1339c 100644 --- a/storage/xtradb/srv/srv0start.cc +++ b/storage/xtradb/srv/srv0start.cc @@ -3,6 +3,7 @@ Copyright (c) 1996, 2013, Oracle and/or its affiliates. All rights reserved. Copyright (c) 2008, Google Inc. Copyright (c) 2009, Percona Inc. +Copyright (c) 2013, SkySQL Ab. All Rights Reserved. Portions of this file contain modifications contributed and copyrighted by Google, Inc. Those modifications are gratefully acknowledged and are described @@ -64,6 +65,8 @@ Created 2/16/1996 Heikki Tuuri #include "ibuf0ibuf.h" #include "srv0start.h" #include "srv0srv.h" +#include "buf0flu.h" + #ifndef UNIV_HOTBACKUP # include "trx0rseg.h" # include "os0proc.h" @@ -128,8 +131,14 @@ static os_file_t files[1000]; /** io_handler_thread parameters for thread identification */ static ulint n[SRV_MAX_N_IO_THREADS + 6]; /** io_handler_thread identifiers, 32 is the maximum number of purge threads */ -static os_thread_id_t thread_ids[SRV_MAX_N_IO_THREADS + 6 - + SRV_MAX_N_PURGE_THREADS]; +/* + static os_thread_id_t thread_ids[SRV_MAX_N_IO_THREADS + 6 +/ + SRV_MAX_N_PURGE_THREADS]; +*/ +/** pgcomp_thread are 16 total */ +#define START_PGCOMP_CNT (SRV_MAX_N_IO_THREADS + 6 + SRV_MAX_N_PURGE_THREADS) +#define PGCOMP_MAX_WORKER 16 +static os_thread_id_t thread_ids[SRV_MAX_N_IO_THREADS + 6 + SRV_MAX_N_PURGE_THREADS + PGCOMP_MAX_WORKER]; /** We use this mutex to test the return value of pthread_mutex_trylock on successful locking. HP-UX does NOT return 0, though Linux et al do. */ @@ -537,7 +546,7 @@ create_log_file( *file = os_file_create( innodb_file_log_key, name, OS_FILE_CREATE|OS_FILE_ON_ERROR_NO_EXIT, OS_FILE_NORMAL, - OS_LOG_FILE, &ret); + OS_LOG_FILE, &ret, FALSE); if (!ret) { ib_logf(IB_LOG_LEVEL_ERROR, "Cannot create %s", name); @@ -754,7 +763,7 @@ open_log_file( *file = os_file_create(innodb_file_log_key, name, OS_FILE_OPEN, OS_FILE_AIO, - OS_LOG_FILE, &ret); + OS_LOG_FILE, &ret, FALSE); if (!ret) { ib_logf(IB_LOG_LEVEL_ERROR, "Unable to open '%s'", name); return(DB_ERROR); @@ -845,7 +854,7 @@ open_or_create_data_files( files[i] = os_file_create( innodb_file_data_key, name, OS_FILE_CREATE, - OS_FILE_NORMAL, OS_DATA_FILE, &ret); + OS_FILE_NORMAL, OS_DATA_FILE, &ret, FALSE); if (srv_read_only_mode) { @@ -888,7 +897,7 @@ open_or_create_data_files( files[i] = os_file_create( innodb_file_data_key, name, OS_FILE_OPEN_RAW, - OS_FILE_NORMAL, OS_DATA_FILE, &ret); + OS_FILE_NORMAL, OS_DATA_FILE, &ret, FALSE); if (!ret) { ib_logf(IB_LOG_LEVEL_ERROR, @@ -921,17 +930,17 @@ open_or_create_data_files( files[i] = os_file_create( innodb_file_data_key, name, OS_FILE_OPEN_RAW, - OS_FILE_NORMAL, OS_DATA_FILE, &ret); + OS_FILE_NORMAL, OS_DATA_FILE, &ret, FALSE); } else if (i == 0) { files[i] = os_file_create( innodb_file_data_key, name, OS_FILE_OPEN_RETRY, - OS_FILE_NORMAL, OS_DATA_FILE, &ret); + OS_FILE_NORMAL, OS_DATA_FILE, &ret, FALSE); } else { files[i] = os_file_create( innodb_file_data_key, name, OS_FILE_OPEN, OS_FILE_NORMAL, - OS_DATA_FILE, &ret); + OS_DATA_FILE, &ret, FALSE); } if (!ret) { @@ -1122,7 +1131,7 @@ srv_undo_tablespace_create( innodb_file_data_key, name, srv_read_only_mode ? OS_FILE_OPEN : OS_FILE_CREATE, - OS_FILE_NORMAL, OS_DATA_FILE, &ret); + OS_FILE_NORMAL, OS_DATA_FILE, &ret, FALSE); if (srv_read_only_mode && ret) { ib_logf(IB_LOG_LEVEL_INFO, @@ -1209,7 +1218,8 @@ srv_undo_tablespace_open( | OS_FILE_ON_ERROR_SILENT, OS_FILE_NORMAL, OS_DATA_FILE, - &ret); + &ret, + FALSE); /* If the file open was successful then load the tablespace. */ @@ -1503,6 +1513,694 @@ init_log_online(void) } } +/* JAN: TODO: */ +/**********************************************************************************/ +extern int timediff(struct timeval *g_time, struct timeval *s_time, struct timeval *d_time); +extern ibool buf_flush_start(buf_pool_t* buf_pool, buf_flush_t flush_type); +extern void buf_flush_end(buf_pool_t* buf_pool, buf_flush_t flush_type); +extern void buf_flush_common(buf_flush_t flush_type, ulint page_count); +extern ulint buf_flush_batch(buf_pool_t* buf_pool, buf_flush_t flush_type, ulint min_n, lsn_t lsn_limit, bool limited_lru_scan, flush_counters_t*); + +typedef enum wrk_status { + WRK_ITEM_SET=0, + WRK_ITEM_START=1, + WRK_ITEM_DONE=2, + WRK_ITEM_SUCCESS=2, + WRK_ITEM_FAILED=3, + WRK_ITEM_STATUS_UNDEFINED +} wrk_status_t; + +typedef enum wthr_status { + WTHR_NOT_INIT=0, + WTHR_INITIALIZED=1, + WTHR_SIG_WAITING=2, + WTHR_RUNNING=3, + WTHR_NO_WORK=4, + WTHR_KILL_IT=5, + WTHR_STATUS_UNDEFINED +} wthr_status_t; + +typedef struct wrk_itm +{ + /****************************/ + /* Need to group into struct*/ + buf_pool_t* buf_pool; //buffer-pool instance + int flush_type; //flush-type for buffer-pool flush operation + int min; //minimum number of pages requested to be flushed + unsigned long long lsn_limit; //lsn limit for the buffer-pool flush operation + /****************************/ + + unsigned long result; //flush pages count + unsigned long t_usec; //time-taken in usec + long id_usr; //thread-id currently working + wrk_status_t wi_status; //flag + struct wrk_itm *next; +} wrk_t; + +typedef enum op_q_status { + Q_NOT_INIT=0, + Q_EMPTY=1, + Q_INITIALIZED=2, + Q_PROCESS=3, + Q_DONE=4, + Q_ERROR=5, + Q_STATUS_UNDEFINED +} q_status_t; + +typedef struct op_queue +{ + pthread_mutex_t mtx; + pthread_cond_t cv; + q_status_t flag; + wrk_t *head; + wrk_t *tail; +} opq_t; + +opq_t wq, cq; + +typedef struct thread_sync +{ + int wthread_id; + pthread_t wthread; + opq_t *wq; + opq_t *cq; + wthr_status_t wt_status; + unsigned long stat_universal_num_processed; + unsigned long stat_cycle_num_processed; +} thread_sync_t; + +/* Global XXX:DD needs to be cleaned */ +int exit_flag; +ulint check_wrk_done_count; +static ulint done_cnt_flag; +static int pgc_n_threads = 8; + +thread_sync_t pc_sync[PGCOMP_MAX_WORKER]; +static wrk_t work_items[PGCOMP_MAX_WORKER]; +static int pgcomp_wrk_initialized = -1; + +int set_check_done_flag_count(int cnt) +{ + return(check_wrk_done_count = cnt); +} + +int set_pgcomp_wrk_init_done(void) +{ + pgcomp_wrk_initialized = 1; + return 0; +} + +int is_pgcomp_wrk_init_done(void) +{ + return(pgcomp_wrk_initialized == 1); +} + +ulint set_done_cnt_flag(ulint val) +{ + /* + * Assumption: The thread calling into set_done_cnt_flag + * needs to have "cq.mtx" acquired, else not safe. + */ + done_cnt_flag = val; + return done_cnt_flag; +} + + +ulint cv_done_inc_flag_sig(thread_sync_t * ppc) +{ + pthread_mutex_lock(&ppc->cq->mtx); + ppc->stat_universal_num_processed++; + ppc->stat_cycle_num_processed++; + done_cnt_flag++; + if(!(done_cnt_flag <= check_wrk_done_count)) { + fprintf(stderr, "ERROR: done_cnt:%lu check_wrk_done_count:%lu\n", + done_cnt_flag, check_wrk_done_count); + } + assert(done_cnt_flag <= check_wrk_done_count); + pthread_mutex_unlock(&ppc->cq->mtx); + if(done_cnt_flag == check_wrk_done_count) { + ppc->wq->flag = Q_DONE; + pthread_mutex_lock(&ppc->cq->mtx); + ppc->cq->flag = Q_DONE; + pthread_cond_signal(&ppc->cq->cv); + pthread_mutex_unlock(&ppc->cq->mtx); + } + return(done_cnt_flag); +} + +int q_remove_wrk(opq_t *q, wrk_t **wi) +{ + int ret = 0; + + if(!wi || !q) { + return -1; + } + + pthread_mutex_lock(&q->mtx); + assert(!((q->tail == NULL) && (q->head != NULL))); + assert(!((q->tail != NULL) && (q->head == NULL))); + + /* get the first in the list*/ + *wi = q->head; + if(q->head) { + ret = 0; + q->head = q->head->next; + (*wi)->next = NULL; + if(!q->head) { + q->tail = NULL; + } + } else { + q->tail = NULL; + ret = 1; /* indicating remove from queue failed */ + } + pthread_mutex_unlock(&q->mtx); + return (ret); +} + +int is_busy_wrk_itm(wrk_t *wi) +{ + if(!wi) { + return -1; + } + return(!(wi->id_usr == -1)); +} + +int setup_wrk_itm(int items) +{ + int i; + for(i=0; imtx, NULL); + pthread_cond_init(&q->cv, NULL); + q->flag = Q_INITIALIZED; + q->head = q->tail = NULL; + + return 0; +} + +#if 0 +int drain_cq(opq_t *cq, int items) +{ + int i=0; + + if(!cq) { + return -1; + } + pthread_mutex_lock(&cq->mtx); + for(i=0; ihead = cq->tail = NULL; + pthread_mutex_unlock(&cq->mtx); + return 0; +} +#endif + +int q_insert_wrk_list(opq_t *q, wrk_t *w_list) +{ + if((!q) || (!w_list)) { + fprintf(stderr, "insert failed q:%p w:%p\n", q, w_list); + return -1; + } + + pthread_mutex_lock(&q->mtx); + + assert(!((q->tail == NULL) && (q->head != NULL))); + assert(!((q->tail != NULL) && (q->head == NULL))); + + /* list is empty */ + if(!q->tail) { + q->head = q->tail = w_list; + } else { + /* added the first of the node to list */ + assert(q->head != NULL); + q->tail->next = w_list; + } + + /* move tail to the last node */ + while(q->tail->next) { + q->tail = q->tail->next; + } + pthread_mutex_unlock(&q->mtx); + + return 0; +} + +int flush_pool_instance(wrk_t *wi) +{ + struct timeval p_start_time, p_end_time, d_time; + flush_counters_t n; + + if(!wi) { + fprintf(stderr, "work item invalid wi:%p\n", wi); + return -1; + } + + wi->t_usec = 0; + if (!buf_flush_start(wi->buf_pool, (buf_flush_t)wi->flush_type)) { + /* We have two choices here. If lsn_limit was + specified then skipping an instance of buffer + pool means we cannot guarantee that all pages + up to lsn_limit has been flushed. We can + return right now with failure or we can try + to flush remaining buffer pools up to the + lsn_limit. We attempt to flush other buffer + pools based on the assumption that it will + help in the retry which will follow the + failure. */ + fprintf(stderr, "flush_start Failed, flush_type:%d\n", + (buf_flush_t)wi->flush_type); + return -1; + } + +#ifdef UNIV_DEBUG + /* Record time taken for the OP in usec */ + gettimeofday(&p_start_time, 0x0); +#endif + + if((buf_flush_t)wi->flush_type == BUF_FLUSH_LRU) { + /* srv_LRU_scan_depth can be arbitrarily large value. + * We cap it with current LRU size. + */ + buf_pool_mutex_enter(wi->buf_pool); + wi->min = UT_LIST_GET_LEN(wi->buf_pool->LRU); + buf_pool_mutex_exit(wi->buf_pool); + wi->min = ut_min(srv_LRU_scan_depth,wi->min); + } + + buf_flush_batch(wi->buf_pool, + (buf_flush_t)wi->flush_type, + wi->min, wi->lsn_limit, false, &n); + + wi->result = n.flushed; + + buf_flush_end(wi->buf_pool, (buf_flush_t)wi->flush_type); + buf_flush_common((buf_flush_t)wi->flush_type, wi->result); + +#ifdef UNIV_DEBUG + gettimeofday(&p_end_time, 0x0); + timediff(&p_end_time, &p_start_time, &d_time); + + wi->t_usec = (unsigned long)(d_time.tv_usec+(d_time.tv_sec*1000000)); +#endif + + return 0; +} + +int service_page_comp_io(thread_sync_t * ppc) +{ + wrk_t *wi = NULL; + int ret=0; + + pthread_mutex_lock(&ppc->wq->mtx); + do{ + ppc->wt_status = WTHR_SIG_WAITING; + ret = pthread_cond_wait(&ppc->wq->cv, &ppc->wq->mtx); + ppc->wt_status = WTHR_RUNNING; + if(ret == ETIMEDOUT) { + fprintf(stderr, "ERROR ETIMEDOUT cnt_flag:[%lu] ret:%d\n", + done_cnt_flag, ret); + } else if(ret == EINVAL || ret == EPERM) { + fprintf(stderr, "ERROR EINVAL/EPERM cnt_flag:[%lu] ret:%d\n", + done_cnt_flag, ret); + } + if(ppc->wq->flag == Q_PROCESS) { + break; + } else { + pthread_mutex_unlock(&ppc->wq->mtx); + return -1; + } + } while (ppc->wq->flag == Q_PROCESS && ret == 0); + + pthread_mutex_unlock(&ppc->wq->mtx); + + while (ppc->cq->flag == Q_PROCESS) { + wi = NULL; + /* Get the work item */ + if (0 != (ret = q_remove_wrk(ppc->wq, &wi))) { + ppc->wt_status = WTHR_NO_WORK; + return -1; + } + + assert(ret==0); + assert(wi != NULL); + assert(0 == is_busy_wrk_itm(wi)); + assert(wi->id_usr == -1); + + wi->id_usr = ppc->wthread; + wi->wi_status = WRK_ITEM_START; + + /* Process work item */ + if(0 != (ret = flush_pool_instance(wi))) { + fprintf(stderr, "FLUSH op failed ret:%d\n", ret); + wi->wi_status = WRK_ITEM_FAILED; + } + + ret = q_insert_wrk_list(ppc->cq, wi); + + assert(0==ret); + assert(check_wrk_done_count >= done_cnt_flag); + wi->wi_status = WRK_ITEM_SUCCESS; + if(check_wrk_done_count == cv_done_inc_flag_sig(ppc)) { + break; + } + } + return(0); +} + +/******************************************************************//** +@return a dummy parameter*/ +extern "C" UNIV_INTERN +os_thread_ret_t +DECLARE_THREAD(page_comp_io_thread)( +/*==========================================*/ + void * arg) +{ + thread_sync_t *ppc_io = ((thread_sync_t *)arg); + + while (srv_shutdown_state != SRV_SHUTDOWN_EXIT_THREADS) { + service_page_comp_io(ppc_io); + ppc_io->stat_cycle_num_processed = 0; + } + os_thread_exit(NULL); + OS_THREAD_DUMMY_RETURN; +} + +int print_queue_wrk_itm(opq_t *q) +{ +#if UNIV_DEBUG + wrk_t *wi = NULL; + + if(!q) { + fprintf(stderr, "queue NULL\n"); + return -1; + } + + if(!q->head || !q->tail) { + assert(!(((q->tail==NULL) && (q->head!=NULL)) && ((q->tail != NULL) && (q->head == NULL)))); + fprintf(stderr, "queue empty (h:%p t:%p)\n", q->head, q->tail); + return 0; + } + + pthread_mutex_lock(&q->mtx); + for(wi = q->head; (wi != NULL) ; wi = wi->next) { + //fprintf(stderr, "- [%p] %p %lu %luus [%ld] >%p\n", + // wi, wi->buf_pool, wi->result, wi->t_usec, wi->id_usr, wi->next); + fprintf(stderr, "- [%p] [%s] >%p\n", + wi, (wi->id_usr == -1)?"free":"Busy", wi->next); + } + pthread_mutex_unlock(&q->mtx); +#endif + return(0); +} + +int print_wrk_list(wrk_t *wi_list) +{ + wrk_t *wi = wi_list; + int i=0; + + if(!wi_list) { + fprintf(stderr, "list NULL\n"); + } + + while(wi) { + fprintf(stderr, "-\t[%p]\t[%s]\t[%lu]\t[%luus] > %p\n", + wi, (wi->id_usr == -1)?"free":"Busy", wi->result, wi->t_usec, wi->next); + wi = wi->next; + i++; + } + fprintf(stderr, "list len: %d\n", i); + return 0; +} + +int pgcomp_handler(wrk_t *w_list) +{ + int ret=0; + opq_t *wrk_q=NULL, *comp_q=NULL; + + wrk_q=&wq; + comp_q=&cq; + + pthread_mutex_lock(&wrk_q->mtx); + /* setup work queue here.. */ + wrk_q->flag = Q_EMPTY; + pthread_mutex_unlock(&wrk_q->mtx); + + ret = q_insert_wrk_list(wrk_q, w_list); + if(ret != 0) { + fprintf(stderr, "%s():work-queue setup FAILED wq:%p w_list:%p \n", + __FUNCTION__, &wq, w_list); + return -1; + } + +retry_submit: + pthread_mutex_lock(&wrk_q->mtx); + /* setup work queue here.. */ + wrk_q->flag = Q_INITIALIZED; + pthread_mutex_unlock(&wrk_q->mtx); + + + pthread_mutex_lock(&comp_q->mtx); + if(0 != set_done_cnt_flag(0)) { + fprintf(stderr, "FAILED %s:%d\n", __FILE__, __LINE__); + pthread_mutex_unlock(&comp_q->mtx); + return -1; + } + comp_q->flag = Q_PROCESS; + pthread_mutex_unlock(&comp_q->mtx); + + /* if threads are waiting request them to start */ + pthread_mutex_lock(&wrk_q->mtx); + wrk_q->flag = Q_PROCESS; + pthread_cond_broadcast(&wrk_q->cv); + pthread_mutex_unlock(&wrk_q->mtx); + + /* Wait on all worker-threads to complete */ + pthread_mutex_lock(&comp_q->mtx); + if (comp_q->flag != Q_DONE) { + do { + pthread_cond_wait(&comp_q->cv, &comp_q->mtx); + if(comp_q->flag != Q_DONE) { + fprintf(stderr, "[1] cv wait on CQ failed flag:%d cnt:%lu\n", + comp_q->flag, done_cnt_flag); + if (done_cnt_flag != srv_buf_pool_instances) { + fprintf(stderr, "[2] cv wait on CQ failed flag:%d cnt:%lu\n", + comp_q->flag, done_cnt_flag); + fprintf(stderr, "============\n"); + print_wrk_list(w_list); + fprintf(stderr, "============\n"); + } + continue; + } else if (done_cnt_flag != srv_buf_pool_instances) { + fprintf(stderr, "[3]cv wait on CQ failed flag:%d cnt:%lu\n", + comp_q->flag, done_cnt_flag); + fprintf(stderr, "============\n"); + print_wrk_list(w_list); + fprintf(stderr, "============\n"); + comp_q->flag = Q_INITIALIZED; + pthread_mutex_unlock(&comp_q->mtx); + goto retry_submit; + + assert(!done_cnt_flag); + continue; + } + assert(done_cnt_flag == srv_buf_pool_instances); + + if ((comp_q->flag == Q_DONE) && + (done_cnt_flag == srv_buf_pool_instances)) { + break; + } + } while((comp_q->flag == Q_INITIALIZED) && + (done_cnt_flag != srv_buf_pool_instances)); + } else { + fprintf(stderr, "[4] cv wait on CQ failed flag:%d cnt:%lu\n", + comp_q->flag, done_cnt_flag); + if (!done_cnt_flag) { + fprintf(stderr, "============\n"); + print_wrk_list(w_list); + fprintf(stderr, "============\n"); + comp_q->flag = Q_INITIALIZED; + pthread_mutex_unlock(&comp_q->mtx); + goto retry_submit; + assert(!done_cnt_flag); + } + assert(done_cnt_flag == srv_buf_pool_instances); + } + + pthread_mutex_unlock(&comp_q->mtx); + pthread_mutex_lock(&wrk_q->mtx); + wrk_q->flag = Q_DONE; + pthread_mutex_unlock(&wrk_q->mtx); + + return 0; +} + +/******************************************************************//** +@return a dummy parameter*/ +int pgcomp_handler_init(int num_threads, int wrk_cnt, opq_t *wq, opq_t *cq) +{ + int i=0; + + if(is_pgcomp_wrk_init_done()) { + fprintf(stderr, "pgcomp_handler_init(): ERROR already initialized\n"); + return -1; + } + + if(!wq || !cq) { + fprintf(stderr, "%s() FAILED wq:%p cq:%p\n", __FUNCTION__, wq, cq); + return -1; + } + + /* work-item setup */ + setup_wrk_itm(wrk_cnt); + + /* wq & cq setup */ + init_queue(wq); + init_queue(cq); + + /* Mark each of the thread sync entires */ + for(i=0; i < PGCOMP_MAX_WORKER; i++) { + pc_sync[i].wthread_id = i; + } + + /* Create threads for page-compression-flush */ + for(i=0; i < num_threads; i++) { + pc_sync[i].wthread_id = i; + pc_sync[i].wq = wq; + pc_sync[i].cq = cq; + os_thread_create(page_comp_io_thread, ((void *)(pc_sync + i)), + thread_ids + START_PGCOMP_CNT + i); + //pc_sync[i].wthread = thread_ids[START_PGCOMP_CNT + i]; + pc_sync[i].wthread = (START_PGCOMP_CNT + i); + pc_sync[i].wt_status = WTHR_INITIALIZED; + } + + set_check_done_flag_count(wrk_cnt); + set_pgcomp_wrk_init_done(); + + return 0; +} + + +int wrk_thread_stat(thread_sync_t *wthr, unsigned int num_threads) +{ + long stat_tot=0; + unsigned int i=0; + for(i=0; i< num_threads;i++) { + stat_tot+=wthr[i].stat_universal_num_processed; + fprintf(stderr, "[%d] stat [%lu]\n", wthr[i].wthread_id, + wthr[i].stat_universal_num_processed); + } + fprintf(stderr, "Stat-Total:%lu\n", stat_tot); + return (0); +} + +int reset_wrk_itm(int items) +{ + int i; + + pthread_mutex_lock(&wq.mtx); + wq.head = wq.tail = NULL; + pthread_mutex_unlock(&wq.mtx); + + pthread_mutex_lock(&cq.mtx); + for(i=0;i