diff --git a/storage/xtradb/CMakeLists.txt b/storage/xtradb/CMakeLists.txt index 452f5c8e35d48..927d060d6c3cb 100644 --- a/storage/xtradb/CMakeLists.txt +++ b/storage/xtradb/CMakeLists.txt @@ -1,4 +1,4 @@ -# Copyright (c) 2006, 2011, Oracle and/or its affiliates. All rights reserved. +# Copyright (c) 2006, 2015, Oracle and/or its affiliates. All rights reserved. # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by @@ -29,6 +29,9 @@ IF(UNIX) ADD_DEFINITIONS(-DLINUX_NATIVE_AIO=1) LINK_LIBRARIES(aio) ENDIF() + IF(HAVE_LIBNUMA) + LINK_LIBRARIES(numa) + ENDIF() ELSEIF(CMAKE_SYSTEM_NAME MATCHES "HP*") ADD_DEFINITIONS("-DUNIV_HPUX") ELSEIF(CMAKE_SYSTEM_NAME STREQUAL "AIX") @@ -145,6 +148,18 @@ IF(NOT CMAKE_CROSSCOMPILING) }" HAVE_IB_GCC_ATOMIC_THREAD_FENCE ) + CHECK_C_SOURCE_RUNS( + "#include + int main() + { + unsigned char c; + + __atomic_test_and_set(&c, __ATOMIC_ACQUIRE); + __atomic_clear(&c, __ATOMIC_RELEASE); + return(0); + }" + HAVE_IB_GCC_ATOMIC_TEST_AND_SET + ) ENDIF() IF(HAVE_IB_GCC_ATOMIC_BUILTINS) @@ -167,6 +182,10 @@ IF(HAVE_IB_GCC_ATOMIC_THREAD_FENCE) ADD_DEFINITIONS(-DHAVE_IB_GCC_ATOMIC_THREAD_FENCE=1) ENDIF() +IF(HAVE_IB_GCC_ATOMIC_TEST_AND_SET) + ADD_DEFINITIONS(-DHAVE_IB_GCC_ATOMIC_TEST_AND_SET=1) +ENDIF() + # either define HAVE_IB_ATOMIC_PTHREAD_T_GCC or not IF(NOT CMAKE_CROSSCOMPILING) CHECK_C_SOURCE_RUNS( diff --git a/storage/xtradb/btr/btr0cur.cc b/storage/xtradb/btr/btr0cur.cc index c68d54c818c29..4c6fdd67c9526 100644 --- a/storage/xtradb/btr/btr0cur.cc +++ b/storage/xtradb/btr/btr0cur.cc @@ -1,6 +1,6 @@ /***************************************************************************** -Copyright (c) 1994, 2014, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 1994, 2015, Oracle and/or its affiliates. All Rights Reserved. Copyright (c) 2008, Google Inc. Copyright (c) 2012, Facebook Inc. @@ -284,8 +284,13 @@ btr_cur_latch_leaves( #ifdef UNIV_BTR_DEBUG ut_a(page_is_comp(get_block->frame) == page_is_comp(page)); - ut_a(btr_page_get_next(get_block->frame, mtr) - == page_get_page_no(page)); + + /* For fake_change mode we avoid a detailed validation + as it operate in tweaked format where-in validation + may fail. */ + ut_a(sibling_mode == RW_NO_LATCH + || btr_page_get_next(get_block->frame, mtr) + == page_get_page_no(page)); #endif /* UNIV_BTR_DEBUG */ if (sibling_mode == RW_NO_LATCH) { /* btr_block_get() called with RW_NO_LATCH will @@ -1383,9 +1388,6 @@ btr_cur_optimistic_insert( } #endif /* UNIV_DEBUG */ - ut_ad((thr && thr_get_trx(thr)->fake_changes) - || mtr_memo_contains(mtr, block, MTR_MEMO_PAGE_X_FIX)); - leaf = page_is_leaf(page); /* Calculate the record size when entry is converted to a record */ @@ -2265,6 +2267,7 @@ btr_cur_optimistic_update( ulint max_size; ulint new_rec_size; ulint old_rec_size; + ulint max_ins_size = 0; dtuple_t* new_entry; roll_ptr_t roll_ptr; ulint i; @@ -2394,6 +2397,10 @@ btr_cur_optimistic_update( : (old_rec_size + page_get_max_insert_size_after_reorganize(page, 1)); + if (!page_zip) { + max_ins_size = page_get_max_insert_size_after_reorganize(page, 1); + } + if (!(((max_size >= BTR_CUR_PAGE_REORGANIZE_LIMIT) && (max_size >= new_rec_size)) || (page_get_n_recs(page) <= 1))) { @@ -2459,12 +2466,15 @@ btr_cur_optimistic_update( ut_ad(err == DB_SUCCESS); func_exit: - if (page_zip - && !(flags & BTR_KEEP_IBUF_BITMAP) + if (!(flags & BTR_KEEP_IBUF_BITMAP) && !dict_index_is_clust(index) && page_is_leaf(page)) { - /* Update the free bits in the insert buffer. */ - ibuf_update_free_bits_zip(block, mtr); + + if (page_zip) { + ibuf_update_free_bits_zip(block, mtr); + } else { + ibuf_update_free_bits_low(block, max_ins_size, mtr); + } } return(err); @@ -2600,6 +2610,7 @@ btr_cur_pessimistic_update( ulint n_reserved = 0; ulint n_ext; trx_t* trx; + ulint max_ins_size = 0; *offsets = NULL; *big_rec = NULL; @@ -2800,6 +2811,10 @@ btr_cur_pessimistic_update( } } + if (!page_zip) { + max_ins_size = page_get_max_insert_size_after_reorganize(page, 1); + } + /* Store state of explicit locks on rec on the page infimum record, before deleting rec. The page infimum acts as a dummy carrier of the locks, taking care also of lock releases, before we can move the locks @@ -2845,13 +2860,18 @@ btr_cur_pessimistic_update( rec_offs_make_valid( page_cursor->rec, index, *offsets); } - } else if (page_zip && - !dict_index_is_clust(index) + } else if (!dict_index_is_clust(index) && page_is_leaf(page)) { + /* Update the free bits in the insert buffer. This is the same block which was skipped by BTR_KEEP_IBUF_BITMAP. */ - ibuf_update_free_bits_zip(block, mtr); + if (page_zip) { + ibuf_update_free_bits_zip(block, mtr); + } else { + ibuf_update_free_bits_low(block, max_ins_size, + mtr); + } } err = DB_SUCCESS; diff --git a/storage/xtradb/buf/buf0buf.cc b/storage/xtradb/buf/buf0buf.cc index 1add55dc3907c..1786ba51429ad 100644 --- a/storage/xtradb/buf/buf0buf.cc +++ b/storage/xtradb/buf/buf0buf.cc @@ -53,6 +53,10 @@ Created 11/5/1995 Heikki Tuuri #include "page0zip.h" #include "srv0mon.h" #include "buf0checksum.h" +#ifdef HAVE_LIBNUMA +#include +#include +#endif // HAVE_LIBNUMA #include "trx0trx.h" #include "srv0start.h" @@ -1125,8 +1129,7 @@ buf_chunk_init( /*===========*/ buf_pool_t* buf_pool, /*!< in: buffer pool instance */ buf_chunk_t* chunk, /*!< out: chunk of buffers */ - ulint mem_size, /*!< in: requested size in bytes */ - ibool populate) /*!< in: virtual page preallocation */ + ulint mem_size) /*!< in: requested size in bytes */ { buf_block_t* block; byte* frame; @@ -1142,13 +1145,29 @@ buf_chunk_init( + (UNIV_PAGE_SIZE - 1), UNIV_PAGE_SIZE); chunk->mem_size = mem_size; - chunk->mem = os_mem_alloc_large(&chunk->mem_size, populate); + chunk->mem = os_mem_alloc_large(&chunk->mem_size); if (UNIV_UNLIKELY(chunk->mem == NULL)) { return(NULL); } +#ifdef HAVE_LIBNUMA + if (srv_numa_interleave) { + int st = mbind(chunk->mem, chunk->mem_size, + MPOL_INTERLEAVE, + numa_all_nodes_ptr->maskp, + numa_all_nodes_ptr->size, + MPOL_MF_MOVE); + if (st != 0) { + ib_logf(IB_LOG_LEVEL_WARN, + "Failed to set NUMA memory policy of buffer" + " pool page frames to MPOL_INTERLEAVE" + " (error: %s).", strerror(errno)); + } + } +#endif // HAVE_LIBNUMA + /* Allocate the block descriptors from the start of the memory block. */ chunk->blocks = (buf_block_t*) chunk->mem; @@ -1345,7 +1364,6 @@ buf_pool_init_instance( /*===================*/ buf_pool_t* buf_pool, /*!< in: buffer pool instance */ ulint buf_pool_size, /*!< in: size in bytes */ - ibool populate, /*!< in: virtual page preallocation */ ulint instance_no) /*!< in: id of the instance */ { ulint i; @@ -1374,7 +1392,7 @@ buf_pool_init_instance( UT_LIST_INIT(buf_pool->free); - if (!buf_chunk_init(buf_pool, chunk, buf_pool_size, populate)) { + if (!buf_chunk_init(buf_pool, chunk, buf_pool_size)) { mem_free(chunk); mem_free(buf_pool); @@ -1480,7 +1498,6 @@ dberr_t buf_pool_init( /*==========*/ ulint total_size, /*!< in: size of the total pool in bytes */ - ibool populate, /*!< in: virtual page preallocation */ ulint n_instances) /*!< in: number of instances */ { ulint i; @@ -1490,13 +1507,28 @@ buf_pool_init( ut_ad(n_instances <= MAX_BUFFER_POOLS); ut_ad(n_instances == srv_buf_pool_instances); +#ifdef HAVE_LIBNUMA + if (srv_numa_interleave) { + ib_logf(IB_LOG_LEVEL_INFO, + "Setting NUMA memory policy to MPOL_INTERLEAVE"); + if (set_mempolicy(MPOL_INTERLEAVE, + numa_all_nodes_ptr->maskp, + numa_all_nodes_ptr->size) != 0) { + ib_logf(IB_LOG_LEVEL_WARN, + "Failed to set NUMA memory policy to" + " MPOL_INTERLEAVE (error: %s).", + strerror(errno)); + } + } +#endif // HAVE_LIBNUMA + buf_pool_ptr = (buf_pool_t*) mem_zalloc( n_instances * sizeof *buf_pool_ptr); for (i = 0; i < n_instances; i++) { buf_pool_t* ptr = &buf_pool_ptr[i]; - if (buf_pool_init_instance(ptr, size, populate, i) != DB_SUCCESS) { + if (buf_pool_init_instance(ptr, size, i) != DB_SUCCESS) { /* Free all the instances created so far. */ buf_pool_free(i); @@ -1510,6 +1542,18 @@ buf_pool_init( btr_search_sys_create(buf_pool_get_curr_size() / sizeof(void*) / 64); +#ifdef HAVE_LIBNUMA + if (srv_numa_interleave) { + ib_logf(IB_LOG_LEVEL_INFO, + "Setting NUMA memory policy to MPOL_DEFAULT"); + if (set_mempolicy(MPOL_DEFAULT, NULL, 0) != 0) { + ib_logf(IB_LOG_LEVEL_WARN, + "Failed to set NUMA memory policy to" + " MPOL_DEFAULT (error: %s).", strerror(errno)); + } + } +#endif // HAVE_LIBNUMA + return(DB_SUCCESS); } diff --git a/storage/xtradb/dict/dict0crea.cc b/storage/xtradb/dict/dict0crea.cc index 30523ff2af4c7..95643eef50c5e 100644 --- a/storage/xtradb/dict/dict0crea.cc +++ b/storage/xtradb/dict/dict0crea.cc @@ -1183,7 +1183,7 @@ dict_create_index_step( >= UNIV_FORMAT_B); node->index = dict_index_get_if_in_cache_low(index_id); - ut_a(!node->index == (err != DB_SUCCESS)); + ut_a((node->index == 0) == (err != DB_SUCCESS)); if (err != DB_SUCCESS) { diff --git a/storage/xtradb/dict/dict0dict.cc b/storage/xtradb/dict/dict0dict.cc index 65dfd20d6c957..6406a784348ae 100644 --- a/storage/xtradb/dict/dict0dict.cc +++ b/storage/xtradb/dict/dict0dict.cc @@ -207,14 +207,6 @@ dict_index_remove_from_cache_low( dict_index_t* index, /*!< in, own: index */ ibool lru_evict); /*!< in: TRUE if page being evicted to make room in the table LRU list */ -/**********************************************************************//** -Removes a table object from the dictionary cache. */ -static -void -dict_table_remove_from_cache_low( -/*=============================*/ - dict_table_t* table, /*!< in, own: table */ - ibool lru_evict); /*!< in: TRUE if evicting from LRU */ #ifdef UNIV_DEBUG /**********************************************************************//** Validate the dictionary table LRU list. @@ -748,6 +740,45 @@ dict_table_get_all_fts_indexes( return(ib_vector_size(indexes)); } +/** Store autoinc value when the table is evicted. +@param[in] table table evicted */ +UNIV_INTERN +void +dict_table_autoinc_store( + const dict_table_t* table) +{ + ut_ad(mutex_own(&dict_sys->mutex)); + + if (table->autoinc != 0) { + ut_ad(dict_sys->autoinc_map->find(table->id) + == dict_sys->autoinc_map->end()); + + dict_sys->autoinc_map->insert( + std::pair( + table->id, table->autoinc)); + } +} + +/** Restore autoinc value when the table is loaded. +@param[in] table table loaded */ +UNIV_INTERN +void +dict_table_autoinc_restore( + dict_table_t* table) +{ + ut_ad(mutex_own(&dict_sys->mutex)); + + autoinc_map_t::iterator it; + it = dict_sys->autoinc_map->find(table->id); + + if (it != dict_sys->autoinc_map->end()) { + table->autoinc = it->second; + ut_ad(table->autoinc != 0); + + dict_sys->autoinc_map->erase(it); + } +} + /********************************************************************//** Reads the next autoinc value (== autoinc counter value), 0 if not yet initialized. @@ -1041,6 +1072,8 @@ dict_init(void) mutex_create(dict_foreign_err_mutex_key, &dict_foreign_err_mutex, SYNC_NO_ORDER_CHECK); } + + dict_sys->autoinc_map = new autoinc_map_t(); } /**********************************************************************//** @@ -1288,6 +1321,8 @@ dict_table_add_to_cache( UT_LIST_ADD_FIRST(table_LRU, dict_sys->table_non_LRU, table); } + dict_table_autoinc_restore(table); + ut_ad(dict_lru_validate()); dict_sys->size += mem_heap_get_size(table->heap) @@ -1978,7 +2013,6 @@ dict_table_change_id_in_cache( /**********************************************************************//** Removes a table object from the dictionary cache. */ -static void dict_table_remove_from_cache_low( /*=============================*/ @@ -2040,6 +2074,10 @@ dict_table_remove_from_cache_low( ut_ad(dict_lru_validate()); + if (lru_evict) { + dict_table_autoinc_store(table); + } + if (lru_evict && table->drop_aborted) { /* Do as dict_table_try_drop_aborted() does. */ @@ -6372,6 +6410,8 @@ dict_close(void) mutex_free(&dict_foreign_err_mutex); } + delete dict_sys->autoinc_map; + mem_free(dict_sys); dict_sys = NULL; } diff --git a/storage/xtradb/handler/ha_innodb.cc b/storage/xtradb/handler/ha_innodb.cc index be47416d45f07..4daf40df10c86 100644 --- a/storage/xtradb/handler/ha_innodb.cc +++ b/storage/xtradb/handler/ha_innodb.cc @@ -1520,6 +1520,19 @@ thd_supports_xa( return(THDVAR(thd, support_xa)); } +/******************************************************************//** +Check the status of fake changes mode (innodb_fake_changes) +@return true if fake change mode is enabled. */ +UNIV_INTERN +ibool +thd_fake_changes( +/*=============*/ + THD* thd) /*!< in: thread handle, or NULL to query + the global innodb_supports_xa */ +{ + return(THDVAR((THD*) thd, fake_changes)); +} + /******************************************************************//** Returns the lock wait timeout for the current connection. @return the lock wait timeout, in seconds */ @@ -2329,7 +2342,15 @@ innobase_trx_init( trx->check_unique_secondary = !thd_test_options( thd, OPTION_RELAXED_UNIQUE_CHECKS); - trx->fake_changes = THDVAR(thd, fake_changes); + /* Transaction on start caches the fake_changes state and uses it for + complete transaction lifetime. + There are some APIs that doesn't need an active transaction object + but transaction object are just use as a cache object/data carrier. + Before using transaction object for such APIs refresh the state of + fake_changes. */ + if (trx->state == TRX_STATE_NOT_STARTED) { + trx->fake_changes = thd_fake_changes(thd); + } #ifdef EXTENDED_SLOWLOG if (thd_log_slow_verbosity(thd) & (1ULL << SLOG_V_INNODB)) { @@ -4125,15 +4146,26 @@ innobase_commit( /* No-op in XtraDB */ trx_search_latch_release_if_reserved(trx); - if (UNIV_UNLIKELY(trx->fake_changes && - (commit_trx || - (!thd_test_options(thd, + /* If fake-changes mode = ON then allow + SELECT (they are read-only) and + CREATE ... SELECT * from table (Well this doesn't open up DDL for InnoDB + as ha_innobase::create will return appropriate error if fake-change = ON + but if create is trying to use other SE and SELECT is executing on + InnoDB table then we allow SELECT to proceed. + Ideally, statement like this should be marked CREATE_SELECT like + INSERT_SELECT but unfortunately it doesn't). */ + if (UNIV_UNLIKELY(trx->fake_changes + && (thd_sql_command(thd) != SQLCOM_SELECT + && thd_sql_command(thd) != SQLCOM_CREATE_TABLE) + && (commit_trx || (!thd_test_options(thd, OPTION_NOT_AUTOCOMMIT | OPTION_BEGIN))))) { /* rollback implicitly */ innobase_rollback(hton, thd, commit_trx); + /* because debug assertion code complains, if something left */ thd->get_stmt_da()->reset_diagnostics_area(); + DBUG_RETURN(HA_ERR_WRONG_COMMAND); } /* Transaction is deregistered only in a commit or a rollback. If @@ -7191,7 +7223,7 @@ ha_innobase::write_row( DBUG_ENTER("ha_innobase::write_row"); - if (srv_read_only_mode) { + if (high_level_read_only) { ib_senderrf(ha_thd(), IB_LOG_LEVEL_WARN, ER_READ_ONLY_MODE); DBUG_RETURN(HA_ERR_TABLE_READONLY); } else if (prebuilt->trx != trx) { @@ -7749,7 +7781,7 @@ ha_innobase::update_row( ut_a(prebuilt->trx == trx); - if (srv_read_only_mode) { + if (high_level_read_only) { ib_senderrf(ha_thd(), IB_LOG_LEVEL_WARN, ER_READ_ONLY_MODE); DBUG_RETURN(HA_ERR_TABLE_READONLY); } else if (!trx_is_started(trx)) { @@ -7895,7 +7927,7 @@ ha_innobase::delete_row( ut_a(prebuilt->trx == trx); - if (srv_read_only_mode) { + if (high_level_read_only) { ib_senderrf(ha_thd(), IB_LOG_LEVEL_WARN, ER_READ_ONLY_MODE); DBUG_RETURN(HA_ERR_TABLE_READONLY); } else if (!trx_is_started(trx)) { @@ -10277,7 +10309,7 @@ ha_innobase::create( if (form->s->fields > REC_MAX_N_USER_FIELDS) { DBUG_RETURN(HA_ERR_TOO_MANY_FIELDS); - } else if (srv_read_only_mode) { + } else if (high_level_read_only) { DBUG_RETURN(HA_ERR_INNODB_READ_ONLY); } @@ -10613,7 +10645,7 @@ ha_innobase::discard_or_import_tablespace( ut_a(prebuilt->trx->magic_n == TRX_MAGIC_N); ut_a(prebuilt->trx == thd_to_trx(ha_thd())); - if (srv_read_only_mode) { + if (high_level_read_only) { DBUG_RETURN(HA_ERR_TABLE_READONLY); } @@ -10711,7 +10743,7 @@ ha_innobase::truncate() DBUG_ENTER("ha_innobase::truncate"); - if (srv_read_only_mode) { + if (high_level_read_only) { DBUG_RETURN(HA_ERR_TABLE_READONLY); } @@ -11087,7 +11119,7 @@ ha_innobase::rename_table( DBUG_ENTER("ha_innobase::rename_table"); - if (srv_read_only_mode) { + if (high_level_read_only) { ib_senderrf(thd, IB_LOG_LEVEL_WARN, ER_READ_ONLY_MODE); DBUG_RETURN(HA_ERR_TABLE_READONLY); } @@ -15195,6 +15227,12 @@ innodb_internal_table_validate( } dict_table_close(user_table, FALSE, TRUE); + + DBUG_EXECUTE_IF("innodb_evict_autoinc_table", + mutex_enter(&dict_sys->mutex); + dict_table_remove_from_cache_low(user_table, TRUE); + mutex_exit(&dict_sys->mutex); + ); } return(ret); @@ -16337,6 +16375,46 @@ innodb_sched_priority_master_update( #endif /* UNIV_LINUX */ +#ifdef UNIV_DEBUG +/*************************************************************//** +Check if it is a valid value of innodb_track_changed_pages. +Changed pages tracking is not working correctly without initialization +procedure on server startup. The function allows to temporary +disable tracking, but only if the feature was enabled on startup. +This function is registered as a callback with MySQL. +@return 0 for valid innodb_track_changed_pages */ +static +int +innodb_track_changed_pages_validate( + THD* thd, /*!< in: thread handle */ + struct st_mysql_sys_var* var, /*!< in: pointer to system + variable */ + void* save, /*!< out: immediate result + for update function */ + struct st_mysql_value* value) /*!< in: incoming bool */ +{ + static bool enabled_on_startup = false; + long long intbuf = 0; + + if (value->val_int(value, &intbuf)) { + /* The value is NULL. That is invalid. */ + return 1; + } + + if (srv_track_changed_pages || enabled_on_startup) { + enabled_on_startup = true; + *reinterpret_cast(save) + = static_cast(intbuf); + return 0; + } + + if (intbuf == srv_track_changed_pages) + return 0; + + return 1; +} +#endif + /****************************************************************//** Callback function for accessing the InnoDB variables from MySQL: SHOW VARIABLES. */ @@ -17232,10 +17310,9 @@ static MYSQL_SYSVAR_LONGLONG(buffer_pool_size, innobase_buffer_pool_size, "The size of the memory buffer InnoDB uses to cache data and indexes of its tables.", NULL, NULL, 128*1024*1024L, 5*1024*1024L, LONGLONG_MAX, 1024*1024L); -static MYSQL_SYSVAR_BOOL(buffer_pool_populate, srv_buf_pool_populate, +static MYSQL_SYSVAR_BOOL(buffer_pool_populate, srv_numa_interleave, PLUGIN_VAR_NOCMDARG | PLUGIN_VAR_READONLY, - "Preallocate (pre-fault) the page frames required for the mapping " - "established by the buffer pool memory region. Disabled by default.", + "Depricated. This option is temporary alias of --innodb-numa-interleave.", NULL, NULL, FALSE); static MYSQL_SYSVAR_ENUM(foreground_preflush, srv_foreground_preflush, @@ -17675,6 +17752,13 @@ static MYSQL_SYSVAR_BOOL(use_native_aio, srv_use_native_aio, "Use native AIO if supported on this platform.", NULL, NULL, TRUE); +#ifdef HAVE_LIBNUMA +static MYSQL_SYSVAR_BOOL(numa_interleave, srv_numa_interleave, + PLUGIN_VAR_NOCMDARG | PLUGIN_VAR_READONLY, + "Use NUMA interleave memory policy to allocate InnoDB buffer pool.", + NULL, NULL, FALSE); +#endif // HAVE_LIBNUMA + static MYSQL_SYSVAR_BOOL(api_enable_binlog, ib_binlog_enabled, PLUGIN_VAR_NOCMDARG | PLUGIN_VAR_READONLY, "Enable binlog for applications direct access InnoDB through InnoDB APIs", @@ -17737,7 +17821,12 @@ static MYSQL_SYSVAR_BOOL(track_changed_pages, srv_track_changed_pages, #endif , "Track the redo log for changed pages and output a changed page bitmap", - NULL, NULL, FALSE); +#ifdef UNIV_DEBUG + innodb_track_changed_pages_validate, +#else + NULL, +#endif + NULL, FALSE); static MYSQL_SYSVAR_ULONGLONG(max_bitmap_file_size, srv_max_bitmap_file_size, PLUGIN_VAR_RQCMDARG, @@ -18013,6 +18102,9 @@ static struct st_mysql_sys_var* innobase_system_variables[]= { MYSQL_SYSVAR(version), MYSQL_SYSVAR(use_sys_malloc), MYSQL_SYSVAR(use_native_aio), +#ifdef HAVE_LIBNUMA + MYSQL_SYSVAR(numa_interleave), +#endif // HAVE_LIBNUMA MYSQL_SYSVAR(change_buffering), MYSQL_SYSVAR(change_buffer_max_size), MYSQL_SYSVAR(track_changed_pages), diff --git a/storage/xtradb/handler/handler0alter.cc b/storage/xtradb/handler/handler0alter.cc index 65a01f1cbab06..764c690485376 100644 --- a/storage/xtradb/handler/handler0alter.cc +++ b/storage/xtradb/handler/handler0alter.cc @@ -234,7 +234,7 @@ ha_innobase::check_if_supported_inplace_alter( { DBUG_ENTER("check_if_supported_inplace_alter"); - if (srv_read_only_mode) { + if (high_level_read_only) { ha_alter_info->unsupported_reason = innobase_get_err_msg(ER_READ_ONLY_MODE); DBUG_RETURN(HA_ALTER_INPLACE_NOT_SUPPORTED); @@ -2600,15 +2600,10 @@ prepare_inplace_alter_table_dict( /* Create a background transaction for the operations on the data dictionary tables. */ ctx->trx = innobase_trx_allocate(ctx->prebuilt->trx->mysql_thd); - - if (UNIV_UNLIKELY(ctx->trx->fake_changes)) { - trx_rollback_to_savepoint(ctx->trx, NULL); - trx_free_for_mysql(ctx->trx); - DBUG_RETURN(HA_ERR_WRONG_COMMAND); - } - trx_start_for_ddl(ctx->trx, TRX_DICT_OP_INDEX); + DBUG_ASSERT(!ctx->trx->fake_changes); + /* Create table containing all indexes to be built in this ALTER TABLE ADD INDEX so that they are in the correct order in the table. */ diff --git a/storage/xtradb/include/buf0buf.h b/storage/xtradb/include/buf0buf.h index 2d41639b065f2..158fc4234039a 100644 --- a/storage/xtradb/include/buf0buf.h +++ b/storage/xtradb/include/buf0buf.h @@ -214,7 +214,6 @@ dberr_t buf_pool_init( /*=========*/ ulint size, /*!< in: Size of the total pool in bytes */ - ibool populate, /*!< in: Force virtual page preallocation */ ulint n_instances); /*!< in: Number of instances */ /********************************************************************//** Frees the buffer pool at shutdown. This must not be invoked before diff --git a/storage/xtradb/include/dict0dict.h b/storage/xtradb/include/dict0dict.h index 78abbcc5398ab..7c3e51b3c81bb 100644 --- a/storage/xtradb/include/dict0dict.h +++ b/storage/xtradb/include/dict0dict.h @@ -309,6 +309,21 @@ dict_table_autoinc_initialize( dict_table_t* table, /*!< in/out: table */ ib_uint64_t value) /*!< in: next value to assign to a row */ __attribute__((nonnull)); + +/** Store autoinc value when the table is evicted. +@param[in] table table evicted */ +UNIV_INTERN +void +dict_table_autoinc_store( + const dict_table_t* table); + +/** Restore autoinc value when the table is loaded. +@param[in] table table loaded */ +UNIV_INTERN +void +dict_table_autoinc_restore( + dict_table_t* table); + /********************************************************************//** Reads the next autoinc value (== autoinc counter value), 0 if not yet initialized. @@ -368,6 +383,15 @@ dict_table_remove_from_cache( dict_table_t* table) /*!< in, own: table */ __attribute__((nonnull)); /**********************************************************************//** +Removes a table object from the dictionary cache. */ +UNIV_INTERN +void +dict_table_remove_from_cache_low( +/*=============================*/ + dict_table_t* table, /*!< in, own: table */ + ibool lru_evict); /*!< in: TRUE if table being evicted + to make room in the table LRU list */ +/**********************************************************************//** Renames a table object. @return TRUE if success */ UNIV_INTERN @@ -1543,6 +1567,8 @@ extern dict_sys_t* dict_sys; /** the data dictionary rw-latch protecting dict_sys */ extern rw_lock_t dict_operation_lock; +typedef std::map autoinc_map_t; + /* Dictionary system struct */ struct dict_sys_t{ ib_prio_mutex_t mutex; /*!< mutex protecting the data @@ -1577,6 +1603,8 @@ struct dict_sys_t{ UT_LIST_BASE_NODE_T(dict_table_t) table_non_LRU; /*!< List of tables that can't be evicted from the cache */ + autoinc_map_t* autoinc_map; /*!< Map to store table id and autoinc + when table is evicted */ }; #endif /* !UNIV_HOTBACKUP */ diff --git a/storage/xtradb/include/ha_prototypes.h b/storage/xtradb/include/ha_prototypes.h index 3ba3df66b7376..74832b59b7f7d 100644 --- a/storage/xtradb/include/ha_prototypes.h +++ b/storage/xtradb/include/ha_prototypes.h @@ -335,6 +335,16 @@ thd_supports_xa( THD* thd); /*!< in: thread handle, or NULL to query the global innodb_supports_xa */ +/******************************************************************//** +Check the status of fake changes mode (innodb_fake_changes) +@return true if fake change mode is enabled. */ +UNIV_INTERN +ibool +thd_fake_changes( +/*=============*/ + THD* thd); /*!< in: thread handle, or NULL to query + the global innodb_supports_xa */ + /******************************************************************//** Returns the lock wait timeout for the current connection. @return the lock wait timeout, in seconds */ diff --git a/storage/xtradb/include/ibuf0ibuf.ic b/storage/xtradb/include/ibuf0ibuf.ic index 21747fdceac52..a5df9f7b6b433 100644 --- a/storage/xtradb/include/ibuf0ibuf.ic +++ b/storage/xtradb/include/ibuf0ibuf.ic @@ -1,6 +1,6 @@ /***************************************************************************** -Copyright (c) 1997, 2013, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 1997, 2015, Oracle and/or its affiliates. All Rights Reserved. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -128,7 +128,8 @@ ibuf_should_try( && ibuf->max_size != 0 && !dict_index_is_clust(index) && index->table->quiesce == QUIESCE_NONE - && (ignore_sec_unique || !dict_index_is_unique(index))); + && (ignore_sec_unique || !dict_index_is_unique(index)) + && srv_force_recovery < SRV_FORCE_NO_IBUF_MERGE); } /******************************************************************//** diff --git a/storage/xtradb/include/os0file.h b/storage/xtradb/include/os0file.h index 05dc59d613378..de5ef1b59fb06 100644 --- a/storage/xtradb/include/os0file.h +++ b/storage/xtradb/include/os0file.h @@ -132,6 +132,10 @@ enum os_file_create_t { #define OS_FILE_READ_ONLY 333 #define OS_FILE_READ_WRITE 444 #define OS_FILE_READ_ALLOW_DELETE 555 /* for mysqlbackup */ +#define OS_FILE_READ_WRITE_CACHED 666 /* OS_FILE_READ_WRITE but never + O_DIRECT. Only for + os_file_create_simple_no_error_handling + currently. */ /* Options for file_create */ #define OS_FILE_AIO 61 @@ -540,9 +544,11 @@ os_file_create_simple_no_error_handling_func( null-terminated string */ ulint create_mode,/*!< in: create mode */ ulint access_type,/*!< in: OS_FILE_READ_ONLY, - OS_FILE_READ_WRITE, or - OS_FILE_READ_ALLOW_DELETE; the last option is - used by a backup program reading the file */ + OS_FILE_READ_WRITE, + OS_FILE_READ_ALLOW_DELETE (used by a backup + program reading the file), or + OS_FILE_READ_WRITE_CACHED (disable O_DIRECT + if it would be enabled otherwise) */ ibool* success)/*!< out: TRUE if succeed, FALSE if error */ __attribute__((nonnull, warn_unused_result)); /****************************************************************//** diff --git a/storage/xtradb/include/os0proc.h b/storage/xtradb/include/os0proc.h index f9e88ff1a287d..613e3bd6947d9 100644 --- a/storage/xtradb/include/os0proc.h +++ b/storage/xtradb/include/os0proc.h @@ -58,8 +58,7 @@ UNIV_INTERN void* os_mem_alloc_large( /*===============*/ - ulint* n, /*!< in/out: number of bytes */ - ibool populate); /*!< in: virtual page preallocation */ + ulint* n); /*!< in/out: number of bytes */ /****************************************************************//** Frees large pages memory. */ UNIV_INTERN diff --git a/storage/xtradb/include/os0sync.h b/storage/xtradb/include/os0sync.h index 57b29fff663c4..f0bcd493d505d 100644 --- a/storage/xtradb/include/os0sync.h +++ b/storage/xtradb/include/os0sync.h @@ -1,6 +1,6 @@ /***************************************************************************** -Copyright (c) 1995, 2014, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 1995, 2015, Oracle and/or its affiliates. All Rights Reserved. Copyright (c) 2008, Google Inc. Portions of this file contain modifications contributed and copyrighted by @@ -38,6 +38,27 @@ Created 9/6/1995 Heikki Tuuri #include "ut0lst.h" #include "sync0types.h" +#if defined __i386__ || defined __x86_64__ || defined _M_IX86 \ + || defined _M_X64 || defined __WIN__ + +#define IB_STRONG_MEMORY_MODEL +#undef HAVE_IB_GCC_ATOMIC_TEST_AND_SET // Quick-and-dirty fix for bug 1519094 + +#endif /* __i386__ || __x86_64__ || _M_IX86 || _M_X64 || __WIN__ */ + +#ifdef HAVE_WINDOWS_ATOMICS +typedef LONG lock_word_t; /*!< On Windows, InterlockedExchange operates + on LONG variable */ +#elif defined(HAVE_ATOMIC_BUILTINS) && !defined(HAVE_ATOMIC_BUILTINS_BYTE) +typedef ulint lock_word_t; +#else + +#define IB_LOCK_WORD_IS_BYTE + +typedef byte lock_word_t; + +#endif /* HAVE_WINDOWS_ATOMICS */ + #ifdef __WIN__ /** Native event (slow)*/ typedef HANDLE os_native_event_t; @@ -429,14 +450,61 @@ amount to decrement. */ # define os_atomic_decrement_uint64(ptr, amount) \ os_atomic_decrement(ptr, amount) -/**********************************************************//** -Returns the old value of *ptr, atomically sets *ptr to new_val */ - -# define os_atomic_test_and_set_byte(ptr, new_val) \ - __sync_lock_test_and_set(ptr, (byte) new_val) - -# define os_atomic_test_and_set_ulint(ptr, new_val) \ - __sync_lock_test_and_set(ptr, new_val) +# if defined(HAVE_IB_GCC_ATOMIC_TEST_AND_SET) + +/** Do an atomic test-and-set. +@param[in,out] ptr Memory location to set to non-zero +@return the previous value */ +inline +lock_word_t +os_atomic_test_and_set(volatile lock_word_t* ptr) +{ + return(__atomic_test_and_set(ptr, __ATOMIC_ACQUIRE)); +} + +/** Do an atomic clear. +@param[in,out] ptr Memory location to set to zero */ +inline +void +os_atomic_clear(volatile lock_word_t* ptr) +{ + __atomic_clear(ptr, __ATOMIC_RELEASE); +} + +# elif defined(IB_STRONG_MEMORY_MODEL) + +/** Do an atomic test and set. +@param[in,out] ptr Memory location to set to non-zero +@return the previous value */ +inline +lock_word_t +os_atomic_test_and_set(volatile lock_word_t* ptr) +{ + return(__sync_lock_test_and_set(ptr, 1)); +} + +/** Do an atomic release. + +In theory __sync_lock_release should be used to release the lock. +Unfortunately, it does not work properly alone. The workaround is +that more conservative __sync_lock_test_and_set is used instead. + +Performance regression was observed at some conditions for Intel +architecture. Disable release barrier on Intel architecture for now. +@param[in,out] ptr Memory location to write to +@return the previous value */ +inline +lock_word_t +os_atomic_clear(volatile lock_word_t* ptr) +{ + return(__sync_lock_test_and_set(ptr, 0)); +} + +# else + +# error "Unsupported platform" + +# endif /* HAVE_IB_GCC_ATOMIC_TEST_AND_SET */ #elif defined(HAVE_IB_SOLARIS_ATOMICS) @@ -511,14 +579,51 @@ amount to decrement. */ # define os_atomic_decrement_uint64(ptr, amount) \ os_atomic_increment_uint64(ptr, -(amount)) -/**********************************************************//** -Returns the old value of *ptr, atomically sets *ptr to new_val */ - -# define os_atomic_test_and_set_byte(ptr, new_val) \ - atomic_swap_uchar(ptr, new_val) - -# define os_atomic_test_and_set_ulint(ptr, new_val) \ - atomic_swap_ulong(ptr, new_val) +# ifdef IB_LOCK_WORD_IS_BYTE + +/** Do an atomic xchg and set to non-zero. +@param[in,out] ptr Memory location to set to non-zero +@return the previous value */ +inline +lock_word_t +os_atomic_test_and_set(volatile lock_word_t* ptr) +{ + return(atomic_swap_uchar(ptr, 1)); +} + +/** Do an atomic xchg and set to zero. +@param[in,out] ptr Memory location to set to zero +@return the previous value */ +inline +lock_word_t +os_atomic_clear(volatile lock_word_t* ptr) +{ + return(atomic_swap_uchar(ptr, 0)); +} + +# else + +/** Do an atomic xchg and set to non-zero. +@param[in,out] ptr Memory location to set to non-zero +@return the previous value */ +inline +lock_word_t +os_atomic_test_and_set(volatile lock_word_t* ptr) +{ + return(atomic_swap_ulong(ptr, 1)); +} + +/** Do an atomic xchg and set to zero. +@param[in,out] ptr Memory location to set to zero +@return the previous value */ +inline +lock_word_t +os_atomic_clear(volatile lock_word_t* ptr) +{ + return(atomic_swap_ulong(ptr, 0)); +} + +# endif /* IB_LOCK_WORD_IS_BYTE */ #elif defined(HAVE_WINDOWS_ATOMICS) @@ -633,16 +738,27 @@ amount to decrement. There is no atomic substract function on Windows */ (ib_int64_t*) ptr, \ -(ib_int64_t) amount) - amount)) -/**********************************************************//** -Returns the old value of *ptr, atomically sets *ptr to new_val. -InterlockedExchange() operates on LONG, and the LONG will be -clobbered */ - -# define os_atomic_test_and_set_byte(ptr, new_val) \ - ((byte) InterlockedExchange(ptr, new_val)) - -# define os_atomic_test_and_set_ulong(ptr, new_val) \ - InterlockedExchange(ptr, new_val) +/** Do an atomic test and set. +InterlockedExchange() operates on LONG, and the LONG will be clobbered +@param[in,out] ptr Memory location to set to non-zero +@return the previous value */ +inline +lock_word_t +os_atomic_test_and_set(volatile lock_word_t* ptr) +{ + return(InterlockedExchange(ptr, 1)); +} + +/** Do an atomic release. +InterlockedExchange() operates on LONG, and the LONG will be clobbered +@param[in,out] ptr Memory location to set to zero +@return the previous value */ +inline +lock_word_t +os_atomic_clear(volatile lock_word_t* ptr) +{ + return(InterlockedExchange(ptr, 0)); +} #else # define IB_ATOMICS_STARTUP_MSG \ @@ -692,7 +808,7 @@ for synchronization */ } while (0); /** barrier definitions for memory ordering */ -#if defined __i386__ || defined __x86_64__ || defined _M_IX86 || defined _M_X64 || defined __WIN__ +#ifdef IB_STRONG_MEMORY_MODEL /* Performance regression was observed at some conditions for Intel architecture. Disable memory barrier for Intel architecture for now. */ # define os_rmb diff --git a/storage/xtradb/include/srv0srv.h b/storage/xtradb/include/srv0srv.h index 39a39c29b558a..400c3b546ccec 100644 --- a/storage/xtradb/include/srv0srv.h +++ b/storage/xtradb/include/srv0srv.h @@ -1,6 +1,6 @@ /***************************************************************************** -Copyright (c) 1995, 2013, Oracle and/or its affiliates. All rights reserved. +Copyright (c) 1995, 2015, Oracle and/or its affiliates. All rights reserved. Copyright (c) 2008, 2009, Google Inc. Copyright (c) 2009, Percona Inc. @@ -206,6 +206,9 @@ extern char* srv_arch_dir; recovery and open all tables in RO mode instead of RW mode. We don't sync the max trx id to disk either. */ extern my_bool srv_read_only_mode; +/** Set if InnoDB operates in read-only mode or innodb-force-recovery +is greater than SRV_FORCE_NO_TRX_UNDO. */ +extern my_bool high_level_read_only; /** store to its own file each table created by an user; data dictionary tables are in the system tablespace 0 */ extern my_bool srv_file_per_table; @@ -236,6 +239,7 @@ OS (provided we compiled Innobase with it in), otherwise we will use simulated aio we build below with threads. Currently we support native aio on windows and linux */ extern my_bool srv_use_native_aio; +extern my_bool srv_numa_interleave; #ifdef __WIN__ extern ibool srv_use_native_conditions; #endif /* __WIN__ */ @@ -299,7 +303,6 @@ extern my_bool srv_use_sys_malloc; extern ibool srv_use_sys_malloc; #endif /* UNIV_HOTBACKUP */ extern ulint srv_buf_pool_size; /*!< requested size in bytes */ -extern my_bool srv_buf_pool_populate; /*!< virtual page preallocation */ extern ulint srv_buf_pool_instances; /*!< requested number of buffer pool instances */ extern ulong srv_n_page_hash_locks; /*!< number of locks to protect buf_pool->page_hash */ @@ -1092,6 +1095,7 @@ struct srv_slot_t{ #else /* !UNIV_HOTBACKUP */ # define srv_use_adaptive_hash_indexes FALSE # define srv_use_native_aio FALSE +# define srv_numa_interleave FALSE # define srv_force_recovery 0UL # define srv_set_io_thread_op_info(t,info) ((void) 0) # define srv_reset_io_thread_op_info() ((void) 0) diff --git a/storage/xtradb/include/sync0sync.h b/storage/xtradb/include/sync0sync.h index e3fe3028ed106..33bf1305e383d 100644 --- a/storage/xtradb/include/sync0sync.h +++ b/storage/xtradb/include/sync0sync.h @@ -1,6 +1,6 @@ /***************************************************************************** -Copyright (c) 1995, 2012, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 1995, 2015, Oracle and/or its affiliates. All Rights Reserved. Copyright (c) 2008, Google Inc. Copyright (c) 2012, Facebook Inc. @@ -47,15 +47,6 @@ Created 9/5/1995 Heikki Tuuri extern "C" my_bool timed_mutexes; #endif /* UNIV_DEBUG && !UNIV_HOTBACKUP */ -#ifdef HAVE_WINDOWS_ATOMICS -typedef LONG lock_word_t; /*!< On Windows, InterlockedExchange operates - on LONG variable */ -#elif defined(HAVE_ATOMIC_BUILTINS) && !defined(HAVE_ATOMIC_BUILTINS_BYTE) -typedef ulint lock_word_t; -#else -typedef byte lock_word_t; -#endif - #if defined UNIV_PFS_MUTEX || defined UNIV_PFS_RWLOCK /* By default, buffer mutexes and rwlocks will be excluded from diff --git a/storage/xtradb/include/sync0sync.ic b/storage/xtradb/include/sync0sync.ic index a252d73e43284..d6561a76cdb65 100644 --- a/storage/xtradb/include/sync0sync.ic +++ b/storage/xtradb/include/sync0sync.ic @@ -1,6 +1,6 @@ /***************************************************************************** -Copyright (c) 1995, 2009, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 1995, 2015, Oracle and/or its affiliates. All Rights Reserved. Copyright (c) 2008, Google Inc. Portions of this file contain modifications contributed and copyrighted by @@ -77,17 +77,13 @@ Performs an atomic test-and-set instruction to the lock_word field of a mutex. @return the previous value of lock_word: 0 or 1 */ UNIV_INLINE -byte +lock_word_t ib_mutex_test_and_set( -/*===============*/ +/*==================*/ ib_mutex_t* mutex) /*!< in: mutex */ { #if defined(HAVE_ATOMIC_BUILTINS) -# if defined(HAVE_ATOMIC_BUILTINS_BYTE) - return(os_atomic_test_and_set_byte(&mutex->lock_word, 1)); -# else - return(os_atomic_test_and_set_ulint(&mutex->lock_word, 1)); -# endif + return(os_atomic_test_and_set(&mutex->lock_word)); #else ibool ret; @@ -103,7 +99,7 @@ ib_mutex_test_and_set( } return((byte) ret); -#endif +#endif /* HAVE_ATOMIC_BUILTINS */ } /******************************************************************//** @@ -116,19 +112,12 @@ mutex_reset_lock_word( ib_mutex_t* mutex) /*!< in: mutex */ { #if defined(HAVE_ATOMIC_BUILTINS) - /* In theory __sync_lock_release should be used to release the lock. - Unfortunately, it does not work properly alone. The workaround is - that more conservative __sync_lock_test_and_set is used instead. */ -# if defined(HAVE_ATOMIC_BUILTINS_BYTE) - os_atomic_test_and_set_byte(&mutex->lock_word, 0); -# else - os_atomic_test_and_set_ulint(&mutex->lock_word, 0); -# endif + os_atomic_clear(&mutex->lock_word); #else mutex->lock_word = 0; os_fast_mutex_unlock(&(mutex->os_fast_mutex)); -#endif +#endif /* HAVE_ATOMIC_BUILTINS */ } /******************************************************************//** diff --git a/storage/xtradb/include/univ.i b/storage/xtradb/include/univ.i index 96570854a244d..11900631a4530 100644 --- a/storage/xtradb/include/univ.i +++ b/storage/xtradb/include/univ.i @@ -47,7 +47,7 @@ Created 1/20/1994 Heikki Tuuri #define INNODB_VERSION_BUGFIX MYSQL_VERSION_PATCH #ifndef PERCONA_INNODB_VERSION -#define PERCONA_INNODB_VERSION 74.0 +#define PERCONA_INNODB_VERSION 76.0 #endif /* Enable UNIV_LOG_ARCHIVE in XtraDB */ diff --git a/storage/xtradb/lock/lock0lock.cc b/storage/xtradb/lock/lock0lock.cc index cf14eed8b2bc2..cc23b2b25df9d 100644 --- a/storage/xtradb/lock/lock0lock.cc +++ b/storage/xtradb/lock/lock0lock.cc @@ -2657,8 +2657,8 @@ lock_rec_inherit_to_gap( /* If srv_locks_unsafe_for_binlog is TRUE or session is using READ COMMITTED isolation level, we do not want locks set by an UPDATE or a DELETE to be inherited as gap type locks. But we - DO want S-locks set by a consistency constraint to be inherited also - then. */ + DO want S-locks/X-locks(taken for replace) set by a consistency + constraint to be inherited also then */ for (lock = lock_rec_get_first(block, heap_no); lock != NULL; diff --git a/storage/xtradb/log/log0log.cc b/storage/xtradb/log/log0log.cc index 8815e0ff9df46..5a23a083ab31c 100644 --- a/storage/xtradb/log/log0log.cc +++ b/storage/xtradb/log/log0log.cc @@ -1,6 +1,6 @@ /***************************************************************************** -Copyright (c) 1995, 2014, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 1995, 2015, Oracle and/or its affiliates. All Rights Reserved. Copyright (c) 2009, Google Inc. Portions of this file contain modifications contributed and copyrighted by @@ -3606,12 +3606,7 @@ logs_empty_and_mark_files_at_shutdown(void) lsn = log_sys->lsn; - ut_ad(srv_force_recovery != SRV_FORCE_NO_LOG_REDO - || lsn == log_sys->last_checkpoint_lsn + LOG_BLOCK_HDR_SIZE); - - - if ((srv_force_recovery != SRV_FORCE_NO_LOG_REDO - && lsn != log_sys->last_checkpoint_lsn) + if (lsn != log_sys->last_checkpoint_lsn || (srv_track_changed_pages && (tracked_lsn != log_sys->last_checkpoint_lsn)) #ifdef UNIV_LOG_ARCHIVE diff --git a/storage/xtradb/log/log0online.cc b/storage/xtradb/log/log0online.cc index 00ff3bf07ed35..a6ecd57ef694f 100644 --- a/storage/xtradb/log/log0online.cc +++ b/storage/xtradb/log/log0online.cc @@ -546,7 +546,7 @@ log_online_start_bitmap_file(void) innodb_file_bmp_key, log_bmp_sys->out.name, OS_FILE_CREATE, - OS_FILE_READ_WRITE, + OS_FILE_READ_WRITE_CACHED, &success); } if (UNIV_UNLIKELY(!success)) { @@ -707,7 +707,7 @@ log_online_read_init(void) log_bmp_sys->out.file = os_file_create_simple_no_error_handling (innodb_file_bmp_key, log_bmp_sys->out.name, OS_FILE_OPEN, - OS_FILE_READ_WRITE, &success); + OS_FILE_READ_WRITE_CACHED, &success); if (!success) { @@ -1490,10 +1490,20 @@ log_online_open_bitmap_file_read_only( file */ { ibool success = FALSE; + size_t srv_data_home_len; ut_ad(name[0] != '\0'); - ut_snprintf(bitmap_file->name, FN_REFLEN, "%s%s", srv_data_home, name); + srv_data_home_len = strlen(srv_data_home); + if (srv_data_home_len + && srv_data_home[srv_data_home_len-1] + != SRV_PATH_SEPARATOR) { + ut_snprintf(bitmap_file->name, FN_REFLEN, "%s%c%s", + srv_data_home, SRV_PATH_SEPARATOR, name); + } else { + ut_snprintf(bitmap_file->name, FN_REFLEN, "%s%s", + srv_data_home, name); + } bitmap_file->file = os_file_create_simple_no_error_handling(innodb_file_bmp_key, bitmap_file->name, diff --git a/storage/xtradb/log/log0recv.cc b/storage/xtradb/log/log0recv.cc index 38d9215ce393e..82b0af828b051 100644 --- a/storage/xtradb/log/log0recv.cc +++ b/storage/xtradb/log/log0recv.cc @@ -1926,7 +1926,7 @@ recv_apply_hashed_log_recs( goto loop; } - ut_ad((!allow_ibuf) == mutex_own(&log_sys->mutex)); + ut_ad((allow_ibuf == 0) == (mutex_own(&log_sys->mutex) != 0)); if (!allow_ibuf) { recv_no_ibuf_operations = TRUE; diff --git a/storage/xtradb/os/os0file.cc b/storage/xtradb/os/os0file.cc index 5976d79faff39..4dbd3ba943345 100644 --- a/storage/xtradb/os/os0file.cc +++ b/storage/xtradb/os/os0file.cc @@ -1308,6 +1308,31 @@ os_file_create_simple_func( return(file); } +/** Disable OS I/O caching on the file if the file type and server +configuration requires it. +@param file handle to the file +@param name name of the file, for diagnostics +@param mode_str operation on the file, for diagnostics +@param type OS_LOG_FILE or OS_DATA_FILE +@param access_type if OS_FILE_READ_WRITE_CACHED, then caching will be disabled +unconditionally, ignored otherwise */ +static +void +os_file_set_nocache_if_needed(os_file_t file, const char* name, + const char *mode_str, ulint type, + ulint access_type) +{ + if (srv_read_only_mode || access_type == OS_FILE_READ_WRITE_CACHED) + return; + + if (srv_unix_file_flush_method == SRV_UNIX_ALL_O_DIRECT + || (type != OS_LOG_FILE + && (srv_unix_file_flush_method == SRV_UNIX_O_DIRECT + || (srv_unix_file_flush_method + == SRV_UNIX_O_DIRECT_NO_FSYNC)))) + os_file_set_nocache(file, name, mode_str); +} + /****************************************************************//** NOTE! Use the corresponding macro os_file_create_simple_no_error_handling(), not directly this function! @@ -1322,9 +1347,11 @@ os_file_create_simple_no_error_handling_func( null-terminated string */ ulint create_mode,/*!< in: create mode */ ulint access_type,/*!< in: OS_FILE_READ_ONLY, - OS_FILE_READ_WRITE, or - OS_FILE_READ_ALLOW_DELETE; the last option is - used by a backup program reading the file */ + OS_FILE_READ_WRITE, + OS_FILE_READ_ALLOW_DELETE (used by a backup + program reading the file), or + OS_FILE_READ_WRITE_CACHED (disable O_DIRECT + if it would be enabled otherwise) */ ibool* success)/*!< out: TRUE if succeed, FALSE if error */ { os_file_t file; @@ -1360,7 +1387,8 @@ os_file_create_simple_no_error_handling_func( access = GENERIC_READ; } else if (srv_read_only_mode) { access = GENERIC_READ; - } else if (access_type == OS_FILE_READ_WRITE) { + } else if (access_type == OS_FILE_READ_WRITE + || access_type == OS_FILE_READ_WRITE_CACHED) { access = GENERIC_READ | GENERIC_WRITE; } else if (access_type == OS_FILE_READ_ALLOW_DELETE) { @@ -1413,7 +1441,8 @@ os_file_create_simple_no_error_handling_func( } else { ut_a(access_type == OS_FILE_READ_WRITE - || access_type == OS_FILE_READ_ALLOW_DELETE); + || access_type == OS_FILE_READ_ALLOW_DELETE + || access_type == OS_FILE_READ_WRITE_CACHED); create_flag = O_RDWR; } @@ -1445,18 +1474,16 @@ os_file_create_simple_no_error_handling_func( /* This function is always called for data files, we should disable OS caching (O_DIRECT) here as we do in os_file_create_func(), so we open the same file in the same mode, see man page of open(2). */ - if (!srv_read_only_mode - && *success - && (srv_unix_file_flush_method == SRV_UNIX_O_DIRECT - || srv_unix_file_flush_method == SRV_UNIX_O_DIRECT_NO_FSYNC)) { - - os_file_set_nocache(file, name, mode_str); + if (*success) { + os_file_set_nocache_if_needed(file, name, mode_str, + OS_DATA_FILE, access_type); } #ifdef USE_FILE_LOCK if (!srv_read_only_mode && *success - && access_type == OS_FILE_READ_WRITE + && (access_type == OS_FILE_READ_WRITE + || access_type == OS_FILE_READ_WRITE_CACHED) && os_file_lock(file, name)) { *success = FALSE; @@ -1816,17 +1843,9 @@ os_file_create_func( } while (retry); - if (!srv_read_only_mode - && *success - && type != OS_LOG_FILE - && (srv_unix_file_flush_method == SRV_UNIX_O_DIRECT - || srv_unix_file_flush_method == SRV_UNIX_O_DIRECT_NO_FSYNC)) { + if (*success) { - os_file_set_nocache(file, name, mode_str); - } else if (!srv_read_only_mode - && *success - && srv_unix_file_flush_method == SRV_UNIX_ALL_O_DIRECT) { - os_file_set_nocache(file, name, mode_str); + os_file_set_nocache_if_needed(file, name, mode_str, type, 0); } #ifdef USE_FILE_LOCK diff --git a/storage/xtradb/os/os0proc.cc b/storage/xtradb/os/os0proc.cc index ec629430baf5f..ff6d65e4ae671 100644 --- a/storage/xtradb/os/os0proc.cc +++ b/storage/xtradb/os/os0proc.cc @@ -32,12 +32,6 @@ Created 9/30/1995 Heikki Tuuri #include "ut0mem.h" #include "ut0byte.h" -/* Linux release version */ -#if defined(UNIV_LINUX) && defined(_GNU_SOURCE) -#include /* strverscmp() */ -#include /* uname() */ -#endif - /* FreeBSD for example has only MAP_ANON, Linux has MAP_ANONYMOUS and MAP_ANON but MAP_ANON is marked as deprecated */ #if defined(MAP_ANONYMOUS) @@ -46,13 +40,6 @@ MAP_ANON but MAP_ANON is marked as deprecated */ #define OS_MAP_ANON MAP_ANON #endif -/* Linux's MAP_POPULATE */ -#if defined(MAP_POPULATE) -#define OS_MAP_POPULATE MAP_POPULATE -#else -#define OS_MAP_POPULATE 0 -#endif - UNIV_INTERN ibool os_use_large_pages; /* Large page size. This may be a boot-time option on some platforms */ UNIV_INTERN ulint os_large_page_size; @@ -75,24 +62,6 @@ os_proc_get_number(void) #endif } -/****************************************************************//** -Retrieve and compare operating system release. -@return TRUE if the OS release is equal to, or later than release. */ -UNIV_INTERN -ibool -os_compare_release( -/*===============*/ - const char* release /*!< in: OS release */ - __attribute__((unused))) -{ -#if defined(UNIV_LINUX) && defined(_GNU_SOURCE) - struct utsname name; - return uname(&name) == 0 && strverscmp(name.release, release) >= 0; -#else - return 0; -#endif -} - /****************************************************************//** Allocates large pages memory. @return allocated memory */ @@ -100,8 +69,7 @@ UNIV_INTERN void* os_mem_alloc_large( /*===============*/ - ulint* n, /*!< in/out: number of bytes */ - ibool populate) /*!< in: virtual page preallocation */ + ulint* n) /*!< in/out: number of bytes */ { void* ptr; ulint size; @@ -187,13 +155,12 @@ os_mem_alloc_large( ut_ad(ut_is_2pow(size)); size = *n = ut_2pow_round(*n + (size - 1), size); ptr = mmap(NULL, size, PROT_READ | PROT_WRITE, - MAP_PRIVATE | OS_MAP_ANON | - (populate ? OS_MAP_POPULATE : 0), -1, 0); + MAP_PRIVATE | OS_MAP_ANON, -1, 0); if (UNIV_UNLIKELY(ptr == (void*) -1)) { fprintf(stderr, "InnoDB: mmap(%lu bytes) failed;" " errno %lu\n", (ulong) size, (ulong) errno); - return(NULL); + ptr = NULL; } else { os_fast_mutex_lock(&ut_list_mutex); ut_total_allocated_memory += size; @@ -201,25 +168,6 @@ os_mem_alloc_large( UNIV_MEM_ALLOC(ptr, size); } #endif - -#if OS_MAP_ANON && OS_MAP_POPULATE - /* MAP_POPULATE is only supported for private mappings - since Linux 2.6.23. */ - populate = populate && !os_compare_release("2.6.23"); - - if (populate) { - fprintf(stderr, "InnoDB: Warning: mmap(MAP_POPULATE) " - "is not supported for private mappings. " - "Forcing preallocation by faulting in pages.\n"); - } -#endif - - /* Initialize the entire buffer to force the allocation - of physical memory page frames. */ - if (populate) { - memset(ptr, '\0', size); - } - return(ptr); } diff --git a/storage/xtradb/row/row0ins.cc b/storage/xtradb/row/row0ins.cc index 6f41f0c6d1121..113aacce39ad4 100644 --- a/storage/xtradb/row/row0ins.cc +++ b/storage/xtradb/row/row0ins.cc @@ -261,7 +261,13 @@ row_ins_sec_index_entry_by_modify( update = row_upd_build_sec_rec_difference_binary( rec, cursor->index, *offsets, entry, heap); - if (!rec_get_deleted_flag(rec, rec_offs_comp(*offsets))) { + /* If operating in fake_change mode then flow will not mark the record + deleted but will still assume it and take delete-mark path. Condition + below has a different path if record is not marked deleted but we need + to still by-pass it given that original flow has taken this path for + fake_change mode execution assuming record is delete-marked. */ + if (!rec_get_deleted_flag(rec, rec_offs_comp(*offsets)) + && UNIV_UNLIKELY(!thr_get_trx(thr)->fake_changes)) { /* We should never insert in place of a record that has not been delete-marked. The only exception is when online CREATE INDEX copied the changes that we already @@ -2482,9 +2488,14 @@ row_ins_clust_index_entry_low( effectively "roll back" the operation. */ ut_a(err == DB_SUCCESS); dtuple_big_rec_free(big_rec); + } else if (big_rec != NULL + && UNIV_UNLIKELY(thr_get_trx(thr)->fake_changes)) { + dtuple_big_rec_free(big_rec); } - if (err == DB_SUCCESS && dict_index_is_online_ddl(index)) { + if (err == DB_SUCCESS + && dict_index_is_online_ddl(index) + && UNIV_LIKELY(!thr_get_trx(thr)->fake_changes)) { row_log_table_insert(rec, index, offsets); } @@ -2528,8 +2539,8 @@ row_ins_clust_index_entry_low( if (UNIV_UNLIKELY(thr_get_trx(thr)->fake_changes)) { - /* skip store extern */ - mem_heap_free(big_rec->heap); + dtuple_convert_back_big_rec( + index, entry, big_rec); goto func_exit; } @@ -2549,7 +2560,8 @@ row_ins_clust_index_entry_low( dtuple_convert_back_big_rec(index, entry, big_rec); } else { if (err == DB_SUCCESS - && dict_index_is_online_ddl(index)) { + && dict_index_is_online_ddl(index) + && !UNIV_UNLIKELY(thr_get_trx(thr)->fake_changes)) { row_log_table_insert( insert_rec, index, offsets); } diff --git a/storage/xtradb/row/row0log.cc b/storage/xtradb/row/row0log.cc index 609c166c77b81..f88baa42d5879 100644 --- a/storage/xtradb/row/row0log.cc +++ b/storage/xtradb/row/row0log.cc @@ -208,8 +208,7 @@ row_log_block_allocate( DBUG_ENTER("row_log_block_allocate"); if (log_buf.block == NULL) { log_buf.size = srv_sort_buf_size; - log_buf.block = (byte*) os_mem_alloc_large(&log_buf.size, - FALSE); + log_buf.block = (byte*) os_mem_alloc_large(&log_buf.size); DBUG_EXECUTE_IF("simulate_row_log_allocation_failure", if (log_buf.block) os_mem_free_large(log_buf.block, log_buf.size); diff --git a/storage/xtradb/row/row0merge.cc b/storage/xtradb/row/row0merge.cc index ab97a514282fd..6304f0f7266c4 100644 --- a/storage/xtradb/row/row0merge.cc +++ b/storage/xtradb/row/row0merge.cc @@ -3584,7 +3584,7 @@ row_merge_build_indexes( block_size = 3 * srv_sort_buf_size; block = static_cast( - os_mem_alloc_large(&block_size, FALSE)); + os_mem_alloc_large(&block_size)); if (block == NULL) { DBUG_RETURN(DB_OUT_OF_MEMORY); diff --git a/storage/xtradb/row/row0mysql.cc b/storage/xtradb/row/row0mysql.cc index 9f7b69acbdaa9..a1bdc732fa401 100644 --- a/storage/xtradb/row/row0mysql.cc +++ b/storage/xtradb/row/row0mysql.cc @@ -1388,7 +1388,8 @@ row_insert_for_mysql( return(err); } - if (dict_table_has_fts_index(table)) { + if (dict_table_has_fts_index(table) + && UNIV_LIKELY(!thr_get_trx(thr)->fake_changes)) { doc_id_t doc_id; /* Extract the doc id from the hidden FTS column */ diff --git a/storage/xtradb/row/row0upd.cc b/storage/xtradb/row/row0upd.cc index 0f189a5278950..b1002a1db035b 100644 --- a/storage/xtradb/row/row0upd.cc +++ b/storage/xtradb/row/row0upd.cc @@ -2343,6 +2343,7 @@ row_upd_clust_step( ut_a(pcur->rel_pos == BTR_PCUR_ON); ulint mode; + ulint search_mode; #ifdef UNIV_DEBUG /* Work around Bug#14626800 ASSERTION FAILURE IN DEBUG_SYNC(). @@ -2354,17 +2355,29 @@ row_upd_clust_step( } #endif /* UNIV_DEBUG */ + /* If running with fake_changes mode on then switch from modify to + search so that code takes only s-latch and not x-latch. + For dry-run (fake-changes) s-latch is acceptable. Taking x-latch will + make it more restrictive and will block real changes/workflow. */ if (UNIV_UNLIKELY(thr_get_trx(thr)->fake_changes)) { - mode = BTR_SEARCH_LEAF; - } else if (dict_index_is_online_ddl(index)) { - ut_ad(node->table->id != DICT_INDEXES_ID); - mode = BTR_MODIFY_LEAF | BTR_ALREADY_S_LATCHED; - mtr_s_lock(dict_index_get_lock(index), &mtr); + mode = BTR_MODIFY_LEAF; + search_mode = BTR_SEARCH_LEAF; } else { mode = BTR_MODIFY_LEAF; + search_mode = BTR_MODIFY_LEAF; + } + + if (dict_index_is_online_ddl(index)) { + + ut_ad(node->table->id != DICT_INDEXES_ID); + + mode |= BTR_ALREADY_S_LATCHED; + search_mode |= BTR_ALREADY_S_LATCHED; + + mtr_s_lock(dict_index_get_lock(index), &mtr); } - success = btr_pcur_restore_position(mode, pcur, &mtr); + success = btr_pcur_restore_position(search_mode, pcur, &mtr); if (!success) { err = DB_RECORD_NOT_FOUND; @@ -2382,6 +2395,10 @@ row_upd_clust_step( ut_ad(!dict_index_is_online_ddl(index)); + /* Action in fake change mode shouldn't cause changes + in system tables. */ + ut_ad(UNIV_LIKELY(!thr_get_trx(thr)->fake_changes)); + dict_drop_index_tree(btr_pcur_get_rec(pcur), &mtr); mtr_commit(&mtr); @@ -2413,6 +2430,8 @@ row_upd_clust_step( } } + /* This check passes as the function manipulates x-lock to s-lock + if operating in fake-change mode. */ ut_ad(lock_trx_has_rec_x_lock(thr_get_trx(thr), index->table, btr_pcur_get_block(pcur), page_rec_get_heap_no(rec))); diff --git a/storage/xtradb/srv/srv0srv.cc b/storage/xtradb/srv/srv0srv.cc index d10a566b061ec..968275588768d 100644 --- a/storage/xtradb/srv/srv0srv.cc +++ b/storage/xtradb/srv/srv0srv.cc @@ -1,6 +1,6 @@ /***************************************************************************** -Copyright (c) 1995, 2014, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 1995, 2015, Oracle and/or its affiliates. All Rights Reserved. Copyright (c) 2008, 2009 Google Inc. Copyright (c) 2009, Percona Inc. @@ -141,6 +141,9 @@ UNIV_INTERN ulint srv_file_format = 0; UNIV_FORMAT_MAX + 1 means no checking ie. FALSE. The default is to set it to the highest format we support. */ UNIV_INTERN ulint srv_max_file_format_at_startup = UNIV_FORMAT_MAX; +/** Set if InnoDB operates in read-only mode or innodb-force-recovery +is greater than SRV_FORCE_NO_TRX_UNDO. */ +UNIV_INTERN my_bool high_level_read_only; #if UNIV_FORMAT_A # error "UNIV_FORMAT_A must be 0!" @@ -159,6 +162,7 @@ OS (provided we compiled Innobase with it in), otherwise we will use simulated aio we build below with threads. Currently we support native aio on windows and linux */ UNIV_INTERN my_bool srv_use_native_aio = TRUE; +UNIV_INTERN my_bool srv_numa_interleave = FALSE; #ifdef __WIN__ /* Windows native condition variables. We use runtime loading / function @@ -243,8 +247,6 @@ UNIV_INTERN const byte* srv_latin1_ordering; UNIV_INTERN my_bool srv_use_sys_malloc = TRUE; /* requested size in kilobytes */ UNIV_INTERN ulint srv_buf_pool_size = ULINT_MAX; -/* force virtual page preallocation (prefault) */ -UNIV_INTERN my_bool srv_buf_pool_populate = FALSE; /* requested number of buffer pool instances */ UNIV_INTERN ulint srv_buf_pool_instances = 1; /* number of locks to protect buf_pool->page_hash */ @@ -3206,13 +3208,8 @@ srv_do_purge( } n_pages_purged = trx_purge( - n_use_threads, srv_purge_batch_size, false); - - if (!(count++ % TRX_SYS_N_RSEGS)) { - /* Force a truncate of the history list. */ - n_pages_purged += trx_purge( - 1, srv_purge_batch_size, true); - } + n_use_threads, srv_purge_batch_size, + (++count % TRX_SYS_N_RSEGS) == 0); *n_total_purged += n_pages_purged; @@ -3412,8 +3409,17 @@ DECLARE_THREAD(srv_purge_coordinator_thread)( n_pages_purged = trx_purge(1, srv_purge_batch_size, false); } - /* Force a truncate of the history list. */ - n_pages_purged = trx_purge(1, srv_purge_batch_size, true); + /* This trx_purge is called to remove any undo records (added by + background threads) after completion of the above loop. When + srv_fast_shutdown != 0, a large batch size can cause significant + delay in shutdown ,so reducing the batch size to magic number 20 + (which was default in 5.5), which we hope will be sufficient to + remove all the undo records */ + const uint temp_batch_size = 20; + + n_pages_purged = trx_purge(1, srv_purge_batch_size <= temp_batch_size + ? srv_purge_batch_size : temp_batch_size, + true); ut_a(n_pages_purged == 0 || srv_fast_shutdown != 0); /* The task queue should always be empty, independent of fast diff --git a/storage/xtradb/srv/srv0start.cc b/storage/xtradb/srv/srv0start.cc index 545034f1dca34..bbd095dacf7d4 100644 --- a/storage/xtradb/srv/srv0start.cc +++ b/storage/xtradb/srv/srv0start.cc @@ -1595,9 +1595,8 @@ innobase_start_or_create_for_mysql(void) char* logfile0 = NULL; size_t dirnamelen; - if (srv_force_recovery > SRV_FORCE_NO_TRX_UNDO) { - srv_read_only_mode = true; - } + high_level_read_only = srv_read_only_mode + || srv_force_recovery > SRV_FORCE_NO_TRX_UNDO; if (srv_read_only_mode) { ib_logf(IB_LOG_LEVEL_INFO, "Started in read only mode"); @@ -2021,8 +2020,7 @@ innobase_start_or_create_for_mysql(void) ib_logf(IB_LOG_LEVEL_INFO, "Initializing buffer pool, size = %.1f%c", size, unit); - err = buf_pool_init(srv_buf_pool_size, (ibool) srv_buf_pool_populate, - srv_buf_pool_instances); + err = buf_pool_init(srv_buf_pool_size, srv_buf_pool_instances); if (err != DB_SUCCESS) { ib_logf(IB_LOG_LEVEL_ERROR, diff --git a/storage/xtradb/trx/trx0sys.cc b/storage/xtradb/trx/trx0sys.cc index a1736dbb79fe5..5baea052b7bbe 100644 --- a/storage/xtradb/trx/trx0sys.cc +++ b/storage/xtradb/trx/trx0sys.cc @@ -1,6 +1,6 @@ /***************************************************************************** -Copyright (c) 1996, 2013, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 1996, 2015, Oracle and/or its affiliates. All Rights Reserved. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -29,7 +29,10 @@ Created 3/26/1996 Heikki Tuuri #include "trx0sys.ic" #endif -#ifndef UNIV_HOTBACKUP +#ifdef UNIV_HOTBACKUP +#include "fsp0types.h" + +#else /* !UNIV_HOTBACKUP */ #include "fsp0fsp.h" #include "mtr0log.h" #include "mtr0log.h" @@ -1124,18 +1127,15 @@ trx_sys_read_pertable_file_format_id( /* get the file format from the page */ ptr = page + 54; flags = mach_read_from_4(ptr); - if (flags == 0) { - /* file format is Antelope */ - *format_id = 0; - return(TRUE); - } else if (flags & 1) { - /* tablespace flags are ok */ - *format_id = (flags / 32) % 128; - return(TRUE); - } else { + + if (!fsp_flags_is_valid(flags) { /* bad tablespace flags */ return(FALSE); } + + *format_id = FSP_FLAGS_GET_POST_ANTELOPE(flags); + + return(TRUE); } diff --git a/storage/xtradb/trx/trx0trx.cc b/storage/xtradb/trx/trx0trx.cc index c4676fe6974b2..a5e578c547a30 100644 --- a/storage/xtradb/trx/trx0trx.cc +++ b/storage/xtradb/trx/trx0trx.cc @@ -1060,6 +1060,12 @@ trx_start_low( trx->id = trx_sys_get_new_trx_id(); + /* Cache the state of fake_changes that transaction will use for + lifetime. Any change in session/global fake_changes configuration during + lifetime of transaction will not be honored by already started + transaction. */ + trx->fake_changes = thd_fake_changes(trx->mysql_thd); + ut_ad(!trx->in_rw_trx_list); ut_ad(!trx->in_ro_trx_list);