Skip to content

Commit cd7fa0f

Browse files
committed
MDEV-11934: MariaRocks: Group Commit with binlog
Port the previous patch: - Implement MariaDB's Group Commit API. This is a first attempt which lacks the expected performance. To newer MariaDB (which includes newer MyRocks)
1 parent f11eaaa commit cd7fa0f

File tree

1 file changed

+99
-7
lines changed

1 file changed

+99
-7
lines changed

storage/rocksdb/ha_rocksdb.cc

Lines changed: 99 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1806,6 +1806,12 @@ class Rdb_transaction {
18061806
*/
18071807
int64_t m_n_mysql_tables_in_use = 0;
18081808

1809+
/*
1810+
MariaDB's group commit:
1811+
*/
1812+
bool commit_ordered_done;
1813+
bool commit_ordered_res;
1814+
18091815
/*
18101816
for distinction between rdb_transaction_impl and rdb_writebatch_impl
18111817
when using walk tx list
@@ -2431,6 +2437,8 @@ class Rdb_transaction_impl : public Rdb_transaction {
24312437
THDVAR(m_thd, write_ignore_missing_column_families);
24322438
m_is_two_phase = rocksdb_enable_2pc;
24332439

2440+
commit_ordered_done= false;
2441+
24342442
/*
24352443
If m_rocksdb_reuse_tx is null this will create a new transaction object.
24362444
Otherwise it will reuse the existing one.
@@ -2643,6 +2651,7 @@ class Rdb_writebatch_impl : public Rdb_transaction {
26432651
bool is_tx_started() const override { return (m_batch != nullptr); }
26442652

26452653
void start_tx() override {
2654+
commit_ordered_done= false; // Do we need this here?
26462655
reset();
26472656
write_opts.sync = (rocksdb_flush_log_at_trx_commit == FLUSH_LOG_SYNC);
26482657
write_opts.disableWAL = THDVAR(m_thd, write_disable_wal);
@@ -2831,8 +2840,7 @@ static bool rocksdb_flush_wal(handlerton* hton __attribute__((__unused__)))
28312840
*/
28322841
static int rocksdb_prepare(handlerton* hton, THD* thd, bool prepare_tx)
28332842
{
2834-
// This is "ASYNC_COMMIT" feature which is only in webscalesql
2835-
bool async=false;
2843+
bool async=false; // This is "ASYNC_COMMIT" feature which is only present in webscalesql
28362844

28372845
Rdb_transaction *&tx = get_tx_from_thd(thd);
28382846
if (!tx->can_prepare()) {
@@ -2842,7 +2850,8 @@ static int rocksdb_prepare(handlerton* hton, THD* thd, bool prepare_tx)
28422850
(!my_core::thd_test_options(thd, OPTION_NOT_AUTOCOMMIT | OPTION_BEGIN))) {
28432851
/* We were instructed to prepare the whole transaction, or
28442852
this is an SQL statement end and autocommit is on */
2845-
#ifdef MARIAROCKS_NOT_YET // disable prepare/commit
2853+
2854+
#ifdef MARIAROCKS_NOT_YET // Crash-safe slave does not work yet
28462855
std::vector<st_slave_gtid_info> slave_gtid_info;
28472856
my_core::thd_slave_gtid_info(thd, &slave_gtid_info);
28482857
for (const auto &it : slave_gtid_info) {
@@ -2852,31 +2861,50 @@ static int rocksdb_prepare(handlerton* hton, THD* thd, bool prepare_tx)
28522861
#endif
28532862

28542863
if (tx->is_two_phase()) {
2864+
2865+
/*
2866+
MariaDB: the following branch is never taken.
2867+
We always flush at Prepare and rely on RocksDB's internal Group Commit
2868+
to do some grouping.
2869+
*/
28552870
if (thd->durability_property == HA_IGNORE_DURABILITY || async) {
28562871
tx->set_sync(false);
28572872
}
2873+
2874+
/*
2875+
MariaDB: do not flush logs if we are running in a non-crash-safe mode.
2876+
*/
2877+
if (!rocksdb_flush_log_at_trx_commit)
2878+
tx->set_sync(false);
2879+
28582880
XID xid;
28592881
thd_get_xid(thd, reinterpret_cast<MYSQL_XID *>(&xid));
28602882
if (!tx->prepare(rdb_xid_to_string(xid))) {
28612883
return HA_EXIT_FAILURE;
28622884
}
2863-
if (thd->durability_property == HA_IGNORE_DURABILITY )
2885+
2886+
/*
2887+
MariaDB: our Group Commit implementation does not use the
2888+
hton->flush_logs call (at least currently) so the following is not
2889+
needed (TODO: will we need this for binlog rotation?)
2890+
*/
28642891
#ifdef MARIAROCKS_NOT_YET
2865-
(rocksdb_flush_log_at_trx_commit != FLUSH_LOG_NEVER)) {
2892+
if (thd->durability_property == HA_IGNORE_DURABILITY )
2893+
(rocksdb_flush_log_at_trx_commit != FLUSH_LOG_NEVER))
28662894
&&
28672895
THDVAR(thd, flush_log_at_trx_commit))
28682896
#endif
2869-
{
28702897
#ifdef MARIAROCKS_NOT_YET
2898+
{
28712899
// MariaRocks: disable the
28722900
// "write/sync redo log before flushing binlog cache to file"
28732901
// feature. See a869c56d361bb44f46c0efeb11a8f03561676247
28742902
/**
28752903
we set the log sequence as '1' just to trigger hton->flush_logs
28762904
*/
28772905
thd_store_lsn(thd, 1, DB_TYPE_ROCKSDB);
2878-
#endif
28792906
}
2907+
#endif
28802908
}
28812909

28822910
DEBUG_SYNC(thd, "rocksdb.prepared");
@@ -3026,6 +3054,50 @@ static int rocksdb_recover(handlerton* hton, XID* xid_list, uint len)
30263054
return count;
30273055
}
30283056

3057+
3058+
/*
3059+
Handle a commit checkpoint request from server layer.
3060+
3061+
InnoDB does this:
3062+
We put the request in a queue, so that we can notify upper layer about
3063+
checkpoint complete when we have flushed the redo log.
3064+
If we have already flushed all relevant redo log, we notify immediately.
3065+
3066+
MariaRocks just flushes everything right away ATM
3067+
*/
3068+
3069+
static void rocksdb_checkpoint_request(handlerton *hton,
3070+
void *cookie)
3071+
{
3072+
const rocksdb::Status s= rdb->SyncWAL();
3073+
//TODO: what to do on error?
3074+
if (s.ok())
3075+
{
3076+
rocksdb_wal_group_syncs++;
3077+
commit_checkpoint_notify_ha(hton, cookie);
3078+
}
3079+
}
3080+
3081+
/*
3082+
@param all: TRUE - commit the transaction
3083+
FALSE - SQL statement ended
3084+
*/
3085+
static void rocksdb_commit_ordered(handlerton *hton, THD* thd, bool all)
3086+
{
3087+
// Same assert as InnoDB has
3088+
DBUG_ASSERT(all || (!thd_test_options(thd, OPTION_NOT_AUTOCOMMIT |
3089+
OPTION_BEGIN)));
3090+
Rdb_transaction *&tx = get_tx_from_thd(thd);
3091+
3092+
tx->set_sync(false);
3093+
3094+
/* This will note the master position also */
3095+
tx->commit_ordered_res= tx->commit();
3096+
tx->commit_ordered_done= true;
3097+
3098+
}
3099+
3100+
30293101
static int rocksdb_commit(handlerton* hton, THD* thd, bool commit_tx)
30303102
{
30313103
DBUG_ENTER_FUNC();
@@ -3045,6 +3117,16 @@ static int rocksdb_commit(handlerton* hton, THD* thd, bool commit_tx)
30453117
if (tx != nullptr) {
30463118
if (commit_tx || (!my_core::thd_test_options(thd, OPTION_NOT_AUTOCOMMIT |
30473119
OPTION_BEGIN))) {
3120+
/*
3121+
This will not add anything to commit_latency_stats, and this is correct
3122+
right?
3123+
*/
3124+
if (tx->commit_ordered_done)
3125+
{
3126+
thd_wakeup_subsequent_commits(thd, 0);
3127+
DBUG_RETURN((tx->commit_ordered_res? HA_ERR_INTERNAL_ERROR: 0));
3128+
}
3129+
30483130
/*
30493131
We get here
30503132
- For a COMMIT statement that finishes a multi-statement transaction
@@ -3053,6 +3135,7 @@ static int rocksdb_commit(handlerton* hton, THD* thd, bool commit_tx)
30533135
if (tx->commit()) {
30543136
DBUG_RETURN(HA_ERR_ROCKSDB_COMMIT_FAILED);
30553137
}
3138+
thd_wakeup_subsequent_commits(thd, 0);
30563139
} else {
30573140
/*
30583141
We get here when committing a statement within a transaction.
@@ -3076,6 +3159,7 @@ static int rocksdb_commit(handlerton* hton, THD* thd, bool commit_tx)
30763159
DBUG_RETURN(HA_EXIT_SUCCESS);
30773160
}
30783161

3162+
30793163
static int rocksdb_rollback(handlerton *const hton, THD *const thd,
30803164
bool rollback_tx) {
30813165
Rdb_transaction *&tx = get_tx_from_thd(thd);
@@ -3882,11 +3966,19 @@ static int rocksdb_init_func(void *const p) {
38823966
rocksdb_hton->state = SHOW_OPTION_YES;
38833967
rocksdb_hton->create = rocksdb_create_handler;
38843968
rocksdb_hton->close_connection = rocksdb_close_connection;
3969+
38853970
rocksdb_hton->prepare = rocksdb_prepare;
3971+
rocksdb_hton->prepare_ordered = NULL; // Do not need it
3972+
38863973
rocksdb_hton->commit_by_xid = rocksdb_commit_by_xid;
38873974
rocksdb_hton->rollback_by_xid = rocksdb_rollback_by_xid;
38883975
rocksdb_hton->recover = rocksdb_recover;
3976+
3977+
rocksdb_hton->commit_ordered= rocksdb_commit_ordered;
38893978
rocksdb_hton->commit = rocksdb_commit;
3979+
3980+
rocksdb_hton->commit_checkpoint_request= rocksdb_checkpoint_request;
3981+
38903982
rocksdb_hton->rollback = rocksdb_rollback;
38913983
rocksdb_hton->show_status = rocksdb_show_status;
38923984
rocksdb_hton->start_consistent_snapshot =

0 commit comments

Comments
 (0)