Skip to content

Commit 660a292

Browse files
committed
Fix optimistic parallel replication for TokuDB.
Make TokuDB report row lock waits with thd_rpl_deadlock_check(). This allows parallel replication to properly detect conflicts, and kill and retry the offending transaction.
1 parent d145d1b commit 660a292

File tree

9 files changed

+163
-20
lines changed

9 files changed

+163
-20
lines changed

storage/tokudb/PerconaFT/buildheader/make_tdb.cc

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -405,6 +405,7 @@ static void print_db_env_struct (void) {
405405
"int (*set_lock_timeout) (DB_ENV *env, uint64_t default_lock_wait_time_msec, uint64_t (*get_lock_wait_time_cb)(uint64_t default_lock_wait_time))",
406406
"int (*get_lock_timeout) (DB_ENV *env, uint64_t *lock_wait_time_msec)",
407407
"int (*set_lock_timeout_callback) (DB_ENV *env, lock_timeout_callback callback)",
408+
"int (*set_lock_wait_callback) (DB_ENV *env, lock_wait_callback callback)",
408409
"int (*txn_xa_recover) (DB_ENV*, TOKU_XA_XID list[/*count*/], long count, /*out*/ long *retp, uint32_t flags)",
409410
"int (*get_txn_from_xid) (DB_ENV*, /*in*/ TOKU_XA_XID *, /*out*/ DB_TXN **)",
410411
"DB* (*get_db_for_directory) (DB_ENV*)",
@@ -751,6 +752,7 @@ int main (int argc, char *const argv[] __attribute__((__unused__))) {
751752
printf("void toku_dbt_array_resize(DBT_ARRAY *dbts, uint32_t size) %s;\n", VISIBLE);
752753

753754
printf("typedef void (*lock_timeout_callback)(DB *db, uint64_t requesting_txnid, const DBT *left_key, const DBT *right_key, uint64_t blocking_txnid);\n");
755+
printf("typedef void (*lock_wait_callback)(void *arg, uint64_t requesting_txnid, uint64_t blocking_txnid);\n");
754756
printf("typedef int (*iterate_row_locks_callback)(DB **db, DBT *left_key, DBT *right_key, void *extra);\n");
755757
printf("typedef int (*iterate_transactions_callback)(DB_TXN *dbtxn, iterate_row_locks_callback cb, void *locks_extra, void *extra);\n");
756758
printf("typedef int (*iterate_requests_callback)(DB *db, uint64_t requesting_txnid, const DBT *left_key, const DBT *right_key, uint64_t blocking_txnid, uint64_t start_time, void *extra);\n");

storage/tokudb/PerconaFT/ftcxx/db_env.hpp

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -202,6 +202,7 @@ namespace ftcxx {
202202
typedef uint64_t (*get_lock_wait_time_cb_func)(uint64_t);
203203
get_lock_wait_time_cb_func _get_lock_wait_time_cb;
204204
lock_timeout_callback _lock_timeout_callback;
205+
lock_wait_callback _lock_wait_needed_callback;
205206
uint64_t (*_loader_memory_size_callback)(void);
206207

207208
uint32_t _cachesize_gbytes;
@@ -231,6 +232,7 @@ namespace ftcxx {
231232
_lock_wait_time_msec(0),
232233
_get_lock_wait_time_cb(nullptr),
233234
_lock_timeout_callback(nullptr),
235+
_lock_wait_needed_callback(nullptr),
234236
_loader_memory_size_callback(nullptr),
235237
_cachesize_gbytes(0),
236238
_cachesize_bytes(0),
@@ -296,6 +298,11 @@ namespace ftcxx {
296298
handle_ft_retval(r);
297299
}
298300

301+
if (_lock_wait_needed_callback) {
302+
r = env->set_lock_wait_callback(env, _lock_wait_needed_callback);
303+
handle_ft_retval(r);
304+
}
305+
299306
if (_loader_memory_size_callback) {
300307
env->set_loader_memory_size(env, _loader_memory_size_callback);
301308
}
@@ -419,6 +426,11 @@ namespace ftcxx {
419426
return *this;
420427
}
421428

429+
DBEnvBuilder& set_lock_wait_callback(lock_wait_callback callback) {
430+
_lock_wait_needed_callback = callback;
431+
return *this;
432+
}
433+
422434
DBEnvBuilder& set_loader_memory_size(uint64_t (*callback)(void)) {
423435
_loader_memory_size_callback = callback;
424436
return *this;

storage/tokudb/PerconaFT/locktree/lock_request.cc

Lines changed: 40 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -199,7 +199,8 @@ int lock_request::wait(uint64_t wait_time_ms) {
199199
return wait(wait_time_ms, 0, nullptr);
200200
}
201201

202-
int lock_request::wait(uint64_t wait_time_ms, uint64_t killed_time_ms, int (*killed_callback)(void)) {
202+
int lock_request::wait(uint64_t wait_time_ms, uint64_t killed_time_ms, int (*killed_callback)(void),
203+
void (*lock_wait_callback)(void *, TXNID, TXNID)) {
203204
uint64_t t_now = toku_current_time_microsec();
204205
uint64_t t_start = t_now;
205206
uint64_t t_end = t_start + wait_time_ms * 1000;
@@ -208,7 +209,13 @@ int lock_request::wait(uint64_t wait_time_ms, uint64_t killed_time_ms, int (*kil
208209

209210
// check again, this time locking out other retry calls
210211
if (m_state == state::PENDING) {
211-
retry();
212+
GrowableArray<TXNID> conflicts_collector;
213+
conflicts_collector.init();
214+
retry(&conflicts_collector);
215+
if (m_state == state::PENDING) {
216+
report_waits(&conflicts_collector, lock_wait_callback);
217+
}
218+
conflicts_collector.deinit();
212219
}
213220

214221
while (m_state == state::PENDING) {
@@ -287,7 +294,7 @@ TXNID lock_request::get_conflicting_txnid(void) const {
287294
return m_conflicting_txnid;
288295
}
289296

290-
int lock_request::retry(void) {
297+
int lock_request::retry(GrowableArray<TXNID> *conflicts_collector) {
291298
invariant(m_state == state::PENDING);
292299
int r;
293300

@@ -308,13 +315,14 @@ int lock_request::retry(void) {
308315
toku_cond_broadcast(&m_wait_cond);
309316
} else {
310317
m_conflicting_txnid = conflicts.get(0);
318+
add_conflicts_to_waits(&conflicts, conflicts_collector);
311319
}
312320
conflicts.destroy();
313321

314322
return r;
315323
}
316324

317-
void lock_request::retry_all_lock_requests(locktree *lt, void (*after_retry_all_test_callback)(void)) {
325+
void lock_request::retry_all_lock_requests(locktree *lt, void (*lock_wait_callback)(void *, TXNID, TXNID), void (*after_retry_all_test_callback)(void)) {
318326
lt_lock_request_info *info = lt->get_lock_request_info();
319327

320328
info->retry_want++;
@@ -327,6 +335,9 @@ void lock_request::retry_all_lock_requests(locktree *lt, void (*after_retry_all_
327335

328336
toku_mutex_lock(&info->mutex);
329337

338+
GrowableArray<TXNID> conflicts_collector;
339+
conflicts_collector.init();
340+
330341
// here is the group retry algorithm.
331342
// get the latest retry_want count and use it as the generation number of this retry operation.
332343
// if this retry generation is > the last retry generation, then do the lock retries. otherwise,
@@ -344,7 +355,7 @@ void lock_request::retry_all_lock_requests(locktree *lt, void (*after_retry_all_
344355
// move on to the next lock request. otherwise
345356
// the request is gone from the list so we may
346357
// read the i'th entry for the next one.
347-
r = request->retry();
358+
r = request->retry(&conflicts_collector);
348359
if (r != 0) {
349360
i++;
350361
}
@@ -354,6 +365,30 @@ void lock_request::retry_all_lock_requests(locktree *lt, void (*after_retry_all_
354365
}
355366

356367
toku_mutex_unlock(&info->mutex);
368+
369+
report_waits(&conflicts_collector, lock_wait_callback);
370+
conflicts_collector.deinit();
371+
}
372+
373+
void lock_request::add_conflicts_to_waits(txnid_set *conflicts,
374+
GrowableArray<TXNID> *wait_conflicts) {
375+
size_t num_conflicts = conflicts->size();
376+
for (size_t i = 0; i < num_conflicts; i++) {
377+
wait_conflicts->push(m_txnid);
378+
wait_conflicts->push(conflicts->get(i));
379+
}
380+
}
381+
382+
void lock_request::report_waits(GrowableArray<TXNID> *wait_conflicts,
383+
void (*lock_wait_callback)(void *, TXNID, TXNID)) {
384+
if (!lock_wait_callback)
385+
return;
386+
size_t num_conflicts = wait_conflicts->get_size();
387+
for (size_t i = 0; i < num_conflicts; i += 2) {
388+
TXNID blocked_txnid = wait_conflicts->fetch_unchecked(i);
389+
TXNID blocking_txnid = wait_conflicts->fetch_unchecked(i+1);
390+
(*lock_wait_callback)(nullptr, blocked_txnid, blocking_txnid);
391+
}
357392
}
358393

359394
void *lock_request::get_extra(void) const {

storage/tokudb/PerconaFT/locktree/lock_request.h

Lines changed: 9 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -89,7 +89,8 @@ class lock_request {
8989
// returns: The return code of locktree::acquire_[write,read]_lock()
9090
// or simply DB_LOCK_NOTGRANTED if the wait time expired.
9191
int wait(uint64_t wait_time_ms);
92-
int wait(uint64_t wait_time_ms, uint64_t killed_time_ms, int (*killed_callback)(void));
92+
int wait(uint64_t wait_time_ms, uint64_t killed_time_ms, int (*killed_callback)(void),
93+
void (*lock_wait_callback)(void *, TXNID, TXNID) = nullptr);
9394

9495
// return: left end-point of the lock range
9596
const DBT *get_left_key(void) const;
@@ -109,7 +110,7 @@ class lock_request {
109110
// effect: Retries all of the lock requests for the given locktree.
110111
// Any lock requests successfully restarted is completed and woken up.
111112
// The rest remain pending.
112-
static void retry_all_lock_requests(locktree *lt, void (*after_retry_test_callback)(void) = nullptr);
113+
static void retry_all_lock_requests(locktree *lt, void (*lock_wait_callback)(void *, TXNID, TXNID) = nullptr, void (*after_retry_test_callback)(void) = nullptr);
113114

114115
void set_start_test_callback(void (*f)(void));
115116
void set_start_before_pending_test_callback(void (*f)(void));
@@ -162,7 +163,7 @@ class lock_request {
162163

163164
// effect: tries again to acquire the lock described by this lock request
164165
// returns: 0 if retrying the request succeeded and is now complete
165-
int retry(void);
166+
int retry(GrowableArray<TXNID> *conflict_collector);
166167

167168
void complete(int complete_r);
168169

@@ -194,6 +195,11 @@ class lock_request {
194195

195196
static int find_by_txnid(lock_request * const &request, const TXNID &txnid);
196197

198+
// Report list of conflicts to lock wait callback.
199+
static void report_waits(GrowableArray<TXNID> *wait_conflicts,
200+
void (*lock_wait_callback)(void *, TXNID, TXNID));
201+
void add_conflicts_to_waits(txnid_set *conflicts, GrowableArray<TXNID> *wait_conflicts);
202+
197203
void (*m_start_test_callback)(void);
198204
void (*m_start_before_pending_test_callback)(void);
199205
void (*m_retry_test_callback)(void);

storage/tokudb/PerconaFT/locktree/tests/lock_request_start_retry_race_3.cc

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -87,7 +87,7 @@ static void run_locker(locktree *lt, TXNID txnid, const DBT *key, pthread_barrie
8787
buffer.destroy();
8888

8989
// retry pending lock requests
90-
lock_request::retry_all_lock_requests(lt, after_retry_all);
90+
lock_request::retry_all_lock_requests(lt, nullptr, after_retry_all);
9191
}
9292

9393
request.destroy();

storage/tokudb/PerconaFT/src/ydb-internal.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -105,6 +105,7 @@ struct __toku_db_env_internal {
105105
TOKULOGGER logger;
106106
toku::locktree_manager ltm;
107107
lock_timeout_callback lock_wait_timeout_callback; // Called when a lock request times out waiting for a lock.
108+
lock_wait_callback lock_wait_needed_callback; // Called when a lock request requires a wait.
108109

109110
DB *directory; // Maps dnames to inames
110111
DB *persistent_environment; // Stores environment settings, can be used for upgrade

storage/tokudb/PerconaFT/src/ydb.cc

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1804,6 +1804,12 @@ env_set_lock_timeout_callback(DB_ENV *env, lock_timeout_callback callback) {
18041804
return 0;
18051805
}
18061806

1807+
static int
1808+
env_set_lock_wait_callback(DB_ENV *env, lock_wait_callback callback) {
1809+
env->i->lock_wait_needed_callback = callback;
1810+
return 0;
1811+
}
1812+
18071813
static void
18081814
format_time(const time_t *timer, char *buf) {
18091815
ctime_r(timer, buf);
@@ -2704,6 +2710,7 @@ toku_env_create(DB_ENV ** envp, uint32_t flags) {
27042710
USENV(get_lock_timeout);
27052711
USENV(set_lock_timeout);
27062712
USENV(set_lock_timeout_callback);
2713+
USENV(set_lock_wait_callback);
27072714
USENV(set_redzone);
27082715
USENV(log_flush);
27092716
USENV(log_archive);

storage/tokudb/PerconaFT/src/ydb_row_lock.cc

Lines changed: 11 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -193,7 +193,10 @@ int toku_db_start_range_lock(DB *db, DB_TXN *txn, const DBT *left_key, const DBT
193193
toku::lock_request::type lock_type, toku::lock_request *request) {
194194
DB_TXN *txn_anc = txn_oldest_ancester(txn);
195195
TXNID txn_anc_id = txn_anc->id64(txn_anc);
196-
request->set(db->i->lt, txn_anc_id, left_key, right_key, lock_type, toku_is_big_txn(txn_anc));
196+
uint64_t client_id;
197+
void *client_extra;
198+
txn->get_client_id(txn, &client_id, &client_extra);
199+
request->set(db->i->lt, txn_anc_id, left_key, right_key, lock_type, toku_is_big_txn(txn_anc), client_extra);
197200

198201
const int r = request->start();
199202
if (r == 0) {
@@ -221,7 +224,8 @@ int toku_db_wait_range_lock(DB *db, DB_TXN *txn, toku::lock_request *request) {
221224
uint64_t killed_time_msec = env->i->default_killed_time_msec;
222225
if (env->i->get_killed_time_callback)
223226
killed_time_msec = env->i->get_killed_time_callback(killed_time_msec);
224-
const int r = request->wait(wait_time_msec, killed_time_msec, env->i->killed_callback);
227+
const int r = request->wait(wait_time_msec, killed_time_msec, env->i->killed_callback,
228+
env->i->lock_wait_needed_callback);
225229
if (r == 0) {
226230
db_txn_note_row_lock(db, txn_anc, left_key, right_key);
227231
} else if (r == DB_LOCK_NOTGRANTED) {
@@ -248,7 +252,10 @@ void toku_db_grab_write_lock (DB *db, DBT *key, TOKUTXN tokutxn) {
248252
// This lock request must succeed, so we do not want to wait
249253
toku::lock_request request;
250254
request.create();
251-
request.set(db->i->lt, txn_anc_id, key, key, toku::lock_request::type::WRITE, toku_is_big_txn(txn_anc));
255+
uint64_t client_id;
256+
void *client_extra;
257+
txn->get_client_id(txn, &client_id, &client_extra);
258+
request.set(db->i->lt, txn_anc_id, key, key, toku::lock_request::type::WRITE, toku_is_big_txn(txn_anc), client_extra);
252259
int r = request.start();
253260
invariant_zero(r);
254261
db_txn_note_row_lock(db, txn_anc, key, key);
@@ -268,7 +275,7 @@ void toku_db_release_lt_key_ranges(DB_TXN *txn, txn_lt_key_ranges *ranges) {
268275

269276
// all of our locks have been released, so first try to wake up
270277
// pending lock requests, then release our reference on the lt
271-
toku::lock_request::retry_all_lock_requests(lt);
278+
toku::lock_request::retry_all_lock_requests(lt, txn->mgrp->i->lock_wait_needed_callback);
272279

273280
// Release our reference on this locktree
274281
toku::locktree_manager *ltm = &txn->mgrp->i->ltm;

0 commit comments

Comments
 (0)