Skip to content

Commit 184f718

Browse files
committed
MDEV-7249: Performance problem in parallel replication with multi-level slaves
Parallel replication (in 10.0 / "conservative" mode) relies on binlog group commits to group transactions that can be safely run in parallel on the slave. The --binlog-commit-wait-count and --binlog-commit-wait-usec options exist to increase the number of commits per group. But in case of conflicts between transactions, this can cause unnecessary delay and reduced througput, especially on a slave where commit order is fixed. This patch adds a heuristics to reduce this problem. When transaction T1 goes to commit, it will first wait for N transactions to queue up for a group commit. However, if we detect that another transaction T2 is waiting for a row lock held by T1, then we will skip the wait and let T1 commit immediately, releasing locks and let T2 continue. On a slave, this avoids the unfortunate situation where T1 is waiting for T2 to join the group commit, but T2 is waiting for T1 to release locks, causing no work to be done for the duration of the --binlog-commit-wait-usec timeout. (The heuristic seems reasonable on the master as well, so it is enabled for all transactions, not just replication transactions).
1 parent bc902a2 commit 184f718

File tree

10 files changed

+267
-5
lines changed

10 files changed

+267
-5
lines changed
Lines changed: 57 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,57 @@
1+
ALTER TABLE mysql.gtid_slave_pos ENGINE=InnoDB;
2+
CREATE TABLE t1 (a INT PRIMARY KEY, b INT) ENGINE=InnoDB;
3+
SET @old_count= @@GLOBAL.binlog_commit_wait_count;
4+
SET GLOBAL binlog_commit_wait_count= 3;
5+
SET @old_usec= @@GLOBAL.binlog_commit_wait_usec;
6+
SET GLOBAL binlog_commit_wait_usec= 20000000;
7+
SET @a= current_timestamp();
8+
BEGIN;
9+
INSERT INTO t1 VALUES (1,0);
10+
COMMIT;
11+
INSERT INTO t1 VALUES (1,1);
12+
SET @b= unix_timestamp(current_timestamp()) - unix_timestamp(@a);
13+
SELECT IF(@b < 20, "Ok", CONCAT("Error: too much time elapsed: ", @b, " seconds >= 20"));
14+
IF(@b < 20, "Ok", CONCAT("Error: too much time elapsed: ", @b, " seconds >= 20"))
15+
Ok
16+
ERROR 23000: Duplicate entry '1' for key 'PRIMARY'
17+
SET @a= current_timestamp();
18+
INSERT INTO t1 VALUES (2,0);
19+
INSERT INTO t1 VALUES (3,0);
20+
INSERT INTO t1 VALUES (4,0);
21+
SET @b= unix_timestamp(current_timestamp()) - unix_timestamp(@a);
22+
SELECT IF(@b < 20, "Ok", CONCAT("Error: too much time elapsed: ", @b, " seconds >= 20"));
23+
IF(@b < 20, "Ok", CONCAT("Error: too much time elapsed: ", @b, " seconds >= 20"))
24+
Ok
25+
SET @a= current_timestamp();
26+
INSERT INTO t1 VALUES (6,0);
27+
BEGIN;
28+
UPDATE t1 SET b=b+1 WHERE a=1;
29+
UPDATE t1 SET b=b+10 WHERE a=1;
30+
SELECT SLEEP(0.25);
31+
SLEEP(0.25)
32+
0
33+
UPDATE t1 SET b=b+1 WHERE a=3;
34+
COMMIT;
35+
SET @b= unix_timestamp(current_timestamp()) - unix_timestamp(@a);
36+
SELECT IF(@b < 20, "Ok", CONCAT("Error: too much time elapsed: ", @b, " seconds >= 20"));
37+
IF(@b < 20, "Ok", CONCAT("Error: too much time elapsed: ", @b, " seconds >= 20"))
38+
Ok
39+
SET @a= current_timestamp();
40+
INSERT INTO t1 VALUES (7,0);
41+
INSERT INTO t1 VALUES (8,0);
42+
SET @b= unix_timestamp(current_timestamp()) - unix_timestamp(@a);
43+
SELECT IF(@b < 20, "Ok", CONCAT("Error: too much time elapsed: ", @b, " seconds >= 20"));
44+
IF(@b < 20, "Ok", CONCAT("Error: too much time elapsed: ", @b, " seconds >= 20"))
45+
Ok
46+
SELECT * FROM t1 ORDER BY a;
47+
a b
48+
1 11
49+
2 0
50+
3 1
51+
4 0
52+
6 0
53+
7 0
54+
8 0
55+
DROP TABLE t1;
56+
SET GLOBAL binlog_commit_wait_count= @old_count;
57+
SET GLOBAL binlog_commit_wait_usec= @old_usec;
Lines changed: 123 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,123 @@
1+
--source include/have_innodb.inc
2+
--source include/have_log_bin.inc
3+
4+
ALTER TABLE mysql.gtid_slave_pos ENGINE=InnoDB;
5+
CREATE TABLE t1 (a INT PRIMARY KEY, b INT) ENGINE=InnoDB;
6+
7+
SET @old_count= @@GLOBAL.binlog_commit_wait_count;
8+
SET GLOBAL binlog_commit_wait_count= 3;
9+
SET @old_usec= @@GLOBAL.binlog_commit_wait_usec;
10+
SET GLOBAL binlog_commit_wait_usec= 20000000;
11+
12+
connect(con1,localhost,root,,test);
13+
connect(con2,localhost,root,,test);
14+
connect(con3,localhost,root,,test);
15+
16+
# Check that if T2 goes to wait for a row lock of T1 while T1 is waiting for
17+
# more transactions to arrive for group commit, the commit of T1 will complete
18+
# immediately.
19+
# We test this by setting a very high timeout (20 seconds), and testing that
20+
# that much time does not elapse.
21+
22+
--connection default
23+
SET @a= current_timestamp();
24+
25+
--connection con1
26+
BEGIN;
27+
INSERT INTO t1 VALUES (1,0);
28+
send COMMIT;
29+
30+
--connection con2
31+
send INSERT INTO t1 VALUES (1,1);
32+
33+
--connection con1
34+
reap;
35+
36+
--connection default
37+
SET @b= unix_timestamp(current_timestamp()) - unix_timestamp(@a);
38+
SELECT IF(@b < 20, "Ok", CONCAT("Error: too much time elapsed: ", @b, " seconds >= 20"));
39+
40+
--connection con2
41+
--error ER_DUP_ENTRY
42+
reap;
43+
44+
45+
# Test that the commit triggers when sufficient commits have queued up.
46+
--connection default
47+
SET @a= current_timestamp();
48+
49+
--connection con1
50+
send INSERT INTO t1 VALUES (2,0);
51+
52+
--connection con2
53+
send INSERT INTO t1 VALUES (3,0);
54+
55+
--connection con3
56+
INSERT INTO t1 VALUES (4,0);
57+
58+
--connection con1
59+
reap;
60+
--connection con2
61+
reap;
62+
63+
--connection default
64+
SET @b= unix_timestamp(current_timestamp()) - unix_timestamp(@a);
65+
SELECT IF(@b < 20, "Ok", CONCAT("Error: too much time elapsed: ", @b, " seconds >= 20"));
66+
67+
68+
# Test that commit triggers immediately if there is already a transaction
69+
# waiting on another transaction that reaches its commit.
70+
71+
--connection default
72+
SET @a= current_timestamp();
73+
74+
--connection con1
75+
send INSERT INTO t1 VALUES (6,0);
76+
77+
--connection con2
78+
BEGIN;
79+
UPDATE t1 SET b=b+1 WHERE a=1;
80+
81+
--connection con3
82+
send UPDATE t1 SET b=b+10 WHERE a=1;
83+
84+
--connection con2
85+
# A small sleep to let con3 have time to wait on con2.
86+
# The sleep might be too small on loaded host, but that is not a big problem;
87+
# it only means we will trigger a different code path (con3 waits after con2
88+
# is ready to commit rather than before); and either path should work the same.
89+
# So we will not get false positive in case of different timing; at worst false
90+
# negative.
91+
SELECT SLEEP(0.25);
92+
UPDATE t1 SET b=b+1 WHERE a=3;
93+
COMMIT;
94+
95+
--connection con1
96+
reap;
97+
98+
--connection default
99+
SET @b= unix_timestamp(current_timestamp()) - unix_timestamp(@a);
100+
SELECT IF(@b < 20, "Ok", CONCAT("Error: too much time elapsed: ", @b, " seconds >= 20"));
101+
102+
--connection default
103+
SET @a= current_timestamp();
104+
105+
# Now con3 will be waiting for a following group commit to trigger.
106+
--connection con1
107+
send INSERT INTO t1 VALUES (7,0);
108+
--connection con2
109+
INSERT INTO t1 VALUES (8,0);
110+
--connection con3
111+
reap;
112+
113+
--connection default
114+
SET @b= unix_timestamp(current_timestamp()) - unix_timestamp(@a);
115+
SELECT IF(@b < 20, "Ok", CONCAT("Error: too much time elapsed: ", @b, " seconds >= 20"));
116+
117+
--connection default
118+
SELECT * FROM t1 ORDER BY a;
119+
120+
--connection default
121+
DROP TABLE t1;
122+
SET GLOBAL binlog_commit_wait_count= @old_count;
123+
SET GLOBAL binlog_commit_wait_usec= @old_usec;

sql/handler.cc

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1545,7 +1545,10 @@ commit_one_phase_2(THD *thd, bool all, THD_TRANS *trans, bool is_real_trans)
15451545
}
15461546
/* Free resources and perform other cleanup even for 'empty' transactions. */
15471547
if (is_real_trans)
1548+
{
1549+
thd->has_waiter= false;
15481550
thd->transaction.cleanup();
1551+
}
15491552

15501553
DBUG_RETURN(error);
15511554
}
@@ -1626,7 +1629,10 @@ int ha_rollback_trans(THD *thd, bool all)
16261629

16271630
/* Always cleanup. Even if nht==0. There may be savepoints. */
16281631
if (is_real_trans)
1632+
{
1633+
thd->has_waiter= false;
16291634
thd->transaction.cleanup();
1635+
}
16301636
if (all)
16311637
thd->transaction_rollback_request= FALSE;
16321638

sql/log.cc

Lines changed: 58 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -7029,6 +7029,14 @@ MYSQL_BIN_LOG::queue_for_group_commit(group_commit_entry *orig_entry)
70297029
}
70307030
}
70317031

7032+
/*
7033+
Handle the heuristics that if another transaction is waiting for this
7034+
transaction (or if it does so later), then we want to trigger group
7035+
commit immediately, without waiting for the binlog_commit_wait_usec
7036+
timeout to expire.
7037+
*/
7038+
entry->thd->waiting_on_group_commit= true;
7039+
70327040
/* Add the entry to the group commit queue. */
70337041
next_entry= entry->next;
70347042
entry->next= group_commit_queue;
@@ -7044,7 +7052,7 @@ MYSQL_BIN_LOG::queue_for_group_commit(group_commit_entry *orig_entry)
70447052
cur= entry->thd->wait_for_commit_ptr;
70457053
}
70467054

7047-
if (opt_binlog_commit_wait_count > 0)
7055+
if (opt_binlog_commit_wait_count > 0 && orig_queue != NULL)
70487056
mysql_cond_signal(&COND_prepare_ordered);
70497057
mysql_mutex_unlock(&LOCK_prepare_ordered);
70507058
DEBUG_SYNC(orig_entry->thd, "commit_after_release_LOCK_prepare_ordered");
@@ -7218,6 +7226,11 @@ MYSQL_BIN_LOG::trx_group_commit_leader(group_commit_entry *leader)
72187226
while (current)
72197227
{
72207228
group_commit_entry *next= current->next;
7229+
/*
7230+
Now that group commit is started, we can clear the flag; there is no
7231+
longer any use in waiters on this commit trying to trigger it early.
7232+
*/
7233+
current->thd->waiting_on_group_commit= false;
72217234
current->next= queue;
72227235
queue= current;
72237236
current= next;
@@ -7530,7 +7543,7 @@ MYSQL_BIN_LOG::wait_for_sufficient_commits()
75307543
mysql_mutex_assert_owner(&LOCK_prepare_ordered);
75317544

75327545
for (e= last_head= group_commit_queue, count= 0; e; e= e->next)
7533-
if (++count >= opt_binlog_commit_wait_count)
7546+
if (++count >= opt_binlog_commit_wait_count || unlikely(e->thd->has_waiter))
75347547
return;
75357548

75367549
mysql_mutex_unlock(&LOCK_log);
@@ -7545,13 +7558,20 @@ MYSQL_BIN_LOG::wait_for_sufficient_commits()
75457558
&wait_until);
75467559
if (err == ETIMEDOUT)
75477560
break;
7561+
if (unlikely(last_head->thd->has_waiter))
7562+
break;
75487563
head= group_commit_queue;
75497564
for (e= head; e && e != last_head; e= e->next)
7565+
{
75507566
++count;
7567+
if (unlikely(e->thd->has_waiter))
7568+
goto after_loop;
7569+
}
75517570
if (count >= opt_binlog_commit_wait_count)
75527571
break;
75537572
last_head= head;
75547573
}
7574+
after_loop:
75557575

75567576
/*
75577577
We must not wait for LOCK_log while holding LOCK_prepare_ordered.
@@ -7575,6 +7595,42 @@ MYSQL_BIN_LOG::wait_for_sufficient_commits()
75757595
}
75767596

75777597

7598+
void
7599+
MYSQL_BIN_LOG::binlog_trigger_immediate_group_commit()
7600+
{
7601+
group_commit_entry *head;
7602+
mysql_mutex_lock(&LOCK_prepare_ordered);
7603+
head= group_commit_queue;
7604+
if (head)
7605+
{
7606+
head->thd->has_waiter= true;
7607+
mysql_cond_signal(&COND_prepare_ordered);
7608+
}
7609+
mysql_mutex_unlock(&LOCK_prepare_ordered);
7610+
}
7611+
7612+
7613+
/*
7614+
This function is called when a transaction T1 goes to wait for another
7615+
transaction T2. It is used to cut short any binlog group commit delay from
7616+
--binlog-commit-wait-count in the case where another transaction is stalled
7617+
on the wait due to conflicting row locks.
7618+
7619+
If T2 is already ready to group commit, any waiting group commit will be
7620+
signalled to proceed immediately. Otherwise, a flag will be set in T2, and
7621+
when T2 later becomes ready, immediate group commit will be triggered.
7622+
*/
7623+
void
7624+
binlog_report_wait_for(THD *thd1, THD *thd2)
7625+
{
7626+
if (opt_binlog_commit_wait_count == 0)
7627+
return;
7628+
thd2->has_waiter= true;
7629+
if (thd2->waiting_on_group_commit)
7630+
mysql_bin_log.binlog_trigger_immediate_group_commit();
7631+
}
7632+
7633+
75787634
/**
75797635
Wait until we get a signal that the relay log has been updated.
75807636

sql/log.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -690,6 +690,7 @@ class MYSQL_BIN_LOG: public TC_LOG, private MYSQL_LOG
690690
void set_max_size(ulong max_size_arg);
691691
void signal_update();
692692
void wait_for_sufficient_commits();
693+
void binlog_trigger_immediate_group_commit();
693694
void wait_for_update_relay_log(THD* thd);
694695
int wait_for_update_bin_log(THD* thd, const struct timespec * timeout);
695696
void init(ulong max_size);
@@ -1005,6 +1006,7 @@ bool general_log_print(THD *thd, enum enum_server_command command,
10051006
bool general_log_write(THD *thd, enum enum_server_command command,
10061007
const char *query, uint query_length);
10071008

1009+
void binlog_report_wait_for(THD *thd, THD *other_thd);
10081010
void sql_perror(const char *message);
10091011
bool flush_error_log();
10101012

sql/sql_class.cc

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -887,6 +887,7 @@ THD::THD()
887887
in_lock_tables(0),
888888
bootstrap(0),
889889
derived_tables_processing(FALSE),
890+
waiting_on_group_commit(FALSE), has_waiter(FALSE),
890891
spcont(NULL),
891892
m_parser_state(NULL),
892893
#if defined(ENABLED_DEBUG_SYNC)
@@ -4233,6 +4234,8 @@ thd_need_wait_for(const MYSQL_THD thd)
42334234
{
42344235
rpl_group_info *rgi;
42354236

4237+
if (mysql_bin_log.is_open() && opt_binlog_commit_wait_count > 0)
4238+
return true;
42364239
if (!thd)
42374240
return false;
42384241
rgi= thd->rgi_slave;
@@ -4267,13 +4270,14 @@ thd_need_wait_for(const MYSQL_THD thd)
42674270
not harmful, but could lead to unnecessary kill and retry, so best avoided).
42684271
*/
42694272
extern "C" void
4270-
thd_report_wait_for(const MYSQL_THD thd, MYSQL_THD other_thd)
4273+
thd_report_wait_for(MYSQL_THD thd, MYSQL_THD other_thd)
42714274
{
42724275
rpl_group_info *rgi;
42734276
rpl_group_info *other_rgi;
42744277

42754278
if (!thd || !other_thd)
42764279
return;
4280+
binlog_report_wait_for(thd, other_thd);
42774281
rgi= thd->rgi_slave;
42784282
other_rgi= other_thd->rgi_slave;
42794283
if (!rgi || !other_rgi)

sql/sql_class.h

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2674,6 +2674,18 @@ class THD :public Statement,
26742674
it returned an error on master, and this is OK on the slave.
26752675
*/
26762676
bool is_slave_error;
2677+
/*
2678+
True when a transaction is queued up for binlog group commit.
2679+
Used so that if another transaction needs to wait for a row lock held by
2680+
this transaction, it can signal to trigger the group commit immediately,
2681+
skipping the normal --binlog-commit-wait-count wait.
2682+
*/
2683+
bool waiting_on_group_commit;
2684+
/*
2685+
Set true when another transaction goes to wait on a row lock held by this
2686+
transaction. Used together with waiting_on_group_commit.
2687+
*/
2688+
bool has_waiter;
26772689
/*
26782690
In case of a slave, set to the error code the master got when executing
26792691
the query. 0 if no error on the master.

sql/transaction.cc

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -149,6 +149,8 @@ bool trans_begin(THD *thd, uint flags)
149149
when we come here. We should at some point change this to an assert.
150150
*/
151151
thd->transaction.all.modified_non_trans_table= FALSE;
152+
thd->has_waiter= false;
153+
thd->waiting_on_group_commit= false;
152154

153155
if (res)
154156
DBUG_RETURN(TRUE);

storage/innobase/lock/lock0lock.cc

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -375,7 +375,7 @@ struct lock_stack_t {
375375
ulint heap_no; /*!< heap number if rec lock */
376376
};
377377

378-
extern "C" void thd_report_wait_for(const MYSQL_THD thd, MYSQL_THD other_thd);
378+
extern "C" void thd_report_wait_for(MYSQL_THD thd, MYSQL_THD other_thd);
379379
extern "C" int thd_need_wait_for(const MYSQL_THD thd);
380380
extern "C"
381381
int thd_need_ordering_with(const MYSQL_THD thd, const MYSQL_THD other_thd);

0 commit comments

Comments
 (0)