Skip to content

Commit ba02550

Browse files
committed
MDEV-7818: Deadlock occurring with parallel replication and FTWRL
Problem is that FLUSH TABLES WITH READ LOCK first blocks threads from starting new commits, then waits for running commits to complete. But in-order parallel replication needs commits to happen in a particular order, so this can easily deadlock. To fix this problem, this patch introduces a way to temporarily pause the parallel replication worker threads. Before starting FTWRL, we let all worker threads complete in-progress transactions, and then wait. Then we proceed to take the global read lock. Once the lock is obtained, we unpause the worker threads. Now commits are blocked from starting by the global read lock, so the deadlock will no longer occur.
1 parent 6d96fab commit ba02550

File tree

8 files changed

+537
-25
lines changed

8 files changed

+537
-25
lines changed

mysql-test/suite/perfschema/r/stage_mdl_global.result

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@ user1 statement/sql/flush flush tables with read lock
66
username event_name nesting_event_type
77
username event_name nesting_event_type
88
user1 stage/sql/init STATEMENT
9+
user1 stage/sql/init STATEMENT
910
user1 stage/sql/query end STATEMENT
1011
user1 stage/sql/closing tables STATEMENT
1112
user1 stage/sql/freeing items STATEMENT

mysql-test/suite/rpl/r/rpl_parallel2.result

Lines changed: 91 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -29,8 +29,98 @@ include/start_slave.inc
2929
SELECT * FROM t1 WHERE a >= 10 ORDER BY a;
3030
a b
3131
10 0
32+
*** MDEV-7818: Deadlock occurring with parallel replication and FTWRL ***
33+
CREATE TABLE t2 (a INT PRIMARY KEY, b INT) ENGINE=InnoDB;
34+
INSERT INTO t2 VALUES (1,0), (2,0), (3,0);
35+
include/stop_slave.inc
36+
SET @old_dbug= @@SESSION.debug_dbug;
37+
SET @commit_id= 4242;
38+
SET SESSION debug_dbug="+d,binlog_force_commit_id";
39+
BEGIN;
40+
UPDATE t2 SET b=b+1 WHERE a=2;
41+
COMMIT;
42+
BEGIN;
43+
INSERT INTO t2 VALUES (4,10);
44+
COMMIT;
45+
SET SESSION debug_dbug= @old_dbug;
46+
INSERT INTO t2 VALUES (5,0);
47+
INSERT INTO t2 VALUES (6,0);
48+
INSERT INTO t2 VALUES (7,0);
49+
INSERT INTO t2 VALUES (8,0);
50+
INSERT INTO t2 VALUES (9,0);
51+
INSERT INTO t2 VALUES (10,0);
52+
INSERT INTO t2 VALUES (11,0);
53+
INSERT INTO t2 VALUES (12,0);
54+
INSERT INTO t2 VALUES (13,0);
55+
INSERT INTO t2 VALUES (14,0);
56+
INSERT INTO t2 VALUES (15,0);
57+
INSERT INTO t2 VALUES (16,0);
58+
INSERT INTO t2 VALUES (17,0);
59+
INSERT INTO t2 VALUES (18,0);
60+
INSERT INTO t2 VALUES (19,0);
61+
BEGIN;
62+
SELECT * FROM t2 WHERE a=2 FOR UPDATE;
63+
a b
64+
2 0
65+
include/start_slave.inc
66+
FLUSH TABLES WITH READ LOCK;
67+
COMMIT;
68+
STOP SLAVE;
69+
SELECT * FROM t2 ORDER BY a;
70+
a b
71+
1 0
72+
2 1
73+
3 0
74+
4 10
75+
5 0
76+
6 0
77+
7 0
78+
8 0
79+
9 0
80+
10 0
81+
11 0
82+
12 0
83+
13 0
84+
14 0
85+
15 0
86+
16 0
87+
17 0
88+
18 0
89+
19 0
90+
UNLOCK TABLES;
91+
include/wait_for_slave_to_stop.inc
92+
include/start_slave.inc
93+
SELECT * FROM t2 ORDER BY a;
94+
a b
95+
1 0
96+
2 1
97+
3 0
98+
4 10
99+
5 0
100+
6 0
101+
7 0
102+
8 0
103+
9 0
104+
10 0
105+
11 0
106+
12 0
107+
13 0
108+
14 0
109+
15 0
110+
16 0
111+
17 0
112+
18 0
113+
19 0
114+
*** MDEV-8318: Assertion `!pool->busy' failed in pool_mark_busy(rpl_parallel_thread_pool*) on concurrent FTWRL ***
115+
LOCK TABLE t2 WRITE;
116+
FLUSH TABLES WITH READ LOCK;
117+
FLUSH TABLES WITH READ LOCK;
118+
KILL QUERY CID;
119+
ERROR 70100: Query execution was interrupted
120+
UNLOCK TABLES;
121+
UNLOCK TABLES;
32122
include/stop_slave.inc
33123
SET GLOBAL slave_parallel_threads=@old_parallel_threads;
34124
include/start_slave.inc
35-
DROP TABLE t1;
125+
DROP TABLE t1, t2;
36126
include/rpl_end.inc

mysql-test/suite/rpl/t/rpl_parallel2.test

Lines changed: 135 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,5 @@
1+
--source include/have_debug.inc
2+
--source include/have_innodb.inc
13
--source include/have_binlog_format_statement.inc
24
--let $rpl_topology=1->2
35
--source include/rpl_init.inc
@@ -78,13 +80,144 @@ SET GLOBAL sql_slave_skip_counter= 1;
7880
SELECT * FROM t1 WHERE a >= 10 ORDER BY a;
7981

8082

81-
# Clean up
83+
--echo *** MDEV-7818: Deadlock occurring with parallel replication and FTWRL ***
84+
85+
--connection server_1
86+
CREATE TABLE t2 (a INT PRIMARY KEY, b INT) ENGINE=InnoDB;
87+
INSERT INTO t2 VALUES (1,0), (2,0), (3,0);
88+
--save_master_pos
89+
90+
--connection server_2
91+
--sync_with_master
92+
--source include/stop_slave.inc
93+
94+
--connection server_1
95+
# Create a group commit with two transactions, will be used to provoke the
96+
# problematic thread interaction with FTWRL on the slave.
97+
SET @old_dbug= @@SESSION.debug_dbug;
98+
SET @commit_id= 4242;
99+
SET SESSION debug_dbug="+d,binlog_force_commit_id";
100+
101+
BEGIN;
102+
UPDATE t2 SET b=b+1 WHERE a=2;
103+
COMMIT;
104+
105+
BEGIN;
106+
INSERT INTO t2 VALUES (4,10);
107+
COMMIT;
108+
109+
SET SESSION debug_dbug= @old_dbug;
110+
111+
INSERT INTO t2 VALUES (5,0);
112+
INSERT INTO t2 VALUES (6,0);
113+
INSERT INTO t2 VALUES (7,0);
114+
INSERT INTO t2 VALUES (8,0);
115+
INSERT INTO t2 VALUES (9,0);
116+
INSERT INTO t2 VALUES (10,0);
117+
INSERT INTO t2 VALUES (11,0);
118+
INSERT INTO t2 VALUES (12,0);
119+
INSERT INTO t2 VALUES (13,0);
120+
INSERT INTO t2 VALUES (14,0);
121+
INSERT INTO t2 VALUES (15,0);
122+
INSERT INTO t2 VALUES (16,0);
123+
INSERT INTO t2 VALUES (17,0);
124+
INSERT INTO t2 VALUES (18,0);
125+
INSERT INTO t2 VALUES (19,0);
126+
--save_master_pos
127+
128+
--connection server_2
129+
130+
--connect (s1, 127.0.0.1, root,, test, $SLAVE_MYPORT,)
131+
# Block one transaction on a row lock.
132+
BEGIN;
133+
SELECT * FROM t2 WHERE a=2 FOR UPDATE;
134+
135+
--connection server_2
136+
137+
# Wait for slave thread of the other transaction to have the commit lock.
138+
--source include/start_slave.inc
139+
--let $wait_condition= SELECT COUNT(*) > 0 FROM information_schema.processlist WHERE state = "Waiting for prior transaction to commit"
140+
--source include/wait_condition.inc
141+
142+
--connect (s2, 127.0.0.1, root,, test, $SLAVE_MYPORT,)
143+
send FLUSH TABLES WITH READ LOCK;
144+
# The bug was that at this point we were deadlocked.
145+
# The FTWRL command would wait forever for T2 to commit.
146+
# T2 would wait for T1 to commit first, but T1 is waiting for
147+
# the global read lock to be released.
148+
149+
--connection s1
150+
# Release the lock that blocs T1 from replicating.
151+
COMMIT;
152+
153+
--connection s1
154+
send STOP SLAVE;
155+
156+
--connection s2
157+
reap;
158+
159+
--connection server_1
160+
SELECT * FROM t2 ORDER BY a;
161+
162+
--connection s2
163+
UNLOCK TABLES;
164+
165+
--connection s1
166+
reap;
167+
168+
--connection server_2
169+
--source include/wait_for_slave_to_stop.inc
170+
--source include/start_slave.inc
171+
--sync_with_master
172+
173+
SELECT * FROM t2 ORDER BY a;
174+
175+
176+
177+
--echo *** MDEV-8318: Assertion `!pool->busy' failed in pool_mark_busy(rpl_parallel_thread_pool*) on concurrent FTWRL ***
178+
179+
--connection server_1
180+
LOCK TABLE t2 WRITE;
181+
182+
183+
--connect (m1,localhost,root,,test)
184+
--connection m1
185+
--let $cid=`SELECT CONNECTION_ID()`
186+
send FLUSH TABLES WITH READ LOCK;
187+
188+
--connect (m2,localhost,root,,test)
189+
# We cannot force the race with DEBUG_SYNC, because the race does not
190+
# exist after fixing the bug. At best we could force a debug sync to
191+
# time out, which is effectively just a sleep.
192+
# So just put a small sleep here; it is enough to trigger the bug in
193+
# most run before the bug fix, and the code should work correctly
194+
# however the thread scheduling happens.
195+
--sleep 0.1
196+
send FLUSH TABLES WITH READ LOCK;
197+
198+
--connection server_1
199+
--replace_result $cid CID
200+
eval KILL QUERY $cid;
201+
202+
--connection m1
203+
--error ER_QUERY_INTERRUPTED
204+
reap;
205+
206+
--connection server_1
207+
UNLOCK TABLES;
208+
209+
--connection m2
210+
reap;
211+
UNLOCK TABLES;
212+
213+
214+
# Clean up.
82215
--connection server_2
83216
--source include/stop_slave.inc
84217
SET GLOBAL slave_parallel_threads=@old_parallel_threads;
85218
--source include/start_slave.inc
86219

87220
--connection server_1
88-
DROP TABLE t1;
221+
DROP TABLE t1, t2;
89222

90223
--source include/rpl_end.inc

sql/mysqld.cc

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9525,6 +9525,9 @@ PSI_stage_info stage_waiting_for_prior_transaction_to_commit= { 0, "Waiting for
95259525
PSI_stage_info stage_waiting_for_prior_transaction_to_start_commit= { 0, "Waiting for prior transaction to start commit before starting next transaction", 0};
95269526
PSI_stage_info stage_waiting_for_room_in_worker_thread= { 0, "Waiting for room in worker thread event queue", 0};
95279527
PSI_stage_info stage_waiting_for_workers_idle= { 0, "Waiting for worker threads to be idle", 0};
9528+
PSI_stage_info stage_waiting_for_ftwrl= { 0, "Waiting due to global read lock", 0};
9529+
PSI_stage_info stage_waiting_for_ftwrl_threads_to_pause= { 0, "Waiting for worker threads to pause for global read lock", 0};
9530+
PSI_stage_info stage_waiting_for_rpl_thread_pool= { 0, "Waiting while replication worker thread pool is busy", 0};
95289531
PSI_stage_info stage_master_gtid_wait_primary= { 0, "Waiting in MASTER_GTID_WAIT() (primary waiter)", 0};
95299532
PSI_stage_info stage_master_gtid_wait= { 0, "Waiting in MASTER_GTID_WAIT()", 0};
95309533
PSI_stage_info stage_gtid_wait_other_connection= { 0, "Waiting for other master connection to process GTID received on multiple master connections", 0};

sql/mysqld.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -454,6 +454,9 @@ extern PSI_stage_info stage_waiting_for_prior_transaction_to_commit;
454454
extern PSI_stage_info stage_waiting_for_prior_transaction_to_start_commit;
455455
extern PSI_stage_info stage_waiting_for_room_in_worker_thread;
456456
extern PSI_stage_info stage_waiting_for_workers_idle;
457+
extern PSI_stage_info stage_waiting_for_ftwrl;
458+
extern PSI_stage_info stage_waiting_for_ftwrl_threads_to_pause;
459+
extern PSI_stage_info stage_waiting_for_rpl_thread_pool;
457460
extern PSI_stage_info stage_master_gtid_wait_primary;
458461
extern PSI_stage_info stage_master_gtid_wait;
459462
extern PSI_stage_info stage_gtid_wait_other_connection;

0 commit comments

Comments
 (0)