Skip to content

Commit 0c1f97b

Browse files
committed
MDEV-15152 Optimistic parallel slave doesnt cope well with START SLAVE UNTIL
The immediate bug was caused by a failure to recognize a correct position to stop the slave applier run in optimistic parallel mode. There were the following set of issues that the analysis unveil. 1 incorrect estimate for the event binlog position passed to is_until_satisfied 2 wait for workers to complete by the driver thread did not account non-group events that could be left unprocessed and thus to mix up the last executed binlog group's file and position: the file remained old and the position related to the new rotated file 3 incorrect 'slave reached file:pos' by the parallel slave report in the error log 4 relay log UNTIL missed out the parallel slave branch in is_until_satisfied. The patch addresses all of them to simplify logics of log change notification in either the master and relay-log until case. P.1 is addressed with passing the event into is_until_satisfied() for proper analisis by the function. P.2 is fixed by changes in handle_queued_pos_update(). P.4 required removing relay-log change notification by workers. Instead the driver thread updates the notion of the current relay-log fully itself with aid of introduced bool Relay_log_info::until_relay_log_names_defer. An extra print out of the requested until file:pos is arranged with --log-warning=3.
1 parent 451bfcd commit 0c1f97b

File tree

6 files changed

+859
-20
lines changed

6 files changed

+859
-20
lines changed
Lines changed: 291 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,291 @@
1+
include/master-slave.inc
2+
[connection master]
3+
connection slave;
4+
include/stop_slave.inc
5+
RESET MASTER;
6+
RESET SLAVE;
7+
connection master;
8+
RESET MASTER;
9+
CREATE TABLE t1 (a int primary key, b text) ENGINE=InnoDB;
10+
INSERT INTO t1 SET a=25, b='trx0';
11+
connection slave;
12+
include/start_slave.inc
13+
connection master;
14+
connection slave;
15+
include/stop_slave.inc
16+
ALTER TABLE mysql.gtid_slave_pos ENGINE=InnoDB;
17+
SET @old_parallel_threads=@@GLOBAL.slave_parallel_threads;
18+
SET GLOBAL slave_parallel_threads=2;
19+
SET @old_parallel_mode=@@GLOBAL.slave_parallel_mode;
20+
SET GLOBAL slave_parallel_mode='optimistic';
21+
connection slave;
22+
SET @old_max_relay_log_size = @@global.max_relay_log_size;
23+
SET @@global.max_relay_log_size=4096;
24+
connection master;
25+
BEGIN;
26+
INSERT INTO t1 SET a=1, b='trx1';
27+
INSERT INTO t1 SET a=2, b='trx1';
28+
INSERT INTO t1 SET a=3, b='trx1';
29+
INSERT INTO t1 SET a=4, b='trx1';
30+
INSERT INTO t1 SET a=5, b='trx1';
31+
INSERT INTO t1 SET a=6, b='trx1';
32+
INSERT INTO t1 SET a=7, b='trx1';
33+
INSERT INTO t1 SET a=8, b='trx1';
34+
INSERT INTO t1 SET a=9, b='trx1';
35+
INSERT INTO t1 SET a=10, b='trx1';
36+
INSERT INTO t1 SET a=11, b='trx1';
37+
INSERT INTO t1 SET a=12, b='trx1';
38+
INSERT INTO t1 SET a=13, b='trx1';
39+
INSERT INTO t1 SET a=14, b='trx1';
40+
INSERT INTO t1 SET a=15, b='trx1';
41+
INSERT INTO t1 SET a=16, b='trx1';
42+
INSERT INTO t1 SET a=17, b='trx1';
43+
INSERT INTO t1 SET a=18, b='trx1';
44+
INSERT INTO t1 SET a=19, b='trx1';
45+
INSERT INTO t1 SET a=20, b='trx1';
46+
INSERT INTO t1 SET a=21, b='trx1';
47+
INSERT INTO t1 SET a=22, b='trx1';
48+
INSERT INTO t1 SET a=23, b='trx1';
49+
INSERT INTO t1 SET a=24, b='trx1';
50+
COMMIT;
51+
FLUSH LOGS;
52+
BEGIN;
53+
UPDATE t1 SET b='trx2_0' WHERE a = 25;
54+
UPDATE t1 SET b='trx2' WHERE a = 25;
55+
COMMIT;
56+
INSERT INTO t1 SET a=26,b='trx3';
57+
*** case 1 UNTIL inside trx2
58+
connection slave1;
59+
BEGIN;
60+
INSERT INTO t1 SET a= 1;
61+
connection slave;
62+
SELECT <pos_0> <= <pos_until> AND <pos_until> < <pos_trx2> as "pos_until < trx0 and is within trx2";
63+
pos_until < trx0 and is within trx2
64+
1
65+
CHANGE MASTER TO MASTER_USE_GTID=no;
66+
START SLAVE UNTIL MASTER_LOG_FILE = 'file_2', MASTER_LOG_POS = <pos_until>;
67+
connection slave1;
68+
ROLLBACK;
69+
Proof 1: Correct stop
70+
connection slave;
71+
include/wait_for_slave_sql_to_stop.inc
72+
SELECT count(*) = 1 as 'trx2 is committed' FROM t1 WHERE b = 'trx2';
73+
trx2 is committed
74+
1
75+
SELECT count(*) = 0 as 'trx3 is not committed' FROM t1 WHERE b = 'trx3';
76+
trx3 is not committed
77+
1
78+
Proof 2: Resume works out
79+
include/start_slave.inc
80+
connection master;
81+
connection slave;
82+
*** case 2 UNTIL inside trx2
83+
connection slave;
84+
DELETE FROM t1 WHERE a <> 25;
85+
UPDATE t1 SET b='trx0' WHERE a = 25;
86+
connection slave1;
87+
BEGIN;
88+
INSERT INTO t1 SET a= 1;
89+
connection slave;
90+
include/stop_slave.inc
91+
SELECT <pos_0> <= <pos_until> AND <pos_until> < <pos_trx2> as "pos_until >= trx0 and is within trx2";
92+
pos_until >= trx0 and is within trx2
93+
1
94+
CHANGE MASTER TO MASTER_LOG_FILE = 'file_1', MASTER_LOG_POS = <pos_trx0>, MASTER_USE_GTID=no;
95+
START SLAVE UNTIL MASTER_LOG_FILE = 'file_2', MASTER_LOG_POS = <pos_until>;
96+
connection slave1;
97+
ROLLBACK;
98+
Proof 1: Correct stop
99+
connection slave;
100+
include/wait_for_slave_sql_to_stop.inc
101+
SELECT count(*) = 1 as 'trx2 is committed' FROM t1 WHERE b = 'trx2';
102+
trx2 is committed
103+
1
104+
SELECT count(*) = 0 as 'trx3 is not committed' FROM t1 WHERE b = 'trx3';
105+
trx3 is not committed
106+
1
107+
Proof 2: Resume works out
108+
include/start_slave.inc
109+
connection master;
110+
connection slave;
111+
*** case 3 UNTIL inside trx1
112+
connection slave;
113+
DELETE FROM t1 WHERE a <> 25;
114+
UPDATE t1 SET b='trx0' WHERE a = 25;
115+
connection slave1;
116+
BEGIN;
117+
INSERT INTO t1 SET a= 1; # block trx1;
118+
connection slave;
119+
include/stop_slave.inc
120+
SELECT <pos_until> < <pos_0> as "pos_until before trx2 start position";
121+
pos_until before trx2 start position
122+
1
123+
CHANGE MASTER TO MASTER_LOG_FILE = 'file_1', MASTER_LOG_POS = <pos_trx0>, MASTER_USE_GTID=no;
124+
START SLAVE UNTIL MASTER_LOG_FILE = 'file_2', MASTER_LOG_POS = <pos_until>;
125+
connection slave1;
126+
ROLLBACK;
127+
Proof 1: Correct stop
128+
connection slave;
129+
include/wait_for_slave_sql_to_stop.inc
130+
SELECT count(*) = 25-1 as 'trx1 is committed' FROM t1 WHERE b = 'trx1';
131+
trx1 is committed
132+
1
133+
SELECT count(*) = 0 as 'trx2 is not committed' FROM t1 WHERE b = 'trx2';
134+
trx2 is not committed
135+
1
136+
Proof 2: Resume works out
137+
include/start_slave.inc
138+
connection master;
139+
connection slave;
140+
*** case 4 Relay-log UNTIL inside trx1
141+
connection slave;
142+
DELETE FROM t1 WHERE a <> 25;
143+
UPDATE t1 SET b='trx0' WHERE a = 25;
144+
connection slave1;
145+
BEGIN;
146+
INSERT INTO t1 SET a= 1; # block trx1;
147+
connection slave;
148+
include/stop_slave.inc
149+
CHANGE MASTER TO MASTER_LOG_FILE = 'file_1', MASTER_LOG_POS = <pos_trx0>, MASTER_USE_GTID=no;
150+
START SLAVE IO_THREAD;
151+
include/wait_for_slave_io_to_start.inc
152+
START SLAVE UNTIL RELAY_LOG_FILE = 'file_2', RELAY_LOG_POS = <pos_until>;
153+
connection slave1;
154+
ROLLBACK;
155+
Proof 1: Correct stop
156+
connection slave;
157+
include/wait_for_slave_sql_to_stop.inc
158+
SELECT count(*) = 25-1 as 'trx1 is committed' FROM t1 WHERE b = 'trx1';
159+
trx1 is committed
160+
1
161+
SELECT count(*) = 0 as 'trx2 is not committed' FROM t1 WHERE b = 'trx2';
162+
trx2 is not committed
163+
1
164+
Proof 2: Resume works out
165+
include/start_slave.inc
166+
connection master;
167+
connection slave;
168+
*** case 5 Relay-log UNTIL inside a "big" trx that spawns few relay logs
169+
connection master;
170+
CREATE TABLE t2 (a TEXT) ENGINE=InnoDB;
171+
FLUSH LOGS;
172+
connection slave;
173+
connection slave;
174+
include/stop_slave.inc
175+
connection master;
176+
BEGIN;
177+
INSERT INTO t2 SET a=repeat('a',1024);
178+
INSERT INTO t2 SET a=repeat('a',1024);
179+
INSERT INTO t2 SET a=repeat('a',1024);
180+
INSERT INTO t2 SET a=repeat('a',1024);
181+
INSERT INTO t2 SET a=repeat('a',1024);
182+
INSERT INTO t2 SET a=repeat('a',1024);
183+
INSERT INTO t2 SET a=repeat('a',1024);
184+
INSERT INTO t2 SET a=repeat('a',1024);
185+
INSERT INTO t2 SET a=repeat('a',1024);
186+
INSERT INTO t2 SET a=repeat('a',1024);
187+
INSERT INTO t2 SET a=repeat('a',1024);
188+
INSERT INTO t2 SET a=repeat('a',1024);
189+
INSERT INTO t2 SET a=repeat('a',1024);
190+
INSERT INTO t2 SET a=repeat('a',1024);
191+
INSERT INTO t2 SET a=repeat('a',1024);
192+
INSERT INTO t2 SET a=repeat('a',1024);
193+
INSERT INTO t2 SET a=repeat('a',1024);
194+
COMMIT;
195+
INSERT INTO t2 SET a='a';
196+
connection slave;
197+
START SLAVE IO_THREAD;
198+
include/wait_for_slave_io_to_start.inc
199+
START SLAVE UNTIL RELAY_LOG_FILE = 'file_2', RELAY_LOG_POS = <pos_until>;
200+
Proof 1: Correct stop
201+
connection slave;
202+
include/wait_for_slave_sql_to_stop.inc
203+
Proof 2: Resume works out
204+
include/start_slave.inc
205+
connection master;
206+
connection slave;
207+
include/diff_tables.inc [master:t2,slave:t2]
208+
*** case 6 Relay-log UNTIL inside a small trx inside a sequence of relay logs
209+
connection slave;
210+
include/stop_slave.inc
211+
connection master;
212+
BEGIN;
213+
DELETE FROM t2 LIMIT 1;
214+
COMMIT;
215+
BEGIN;
216+
DELETE FROM t2 LIMIT 1;
217+
COMMIT;
218+
BEGIN;
219+
DELETE FROM t2 LIMIT 1;
220+
COMMIT;
221+
BEGIN;
222+
DELETE FROM t2 LIMIT 1;
223+
COMMIT;
224+
BEGIN;
225+
DELETE FROM t2 LIMIT 1;
226+
COMMIT;
227+
BEGIN;
228+
DELETE FROM t2 LIMIT 1;
229+
COMMIT;
230+
BEGIN;
231+
DELETE FROM t2 LIMIT 1;
232+
COMMIT;
233+
BEGIN;
234+
DELETE FROM t2 LIMIT 1;
235+
COMMIT;
236+
BEGIN;
237+
DELETE FROM t2 LIMIT 1;
238+
COMMIT;
239+
BEGIN;
240+
DELETE FROM t2 LIMIT 1;
241+
COMMIT;
242+
BEGIN;
243+
DELETE FROM t2 LIMIT 1;
244+
COMMIT;
245+
BEGIN;
246+
DELETE FROM t2 LIMIT 1;
247+
COMMIT;
248+
BEGIN;
249+
DELETE FROM t2 LIMIT 1;
250+
COMMIT;
251+
BEGIN;
252+
DELETE FROM t2 LIMIT 1;
253+
COMMIT;
254+
BEGIN;
255+
DELETE FROM t2 LIMIT 1;
256+
COMMIT;
257+
BEGIN;
258+
DELETE FROM t2 LIMIT 1;
259+
COMMIT;
260+
BEGIN;
261+
DELETE FROM t2 LIMIT 1;
262+
COMMIT;
263+
BEGIN;
264+
DELETE FROM t2 LIMIT 1;
265+
COMMIT;
266+
COMMIT;
267+
connection slave;
268+
START SLAVE IO_THREAD;
269+
include/wait_for_slave_io_to_start.inc
270+
connection master;
271+
include/sync_slave_io_with_master.inc
272+
connection slave;
273+
START SLAVE UNTIL RELAY_LOG_FILE = 'file_2', RELAY_LOG_POS = <pos_until>;
274+
Proof 1: Correct stop
275+
connection slave;
276+
include/wait_for_slave_sql_to_stop.inc
277+
Proof 2: Resume works out
278+
include/start_slave.inc
279+
connection master;
280+
connection slave;
281+
include/diff_tables.inc [master:t2,slave:t2]
282+
connection slave;
283+
include/stop_slave.inc
284+
SET GLOBAL max_relay_log_size=@old_max_relay_log_size;
285+
SET GLOBAL slave_parallel_mode=@old_parallel_mode;
286+
SET GLOBAL slave_parallel_threads=@old_parallel_threads;
287+
include/start_slave.inc
288+
connection master;
289+
DROP TABLE t1, t2;
290+
connection slave;
291+
include/rpl_end.inc

0 commit comments

Comments
 (0)