Skip to content

Commit b616991

Browse files
committed
MDEV-8031: Parallel replication stops on "connection killed" error (probably incorrectly handled deadlock kill)
There was a rare race, where a deadlock error might not be correctly handled, causing the slave to stop with something like this in the error log: 150423 14:04:10 [ERROR] Slave SQL: Connection was killed, Gtid 0-1-2, Internal MariaDB error code: 1927 150423 14:04:10 [Warning] Slave: Connection was killed Error_code: 1927 150423 14:04:10 [Warning] Slave: Deadlock found when trying to get lock; try restarting transaction Error_code: 1213 150423 14:04:10 [Warning] Slave: Connection was killed Error_code: 1927 150423 14:04:10 [Warning] Slave: Connection was killed Error_code: 1927 150423 14:04:10 [ERROR] Error running query, slave SQL thread aborted. Fix the problem, and restart the slave SQL thread with "SLAVE START". We stopped at log 'master-bin.000001 position 1234 The problem was incorrect error handling. When a deadlock is detected, it causes a KILL CONNECTION on the offending thread. This error is then later converted to a deadlock error, and the transaction is retried. However, the deadlock error was not cleared at the start of the retry, nor was the lingering kill signal. So it was possible to get another deadlock kill early during retry. If this happened with particular thread scheduling/timing, it was possible that the new KILL CONNECTION error was masked by the earlier deadlock error, so that the second kill was not properly converted into a deadlock error and retry. This patch adds code that clears the old error and killed flag before starting the retry. It also adds code to handle a deadlock kill caught in a couple of places where it was not handled before.
1 parent 4760528 commit b616991

File tree

3 files changed

+259
-18
lines changed

3 files changed

+259
-18
lines changed

mysql-test/suite/rpl/r/rpl_parallel.result

Lines changed: 83 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1537,7 +1537,7 @@ a b
15371537
120 0
15381538
121 0
15391539
include/stop_slave.inc
1540-
SET GLOBAL debug_dbug= @old_debug;
1540+
SET GLOBAL debug_dbug= @old_dbug;
15411541
include/start_slave.inc
15421542
*** MDEV-7929: record_gtid() for non-transactional event group calls wakeup_subsequent_commits() too early, causing slave hang. ***
15431543
include/stop_slave.inc
@@ -1566,7 +1566,88 @@ a b
15661566
130 0
15671567
131 0
15681568
include/stop_slave.inc
1569-
SET GLOBAL debug_dbug= @old_debug;
1569+
SET GLOBAL debug_dbug= @old_dbug;
1570+
include/start_slave.inc
1571+
*** MDEV-8031: Parallel replication stops on "connection killed" error (probably incorrectly handled deadlock kill) ***
1572+
INSERT INTO t3 VALUES (201,0), (202,0);
1573+
include/save_master_gtid.inc
1574+
include/sync_with_master_gtid.inc
1575+
include/stop_slave.inc
1576+
SET @old_dbug= @@GLOBAL.debug_dbug;
1577+
SET GLOBAL debug_dbug= '+d,inject_mdev8031';
1578+
SET @old_dbug= @@SESSION.debug_dbug;
1579+
SET SESSION debug_dbug="+d,binlog_force_commit_id";
1580+
SET @commit_id= 10200;
1581+
INSERT INTO t3 VALUES (203, 1);
1582+
INSERT INTO t3 VALUES (204, 1);
1583+
INSERT INTO t3 VALUES (205, 1);
1584+
UPDATE t3 SET b=b+1 WHERE a=201;
1585+
UPDATE t3 SET b=b+1 WHERE a=201;
1586+
UPDATE t3 SET b=b+1 WHERE a=201;
1587+
UPDATE t3 SET b=b+1 WHERE a=202;
1588+
UPDATE t3 SET b=b+1 WHERE a=202;
1589+
UPDATE t3 SET b=b+1 WHERE a=202;
1590+
UPDATE t3 SET b=b+1 WHERE a=202;
1591+
UPDATE t3 SET b=b+1 WHERE a=203;
1592+
UPDATE t3 SET b=b+1 WHERE a=203;
1593+
UPDATE t3 SET b=b+1 WHERE a=204;
1594+
UPDATE t3 SET b=b+1 WHERE a=204;
1595+
UPDATE t3 SET b=b+1 WHERE a=204;
1596+
UPDATE t3 SET b=b+1 WHERE a=203;
1597+
UPDATE t3 SET b=b+1 WHERE a=205;
1598+
UPDATE t3 SET b=b+1 WHERE a=205;
1599+
SET SESSION debug_dbug=@old_dbug;
1600+
SELECT * FROM t3 WHERE a>=200 ORDER BY a;
1601+
a b
1602+
201 3
1603+
202 4
1604+
203 4
1605+
204 4
1606+
205 3
1607+
include/save_master_gtid.inc
1608+
include/start_slave.inc
1609+
include/sync_with_master_gtid.inc
1610+
SELECT * FROM t3 WHERE a>=200 ORDER BY a;
1611+
a b
1612+
201 3
1613+
202 4
1614+
203 4
1615+
204 4
1616+
205 3
1617+
include/stop_slave.inc
1618+
SET GLOBAL debug_dbug= @old_dbug;
1619+
include/start_slave.inc
1620+
*** Check getting deadlock killed inside open_binlog() during retry. ***
1621+
include/stop_slave.inc
1622+
SET @old_dbug= @@GLOBAL.debug_dbug;
1623+
SET GLOBAL debug_dbug= '+d,inject_retry_event_group_open_binlog_kill';
1624+
SET @old_max= @@GLOBAL.max_relay_log_size;
1625+
SET GLOBAL max_relay_log_size= 4096;
1626+
SET @old_dbug= @@SESSION.debug_dbug;
1627+
SET SESSION debug_dbug="+d,binlog_force_commit_id";
1628+
SET @commit_id= 10210;
1629+
Omit long queries that cause relaylog rotations and transaction retries...
1630+
SET SESSION debug_dbug=@old_dbug;
1631+
SELECT * FROM t3 WHERE a>=200 ORDER BY a;
1632+
a b
1633+
201 6
1634+
202 8
1635+
203 7
1636+
204 7
1637+
205 5
1638+
include/save_master_gtid.inc
1639+
include/start_slave.inc
1640+
include/sync_with_master_gtid.inc
1641+
SELECT * FROM t3 WHERE a>=200 ORDER BY a;
1642+
a b
1643+
201 6
1644+
202 8
1645+
203 7
1646+
204 7
1647+
205 5
1648+
include/stop_slave.inc
1649+
SET GLOBAL debug_dbug= @old_debg;
1650+
SET GLOBAL max_relay_log_size= @old_max;
15701651
include/start_slave.inc
15711652
include/stop_slave.inc
15721653
SET GLOBAL slave_parallel_threads=@old_parallel_threads;

mysql-test/suite/rpl/t/rpl_parallel.test

Lines changed: 111 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2155,7 +2155,7 @@ SELECT * FROM t3 WHERE a >= 120 ORDER BY a;
21552155
SELECT * FROM t3 WHERE a >= 120 ORDER BY a;
21562156

21572157
--source include/stop_slave.inc
2158-
SET GLOBAL debug_dbug= @old_debug;
2158+
SET GLOBAL debug_dbug= @old_dbug;
21592159
--source include/start_slave.inc
21602160

21612161

@@ -2202,10 +2202,119 @@ SELECT * FROM t3 WHERE a >= 130 ORDER BY a;
22022202
SELECT * FROM t3 WHERE a >= 130 ORDER BY a;
22032203

22042204
--source include/stop_slave.inc
2205-
SET GLOBAL debug_dbug= @old_debug;
2205+
SET GLOBAL debug_dbug= @old_dbug;
22062206
--source include/start_slave.inc
22072207

22082208

2209+
--echo *** MDEV-8031: Parallel replication stops on "connection killed" error (probably incorrectly handled deadlock kill) ***
2210+
2211+
--connection server_1
2212+
INSERT INTO t3 VALUES (201,0), (202,0);
2213+
--source include/save_master_gtid.inc
2214+
2215+
--connection server_2
2216+
--source include/sync_with_master_gtid.inc
2217+
--source include/stop_slave.inc
2218+
SET @old_dbug= @@GLOBAL.debug_dbug;
2219+
SET GLOBAL debug_dbug= '+d,inject_mdev8031';
2220+
2221+
--connection server_1
2222+
# We artificially create a situation that hopefully resembles the original
2223+
# bug which was only seen "in the wild", and only once.
2224+
# Setup a fake group commit with lots of conflicts that will lead to deadloc
2225+
# kill. The slave DBUG injection causes the slave to be deadlock killed at
2226+
# a particular point during the retry, and then later do a small sleep at
2227+
# another critical point where the prior transaction then has a chance to
2228+
# complete. Finally an extra KILL check catches an unhandled, lingering
2229+
# deadlock kill. So rather artificial, but at least it exercises the
2230+
# relevant code paths.
2231+
SET @old_dbug= @@SESSION.debug_dbug;
2232+
SET SESSION debug_dbug="+d,binlog_force_commit_id";
2233+
2234+
SET @commit_id= 10200;
2235+
INSERT INTO t3 VALUES (203, 1);
2236+
INSERT INTO t3 VALUES (204, 1);
2237+
INSERT INTO t3 VALUES (205, 1);
2238+
UPDATE t3 SET b=b+1 WHERE a=201;
2239+
UPDATE t3 SET b=b+1 WHERE a=201;
2240+
UPDATE t3 SET b=b+1 WHERE a=201;
2241+
UPDATE t3 SET b=b+1 WHERE a=202;
2242+
UPDATE t3 SET b=b+1 WHERE a=202;
2243+
UPDATE t3 SET b=b+1 WHERE a=202;
2244+
UPDATE t3 SET b=b+1 WHERE a=202;
2245+
UPDATE t3 SET b=b+1 WHERE a=203;
2246+
UPDATE t3 SET b=b+1 WHERE a=203;
2247+
UPDATE t3 SET b=b+1 WHERE a=204;
2248+
UPDATE t3 SET b=b+1 WHERE a=204;
2249+
UPDATE t3 SET b=b+1 WHERE a=204;
2250+
UPDATE t3 SET b=b+1 WHERE a=203;
2251+
UPDATE t3 SET b=b+1 WHERE a=205;
2252+
UPDATE t3 SET b=b+1 WHERE a=205;
2253+
SET SESSION debug_dbug=@old_dbug;
2254+
2255+
SELECT * FROM t3 WHERE a>=200 ORDER BY a;
2256+
--source include/save_master_gtid.inc
2257+
2258+
--connection server_2
2259+
--source include/start_slave.inc
2260+
--source include/sync_with_master_gtid.inc
2261+
2262+
SELECT * FROM t3 WHERE a>=200 ORDER BY a;
2263+
--source include/stop_slave.inc
2264+
SET GLOBAL debug_dbug= @old_dbug;
2265+
--source include/start_slave.inc
2266+
2267+
2268+
--echo *** Check getting deadlock killed inside open_binlog() during retry. ***
2269+
2270+
--connection server_2
2271+
--source include/stop_slave.inc
2272+
SET @old_dbug= @@GLOBAL.debug_dbug;
2273+
SET GLOBAL debug_dbug= '+d,inject_retry_event_group_open_binlog_kill';
2274+
SET @old_max= @@GLOBAL.max_relay_log_size;
2275+
SET GLOBAL max_relay_log_size= 4096;
2276+
2277+
--connection server_1
2278+
SET @old_dbug= @@SESSION.debug_dbug;
2279+
SET SESSION debug_dbug="+d,binlog_force_commit_id";
2280+
2281+
--let $large= `SELECT REPEAT("*", 8192)`
2282+
SET @commit_id= 10210;
2283+
--echo Omit long queries that cause relaylog rotations and transaction retries...
2284+
--disable_query_log
2285+
eval UPDATE t3 SET b=b+1 WHERE a=201 /* $large */;
2286+
eval UPDATE t3 SET b=b+1 WHERE a=201 /* $large */;
2287+
eval UPDATE t3 SET b=b+1 WHERE a=201 /* $large */;
2288+
eval UPDATE t3 SET b=b+1 WHERE a=202 /* $large */;
2289+
eval UPDATE t3 SET b=b+1 WHERE a=202 /* $large */;
2290+
eval UPDATE t3 SET b=b+1 WHERE a=202 /* $large */;
2291+
eval UPDATE t3 SET b=b+1 WHERE a=202 /* $large */;
2292+
eval UPDATE t3 SET b=b+1 WHERE a=203 /* $large */;
2293+
eval UPDATE t3 SET b=b+1 WHERE a=203 /* $large */;
2294+
eval UPDATE t3 SET b=b+1 WHERE a=204 /* $large */;
2295+
eval UPDATE t3 SET b=b+1 WHERE a=204 /* $large */;
2296+
eval UPDATE t3 SET b=b+1 WHERE a=204 /* $large */;
2297+
eval UPDATE t3 SET b=b+1 WHERE a=203 /* $large */;
2298+
eval UPDATE t3 SET b=b+1 WHERE a=205 /* $large */;
2299+
eval UPDATE t3 SET b=b+1 WHERE a=205 /* $large */;
2300+
--enable_query_log
2301+
SET SESSION debug_dbug=@old_dbug;
2302+
2303+
SELECT * FROM t3 WHERE a>=200 ORDER BY a;
2304+
--source include/save_master_gtid.inc
2305+
2306+
--connection server_2
2307+
--source include/start_slave.inc
2308+
--source include/sync_with_master_gtid.inc
2309+
2310+
SELECT * FROM t3 WHERE a>=200 ORDER BY a;
2311+
--source include/stop_slave.inc
2312+
SET GLOBAL debug_dbug= @old_debg;
2313+
SET GLOBAL max_relay_log_size= @old_max;
2314+
--source include/start_slave.inc
2315+
2316+
2317+
22092318
# Clean up.
22102319
--connection server_2
22112320
--source include/stop_slave.inc

sql/rpl_parallel.cc

Lines changed: 65 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
#include "rpl_parallel.h"
33
#include "slave.h"
44
#include "rpl_mi.h"
5+
#include "sql_parse.h"
56
#include "debug_sync.h"
67

78
/*
@@ -326,7 +327,7 @@ retry_event_group(rpl_group_info *rgi, rpl_parallel_thread *rpt,
326327
IO_CACHE rlog;
327328
LOG_INFO linfo;
328329
File fd= (File)-1;
329-
const char *errmsg= NULL;
330+
const char *errmsg;
330331
inuse_relaylog *ir= rgi->relay_log;
331332
uint64 event_count;
332333
uint64 events_to_execute= rgi->retry_event_count;
@@ -342,6 +343,7 @@ retry_event_group(rpl_group_info *rgi, rpl_parallel_thread *rpt,
342343
do_retry:
343344
event_count= 0;
344345
err= 0;
346+
errmsg= NULL;
345347

346348
/*
347349
If we already started committing before getting the deadlock (or other
@@ -377,7 +379,16 @@ retry_event_group(rpl_group_info *rgi, rpl_parallel_thread *rpt,
377379
*/
378380
if(thd->wait_for_commit_ptr)
379381
thd->wait_for_commit_ptr->unregister_wait_for_prior_commit();
382+
DBUG_EXECUTE_IF("inject_mdev8031", {
383+
/* Simulate that we get deadlock killed at this exact point. */
384+
rgi->killed_for_retry= true;
385+
mysql_mutex_lock(&thd->LOCK_thd_data);
386+
thd->killed= KILL_CONNECTION;
387+
mysql_mutex_unlock(&thd->LOCK_thd_data);
388+
});
380389
rgi->cleanup_context(thd, 1);
390+
thd->reset_killed();
391+
thd->clear_error();
381392

382393
/*
383394
If we retry due to a deadlock kill that occured during the commit step, we
@@ -418,10 +429,22 @@ retry_event_group(rpl_group_info *rgi, rpl_parallel_thread *rpt,
418429
complete its commit.
419430
*/
420431
thd->clear_error();
432+
thd->reset_killed();
421433
if(thd->wait_for_commit_ptr)
422434
thd->wait_for_commit_ptr->unregister_wait_for_prior_commit();
435+
DBUG_EXECUTE_IF("inject_mdev8031", {
436+
/* Inject a small sleep to give prior transaction a chance to commit. */
437+
my_sleep(100000);
438+
});
423439
}
424440

441+
/*
442+
Let us clear any lingering deadlock kill one more time, here after
443+
wait_for_prior_commit() has completed. This should rule out any
444+
possibility of an old deadlock kill lingering on beyond this point.
445+
*/
446+
thd->reset_killed();
447+
425448
strmake_buf(log_name, ir->name);
426449
if ((fd= open_binlog(&rlog, log_name, &errmsg)) <0)
427450
{
@@ -437,6 +460,14 @@ retry_event_group(rpl_group_info *rgi, rpl_parallel_thread *rpt,
437460
err= 1;
438461
goto err;
439462
}
463+
DBUG_EXECUTE_IF("inject_mdev8031", {
464+
/* Simulate pending KILL caught in read_relay_log_description_event(). */
465+
if (thd->check_killed()) {
466+
thd->send_kill_message();
467+
err= 1;
468+
goto err;
469+
}
470+
});
440471
my_b_seek(&rlog, cur_offset);
441472

442473
do
@@ -459,7 +490,7 @@ retry_event_group(rpl_group_info *rgi, rpl_parallel_thread *rpt,
459490
{
460491
errmsg= "slave SQL thread aborted because of I/O error";
461492
err= 1;
462-
goto err;
493+
goto check_retry;
463494
}
464495
if (rlog.error > 0)
465496
{
@@ -488,10 +519,25 @@ retry_event_group(rpl_group_info *rgi, rpl_parallel_thread *rpt,
488519
}
489520
strmake_buf(log_name ,linfo.log_file_name);
490521

522+
DBUG_EXECUTE_IF("inject_retry_event_group_open_binlog_kill", {
523+
if (retries < 2)
524+
{
525+
/* Simulate that we get deadlock killed during open_binlog(). */
526+
mysql_reset_thd_for_next_command(thd);
527+
rgi->killed_for_retry= true;
528+
mysql_mutex_lock(&thd->LOCK_thd_data);
529+
thd->killed= KILL_CONNECTION;
530+
mysql_mutex_unlock(&thd->LOCK_thd_data);
531+
thd->send_kill_message();
532+
fd= (File)-1;
533+
err= 1;
534+
goto check_retry;
535+
}
536+
});
491537
if ((fd= open_binlog(&rlog, log_name, &errmsg)) <0)
492538
{
493539
err= 1;
494-
goto err;
540+
goto check_retry;
495541
}
496542
/* Loop to try again on the new log file. */
497543
}
@@ -534,26 +580,31 @@ retry_event_group(rpl_group_info *rgi, rpl_parallel_thread *rpt,
534580
if (retries == 0) err= dbug_simulate_tmp_error(rgi, thd););
535581
DBUG_EXECUTE_IF("rpl_parallel_simulate_infinite_temp_err_gtid_0_x_100",
536582
err= dbug_simulate_tmp_error(rgi, thd););
537-
if (err)
583+
if (!err)
584+
continue;
585+
586+
check_retry:
587+
convert_kill_to_deadlock_error(rgi);
588+
if (has_temporary_error(thd))
538589
{
539-
convert_kill_to_deadlock_error(rgi);
540-
if (has_temporary_error(thd))
590+
++retries;
591+
if (retries < slave_trans_retries)
541592
{
542-
++retries;
543-
if (retries < slave_trans_retries)
593+
if (fd >= 0)
544594
{
545595
end_io_cache(&rlog);
546596
mysql_file_close(fd, MYF(MY_WME));
547597
fd= (File)-1;
548-
goto do_retry;
549598
}
550-
sql_print_error("Slave worker thread retried transaction %lu time(s) "
551-
"in vain, giving up. Consider raising the value of "
552-
"the slave_transaction_retries variable.",
553-
slave_trans_retries);
599+
goto do_retry;
554600
}
555-
goto err;
601+
sql_print_error("Slave worker thread retried transaction %lu time(s) "
602+
"in vain, giving up. Consider raising the value of "
603+
"the slave_transaction_retries variable.",
604+
slave_trans_retries);
556605
}
606+
goto err;
607+
557608
} while (event_count < events_to_execute);
558609

559610
err:

0 commit comments

Comments
 (0)