Skip to content

Commit e5f1e84

Browse files
committed
MDEV-8147: Assertion `m_lock_type == 2' failed in handler::ha_close() during parallel replication
When the slave processes the master restart format_description event, parallel replication needs to complete any prior events before processing the restart event (which closes temporary tables and such stuff). This happens in wait_for_workers_idle(), however it was not waiting long enough. The wait was using wait_for_prior_commit(), but at that points table can still be open. This lead to assertion in this case. So change wait_for_workers_idle() to wait until all worker threads have reached finish_event_group(), at which point all tables should have been closed.
1 parent ef99edf commit e5f1e84

File tree

7 files changed

+151
-14
lines changed

7 files changed

+151
-14
lines changed
Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,42 @@
1+
include/master-slave.inc
2+
[connection master]
3+
SET @old_parallel_threads=@@GLOBAL.slave_parallel_threads;
4+
include/stop_slave.inc
5+
SET @old_dbug= @@GLOBAL.debug_dbug;
6+
SET GLOBAL debug_dbug="+d,inject_wakeup_subsequent_commits_sleep";
7+
SET GLOBAL slave_parallel_threads=8;
8+
*** MDEV-8147: Assertion `m_lock_type == 2' failed in handler::ha_close() during parallel replication ***
9+
CREATE TABLE E (
10+
pk INTEGER AUTO_INCREMENT,
11+
col_int_nokey INTEGER /*! NULL */,
12+
col_int_key INTEGER /*! NULL */,
13+
col_date_key DATE /*! NULL */,
14+
col_date_nokey DATE /*! NULL */,
15+
col_time_key TIME /*! NULL */,
16+
col_time_nokey TIME /*! NULL */,
17+
col_datetime_key DATETIME /*! NULL */,
18+
col_datetime_nokey DATETIME /*! NULL */,
19+
col_varchar_key VARCHAR(1) /*! NULL */,
20+
col_varchar_nokey VARCHAR(1) /*! NULL */,
21+
PRIMARY KEY (pk),
22+
KEY (col_int_key),
23+
KEY (col_date_key),
24+
KEY (col_time_key),
25+
KEY (col_datetime_key),
26+
KEY (col_varchar_key, col_int_key)
27+
) ENGINE=InnoDB;
28+
ALTER TABLE `E` PARTITION BY KEY() PARTITIONS 5;
29+
ALTER TABLE `E` REMOVE PARTITIONING;
30+
CREATE TABLE t1 (a INT PRIMARY KEY);
31+
include/start_slave.inc
32+
include/stop_slave.inc
33+
SET GLOBAL debug_dbug=@old_dbug;
34+
SET GLOBAL slave_parallel_threads=0;
35+
SET GLOBAL slave_parallel_threads=8;
36+
include/start_slave.inc
37+
include/stop_slave.inc
38+
SET GLOBAL slave_parallel_threads=@old_parallel_threads;
39+
include/start_slave.inc
40+
DROP TABLE `E`;
41+
DROP TABLE t1;
42+
include/rpl_end.inc
Lines changed: 81 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,81 @@
1+
--source include/have_partition.inc
2+
--source include/have_innodb.inc
3+
--source include/have_debug.inc
4+
--source include/master-slave.inc
5+
6+
--connection server_2
7+
SET @old_parallel_threads=@@GLOBAL.slave_parallel_threads;
8+
--source include/stop_slave.inc
9+
SET @old_dbug= @@GLOBAL.debug_dbug;
10+
SET GLOBAL debug_dbug="+d,inject_wakeup_subsequent_commits_sleep";
11+
SET GLOBAL slave_parallel_threads=8;
12+
13+
--echo *** MDEV-8147: Assertion `m_lock_type == 2' failed in handler::ha_close() during parallel replication ***
14+
--connection server_1
15+
CREATE TABLE E (
16+
pk INTEGER AUTO_INCREMENT,
17+
col_int_nokey INTEGER /*! NULL */,
18+
col_int_key INTEGER /*! NULL */,
19+
20+
col_date_key DATE /*! NULL */,
21+
col_date_nokey DATE /*! NULL */,
22+
23+
col_time_key TIME /*! NULL */,
24+
col_time_nokey TIME /*! NULL */,
25+
26+
col_datetime_key DATETIME /*! NULL */,
27+
col_datetime_nokey DATETIME /*! NULL */,
28+
29+
col_varchar_key VARCHAR(1) /*! NULL */,
30+
col_varchar_nokey VARCHAR(1) /*! NULL */,
31+
32+
PRIMARY KEY (pk),
33+
KEY (col_int_key),
34+
KEY (col_date_key),
35+
KEY (col_time_key),
36+
KEY (col_datetime_key),
37+
KEY (col_varchar_key, col_int_key)
38+
) ENGINE=InnoDB;
39+
40+
ALTER TABLE `E` PARTITION BY KEY() PARTITIONS 5;
41+
ALTER TABLE `E` REMOVE PARTITIONING;
42+
--write_file $MYSQLTEST_VARDIR/tmp/mysqld.1.expect
43+
wait
44+
EOF
45+
--shutdown_server 30
46+
--source include/wait_until_disconnected.inc
47+
--connection default
48+
--source include/wait_until_disconnected.inc
49+
--append_file $MYSQLTEST_VARDIR/tmp/mysqld.1.expect
50+
restart:
51+
EOF
52+
--enable_reconnect
53+
--source include/wait_until_connected_again.inc
54+
--connection server_1
55+
--enable_reconnect
56+
--source include/wait_until_connected_again.inc
57+
CREATE TABLE t1 (a INT PRIMARY KEY);
58+
--save_master_pos
59+
60+
--connection server_2
61+
--source include/start_slave.inc
62+
--sync_with_master
63+
64+
# Re-spawn worker threads to clear dbug injection.
65+
--source include/stop_slave.inc
66+
SET GLOBAL debug_dbug=@old_dbug;
67+
SET GLOBAL slave_parallel_threads=0;
68+
SET GLOBAL slave_parallel_threads=8;
69+
--source include/start_slave.inc
70+
71+
72+
# Clean up.
73+
--connection server_2
74+
--source include/stop_slave.inc
75+
SET GLOBAL slave_parallel_threads=@old_parallel_threads;
76+
--source include/start_slave.inc
77+
78+
--connection server_1
79+
DROP TABLE `E`;
80+
DROP TABLE t1;
81+
--source include/rpl_end.inc

sql/mysqld.cc

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9498,6 +9498,7 @@ PSI_stage_info stage_waiting_for_work_from_sql_thread= { 0, "Waiting for work fr
94989498
PSI_stage_info stage_waiting_for_prior_transaction_to_commit= { 0, "Waiting for prior transaction to commit", 0};
94999499
PSI_stage_info stage_waiting_for_prior_transaction_to_start_commit= { 0, "Waiting for prior transaction to start commit before starting next transaction", 0};
95009500
PSI_stage_info stage_waiting_for_room_in_worker_thread= { 0, "Waiting for room in worker thread event queue", 0};
9501+
PSI_stage_info stage_waiting_for_workers_idle= { 0, "Waiting for worker threads to be idle", 0};
95019502
PSI_stage_info stage_master_gtid_wait_primary= { 0, "Waiting in MASTER_GTID_WAIT() (primary waiter)", 0};
95029503
PSI_stage_info stage_master_gtid_wait= { 0, "Waiting in MASTER_GTID_WAIT()", 0};
95039504
PSI_stage_info stage_gtid_wait_other_connection= { 0, "Waiting for other master connection to process GTID received on multiple master connections", 0};

sql/mysqld.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -452,6 +452,7 @@ extern PSI_stage_info stage_waiting_for_work_from_sql_thread;
452452
extern PSI_stage_info stage_waiting_for_prior_transaction_to_commit;
453453
extern PSI_stage_info stage_waiting_for_prior_transaction_to_start_commit;
454454
extern PSI_stage_info stage_waiting_for_room_in_worker_thread;
455+
extern PSI_stage_info stage_waiting_for_workers_idle;
455456
extern PSI_stage_info stage_master_gtid_wait_primary;
456457
extern PSI_stage_info stage_master_gtid_wait;
457458
extern PSI_stage_info stage_gtid_wait_other_connection;

sql/rpl_parallel.cc

Lines changed: 19 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -168,6 +168,8 @@ finish_event_group(rpl_parallel_thread *rpt, uint64 sub_id,
168168
done and also no longer need waiting for.
169169
*/
170170
entry->last_committed_sub_id= sub_id;
171+
if (entry->need_sub_id_signal)
172+
mysql_cond_broadcast(&entry->COND_parallel_entry);
171173

172174
/* Now free any GCOs in which all transactions have committed. */
173175
group_commit_orderer *tmp_gco= rgi->gco;
@@ -1894,26 +1896,29 @@ rpl_parallel::wait_for_workers_idle(THD *thd)
18941896
max_i= domain_hash.records;
18951897
for (i= 0; i < max_i; ++i)
18961898
{
1897-
bool active;
1898-
wait_for_commit my_orderer;
1899+
PSI_stage_info old_stage;
18991900
struct rpl_parallel_entry *e;
1901+
int err= 0;
19001902

19011903
e= (struct rpl_parallel_entry *)my_hash_element(&domain_hash, i);
19021904
mysql_mutex_lock(&e->LOCK_parallel_entry);
1903-
if ((active= (e->current_sub_id > e->last_committed_sub_id)))
1904-
{
1905-
wait_for_commit *waitee= &e->current_group_info->commit_orderer;
1906-
my_orderer.register_wait_for_prior_commit(waitee);
1907-
thd->wait_for_commit_ptr= &my_orderer;
1908-
}
1909-
mysql_mutex_unlock(&e->LOCK_parallel_entry);
1910-
if (active)
1905+
e->need_sub_id_signal= true;
1906+
thd->ENTER_COND(&e->COND_parallel_entry, &e->LOCK_parallel_entry,
1907+
&stage_waiting_for_workers_idle, &old_stage);
1908+
while (e->current_sub_id > e->last_committed_sub_id)
19111909
{
1912-
int err= my_orderer.wait_for_prior_commit(thd);
1913-
thd->wait_for_commit_ptr= NULL;
1914-
if (err)
1915-
return err;
1910+
if (thd->check_killed())
1911+
{
1912+
thd->send_kill_message();
1913+
err= 1;
1914+
break;
1915+
}
1916+
mysql_cond_wait(&e->COND_parallel_entry, &e->LOCK_parallel_entry);
19161917
}
1918+
e->need_sub_id_signal= false;
1919+
thd->EXIT_COND(&old_stage);
1920+
if (err)
1921+
return err;
19171922
}
19181923
return 0;
19191924
}

sql/rpl_parallel.h

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -227,6 +227,12 @@ struct rpl_parallel_entry {
227227
waiting for event groups to complete.
228228
*/
229229
bool force_abort;
230+
/*
231+
Set in wait_for_workers_idle() to show that it is waiting, so that
232+
finish_event_group knows to signal it when last_committed_sub_id is
233+
increased.
234+
*/
235+
bool need_sub_id_signal;
230236
/*
231237
At STOP SLAVE (force_abort=true), we do not want to process all events in
232238
the queue (which could unnecessarily delay stop, if a lot of events happen

sql/sql_class.cc

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6609,6 +6609,7 @@ wait_for_commit::wakeup_subsequent_commits2(int wakeup_error)
66096609
a mutex), so no extra explicit barrier is needed here.
66106610
*/
66116611
wakeup_subsequent_commits_running= false;
6612+
DBUG_EXECUTE_IF("inject_wakeup_subsequent_commits_sleep", my_sleep(21000););
66126613
}
66136614

66146615

0 commit comments

Comments
 (0)