Skip to content

Commit

Permalink
MDEV-30260: Slave crashed:reload_acl_and_cache during shutdown
Browse files Browse the repository at this point in the history
The signal handler thread can use various different runtime
resources when processing a SIGHUP (e.g. master-info information)
due to calling into reload_acl_and_cache(). Currently, the shutdown
process waits for the termination of the signal thread after
performing cleanup. However, this could cause resources actively
used by the signal handler to be freed while reload_acl_and_cache()
is processing.

The specific resource that caused MDEV-30260 is a race condition for
the hostname_cache, such that mysqld would delete it in
clean_up()::hostname_cache_free(), before the signal handler would
use it in reload_acl_and_cache()::hostname_cache_refresh().

Another similar resource is the active_mi/master_info_index. There
was a race between its deletion by the main thread in end_slave(),
and their usage by the Signal Handler as a part of
Master_info_index::flush_all_relay_logs.read(active_mi) in
reload_acl_and_cache().

This patch fixes these race conditions by relocating where server
shutdown waits for the signal handler to die until after
server-level threads have been killed (i.e., as a last step of
close_connections()). With respect to the hostname_cache, active_mi
and master_info_cache, this ensures that they cannot be destroyed
while the signal handler is still active, and potentially using
them.

Additionally:

 1) This requires that Events memory is still in place for SIGHUP
handling's mysql_print_status(). So event deinitialization is moved
into clean_up(), but the event scheduler still needs to be stopped
in close_connections() at the same spot.

 2) The function kill_server_thread is no longer used, so it is
deleted

 3) The timeout to wait for the death of the signal thread was not
consistent with the comment. The comment mentioned up to 10 seconds,
whereas it was actually 0.01s. The code has been fixed to wait up to
10 seconds.

 4) A warning has been added if the signal handler thread fails to
exit in time.

 5) Added pthread_join() to end of wait_for_signal_thread_to_end()
if it hadn't ended in 10s with a warning. Note this also removes
the pthread_detached attribute from the signal_thread to allow
for the pthread_join().

Reviewed By:
===========
Vladislav Vaintroub <wlad@mariadb.com>
Andrei Elkin <andrei.elkin@mariadb.com>
  • Loading branch information
bnestere committed Apr 9, 2024
1 parent 4980fcb commit 952ab9a
Show file tree
Hide file tree
Showing 4 changed files with 259 additions and 35 deletions.
50 changes: 50 additions & 0 deletions mysql-test/suite/rpl/r/rpl_shutdown_sighup.result
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
include/master-slave.inc
[connection master]
connection slave;
set statement sql_log_bin=0 for call mtr.add_suppression("Signal handler thread did not exit in a timely manner");
#
# Main test
connection master;
create table t1 (a int);
insert into t1 values (1);
include/save_master_gtid.inc
connection slave;
include/sync_with_master_gtid.inc
set @@global.debug_dbug= "+d,hold_sighup_log_refresh";
# Waiting for sighup to reach reload_acl_and_cache..
set debug_sync="now wait_for in_reload_acl_and_cache";
# Signalling signal handler to proceed to sleep before REFRESH_HOSTS
set debug_sync="now signal refresh_logs";
# Starting shutdown (note this will take 3+ seconds due to DBUG my_sleep in reload_acl_and_cache)
shutdown;
connection server_2;
connection slave;
include/assert_grep.inc [Ensure Mariadbd did not segfault when shutting down]
connection master;
connection slave;
#
# Error testcase to ensure an error message is shown if the signal
# takes longer than the timeout while processing the SIGHUP
connection slave;
set @@global.debug_dbug= "+d,force_sighup_processing_timeout";
set @@global.debug_dbug= "+d,hold_sighup_log_refresh";
connection master;
insert into t1 values (1);
include/save_master_gtid.inc
connection slave;
include/sync_with_master_gtid.inc
# Waiting for sighup to reach reload_acl_and_cache..
set debug_sync="now wait_for in_reload_acl_and_cache";
# Signalling signal handler to proceed to sleep before REFRESH_HOSTS
set debug_sync="now signal refresh_logs";
# Starting shutdown (note this will take 3+ seconds due to DBUG my_sleep in reload_acl_and_cache)
shutdown;
connection server_2;
connection slave;
include/assert_grep.inc [Ensure warning is issued that signal handler thread is still processing]
#
# Cleanup
connection master;
drop table t1;
include/rpl_end.inc
# End of rpl_shutdown_sighup.test
154 changes: 154 additions & 0 deletions mysql-test/suite/rpl/t/rpl_shutdown_sighup.test
Original file line number Diff line number Diff line change
@@ -0,0 +1,154 @@
#
# The signal handler thread can use various different runtime resources when
# processing a SIGHUP (e.g. master-info information), as the logic calls into
# reload_acl_and_cache(). This test ensures that SIGHUP processing, when
# concurrent with server shutdown, the shutdown logic must wait for the SIGHUP
# processing to finish before cleaning up any resources.
#
# Additionally, the error case is tested such that the signal handler thread
# takes too long processing a SIGHUP, and the main mysqld thread must skip its
# wait and output a warning.
#
# Note the SIGHUP is sent via the command-line kill program via a perl script.
#
# References:
# MDEV-30260: Slave crashed:reload_acl_and_cache during shutdown
#

--source include/not_windows.inc
--source include/not_embedded.inc
--source include/have_debug.inc
--source include/have_debug_sync.inc

# Binlog format doesn't matter
--source include/have_binlog_format_statement.inc
--source include/master-slave.inc

# For error test case which forces timeout
--connection slave
set statement sql_log_bin=0 for call mtr.add_suppression("Signal handler thread did not exit in a timely manner");


--echo #
--echo # Main test
--connection master
create table t1 (a int);
insert into t1 values (1);
--source include/save_master_gtid.inc

--connection slave
--source include/sync_with_master_gtid.inc

# Make signal handler handle SIGHUP..
set @@global.debug_dbug= "+d,hold_sighup_log_refresh";
--let KILL_NODE_PIDFILE = `SELECT @@pid_file`
--perl
my $kill_sig = $ENV{'KILL_SIGNAL_VALUE'};
my $pid_filename = $ENV{'KILL_NODE_PIDFILE'};
my $mysqld_pid = `cat $pid_filename`;
chomp($mysqld_pid);
system("kill -HUP $mysqld_pid");
exit(0);
EOF

--echo # Waiting for sighup to reach reload_acl_and_cache..
set debug_sync="now wait_for in_reload_acl_and_cache";
--echo # Signalling signal handler to proceed to sleep before REFRESH_HOSTS
set debug_sync="now signal refresh_logs";

# ..while we are shutting down
--write_file $MYSQLTEST_VARDIR/tmp/mysqld.2.expect
wait
EOF
--echo # Starting shutdown (note this will take 3+ seconds due to DBUG my_sleep in reload_acl_and_cache)
shutdown;

--source include/wait_until_disconnected.inc
--append_file $MYSQLTEST_VARDIR/tmp/mysqld.2.expect
restart: --skip-slave-start=0
EOF

--connection server_2
--enable_reconnect
--source include/wait_until_connected_again.inc

--connection slave
--enable_reconnect
--source include/wait_until_connected_again.inc

--let $assert_text= Ensure Mariadbd did not segfault when shutting down
--let $assert_select= got signal 11
--let $assert_file= $MYSQLTEST_VARDIR/log/mysqld.2.err
--let $assert_count= 0
--let $assert_only_after = CURRENT_TEST: rpl.rpl_shutdown_sighup
--source include/assert_grep.inc

--connection master
--sync_slave_with_master


--echo #
--echo # Error testcase to ensure an error message is shown if the signal
--echo # takes longer than the timeout while processing the SIGHUP

--connection slave
set @@global.debug_dbug= "+d,force_sighup_processing_timeout";
set @@global.debug_dbug= "+d,hold_sighup_log_refresh";

--connection master
insert into t1 values (1);
--source include/save_master_gtid.inc

--connection slave
--source include/sync_with_master_gtid.inc

# Make signal handler handle SIGHUP..
--let KILL_NODE_PIDFILE = `SELECT @@pid_file`
--perl
my $kill_sig = $ENV{'KILL_SIGNAL_VALUE'};
my $pid_filename = $ENV{'KILL_NODE_PIDFILE'};
my $mysqld_pid = `cat $pid_filename`;
chomp($mysqld_pid);
system("kill -HUP $mysqld_pid");
exit(0);
EOF
--echo # Waiting for sighup to reach reload_acl_and_cache..
set debug_sync="now wait_for in_reload_acl_and_cache";
--echo # Signalling signal handler to proceed to sleep before REFRESH_HOSTS
set debug_sync="now signal refresh_logs";

# ..while we are shutting down
--write_file $MYSQLTEST_VARDIR/tmp/mysqld.2.expect
wait
EOF
--echo # Starting shutdown (note this will take 3+ seconds due to DBUG my_sleep in reload_acl_and_cache)
shutdown;

--source include/wait_until_disconnected.inc
--append_file $MYSQLTEST_VARDIR/tmp/mysqld.2.expect
restart: --skip-slave-start=0
EOF

--connection server_2
--enable_reconnect
--source include/wait_until_connected_again.inc

--connection slave
--enable_reconnect
--source include/wait_until_connected_again.inc

--let $assert_text= Ensure warning is issued that signal handler thread is still processing
--let $assert_select= Signal handler thread did not exit in a timely manner.
--let $assert_file= $MYSQLTEST_VARDIR/log/mysqld.2.err
--let $assert_count= 1
--let $assert_only_after = CURRENT_TEST: rpl.rpl_shutdown_sighup
--source include/assert_grep.inc


--echo #
--echo # Cleanup
--connection master
drop table t1;

--source include/rpl_end.inc
--echo # End of rpl_shutdown_sighup.test
67 changes: 32 additions & 35 deletions sql/mysqld.cc
Original file line number Diff line number Diff line change
Expand Up @@ -1766,7 +1766,8 @@ static void close_connections(void)
DBUG_EXECUTE_IF("delay_shutdown_phase_2_after_semisync_wait",
my_sleep(500000););

Events::deinit();
if (Events::inited)
Events::stop();
slave_prepare_for_shutdown();
ack_receiver.stop();

Expand Down Expand Up @@ -1827,6 +1828,12 @@ static void close_connections(void)
}
/* End of kill phase 2 */

/*
The signal thread can use server resources, e.g. when processing SIGHUP,
and it must end gracefully before clean_up()
*/
wait_for_signal_thread_to_end();

DBUG_PRINT("quit",("close_connections thread"));
DBUG_VOID_RETURN;
}
Expand Down Expand Up @@ -1930,14 +1937,8 @@ extern "C" void unireg_abort(int exit_code)
static void mysqld_exit(int exit_code)
{
DBUG_ENTER("mysqld_exit");
/*
Important note: we wait for the signal thread to end,
but if a kill -15 signal was sent, the signal thread did
spawn the kill_server_thread thread, which is running concurrently.
*/
rpl_deinit_gtid_waiting();
rpl_deinit_gtid_slave_state();
wait_for_signal_thread_to_end();
#ifdef WITH_WSREP
wsrep_deinit_server();
wsrep_sst_auth_free();
Expand Down Expand Up @@ -2016,6 +2017,9 @@ static void clean_up(bool print_message)
free_status_vars();
end_thr_alarm(1); /* Free allocated memory */
end_thr_timer();
#ifndef EMBEDDED_LIBRARY
Events::deinit();
#endif
my_free_open_file_info();
if (defaults_argv)
free_defaults(defaults_argv);
Expand Down Expand Up @@ -2083,16 +2087,32 @@ static void clean_up(bool print_message)
*/
static void wait_for_signal_thread_to_end()
{
uint i;
uint i, n_waits= DBUG_EVALUATE("force_sighup_processing_timeout", 5, 100);
int err= 0;
/*
Wait up to 10 seconds for signal thread to die. We use this mainly to
avoid getting warnings that my_thread_end has not been called
*/
for (i= 0 ; i < 100 && signal_thread_in_use; i++)
for (i= 0 ; i < n_waits && signal_thread_in_use; i++)
{
if (pthread_kill(signal_thread, MYSQL_KILL_SIGNAL) == ESRCH)
err= pthread_kill(signal_thread, MYSQL_KILL_SIGNAL);
if (err)
break;
my_sleep(100); // Give it time to die
my_sleep(100000); // Give it time to die, .1s per iteration
}

if (err && err != ESRCH)
{
sql_print_error("Failed to send kill signal to signal handler thread, "
"pthread_kill() errno: %d",
err);
}

if (i == n_waits && signal_thread_in_use)
{
sql_print_warning("Signal handler thread did not exit in a timely manner. "
"Continuing to wait for it to stop..");
pthread_join(signal_thread, NULL);
}
}
#endif /*EMBEDDED_LIBRARY*/
Expand Down Expand Up @@ -2916,7 +2936,6 @@ static void start_signal_handler(void)

(void) pthread_attr_init(&thr_attr);
pthread_attr_setscope(&thr_attr,PTHREAD_SCOPE_SYSTEM);
(void) pthread_attr_setdetachstate(&thr_attr,PTHREAD_CREATE_DETACHED);
(void) my_setstacksize(&thr_attr,my_thread_stack_size);

mysql_mutex_lock(&LOCK_start_thread);
Expand All @@ -2936,18 +2955,6 @@ static void start_signal_handler(void)
}


#if defined(USE_ONE_SIGNAL_HAND)
pthread_handler_t kill_server_thread(void *arg __attribute__((unused)))
{
my_thread_init(); // Initialize new thread
break_connect_loop();
my_thread_end();
pthread_exit(0);
return 0;
}
#endif


/** This threads handles all signals and alarms. */
/* ARGSUSED */
pthread_handler_t signal_hand(void *arg __attribute__((unused)))
Expand Down Expand Up @@ -3004,7 +3011,7 @@ pthread_handler_t signal_hand(void *arg __attribute__((unused)))
int origin;

while ((error= my_sigwait(&set, &sig, &origin)) == EINTR) /* no-op */;
if (cleanup_done)
if (abort_loop)
{
DBUG_PRINT("quit",("signal_handler: calling my_thread_end()"));
my_thread_end();
Expand All @@ -3027,18 +3034,8 @@ pthread_handler_t signal_hand(void *arg __attribute__((unused)))
{
/* Delete the instrumentation for the signal thread */
PSI_CALL_delete_current_thread();
#ifdef USE_ONE_SIGNAL_HAND
pthread_t tmp;
if (unlikely((error= mysql_thread_create(0, /* Not instrumented */
&tmp, &connection_attrib,
kill_server_thread,
(void*) &sig))))
sql_print_error("Can't create thread to kill server (errno= %d)",
error);
#else
my_sigset(sig, SIG_IGN);
break_connect_loop(); // MIT THREAD has a alarm thread
#endif
}
break;
case SIGHUP:
Expand Down
23 changes: 23 additions & 0 deletions sql/sql_reload.cc
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,15 @@ bool reload_acl_and_cache(THD *thd, unsigned long long options,
bool result=0;
select_errors=0; /* Write if more errors */
int tmp_write_to_binlog= *write_to_binlog= 1;
#ifndef DBUG_OFF
/*
When invoked for handling a SIGHUP by rpl_shutdown_sighup.test, we need to
force the signal handler to wait after REFRESH_TABLES, as that will check
for a killed server, and we need to call hostname_cache_refresh after
server cleanup has happened to trigger MDEV-30260.
*/
int do_dbug_sleep= 0;
#endif

DBUG_ASSERT(!thd || !thd->in_sub_stmt);

Expand Down Expand Up @@ -99,6 +108,15 @@ bool reload_acl_and_cache(THD *thd, unsigned long long options,
*/
my_error(ER_UNKNOWN_ERROR, MYF(0));
}

#ifndef DBUG_OFF
DBUG_EXECUTE_IF("hold_sighup_log_refresh", {
DBUG_ASSERT(!debug_sync_set_action(
thd, STRING_WITH_LEN("now SIGNAL in_reload_acl_and_cache "
"WAIT_FOR refresh_logs")));
do_dbug_sleep= 1;
});
#endif
}
opt_noacl= 0;

Expand Down Expand Up @@ -351,6 +369,11 @@ bool reload_acl_and_cache(THD *thd, unsigned long long options,
}
my_dbopt_cleanup();
}

#ifndef DBUG_OFF
if (do_dbug_sleep)
my_sleep(3000000); // 3s
#endif
if (options & REFRESH_HOSTS)
hostname_cache_refresh();
if (thd && (options & REFRESH_STATUS))
Expand Down

0 comments on commit 952ab9a

Please sign in to comment.