Skip to content

Commit 66832b6

Browse files
author
Nirbhay Choubey
committed
MDEV-9598: Donor's rsync SST script hangs if FTWRL fails
During SST, since wsrep_sst_rsync waits for mysqld to create "tables_flushed" file after it has successfully executed FTWRL, it would wait forever if FTWRL fails. Fixed by introducing a mechanism to report failure to the script.
1 parent 0251232 commit 66832b6

File tree

2 files changed

+104
-41
lines changed

2 files changed

+104
-41
lines changed

scripts/wsrep_sst_rsync.sh

Lines changed: 15 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -127,7 +127,10 @@ then
127127
then
128128

129129
FLUSHED="$WSREP_SST_OPT_DATA/tables_flushed"
130+
ERROR="$WSREP_SST_OPT_DATA/sst_error"
131+
130132
rm -rf "$FLUSHED"
133+
rm -rf "$ERROR"
131134

132135
# Use deltaxfer only for WAN
133136
inv=$(basename $0)
@@ -137,10 +140,20 @@ then
137140
echo "flush tables"
138141

139142
# Wait for :
140-
# (a) tables to be flushed, and
141-
# (b) state ID & wsrep_gtid_domain_id to be written to the file.
143+
# (a) Tables to be flushed, AND
144+
# (b) Cluster state ID & wsrep_gtid_domain_id to be written to the file, OR
145+
# (c) ERROR file, in case flush tables operation failed.
146+
142147
while [ ! -r "$FLUSHED" ] && ! grep -q ':' "$FLUSHED" >/dev/null 2>&1
143148
do
149+
# Check whether ERROR file exists.
150+
if [ -f "$ERROR" ]
151+
then
152+
# Flush tables operation failed.
153+
rm -rf "$ERROR"
154+
exit 255
155+
fi
156+
144157
sleep 0.2
145158
done
146159

sql/wsrep_sst.cc

Lines changed: 89 additions & 39 deletions
Original file line numberDiff line numberDiff line change
@@ -896,6 +896,56 @@ static int sst_donate_mysqldump (const char* addr,
896896

897897
wsrep_seqno_t wsrep_locked_seqno= WSREP_SEQNO_UNDEFINED;
898898

899+
900+
/*
901+
Create a file under data directory.
902+
*/
903+
static int sst_create_file(const char *name, const char *content)
904+
{
905+
int err= 0;
906+
char *real_name;
907+
char *tmp_name;
908+
ssize_t len;
909+
FILE *file;
910+
911+
len= strlen(mysql_real_data_home) + strlen(name) + 2;
912+
real_name= (char *) alloca(len);
913+
914+
snprintf(real_name, (size_t) len, "%s/%s", mysql_real_data_home, name);
915+
916+
tmp_name= (char *) alloca(len + 4);
917+
snprintf(tmp_name, (size_t) len + 4, "%s.tmp", real_name);
918+
919+
file= fopen(tmp_name, "w+");
920+
921+
if (0 == file)
922+
{
923+
err= errno;
924+
WSREP_ERROR("Failed to open '%s': %d (%s)", tmp_name, err, strerror(err));
925+
}
926+
else
927+
{
928+
// Write the specified content into the file.
929+
if (content != NULL)
930+
{
931+
fprintf(file, "%s\n", content);
932+
fsync(fileno(file));
933+
}
934+
935+
fclose(file);
936+
937+
if (rename(tmp_name, real_name) == -1)
938+
{
939+
err= errno;
940+
WSREP_ERROR("Failed to rename '%s' to '%s': %d (%s)", tmp_name,
941+
real_name, err, strerror(err));
942+
}
943+
}
944+
945+
return err;
946+
}
947+
948+
899949
static int run_sql_command(THD *thd, const char *query)
900950
{
901951
thd->set_query((char *)query, strlen(query));
@@ -911,7 +961,7 @@ static int run_sql_command(THD *thd, const char *query)
911961
if (thd->is_error())
912962
{
913963
int const err= thd->get_stmt_da()->sql_errno();
914-
WSREP_WARN ("error executing '%s': %d (%s)%s",
964+
WSREP_WARN ("Error executing '%s': %d (%s)%s",
915965
query, err, thd->get_stmt_da()->message(),
916966
err == ER_UNKNOWN_SYSTEM_VARIABLE ?
917967
". Was mysqld built with --with-innodb-disallow-writes ?" : "");
@@ -921,15 +971,21 @@ static int run_sql_command(THD *thd, const char *query)
921971
return 0;
922972
}
923973

974+
924975
static int sst_flush_tables(THD* thd)
925976
{
926977
WSREP_INFO("Flushing tables for SST...");
927978

928979
int err;
929980
int not_used;
930-
CHARSET_INFO *current_charset;
981+
/*
982+
Files created to notify the SST script about the outcome of table flush
983+
operation.
984+
*/
985+
const char *flush_success= "tables_flushed";
986+
const char *flush_error= "sst_error";
931987

932-
current_charset = thd->variables.character_set_client;
988+
CHARSET_INFO *current_charset= thd->variables.character_set_client;
933989

934990
if (!is_supported_parser_charset(current_charset))
935991
{
@@ -942,61 +998,55 @@ static int sst_flush_tables(THD* thd)
942998

943999
if (run_sql_command(thd, "FLUSH TABLES WITH READ LOCK"))
9441000
{
945-
WSREP_ERROR("Failed to flush and lock tables");
946-
err = -1;
1001+
err= -1;
9471002
}
9481003
else
9491004
{
950-
/* make sure logs are flushed after global read lock acquired */
951-
err= reload_acl_and_cache(thd, REFRESH_ENGINE_LOG | REFRESH_BINARY_LOG,
952-
(TABLE_LIST*) 0, &not_used);
1005+
/*
1006+
Make sure logs are flushed after global read lock acquired. In case
1007+
reload fails, we must also release the acquired FTWRL.
1008+
*/
1009+
if (reload_acl_and_cache(thd, REFRESH_ENGINE_LOG | REFRESH_BINARY_LOG,
1010+
(TABLE_LIST*) 0, &not_used))
1011+
{
1012+
thd->global_read_lock.unlock_global_read_lock(thd);
1013+
err= -1;
1014+
}
9531015
}
9541016

9551017
thd->variables.character_set_client = current_charset;
9561018

957-
9581019
if (err)
9591020
{
960-
WSREP_ERROR("Failed to flush tables: %d (%s)", err, strerror(err));
1021+
WSREP_ERROR("Failed to flush and lock tables");
1022+
1023+
/*
1024+
The SST must be aborted as the flush tables failed. Notify this to SST
1025+
script by creating the error file.
1026+
*/
1027+
int tmp;
1028+
if ((tmp= sst_create_file(flush_error, NULL))) {
1029+
err= tmp;
1030+
}
9611031
}
9621032
else
9631033
{
9641034
WSREP_INFO("Tables flushed.");
965-
const char base_name[]= "tables_flushed";
966-
967-
ssize_t const full_len= strlen(mysql_real_data_home) + strlen(base_name)+2;
968-
char *real_name= (char *) alloca(full_len);
969-
snprintf(real_name, (size_t) full_len, "%s/%s", mysql_real_data_home,
970-
base_name);
971-
char *tmp_name= (char *) alloca(full_len + 4);
972-
snprintf(tmp_name, (size_t) full_len + 4, "%s.tmp", real_name);
9731035

974-
FILE* file= fopen(tmp_name, "w+");
975-
if (0 == file)
976-
{
977-
err= errno;
978-
WSREP_ERROR("Failed to open '%s': %d (%s)", tmp_name, err,strerror(err));
979-
}
980-
else
981-
{
982-
// Write cluster state ID and wsrep_gtid_domain_id.
983-
fprintf(file, "%s:%lld %d\n",
984-
wsrep_cluster_state_uuid, (long long)wsrep_locked_seqno,
985-
wsrep_gtid_domain_id);
986-
fsync(fileno(file));
987-
fclose(file);
988-
if (rename(tmp_name, real_name) == -1)
989-
{
990-
err= errno;
991-
WSREP_ERROR("Failed to rename '%s' to '%s': %d (%s)",
992-
tmp_name, real_name, err,strerror(err));
993-
}
994-
}
1036+
/*
1037+
Tables have been flushed. Create a file with cluster state ID and
1038+
wsrep_gtid_domain_id.
1039+
*/
1040+
char content[100];
1041+
snprintf(content, sizeof(content), "%s:%lld %d\n", wsrep_cluster_state_uuid,
1042+
(long long)wsrep_locked_seqno, wsrep_gtid_domain_id);
1043+
err= sst_create_file(flush_success, content);
9951044
}
9961045

9971046
return err;
9981047
}
9991048

1049+
10001050
static void sst_disallow_writes (THD* thd, bool yes)
10011051
{
10021052
char query_str[64] = { 0, };

0 commit comments

Comments
 (0)