Skip to content

Commit

Permalink
MDEV-25880: rsync may be mistakenly killed when overlapping SST
Browse files Browse the repository at this point in the history
This commit fixes a bug was originally discovered during the
galera_nbo_sst_slave mtr test for 10.6 branch. However it is
relevant for all versions and can lead to intermittent SST
crashes via rsync on very fast server restarts - when a new
SST process (for example, after starting a new server instance)
overlaps the old SST process started by the previous, already
terminated server. This overlap can result in the new rsync
being killed instead of the old rsync, or the pid file from
the new rsync being killed, which then lead to problems.
  • Loading branch information
sysprg committed Jun 15, 2021
1 parent 1c35a3f commit 18d5be5
Show file tree
Hide file tree
Showing 2 changed files with 25 additions and 5 deletions.
2 changes: 1 addition & 1 deletion scripts/wsrep_sst_common.sh
Original file line number Diff line number Diff line change
Expand Up @@ -1190,7 +1190,6 @@ trim_string()
check_pid()
{
local pid_file="$1"
local remove=${2:-0}
if [ -r "$pid_file" ]; then
local pid=$(cat "$pid_file" 2>/dev/null)
if [ -n "$pid" ]; then
Expand All @@ -1201,6 +1200,7 @@ check_pid()
fi
fi
fi
local remove=${2:-0}
if [ $remove -eq 1 ]; then
rm -f "$pid_file"
fi
Expand Down
28 changes: 24 additions & 4 deletions scripts/wsrep_sst_rsync.sh
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,8 @@ cleanup_joiner()
if [ "$WSREP_SST_OPT_ROLE" = 'joiner' ]; then
wsrep_cleanup_progress_file
fi

[ -f "$SST_PID" ] && rm -f "$SST_PID"
}

check_pid_and_port()
Expand Down Expand Up @@ -281,6 +283,7 @@ then
*)
wsrep_log_error "Unrecognized ssl-mode option: '$SSLMODE'"
exit 22 # EINVAL
;;
esac
if [ -z "$CAFILE_OPT" ]; then
wsrep_log_error "Can't have ssl-mode='$SSLMODE' without CA file"
Expand Down Expand Up @@ -499,6 +502,21 @@ elif [ "$WSREP_SST_OPT_ROLE" = 'joiner' ]
then
check_sockets_utils

SST_PID="$WSREP_SST_OPT_DATA/wsrep_rsync_sst.pid"

# give some time for lingering stunnel from previous SST to complete
check_round=0
while check_pid "$SST_PID" 0
do
wsrep_log_info "previous SST not completed, waiting for it to exit"
check_round=$(( check_round + 1 ))
if [ $check_round -eq 10 ]; then
wsrep_log_error "SST script already running."
exit 114 # EALREADY
fi
sleep 1
done

# give some time for lingering stunnel from previous SST to complete
check_round=0
while check_pid "$STUNNEL_PID" 1
Expand Down Expand Up @@ -583,12 +601,14 @@ EOF
RSYNC_ADDR="*"
fi

echo $$ > "$SST_PID"

if [ -z "$STUNNEL" ]
then
rsync --daemon --no-detach --port "$RSYNC_PORT" --config "$RSYNC_CONF" $RSYNC_EXTRA_ARGS &
RSYNC_REAL_PID=$!
TRANSFER_REAL_PID="$RSYNC_REAL_PID"
TRANSFER_PID=$RSYNC_PID
TRANSFER_REAL_PID=$RSYNC_REAL_PID
TRANSFER_PID="$RSYNC_PID"
else
# Let's check if the path to the config file contains a space?
if [ "${RSYNC_CONF#* }" = "$RSYNC_CONF" ]; then
Expand Down Expand Up @@ -631,8 +651,8 @@ EOF
fi
stunnel "$STUNNEL_CONF" &
STUNNEL_REAL_PID=$!
TRANSFER_REAL_PID="$STUNNEL_REAL_PID"
TRANSFER_PID=$STUNNEL_PID
TRANSFER_REAL_PID=$STUNNEL_REAL_PID
TRANSFER_PID="$STUNNEL_PID"
fi

if [ "${SSLMODE#VERIFY}" != "$SSLMODE" ]
Expand Down

0 comments on commit 18d5be5

Please sign in to comment.