forked from hpcc-systems/HPCC-Platform
/
init_thor.in
executable file
·210 lines (186 loc) · 7.73 KB
/
init_thor.in
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
#!/bin/bash
################################################################################
# HPCC SYSTEMS software Copyright (C) 2012 HPCC Systems®.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
################################################################################
###<REPLACE>###
source ${INSTALL_DIR}/sbin/hpcc_setenv
deploydir=$(dirname $(type -path $0))
source ${INSTALL_DIR}/etc/init.d/hpcc_common
component=$(basename $PWD)
PID_NAME="${PID_DIR}/${component}.pid"
INIT_PID_NAME="${PID_DIR}/init_${component}.pid"
timestamp="$(date +%Y_%m_%d_%H_%M_%S)"
export logfile="${LOG_DIR}/${component}/init_${component}_${timestamp}.log"
# for use by init_thorslave call
logredirect="init_thorslave_${component}_${timestamp}.log"
log "Starting ${component}"
log "removing any previous sentinel file"
export SENTINEL="thor.sentinel"
rm -f ${SENTINEL}
if [[ -z "$deploydir" ]]; then
deploydir=$(pwd -P)
fi
instancedir=$(pwd -P)
source $instancedir/setvars
slaveportinc=$(($channelsperslave * $localthorportinc))
ln -s -f $deploydir/thormaster_lcr thormaster_$THORNAME
ENV_DIR=$(cat ${HPCC_CONFIG} | sed -n "/\[DEFAULT\]/,/\[/p" | grep "^configs=" | sed -e 's/^configs=//')
contains()
{
local n=$#
local value=${!n}
local i=0
for ((i=1;i < ${n};i++)); do
if [[ "${!i}" == "${value}" ]]; then
echo "${i}"
return 0
fi
done
echo "0"
return 1
}
kill_slaves()
{
log "Killing slaves"
# NB: many of the parameters to init_thorslave not used by 'stop' command
if [[ "$localthor" = "true" ]]; then
$deploydir/init_thorslave stop localhost $slavespernode $THORSLAVEPORT $slaveportinc $THORMASTER $THORMASTERPORT $LOG_DIR $instancedir $deploydir $THORNAME ${INSTALL_DIR}/sbin $logredirect
else
# we want to kill only slaves that have already been started in run_thor
if [[ -r $instancedir/uslaves ]]; then
clusternodes=$(cat $instancedir/uslaves 2> /dev/null | wc -l)
$deploydir/frunssh $instancedir/uslaves "/bin/sh -c '$deploydir/init_thorslave stop localhost $slavespernode $THORSLAVEPORT $slaveportinc $THORMASTER $THORMASTERPORT $LOG_DIR $instancedir $deploydir $THORNAME ${INSTALL_DIR}/sbin $logredirect'" -i:$SSHidentityfile -u:$SSHusername -pe:$SSHpassword -t:$SSHtimeout -a:$SSHretries -n:$clusternodes 2>&1
FRUNSSH_RC=$?
if [[ ${FRUNSSH_RC} -gt 0 ]]; then
log "Error ${FRUNSSH_RC} in frunssh"
log "Please check ${LOG_DIR}/frunssh for more details"
# clean up any slaves it was able to reach
log "Stopping ${component}"
kill_process ${PID_NAME} thormaster_${component} 25
unlock ${LOCK_DIR}/$component/${component}.lock
rm -f $INIT_PID_NAME $instancedir/slaves > /dev/null 2>&1
exit 255
else
log "Frunssh successful"
fi
fi
fi
rm -f $instancedir/uslaves > /dev/null 2>&1
}
killed()
{
log "Stopping ${component}"
kill_process ${PID_NAME} thormaster_${component} 25
log "$component Stopped"
unlock ${LOCK_DIR}/$component/${component}.lock
kill_slaves
log "removing init.pid file and slaves file"
rm -f $INIT_PID_NAME $instancedir/slaves > /dev/null 2>&1
exit 255
}
trap "killed" SIGINT SIGTERM
log "Ensuring a clean working environment ..."
kill_slaves
thorpid=0
while [[ 1 ]]; do
# update slaves file in case state of environment has been altered since last run
errorMessage=$( daliadmin server=$DALISERVER clusternodes ${component} $instancedir/slaves 2>&1 )
errcode=$?
if [[ 0 != ${errcode} ]]; then
log "failed to lookup dali group for ${component}"
log "$errorMessage"
exit 1
fi
log "--------------------------"
log "starting thorslaves ..."
# Would be simpler, if there was simple way to test if ip is local and get rid of 'localthor' setting
sort $instancedir/slaves | uniq > $instancedir/uslaves
if [[ "$localthor" = "true" ]]; then
slaveip=$(head -n 1 $instancedir/uslaves)
$deploydir/init_thorslave start $slaveip $slavespernode $THORSLAVEPORT $slaveportinc $THORMASTER $THORMASTERPORT $LOG_DIR $instancedir $deploydir $THORNAME ${INSTALL_DIR}/sbin $logredirect
else
clusternodes=$(cat $instancedir/uslaves | wc -l)
$deploydir/frunssh $instancedir/uslaves "/bin/sh -c '$deploydir/init_thorslave start %a $slavespernode $THORSLAVEPORT $slaveportinc $THORMASTER $THORMASTERPORT $LOG_DIR $instancedir $deploydir $THORNAME ${INSTALL_DIR}/sbin $logredirect'" -i:$SSHidentityfile -u:$SSHusername -pe:$SSHpassword -t:$SSHtimeout -a:$SSHretries -n:$clusternodes 2>&1
FRUNSSH_RC=$?
if [[ ${FRUNSSH_RC} -gt 0 ]]; then
log "Error ${FRUNSSH_RC} in frunssh"
log "Please check ${LOG_DIR}/frunssh for more details"
# clean up any slaves it was able to reach
killed
fi
fi
log "thormaster cmd : $instancedir/thormaster_$THORNAME MASTER=$THORMASTER:$THORMASTERPORT"
nohup $instancedir/thormaster_$THORNAME MASTER=$THORMASTER:$THORMASTERPORT 2> /dev/null 1>/dev/null &
thorpid=$!
if [[ "$thorpid" -ne "0" ]]; then
log "thormaster_lcr process started pid = $thorpid"
echo $thorpid > $PID_NAME
wait $thorpid
errcode=$?
case $errcode in
# TEC_Clean
0) log "Thormaster ($thorpid) Exited cleanly"
rm -f $instancedir/slaves $instancedir/uslaves $PID_NAME $INIT_PID_NAME > /dev/null 2>&1
exit 0
;;
# TEC_CtrlC
1) log "Thormaster ($thorpid) Interrupted, Ctrl-C caught"
killed
;;
# TEC_SlaveInit
4) log "Thormaster ($thorpid) Slaves failed to initialize"
log "Shutting down"
killed
;;
# TEC_Idle
2) log "Thormaster ($thorpid) Idle"
kill_slaves
;;
# TEC_Swap=5, TEC_Watchdog=3, TEC_DaliDown=6
*) if [[ $errcode -eq 5 ]]; then
log "Thormaster ($thorpid) Swap node required"
elif [[ $errcode -eq 3 ]]; then
log "Thormaster ($thorpid) Lost connection to slave(s)"
elif [[ $errcode -eq 6 ]]; then
log "Thormaster ($thorpid) Unable to connect to Dali"
else
log "Thormaster ($thorpid) Unknown error code: [$errcode]"
fi
log "Stopping thorslave(s) for restart"
kill_slaves
if [[ "false" != "$autoSwapNode" ]]; then
log "Running autoswap $THORNAME :: ($thorpid)"
swapnode auto $DALISERVER $component
errcode=$?
if [[ 0 != ${errcode} ]]; then
log "Auto swap node failed, errcode=${errcode}"
killed
fi
else
log "Autoswap set to false. Restarting"
fi
# restarting thormaster
;;
esac
else
log "failed to start thormaster_lcr, pausing for 30 seconds"
sleep 30
kill_slaves
fi
if [[ ! -e $SENTINEL ]]; then
log "$SENTINEL has been removed or thormaster did not fully start - script stopping"
exit 0
fi
done