Skip to content

Commit

Permalink
Watch node object for podCIDR instead of crash + wait for restart
Browse files Browse the repository at this point in the history
This can save up to 10 seconds before Kubelet decides to restart
the pod after the crash.
  • Loading branch information
jingyuanliang committed May 24, 2024
1 parent f19530f commit c7585b0
Show file tree
Hide file tree
Showing 32 changed files with 383 additions and 108 deletions.
75 changes: 49 additions & 26 deletions scripts/install-cni.sh
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,11 @@ log() {
echo "$@"
}

fatal() {
echo FATAL: "$@" >&2
exit 1
}

# shellcheck disable=SC2317,SC2329 # when called with $1=calico_ready
calico_ready() {
log "Listing items matching /host/etc/cni/net.d/*calico*.conflist"
Expand Down Expand Up @@ -110,21 +115,41 @@ else
cni_spec=${cni_spec//@cniBandwidthPlugin/}
fi

token=$(</var/run/secrets/kubernetes.io/serviceaccount/token)
host=${KUBERNETES_SERVICE_HOST}
# If host contains a colon (:), it is an IPv6 address, hence needs wrapping
# with [..].
if [[ "${host}" =~ : ]]; then
host="[$host]"
fi
node_url="https://$host:${KUBERNETES_SERVICE_PORT}/api/v1/nodes/${HOSTNAME}"
response=$(curl -k -s -H "Authorization: Bearer $token" "$node_url")
fetch_node_object() {
local attempts=$1
local timeout=$2

local host=${KUBERNETES_SERVICE_HOST}
# If host contains a colon (:), it is an IPv6 address, hence needs wrapping
# with [..].
if [[ "${host}" =~ : ]]; then
host="[${host}]"
fi

local token
local node_url="https://${host}:${KUBERNETES_SERVICE_PORT}/api/v1/nodes?watch=1&timeoutSeconds=${timeout}&fieldSelector=metadata.name=${HOSTNAME}"

for ((i=1; i<=attempts; i++)); do
log "Watching attempt #${i} at ${node_url}"
token=$(</var/run/secrets/kubernetes.io/serviceaccount/token)
node_object=$(grep --line-buffered -m1 . <(curl -k -s -N -H "Authorization: Bearer ${token}" "${node_url}" | jq --unbuffered -c '.object | select(.spec.podCIDR != null)')) || node_object=
[[ -n "${node_object}" ]] && return
done

fatal "Could not successfully watch node and wait for podCIDR."
}

# Watch for up to 1 minute, we don't expect podCIDR to be not populated for too
# long, but this can also be three continuous retries and failures, then we wait
# for kubelet to retry the whole container if node_object is still not fetched.
fetch_node_object 3 20
log "Node object fetched:"
log "${node_object}"

if [ "${MIGRATE_TO_DPV2:-}" == "true" ]; then
DPV2_MIGRATION_READY=$(jq '.metadata.labels."cloud.google.com/gke-dpv2-migration-ready"' <<<"$response")
if [[ "${MIGRATE_TO_DPV2:-}" == "true" ]]; then
DPV2_MIGRATION_READY=$(jq -r '.metadata.labels."cloud.google.com/gke-dpv2-migration-ready"' <<<"${node_object}")
log "Migration to DPv2 in progress; node ready: '${DPV2_MIGRATION_READY}'"
if [ "${DPV2_MIGRATION_READY}" != '"true"' ] # DPV2_MIGRATION_READY is a JSON string thus double quotes
then
if [[ "${DPV2_MIGRATION_READY}" != "true" ]]; then
ENABLE_CILIUM_PLUGIN=false
fi
fi
Expand All @@ -134,7 +159,8 @@ if [[ "${ENABLE_CILIUM_PLUGIN}" == "true" ]]; then
if [[ -n "${CILIUM_FAST_START_NAMESPACES:-}" ]]; then
cilium_cni_config=$(jq --arg namespaces "${CILIUM_FAST_START_NAMESPACES:-}" '.["dpv2-fast-start-namespaces"] = $namespaces' <<<"${cilium_cni_config}")
fi
log "Adding Cilium plug-in to the CNI config: ${cilium_cni_config//$'\n'/ }"
log "Adding Cilium plug-in to the CNI config:"
log "${cilium_cni_config//$'\n'/ }"
cni_spec=${cni_spec//@cniCiliumPlugin/, ${cilium_cni_config}}
else
log "Not using Cilium plug-in."
Expand Down Expand Up @@ -218,8 +244,7 @@ function fillSubnetsInCniSpecV2Template {
SUBNETS_REPLACEMENT+=("$(jq -nc --arg subnet "${subnet}" '[{"subnet": $subnet}]')")
ROUTES_REPLACEMENT+=('{"dst": "::/0"}')
else
log "[ERROR] Subnet detected in .spec.podCIDRs '${subnet}' is not a valid IP range"
exit 1
fatal "Subnet detected in .spec.podCIDRs '${subnet}' is not a valid IP range"
fi
done

Expand Down Expand Up @@ -261,13 +286,9 @@ function fillSubnetsInCniSpecLegacyTemplate {
log "PodCIDR IPv4 detected: '${primary_subnet:-}'"
cni_spec=${cni_spec//@ipv4Subnet/[{\"subnet\": \"${primary_subnet:-}\"\}]}
elif is_ipv6_range "${primary_subnet:-}" ; then
log "Primary IPv6 pod range detected '${primary_subnet:-}'. It will only work with new spec template."
exit 1
fatal "Primary IPv6 pod range detected '${primary_subnet:-}'. It will only work with new spec template."
else
log "Response from $node_url"
log "$node"
log "Failed to fetch PodCIDR from K8s API server, primary_subnet=${primary_subnet:-}. Exiting (1)..."
exit 1
fatal "Failed to fetch PodCIDR from K8s API server, primary_subnet=${primary_subnet:-}."
fi

if [ -n "${node_ipv6_addr:-}" ] && [ "${node_ipv6_addr}" != "null" ]; then
Expand Down Expand Up @@ -296,15 +317,15 @@ function fillSubnetsInCniSpec {
}


CLUSTER_STACK_TYPE=$(jq -r '.metadata.labels."cloud.google.com/gke-stack-type"' <<<"$response")
CLUSTER_STACK_TYPE=$(jq -r '.metadata.labels."cloud.google.com/gke-stack-type"' <<<"${node_object}")
log "Node's cluster stack type label: '${CLUSTER_STACK_TYPE:-}'"

node_ipv6_addr=''
if [ "$ENABLE_IPV6" == "true" ] || [ "${CLUSTER_STACK_TYPE:-}" == "IPV4_IPV6" ]; then
node_ipv6_addr=$(curl -s -k --fail "http://metadata.google.internal/computeMetadata/v1/instance/network-interfaces/0/?recursive=true" -H "Metadata-Flavor: Google" | jq -r '.ipv6s[0]' ) ||:
fi

fillSubnetsInCniSpec "$response" "$node_ipv6_addr"
fillSubnetsInCniSpec "${node_object}" "${node_ipv6_addr}"

if [ "$POPULATE_IP6TABLES" == "true" ] ; then
# Ensure the IPv6 firewall rules are as expected.
Expand Down Expand Up @@ -433,7 +454,8 @@ cilium_wait_or_ignore() {
}

write_and_success() {
log "Creating CNI spec at '${output_file}' with content: ${cni_spec//$'\n'/ }"
log "Creating CNI spec at '${output_file}' with content:"
log "${cni_spec//$'\n'/ }"
write_file "${output_file}" "${cni_spec}"
success
}
Expand All @@ -449,7 +471,8 @@ if [[ "${RUN_CNI_WATCHDOG:-}" != "true" ]]; then
write_and_success
fi

log "Running CNI watchdog to watch Cilium and manage CNI config at '${output_file}' with content: ${cni_spec//$'\n'/ }"
log "Running CNI watchdog to watch Cilium and manage CNI config at '${output_file}' with content:"
log "${cni_spec//$'\n'/ }"
cilium_watchdog_success_wait=${CILIUM_WATCHDOG_SUCCESS_WAIT:-300}
cilium_watchdog_failure_retry=${CILIUM_WATCHDOG_FAILURE_RETRY:-60}
cilium_watchdog_fast_start_wait=${CILIUM_WATCHDOG_FAST_START_WAIT:-60}
Expand Down
6 changes: 3 additions & 3 deletions scripts/testcase/testcase-basic-v2-ipv4.sh
Original file line number Diff line number Diff line change
Expand Up @@ -21,8 +21,8 @@ function before_test() {
*http://metadata.google.internal/computeMetadata/v1/instance/network-interfaces/0*)
echo '{"ipv6s": ["2600:1900:4000:318:0:7:0:0"]}'
;;
*https://10.0.0.1:443/api/v1/nodes/*)
echo '{
*https://10.0.0.1:443/api/v1/nodes*)
echo '{"object":{
"metadata": {
"labels": {
},
Expand All @@ -38,7 +38,7 @@ function before_test() {
],
"providerID": "gce://my-gke-project/us-central1-c/gke-my-cluster-default-pool-128bc25d-9c94"
}
}'
}}'
;;
*)
#unsupported
Expand Down
6 changes: 3 additions & 3 deletions scripts/testcase/testcase-basic-v2-ipv6.sh
Original file line number Diff line number Diff line change
Expand Up @@ -21,8 +21,8 @@ function before_test() {
*http://metadata.google.internal/computeMetadata/v1/instance/network-interfaces/0*)
echo '{"ipv6s": ["2600:1900:4000:318:0:7:0:0"]}'
;;
*"https://[fd20::1]:443/api/v1/nodes/"*)
echo '{
*"https://[fd20::1]:443/api/v1/nodes"*)
echo '{"object":{
"metadata": {
"labels": {
},
Expand All @@ -38,7 +38,7 @@ function before_test() {
],
"providerID": "gce://my-gke-project/us-central1-c/gke-my-cluster-default-pool-128bc25d-9c94"
}
}'
}}'
;;
*)
#unsupported
Expand Down
6 changes: 3 additions & 3 deletions scripts/testcase/testcase-basic-v2.sh
Original file line number Diff line number Diff line change
Expand Up @@ -21,8 +21,8 @@ function before_test() {
*http://metadata.google.internal/computeMetadata/v1/instance/network-interfaces/0*)
echo '{"ipv6s": ["2600:1900:4000:318:0:7:0:0"]}'
;;
*https://kubernetes.default.svc:443/api/v1/nodes/*)
echo '{
*https://kubernetes.default.svc:443/api/v1/nodes*)
echo '{"object":{
"metadata": {
"labels": {
},
Expand All @@ -38,7 +38,7 @@ function before_test() {
],
"providerID": "gce://my-gke-project/us-central1-c/gke-my-cluster-default-pool-128bc25d-9c94"
}
}'
}}'
;;
*)
#unsupported
Expand Down
6 changes: 3 additions & 3 deletions scripts/testcase/testcase-basic.sh
Original file line number Diff line number Diff line change
Expand Up @@ -19,8 +19,8 @@ function before_test() {
*http://metadata.google.internal/computeMetadata/v1/instance/network-interfaces/0*)
echo '{"ipv6s": ["2600:1900:4000:318:0:7:0:0"]}'
;;
*https://kubernetes.default.svc:443/api/v1/nodes/*)
echo '{
*https://kubernetes.default.svc:443/api/v1/nodes*)
echo '{"object":{
"metadata": {
"labels": {
},
Expand All @@ -36,7 +36,7 @@ function before_test() {
],
"providerID": "gce://my-gke-project/us-central1-c/gke-my-cluster-default-pool-128bc25d-9c94"
}
}'
}}'
;;
*)
#unsupported
Expand Down
6 changes: 3 additions & 3 deletions scripts/testcase/testcase-calico-v2.sh
Original file line number Diff line number Diff line change
Expand Up @@ -26,8 +26,8 @@ function before_test() {
*http://metadata.google.internal/computeMetadata/v1/instance/network-interfaces/0*)
echo '{"ipv6s": ["2600:1900:4000:318:0:7:0:0"]}'
;;
*https://kubernetes.default.svc:443/api/v1/nodes/*)
echo '{
*https://kubernetes.default.svc:443/api/v1/nodes*)
echo '{"object":{
"metadata": {
"labels": {
},
Expand All @@ -43,7 +43,7 @@ function before_test() {
],
"providerID": "gce://my-gke-project/us-central1-c/gke-my-cluster-default-pool-128bc25d-9c94"
}
}'
}}'
;;
*)
#unsupported
Expand Down
6 changes: 3 additions & 3 deletions scripts/testcase/testcase-calico.sh
Original file line number Diff line number Diff line change
Expand Up @@ -24,8 +24,8 @@ function before_test() {
*http://metadata.google.internal/computeMetadata/v1/instance/network-interfaces/0*)
echo '{"ipv6s": ["2600:1900:4000:318:0:7:0:0"]}'
;;
*https://kubernetes.default.svc:443/api/v1/nodes/*)
echo '{
*https://kubernetes.default.svc:443/api/v1/nodes*)
echo '{"object":{
"metadata": {
"labels": {
},
Expand All @@ -41,7 +41,7 @@ function before_test() {
],
"providerID": "gce://my-gke-project/us-central1-c/gke-my-cluster-default-pool-128bc25d-9c94"
}
}'
}}'
;;
*)
#unsupported
Expand Down
6 changes: 3 additions & 3 deletions scripts/testcase/testcase-cilium-faststart-v2.sh
Original file line number Diff line number Diff line change
Expand Up @@ -23,8 +23,8 @@ function before_test() {
*http://metadata.google.internal/computeMetadata/v1/instance/network-interfaces/0*)
echo '{"ipv6s": ["2600:1900:4000:318:0:7:0:0"]}'
;;
*https://kubernetes.default.svc:443/api/v1/nodes/*)
echo '{
*https://kubernetes.default.svc:443/api/v1/nodes*)
echo '{"object":{
"metadata": {
"labels": {
},
Expand All @@ -40,7 +40,7 @@ function before_test() {
],
"providerID": "gce://my-gke-project/us-central1-c/gke-my-cluster-default-pool-128bc25d-9c94"
}
}'
}}'
;;
*http://localhost:63197/*)
echo 'healthz'
Expand Down
6 changes: 3 additions & 3 deletions scripts/testcase/testcase-cilium-faststart.sh
Original file line number Diff line number Diff line change
Expand Up @@ -21,8 +21,8 @@ function before_test() {
*http://metadata.google.internal/computeMetadata/v1/instance/network-interfaces/0*)
echo '{"ipv6s": ["2600:1900:4000:318:0:7:0:0"]}'
;;
*https://kubernetes.default.svc:443/api/v1/nodes/*)
echo '{
*https://kubernetes.default.svc:443/api/v1/nodes*)
echo '{"object":{
"metadata": {
"labels": {
},
Expand All @@ -38,7 +38,7 @@ function before_test() {
],
"providerID": "gce://my-gke-project/us-central1-c/gke-my-cluster-default-pool-128bc25d-9c94"
}
}'
}}'
;;
*http://localhost:63197/*)
echo 'healthz'
Expand Down
6 changes: 3 additions & 3 deletions scripts/testcase/testcase-cilium-v2.sh
Original file line number Diff line number Diff line change
Expand Up @@ -23,8 +23,8 @@ function before_test() {
*http://metadata.google.internal/computeMetadata/v1/instance/network-interfaces/0*)
echo '{"ipv6s": ["2600:1900:4000:318:0:7:0:0"]}'
;;
*https://kubernetes.default.svc:443/api/v1/nodes/*)
echo '{
*https://kubernetes.default.svc:443/api/v1/nodes*)
echo '{"object":{
"metadata": {
"labels": {
},
Expand All @@ -40,7 +40,7 @@ function before_test() {
],
"providerID": "gce://my-gke-project/us-central1-c/gke-my-cluster-default-pool-128bc25d-9c94"
}
}'
}}'
;;
*http://localhost:63197/*)
echo 'healthz'
Expand Down
6 changes: 3 additions & 3 deletions scripts/testcase/testcase-cilium.sh
Original file line number Diff line number Diff line change
Expand Up @@ -21,8 +21,8 @@ function before_test() {
*http://metadata.google.internal/computeMetadata/v1/instance/network-interfaces/0*)
echo '{"ipv6s": ["2600:1900:4000:318:0:7:0:0"]}'
;;
*https://kubernetes.default.svc:443/api/v1/nodes/*)
echo '{
*https://kubernetes.default.svc:443/api/v1/nodes*)
echo '{"object":{
"metadata": {
"labels": {
},
Expand All @@ -38,7 +38,7 @@ function before_test() {
],
"providerID": "gce://my-gke-project/us-central1-c/gke-my-cluster-default-pool-128bc25d-9c94"
}
}'
}}'
;;
*http://localhost:63197/*)
echo 'healthz'
Expand Down
6 changes: 3 additions & 3 deletions scripts/testcase/testcase-directpath-v2.sh
Original file line number Diff line number Diff line change
Expand Up @@ -24,9 +24,9 @@ function before_test() {
# call to GCE metadata server
echo '{"ipv6s": ["2600:1900:4000:318:0:7:0:0"]}'
;;
*https://kubernetes.default.svc:443/api/v1/nodes/*)
*https://kubernetes.default.svc:443/api/v1/nodes*)
# call to kube-apiserver
echo '{
echo '{"object":{
"metadata": {
"labels": {
},
Expand All @@ -42,7 +42,7 @@ function before_test() {
],
"providerID": "gce://my-gke-project/us-central1-c/gke-my-cluster-default-pool-128bc25d-9c94"
}
}'
}}'
;;
*)
# unmatched call
Expand Down
6 changes: 3 additions & 3 deletions scripts/testcase/testcase-directpath.sh
Original file line number Diff line number Diff line change
Expand Up @@ -22,9 +22,9 @@ function before_test() {
# call to GCE metadata server
echo '{"ipv6s": ["2600:1900:4000:318:0:7:0:0"]}'
;;
*https://kubernetes.default.svc:443/api/v1/nodes/*)
*https://kubernetes.default.svc:443/api/v1/nodes*)
# call to kube-apiserver
echo '{
echo '{"object":{
"metadata": {
"labels": {
},
Expand All @@ -40,7 +40,7 @@ function before_test() {
],
"providerID": "gce://my-gke-project/us-central1-c/gke-my-cluster-default-pool-128bc25d-9c94"
}
}'
}}'
;;
*)
# unmatched call
Expand Down

0 comments on commit c7585b0

Please sign in to comment.