diff --git a/.vscode/launch.json b/.vscode/launch.json index 6063f453..edf599d2 100644 --- a/.vscode/launch.json +++ b/.vscode/launch.json @@ -11,7 +11,7 @@ "mode": "debug", "program": "${workspaceRoot}/operator/cmd/main.go", "cwd": "${workspaceRoot}/operator", - "buildFlags": "--ldflags '-X github.com/NVIDIA/skyhook/internal/version.GIT_SHA=foobars -X github.com/NVIDIA/skyhook/internal/version.VERSION=v0.5.0'", + "buildFlags": "--ldflags '-X github.com/NVIDIA/skyhook/operator/internal/version.GIT_SHA=foobars -X github.com/NVIDIA/skyhook/operator/internal/version.VERSION=v0.5.0'", "env": { "ENABLE_WEBHOOKS": "false", "LOG_ENCODER": "console", diff --git a/README.md b/README.md index 54b1f1a1..58051208 100644 --- a/README.md +++ b/README.md @@ -186,9 +186,19 @@ The operator will apply steps in a package throughout different lifecycle stages The stages are applied in this order: +**Without Interrupts:** +- Uninstall -> Apply -> Config (No Upgrade) +- Upgrade -> Config (With Upgrade) + +**With Interrupts:** +For packages that require interrupts, the node is first cordoned and drained to ensure workloads are safely evacuated before package operations begin: - Uninstall -> Apply -> Config -> Interrupt -> Post-Interrupt (No Upgrade) - Upgrade -> Config -> Interrupt -> Post-Interrupt (With Upgrade) +This ensures that when operations like kernel module unloading or system reboots are required, they happen after workloads have been safely removed and any necessary pre-interrupt package operations have completed. + +**NOTE**: If a package is removed from the SCR, then the uninstall stage for that package will solely be run. + **Semantic versioning is strictly enforced in the operator** in order to support upgrade and uninstall. Semantic versioning allows the operator to know which way the package is going while also enforcing best versioning practices. diff --git a/docs/README.md b/docs/README.md index 4d07c87d..bb8e53cb 100644 --- a/docs/README.md +++ b/docs/README.md @@ -14,6 +14,9 @@ This directory contains user and operator documentation for Skyhook. Here you'll - [Runtime Required](runtime_required.md): How to use the runtime required taint and feature in Skyhook. + - [Interrupt Flow and Ordering](interrupt_flow.md): + Detailed explanation of how Skyhook handles packages with interrupts, including the interrupt sequence. + - [Strict Ordering](ordering_of_skyhooks.md): How and why the operator applies each Skyhook Custom Resource in a deterministic sequential order. - **Resources** diff --git a/docs/interrupt_flow.md b/docs/interrupt_flow.md new file mode 100644 index 00000000..9abab084 --- /dev/null +++ b/docs/interrupt_flow.md @@ -0,0 +1,87 @@ +# Interrupt Flow and Ordering + +This document explains how Skyhook handles packages that require interrupts and the specific ordering of operations to ensure safe and reliable execution. + +## Overview + +When a package requires an interrupt (such as a reboot or service restart), Skyhook follows a specific sequence to ensure that workloads are safely evacuated from the node before any potentially disruptive operations occur. + +## Interrupt Flow Sequence + +### For packages WITH interrupts: + +1. **Uninstall** (if downgrading) - Package uninstallation operations are executed. +2. **Cordon** - Node is marked as unschedulable to prevent new workloads from being scheduled +3. **Wait** - System waits for any conflicting workloads to naturally complete or be rescheduled +4. **Drain** - Remaining workloads are gracefully evicted from the node +5. **Apply** / **Upgrade** (if upgrading) - Package installation/upgrade operations are executed +6. **Config** - Configuration and setup operations are performed +7. **Interrupt** - The actual interrupt operation (reboot, service restart, etc.) is executed +8. **Post-Interrupt** - Any cleanup or verification operations after the interrupt + +### For packages WITHOUT interrupts: + +1. **Uninstall** (if downgrading) - Package uninstallation operations are executed. +2. **Apply** / **Upgrade** (if upgrading) - Package installation/upgrade operations are executed +3. **Config** - Configuration and setup operations are performed + +## Why This Order Matters + +The **uninstall → cordon → wait → drain → apply/upgrade → config → interrupt** sequence is critical for several reasons: + +### Safety First +- Workloads are safely removed before any potentially disruptive operations +- Prevents data loss or service interruption for running applications +- Ensures the node is in a clean state before package operations begin + +### Use Cases +This ordering is particularly important for scenarios such as: + +- **Kernel module changes**: Unloading kernel modules while workloads are present could cause system instability +- **GPU mode switching**: Changing GPU from graphics to compute mode requires exclusive access +- **Driver updates**: Hardware driver changes need exclusive access to the hardware +- **System reboots**: Obviously require all workloads to be evacuated first + +### Example Scenario + +Consider a package that needs to unload a kernel module, perform some operations, and then reboot: + +```yaml +apiVersion: skyhook.nvidia.com/v1alpha1 +kind: Skyhook +metadata: + name: gpu-mode-switch +spec: + packages: + gpu-driver: + version: "1.0.0" + image: "example/gpu-driver" + interrupt: + type: "reboot" +``` + +**Flow:** +1. **Cordon**: Node becomes unschedulable +2. **Wait**: Any non-interrupt workloads are given time to complete +3. **Drain**: Remaining workloads are evicted +4. **Apply**: GPU driver package operations run (unload old module, install new) +5. **Config**: Configuration files are updated +6. **Interrupt**: System reboots to complete the driver change +7. **Post-Interrupt**: Verification that the new driver is loaded correctly + +## Technical Implementation + +The interrupt flow is managed by the `ProcessInterrupt` and `EnsureNodeIsReadyForInterrupt` functions in the Skyhook controller, which: + +- Check for conflicting workloads using label selectors +- Coordinate the cordon and drain operations +- Ensure the node is ready before proceeding with package operations +- Handle the timing and sequencing of all stages + +## Best Practices + +- Always test interrupt-enabled packages in non-production environments first +- Use appropriate `podNonInterruptLabels` selectors to identify important workloads that should block interrupts +- Consider the impact of node cordoning on cluster capacity +- Monitor package logs during interrupt operations for troubleshooting +- Use Grafana dashboards to monitor interrupt operations and track package state transitions across your cluster (see [docs/metrics/](metrics/) for dashboard setup and configuration) diff --git a/docs/operator-status-definitions.md b/docs/operator-status-definitions.md index f68cbca2..6964111a 100644 --- a/docs/operator-status-definitions.md +++ b/docs/operator-status-definitions.md @@ -75,7 +75,10 @@ upgrade → config ``` ### With Interrupts: +When a package requires an interrupt, the node is first cordoned and drained before package operations begin: ``` -uninstall → apply → config → interrupt → post-interrupt -upgrade → config → interrupt → post-interrupt -``` \ No newline at end of file +uninstall (if downgrading) → cordon → wait → drain → apply → config → interrupt → post-interrupt +cordon → wait → drain → upgrade (if upgrading) → config → interrupt → post-interrupt +``` + +**Note**: The cordon, wait, and drain phases ensure that workloads are safely removed from the node before any package operations that require interrupts (such as reboots or kernel module changes) are executed. \ No newline at end of file diff --git a/k8s-tests/chainsaw/skyhook/config-skyhook/assert.yaml b/k8s-tests/chainsaw/skyhook/config-skyhook/assert.yaml index 24651b23..1f1fd759 100644 --- a/k8s-tests/chainsaw/skyhook/config-skyhook/assert.yaml +++ b/k8s-tests/chainsaw/skyhook/config-skyhook/assert.yaml @@ -21,7 +21,7 @@ metadata: skyhook.nvidia.com/test-node: skyhooke2e skyhook.nvidia.com/status_config-skyhook: in_progress annotations: - ("skyhook.nvidia.com/nodeState_config-skyhook" && parse_json("skyhook.nvidia.com/nodeState_config-skyhook")): + ("skyhook.nvidia.com/nodeState_config-skyhook" && parse_json("skyhook.nvidia.com/nodeState_config-skyhook")): { "baxter|3.2.1": { "name": "baxter", @@ -30,22 +30,26 @@ metadata: "stage": "apply", "state": "in_progress" }, - "dexter|1.2.3": { - "name": "dexter", - "version": "1.2.3", + "spencer|3.2.3": { + "name": "spencer", + "version": "3.2.3", "image": "ghcr.io/nvidia/skyhook/agentless", "stage": "apply", "state": "in_progress" }, - "spencer|3.2.3": { - "name": "spencer", - "version": "3.2.3", + "dexter|1.2.3": { + "name": "dexter", + "version": "1.2.3", "image": "ghcr.io/nvidia/skyhook/agentless", "stage": "apply", "state": "in_progress" - } + }, } skyhook.nvidia.com/status_config-skyhook: in_progress +spec: + taints: + - effect: NoSchedule + key: node.kubernetes.io/unschedulable status: (conditions[?type == 'skyhook.nvidia.com/config-skyhook/NotReady']): - reason: "Incomplete" @@ -62,13 +66,7 @@ status: status: in_progress nodeState: (values(@)): - - dexter|1.2.3: - name: dexter - state: in_progress - version: '1.2.3' - stage: apply - image: ghcr.io/nvidia/skyhook/agentless - baxter|3.2.1: + - baxter|3.2.1: name: baxter state: in_progress version: '3.2.1' @@ -80,6 +78,12 @@ status: version: '3.2.3' stage: apply image: ghcr.io/nvidia/skyhook/agentless + dexter|1.2.3: + name: dexter + state: in_progress + version: '1.2.3' + stage: apply + image: ghcr.io/nvidia/skyhook/agentless nodeStatus: # grab values should be one and is complete (values(@)): diff --git a/k8s-tests/chainsaw/skyhook/interrupt-grouping/assert.yaml b/k8s-tests/chainsaw/skyhook/interrupt-grouping/assert.yaml index 06fbc8ad..371ec048 100644 --- a/k8s-tests/chainsaw/skyhook/interrupt-grouping/assert.yaml +++ b/k8s-tests/chainsaw/skyhook/interrupt-grouping/assert.yaml @@ -13,7 +13,27 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - +--- +kind: Node +apiVersion: v1 +metadata: + labels: + skyhook.nvidia.com/test-node: skyhooke2e + skyhook.nvidia.com/status_interrupt-grouping: in_progress + annotations: + skyhook.nvidia.com/status_interrupt-grouping: in_progress +spec: + taints: + - effect: NoSchedule + key: node.kubernetes.io/unschedulable +status: + (conditions[?type == 'skyhook.nvidia.com/interrupt-grouping/NotReady']): + - reason: "Incomplete" + status: "True" + (conditions[?type == 'skyhook.nvidia.com/interrupt-grouping/Erroring']): + - reason: "Not Erroring" + status: "False" +--- kind: Pod apiVersion: v1 metadata: diff --git a/k8s-tests/chainsaw/skyhook/interrupt/assert-a.yaml b/k8s-tests/chainsaw/skyhook/interrupt/assert-a.yaml new file mode 100644 index 00000000..4e865a95 --- /dev/null +++ b/k8s-tests/chainsaw/skyhook/interrupt/assert-a.yaml @@ -0,0 +1,84 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +--- +kind: Pod +apiVersion: v1 +metadata: + namespace: skyhook + labels: + skyhook.nvidia.com/name: interrupt + skyhook.nvidia.com/package: jason-1.3.2 + annotations: + ("skyhook.nvidia.com/package" && parse_json("skyhook.nvidia.com/package")): + { + "name": "jason", + "version": "1.3.2", + "skyhook": "interrupt", + "stage": "apply", + "image": "ghcr.io/nvidia/skyhook/agentless" + } + ownerReferences: + - apiVersion: skyhook.nvidia.com/v1alpha1 + kind: Skyhook + name: interrupt +spec: + initContainers: + - name: jason-init + image: ghcr.io/nvidia/skyhook/agentless:1.3.2 + - name: jason-apply + image: ghcr.io/nvidia/skyhook/agentless:3.2.3 + args: + ([0]): apply + ([1]): /root + (length(@)): 3 + - name: jason-applycheck + image: ghcr.io/nvidia/skyhook/agentless:3.2.3 + args: + ([0]): apply-check + ([1]): /root + (length(@)): 3 +--- +apiVersion: v1 +kind: Node +metadata: + labels: + skyhook.nvidia.com/test-node: skyhooke2e + skyhook.nvidia.com/status_interrupt: in_progress + annotations: + ("skyhook.nvidia.com/nodeState_interrupt" && parse_json("skyhook.nvidia.com/nodeState_interrupt")): + { + "jason|1.3.2": { + "name": "jason", + "version": "1.3.2", + "image": "ghcr.io/nvidia/skyhook/agentless", + "stage": "config", + "state": "complete" + } + } + skyhook.nvidia.com/status_interrupt: in_progress +spec: + taints: + - effect: NoSchedule + key: node.kubernetes.io/unschedulable +status: + (conditions[?type == 'skyhook.nvidia.com/interrupt/NotReady']): + - reason: "Incomplete" + status: "True" + (conditions[?type == 'skyhook.nvidia.com/interrupt/Erroring']): + - reason: "Not Erroring" + status: "False" +--- diff --git a/k8s-tests/chainsaw/skyhook/interrupt/assert.yaml b/k8s-tests/chainsaw/skyhook/interrupt/assert-b.yaml similarity index 83% rename from k8s-tests/chainsaw/skyhook/interrupt/assert.yaml rename to k8s-tests/chainsaw/skyhook/interrupt/assert-b.yaml index e6f0e444..e5d94d42 100644 --- a/k8s-tests/chainsaw/skyhook/interrupt/assert.yaml +++ b/k8s-tests/chainsaw/skyhook/interrupt/assert-b.yaml @@ -52,42 +52,50 @@ # ([1]): /root # (length(@)): 3 --- -kind: Pod apiVersion: v1 +kind: Node metadata: - namespace: skyhook labels: - skyhook.nvidia.com/name: interrupt - skyhook.nvidia.com/package: jason-1.3.2 + skyhook.nvidia.com/test-node: skyhooke2e + skyhook.nvidia.com/status_interrupt: in_progress annotations: - ("skyhook.nvidia.com/package" && parse_json("skyhook.nvidia.com/package")): + ("skyhook.nvidia.com/nodeState_interrupt" && parse_json("skyhook.nvidia.com/nodeState_interrupt")): { - "name": "jason", - "version": "1.3.2", - "skyhook": "interrupt", - "stage": "apply", - "image": "ghcr.io/nvidia/skyhook/agentless" + "jason|1.3.2": { + "name": "jason", + "version": "1.3.2", + "image": "ghcr.io/nvidia/skyhook/agentless", + "stage": "config", + "state": "complete" + }, + "dexter|1.2.3": { + "name": "dexter", + "version": "1.2.3", + "image": "ghcr.io/nvidia/skyhook/agentless", + "stage": "apply", + "state": "in_progress" + }, + "john|1.2.3": { + "name": "john", + "version": "1.2.3", + "image": "ghcr.io/nvidia/skyhook/agentless", + "stage": "apply", + "state": "in_progress" + } } - ownerReferences: - - apiVersion: skyhook.nvidia.com/v1alpha1 - kind: Skyhook - name: interrupt + skyhook.nvidia.com/cordon_interrupt: "true" + skyhook.nvidia.com/status_interrupt: in_progress spec: - initContainers: - - name: jason-init - image: ghcr.io/nvidia/skyhook/agentless:1.3.2 - - name: jason-apply - image: ghcr.io/nvidia/skyhook/agentless:3.2.3 - args: - ([0]): apply - ([1]): /root - (length(@)): 3 - - name: jason-applycheck - image: ghcr.io/nvidia/skyhook/agentless:3.2.3 - args: - ([0]): apply-check - ([1]): /root - (length(@)): 3 + taints: + - effect: NoSchedule + key: node.kubernetes.io/unschedulable +status: + (conditions[?type == 'skyhook.nvidia.com/interrupt/NotReady']): + - reason: "Incomplete" + status: "True" + (conditions[?type == 'skyhook.nvidia.com/interrupt/Erroring']): + - reason: "Not Erroring" + status: "False" --- kind: ConfigMap apiVersion: v1 @@ -142,8 +150,7 @@ spec: ([1]): /root (parse_json(base64_decode([3]))): { - "type": "service_restart", - "services": ["cron"] + "type": "node_restart", } (length(@)): 4 --- diff --git a/k8s-tests/chainsaw/skyhook/interrupt/assert-drain-me.yaml b/k8s-tests/chainsaw/skyhook/interrupt/assert-drain-me.yaml new file mode 100644 index 00000000..6e6b5d36 --- /dev/null +++ b/k8s-tests/chainsaw/skyhook/interrupt/assert-drain-me.yaml @@ -0,0 +1,24 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +--- +apiVersion: v1 +kind: Pod +metadata: + name: drain-on + labels: + drain: me +--- diff --git a/k8s-tests/chainsaw/skyhook/interrupt/assert-important-stuff.yaml b/k8s-tests/chainsaw/skyhook/interrupt/assert-important-stuff.yaml new file mode 100644 index 00000000..c9fabdc7 --- /dev/null +++ b/k8s-tests/chainsaw/skyhook/interrupt/assert-important-stuff.yaml @@ -0,0 +1,24 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +--- +apiVersion: v1 +kind: Pod +metadata: + name: important-stuff + labels: + dontmess: withme +--- diff --git a/k8s-tests/chainsaw/skyhook/interrupt/chainsaw-test.yaml b/k8s-tests/chainsaw/skyhook/interrupt/chainsaw-test.yaml index a42ffdc5..569ce1b1 100644 --- a/k8s-tests/chainsaw/skyhook/interrupt/chainsaw-test.yaml +++ b/k8s-tests/chainsaw/skyhook/interrupt/chainsaw-test.yaml @@ -69,4 +69,14 @@ spec: - create: file: skyhook.yaml - assert: - file: assert.yaml + file: assert-a.yaml + - assert: + file: assert-important-stuff.yaml + - assert: + file: assert-drain-me.yaml + - delete: + file: assert-important-stuff.yaml + - error: + file: assert-drain-me.yaml + - assert: + file: assert-b.yaml \ No newline at end of file diff --git a/k8s-tests/chainsaw/skyhook/interrupt/pod.yaml b/k8s-tests/chainsaw/skyhook/interrupt/pod.yaml index 504e57b2..d05814d0 100644 --- a/k8s-tests/chainsaw/skyhook/interrupt/pod.yaml +++ b/k8s-tests/chainsaw/skyhook/interrupt/pod.yaml @@ -29,7 +29,7 @@ spec: containers: - name: workload image: busybox:latest - command: ["sh", "-c", "sleep 20 && exit"] + command: ["sh", "-c", "sleep 1000 && exit"] --- apiVersion: v1 kind: Pod @@ -45,7 +45,7 @@ spec: containers: - name: workload image: busybox:latest - command: ["sh", "-c", "sleep 100 && exit"] + command: ["sh", "-c", "sleep 1000 && exit"] --- ## testing remove an old version that should, this pod is running from old version of of the SCR - this will be deleted apiVersion: v1 diff --git a/k8s-tests/chainsaw/skyhook/interrupt/skyhook.yaml b/k8s-tests/chainsaw/skyhook/interrupt/skyhook.yaml index 0b69ed14..7d59e672 100644 --- a/k8s-tests/chainsaw/skyhook/interrupt/skyhook.yaml +++ b/k8s-tests/chainsaw/skyhook/interrupt/skyhook.yaml @@ -68,8 +68,7 @@ spec: version: "1.2.3" image: ghcr.io/nvidia/skyhook/agentless interrupt: - type: service - services: [cron] + type: reboot env: - name: SLEEP_LEN value: "3" ## making faster @@ -84,3 +83,12 @@ spec: color.bad=yellow allow.textmode=true how.nice.to.look=fairlyNice + john: + version: "1.2.3" + image: ghcr.io/nvidia/skyhook/agentless + interrupt: + type: service + services: [rsyslog] + env: + - name: SLEEP_LEN + value: "3" ## making faster \ No newline at end of file diff --git a/k8s-tests/chainsaw/skyhook/uninstall-upgrade-skyhook/assert-update-no-packages.yaml b/k8s-tests/chainsaw/skyhook/uninstall-upgrade-skyhook/assert-update-no-packages.yaml index b5b7e077..9b61bfa0 100644 --- a/k8s-tests/chainsaw/skyhook/uninstall-upgrade-skyhook/assert-update-no-packages.yaml +++ b/k8s-tests/chainsaw/skyhook/uninstall-upgrade-skyhook/assert-update-no-packages.yaml @@ -14,6 +14,39 @@ # See the License for the specific language governing permissions and # limitations under the License. +apiVersion: v1 +kind: Node +metadata: + labels: + skyhook.nvidia.com/test-node: skyhooke2e + skyhook.nvidia.com/status_uninstall-upgrade-skyhook: in_progress + annotations: + ("skyhook.nvidia.com/nodeState_uninstall-upgrade-skyhook" && parse_json("skyhook.nvidia.com/nodeState_uninstall-upgrade-skyhook")): + { + "dogs|1.2.5": { + "name": "dogs", + "version": "1.2.5", + "image": "ghcr.io/nvidia/skyhook/agentless", + "stage": "uninstall", + "state": "in_progress" + }, + "nullptr|2.0.1": { + "name": "nullptr", + "version": "2.0.1", + "image": "ghcr.io/nvidia/skyhook/agentless", + "stage": "uninstall", + "state": "in_progress" + }, + } + skyhook.nvidia.com/status_uninstall-upgrade-skyhook: in_progress + (!taints || length(taints)==`0` || (taints && !not_null(taints))): true ## taints should be empty or not exist +status: + (conditions[?type == 'skyhook.nvidia.com/uninstall-upgrade-skyhook/NotReady']): + - reason: "Incomplete" + status: "True" + (conditions[?type == 'skyhook.nvidia.com/uninstall-upgrade-skyhook/Erroring']): + - reason: "Not Erroring" + status: "False" --- kind: Pod apiVersion: v1 diff --git a/k8s-tests/chainsaw/skyhook/uninstall-upgrade-skyhook/assert-update.yaml b/k8s-tests/chainsaw/skyhook/uninstall-upgrade-skyhook/assert-update.yaml index cb8ac0bb..db0454f2 100644 --- a/k8s-tests/chainsaw/skyhook/uninstall-upgrade-skyhook/assert-update.yaml +++ b/k8s-tests/chainsaw/skyhook/uninstall-upgrade-skyhook/assert-update.yaml @@ -14,6 +14,56 @@ # See the License for the specific language governing permissions and # limitations under the License. +apiVersion: v1 +kind: Node +metadata: + labels: + skyhook.nvidia.com/test-node: skyhooke2e + skyhook.nvidia.com/status_uninstall-upgrade-skyhook: in_progress + annotations: + ("skyhook.nvidia.com/nodeState_uninstall-upgrade-skyhook" && parse_json("skyhook.nvidia.com/nodeState_uninstall-upgrade-skyhook")): + { + "dogs|1.2.5": { + "name": "dogs", + "version": "1.2.5", + "image": "ghcr.io/nvidia/skyhook/agentless", + "stage": "uninstall", + "state": "skipped" + }, + "dogs|1.2.6": { + "name": "dogs", + "version": "1.2.6", + "image": "ghcr.io/nvidia/skyhook/agentless", + "stage": "uninstall", + "state": "in_progress" + }, + "nullptr|2.0.1": { + "name": "nullptr", + "version": "2.0.1", + "image": "ghcr.io/nvidia/skyhook/agentless", + "stage": "upgrade", + "state": "in_progress" + }, + "cats|6.2.0": { + "name": "cats", + "version": "6.2.0", + "image": "ghcr.io/nvidia/skyhook/agentless", + "stage": "uninstall", + "state": "in_progress" + } + } + skyhook.nvidia.com/status_uninstall-upgrade-skyhook: in_progress +spec: + taints: + - effect: NoSchedule + key: node.kubernetes.io/unschedulable +status: + (conditions[?type == 'skyhook.nvidia.com/uninstall-upgrade-skyhook/NotReady']): + - reason: "Incomplete" + status: "True" + (conditions[?type == 'skyhook.nvidia.com/uninstall-upgrade-skyhook/Erroring']): + - reason: "Not Erroring" + status: "False" --- kind: Pod apiVersion: v1 diff --git a/k8s-tests/chainsaw/skyhook/uninstall-upgrade-skyhook/assert.yaml b/k8s-tests/chainsaw/skyhook/uninstall-upgrade-skyhook/assert.yaml index 72e5fc9b..0296f040 100644 --- a/k8s-tests/chainsaw/skyhook/uninstall-upgrade-skyhook/assert.yaml +++ b/k8s-tests/chainsaw/skyhook/uninstall-upgrade-skyhook/assert.yaml @@ -14,6 +14,50 @@ # See the License for the specific language governing permissions and # limitations under the License. +apiVersion: v1 +kind: Node +metadata: + labels: + skyhook.nvidia.com/test-node: skyhooke2e + skyhook.nvidia.com/status_uninstall-upgrade-skyhook: in_progress + annotations: + ("skyhook.nvidia.com/nodeState_uninstall-upgrade-skyhook" && parse_json("skyhook.nvidia.com/nodeState_uninstall-upgrade-skyhook")): + { + "cats|6.2.0": { + "name": "cats", + "version": "6.2.0", + "image": "ghcr.io/nvidia/skyhook/agentless", + "stage": "apply", + "state": "in_progress" + }, + "dogs|1.2.6": { + "name": "dogs", + "version": "1.2.6", + "image": "ghcr.io/nvidia/skyhook/agentless", + "stage": "apply", + "state": "in_progress" + }, + "nullptr|2.0.0": { + "name": "nullptr", + "version": "2.0.0", + "image": "ghcr.io/nvidia/skyhook/agentless", + "stage": "apply", + "state": "in_progress" + } + } + skyhook.nvidia.com/status_uninstall-upgrade-skyhook: in_progress +spec: + taints: + - effect: NoSchedule + key: node.kubernetes.io/unschedulable +status: + (conditions[?type == 'skyhook.nvidia.com/uninstall-upgrade-skyhook/NotReady']): + - reason: "Incomplete" + status: "True" + (conditions[?type == 'skyhook.nvidia.com/uninstall-upgrade-skyhook/Erroring']): + - reason: "Not Erroring" + status: "False" +--- apiVersion: v1 kind: Node metadata: diff --git a/operator/api/v1alpha1/skyhook_types.go b/operator/api/v1alpha1/skyhook_types.go index 83f8375a..77ac7823 100644 --- a/operator/api/v1alpha1/skyhook_types.go +++ b/operator/api/v1alpha1/skyhook_types.go @@ -425,8 +425,7 @@ func (ns *NodeState) NextStage(_package *Package, interrupt map[string][]*Interr StageUpgrade: StageConfig, } - hasInterrupt := (*ns).HasInterrupt(*_package, interrupt, config) - if hasInterrupt { + if hasInterrupt := (*ns).HasInterrupt(*_package, interrupt, config); hasInterrupt { nextStage = map[Stage]Stage{ StageUpgrade: StageConfig, StageUninstall: StageApply, diff --git a/operator/internal/controller/cluster_state_v2.go b/operator/internal/controller/cluster_state_v2.go index 02fcdc23..f8eb04c5 100644 --- a/operator/internal/controller/cluster_state_v2.go +++ b/operator/internal/controller/cluster_state_v2.go @@ -385,12 +385,6 @@ func (np *NodePicker) SelectNodes(s SkyhookNodes) []wrapper.SkyhookNode { Operator: corev1.TolerationOpExists, Effect: corev1.TaintEffectNoSchedule, }, - { - Key: SkyhookTaintUnschedulable, - Value: s.GetSkyhook().GetName(), - Operator: corev1.TolerationOpEqual, - Effect: corev1.TaintEffectNoSchedule, - }, }, s.GetSkyhook().Spec.AdditionalTolerations...) if s.GetSkyhook().Spec.RuntimeRequired { diff --git a/operator/internal/controller/pod_controller.go b/operator/internal/controller/pod_controller.go index 939d886b..26ce6790 100644 --- a/operator/internal/controller/pod_controller.go +++ b/operator/internal/controller/pod_controller.go @@ -191,9 +191,6 @@ func (r *SkyhookReconciler) HandleCompletePod(ctx context.Context, skyhookNode w updated := false if containerName == InterruptContainerName { - // cleanup special race preventing taint - skyhookNode.RemoveTaint(SkyhookTaintUnschedulable) - // in this one case do we need a skyhook instance to get packages // kind of sucks, but does not update, just reads so that is better // seems safer to leave it this way unfortunately. diff --git a/operator/internal/controller/skyhook_controller.go b/operator/internal/controller/skyhook_controller.go index 52bf1919..9a3c1851 100644 --- a/operator/internal/controller/skyhook_controller.go +++ b/operator/internal/controller/skyhook_controller.go @@ -64,9 +64,8 @@ const ( EventsReasonNodeReboot = "Reboot" EventTypeNormal = "Normal" // EventTypeWarning = "Warning" - TaintUnschedulable = corev1.TaintNodeUnschedulable - SkyhookTaintUnschedulable = "skyhook.nvidia.com/unschedulable" - InterruptContainerName = "interrupt" + TaintUnschedulable = corev1.TaintNodeUnschedulable + InterruptContainerName = "interrupt" SkyhookFinalizer = "skyhook.nvidia.com/skyhook" ) @@ -1107,7 +1106,6 @@ func (r *SkyhookReconciler) HandleFinalizer(ctx context.Context, skyhook Skyhook patch := client.StrategicMergeFrom(node.GetNode().DeepCopy()) node.Uncordon() - node.RemoveTaint(SkyhookTaintUnschedulable) // if this doesn't change the node then don't patch if !node.Changed() { @@ -1191,26 +1189,34 @@ func (r *SkyhookReconciler) HasRunningPackages(ctx context.Context, skyhookNode return pods != nil && len(pods.Items) > 0, nil } -func (r *SkyhookReconciler) DrainNode(ctx context.Context, skyhookNode wrapper.SkyhookNode) error { - - // cordon node - skyhookNode.Cordon() - - // adding this taint should help preventing things coming back, including new packages - // NOTE: might cause issues if we boot off something that really had to be there... - skyhookNode.Taint(SkyhookTaintUnschedulable) +func (r *SkyhookReconciler) DrainNode(ctx context.Context, skyhookNode wrapper.SkyhookNode, _package *v1alpha1.Package) (bool, error) { + drained, err := r.IsDrained(ctx, skyhookNode) + if err != nil { + return false, err + } + if drained { + return true, nil + } pods, err := r.dal.GetPods(ctx, client.MatchingFields{ "spec.nodeName": skyhookNode.GetNode().Name, }) if err != nil { - return err + return false, err } if pods == nil || len(pods.Items) == 0 { - return nil + return true, nil } + r.recorder.Eventf(skyhookNode.GetNode(), EventTypeNormal, EventsReasonSkyhookInterrupt, + "draining node [%s] package [%s:%s] from [skyhook:%s]", + skyhookNode.GetNode().Name, + _package.Name, + _package.Version, + skyhookNode.GetSkyhook().Name, + ) + errs := make([]error, 0) for _, pod := range pods.Items { @@ -1223,13 +1229,20 @@ func (r *SkyhookReconciler) DrainNode(ctx context.Context, skyhookNode wrapper.S } } - return utilerrors.NewAggregate(errs) + return len(errs) == 0, utilerrors.NewAggregate(errs) } // Interrupt should not be called unless safe to do so, IE already cordoned and drained func (r *SkyhookReconciler) Interrupt(ctx context.Context, skyhookNode wrapper.SkyhookNode, _package *v1alpha1.Package, _interrupt *v1alpha1.Interrupt) error { - skyhookNode.Taint(SkyhookTaintUnschedulable) // adding this to prevent race conditions between SCR instances + hasPackagesRunning, err := r.HasRunningPackages(ctx, skyhookNode) + if err != nil { + return err + } + + if hasPackagesRunning { // keep waiting... + return nil + } exists, err := r.PodExists(ctx, skyhookNode.GetNode().Name, skyhookNode.GetSkyhook().Name, _package) if err != nil { @@ -1518,12 +1531,6 @@ func createInterruptPodForPackage(opts SkyhookOperatorOptions, _interrupt *v1alp Key: TaintUnschedulable, Operator: corev1.TolerationOpExists, }, - { - Key: SkyhookTaintUnschedulable, - Value: skyhook.Name, - Operator: corev1.TolerationOpEqual, - Effect: corev1.TaintEffectNoSchedule, - }, opts.GetRuntimeRequiredToleration(), }, skyhook.Spec.AdditionalTolerations...), Volumes: volumes, @@ -1966,19 +1973,8 @@ func (r *SkyhookReconciler) ProcessInterrupt(ctx context.Context, skyhookNode wr return true, nil } - // cordon node - skyhookNode.Cordon() - - hasWork, err := r.HasNonInterruptWork(ctx, skyhookNode) - if err != nil { - return false, err - } - - if hasWork { // has work, so we need to keep waiting to do - return false, nil - } - - stage := v1alpha1.StageApply // default starting stage + // default starting stage + stage := v1alpha1.StageApply nextStage := skyhookNode.NextStage(_package) if nextStage != nil { stage = *nextStage @@ -1994,54 +1990,27 @@ func (r *SkyhookReconciler) ProcessInterrupt(ctx context.Context, skyhookNode wr // so we need to check if the pod exists and if it does, we need to recreate it if status != nil && (status.State == v1alpha1.StateInProgress || status.State == v1alpha1.StateErroring) && status.Stage == v1alpha1.StageInterrupt { // call interrupt to recreate the pod if missing - err = r.Interrupt(ctx, skyhookNode, _package, interrupt) + err := r.Interrupt(ctx, skyhookNode, _package, interrupt) if err != nil { return false, err } } - if nextStage != nil && *nextStage == v1alpha1.StageInterrupt && runInterrupt { // time to do the interrupt - - hasWork, err := r.HasNonInterruptWork(ctx, skyhookNode) - if err != nil { - return false, err - } - if hasWork { // keep waiting... - return false, nil - } - - hasPackagesRunning, err := r.HasRunningPackages(ctx, skyhookNode) + // drain and cordon node before applying package that has an interrupt + if stage == v1alpha1.StageApply { + ready, err := r.EnsureNodeIsReadyForInterrupt(ctx, skyhookNode, _package) if err != nil { return false, err } - if hasPackagesRunning { // keep waiting... + if !ready { return false, nil } + } - drained, err := r.IsDrained(ctx, skyhookNode) - if err != nil { - return false, err - } - - // TODO: NOTE this could be a dead lock if IsDrained always returns true if it happens to not be correct - if !drained { - err := r.DrainNode(ctx, skyhookNode) - if err != nil { - return false, fmt.Errorf("error draining node [%s]: %w", skyhookNode.GetNode().Name, err) - } - - r.recorder.Eventf(skyhookNode.GetNode(), EventTypeNormal, EventsReasonSkyhookInterrupt, - "draining node [%s] package [%s:%s] from [skyhook:%s]", - skyhookNode.GetNode().Name, - _package.Name, - _package.Version, - skyhookNode.GetSkyhook().Name, - ) - return false, nil // need to wait for it to drain - } - - err = r.Interrupt(ctx, skyhookNode, _package, interrupt) + // time to interrupt (once other packages have finished) + if stage == v1alpha1.StageInterrupt && runInterrupt { + err := r.Interrupt(ctx, skyhookNode, _package, interrupt) if err != nil { return false, err } @@ -2050,7 +2019,7 @@ func (r *SkyhookReconciler) ProcessInterrupt(ctx context.Context, skyhookNode wr } //skipping - if nextStage != nil && *nextStage == v1alpha1.StageInterrupt && !runInterrupt { + if stage == v1alpha1.StageInterrupt && !runInterrupt { err := skyhookNode.Upsert(_package.PackageRef, _package.Image, v1alpha1.StateSkipped, stage, 0) if err != nil { return false, fmt.Errorf("error upserting to skip interrupt: %w", err) @@ -2066,6 +2035,26 @@ func (r *SkyhookReconciler) ProcessInterrupt(ctx context.Context, skyhookNode wr return true, nil } +func (r *SkyhookReconciler) EnsureNodeIsReadyForInterrupt(ctx context.Context, skyhookNode wrapper.SkyhookNode, _package *v1alpha1.Package) (bool, error) { + // cordon node + skyhookNode.Cordon() + + hasWork, err := r.HasNonInterruptWork(ctx, skyhookNode) + if err != nil { + return false, err + } + if hasWork { // keep waiting... + return false, nil + } + + ready, err := r.DrainNode(ctx, skyhookNode, _package) + if err != nil { + return false, fmt.Errorf("error draining node [%s]: %w", skyhookNode.GetNode().Name, err) + } + + return ready, nil +} + // ApplyPackage starts a pod on node for the package func (r *SkyhookReconciler) ApplyPackage(ctx context.Context, logger logr.Logger, clusterState *clusterState, skyhookNode wrapper.SkyhookNode, _package *v1alpha1.Package, runInterrupt bool) error { @@ -2082,12 +2071,8 @@ func (r *SkyhookReconciler) ApplyPackage(ctx context.Context, logger logr.Logger // which is why it needs to be here as well if packageStatus, found := skyhookNode.PackageStatus(_package.GetUniqueName()); found { switch packageStatus.Stage { - case v1alpha1.StageConfig: - stage = v1alpha1.StageConfig - case v1alpha1.StageUpgrade: - stage = v1alpha1.StageUpgrade - case v1alpha1.StageUninstall: - stage = v1alpha1.StageUninstall + case v1alpha1.StageConfig, v1alpha1.StageUpgrade, v1alpha1.StageUninstall: + stage = packageStatus.Stage } } @@ -2170,12 +2155,6 @@ func (r *SkyhookReconciler) ApplyPackage(ctx context.Context, logger logr.Logger skyhookNode.GetSkyhook().Updated = true - // skyhook_package_in_progress_count.WithLabelValues( - // skyhookNode.GetSkyhook().Name, - // _package.Name, - // _package.Version, - // ).Inc() - return err }