From 840c746127e2ce9e06eb877ce1c8a9b9225cd3c2 Mon Sep 17 00:00:00 2001 From: Brian Lockwood Date: Fri, 8 Aug 2025 14:07:42 -0700 Subject: [PATCH 1/4] fix(ci): fix bug in publishing automation --- .gitlab-ci.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 774b1d36..ff59c263 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -249,7 +249,7 @@ create-chart-json: }, "artifacts": [ { - "name": "chart", + "name": "skyhook-operator", "version": "${CI_COMMIT_TAG#chart/}", "type": "chart" } From 291215e0ffe1140754662782cba11627c7ca6f24 Mon Sep 17 00:00:00 2001 From: Brian Lockwood Date: Mon, 18 Aug 2025 13:36:40 -0700 Subject: [PATCH 2/4] chore: bump helm version and go version --- .github/workflows/operator-ci.yaml | 2 +- operator/Makefile | 2 +- operator/deps.mk | 2 +- operator/go.mod | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/.github/workflows/operator-ci.yaml b/.github/workflows/operator-ci.yaml index 70bfac8a..2964b93b 100644 --- a/.github/workflows/operator-ci.yaml +++ b/.github/workflows/operator-ci.yaml @@ -43,7 +43,7 @@ on: env: REGISTRY: ghcr.io IMAGE_NAME: ${{ github.repository }} - GO_VERSION: 1.24.5 + GO_VERSION: 1.24.6 PLATFORMS: linux/amd64,linux/arm64 jobs: diff --git a/operator/Makefile b/operator/Makefile index 4b53b0c0..635ae0ad 100644 --- a/operator/Makefile +++ b/operator/Makefile @@ -16,7 +16,7 @@ include deps.mk ## Version of the operator VERSION ?= $(GIT_TAG_LAST) -GO_VERSION ?= 1.24.5 +GO_VERSION ?= 1.24.6 # Image URL to use all building/pushing image ## TODO: update this to the correct image location diff --git a/operator/deps.mk b/operator/deps.mk index 65cbf40d..14f5d263 100644 --- a/operator/deps.mk +++ b/operator/deps.mk @@ -44,7 +44,7 @@ GOCOVER_VERSION ?= v1.3.0 GINKGO_VERSION ?= v2.22.2 MOCKERY_VERSION ?= v3.5.0 CHAINSAW_VERSION ?= v0.2.10 -HELM_VERSION ?= v3.15.0 +HELM_VERSION ?= v3.18.5 HELMIFY_VERSION ?= v0.4.12 GO_LICENSES_VERSION ?= v1.6.0 diff --git a/operator/go.mod b/operator/go.mod index a9af3390..27e1ddf0 100644 --- a/operator/go.mod +++ b/operator/go.mod @@ -2,7 +2,7 @@ module github.com/NVIDIA/skyhook/operator go 1.24.0 -toolchain go1.24.5 +toolchain go1.24.6 require ( github.com/go-logr/logr v1.4.2 From 91aa4adc76d28bc85fb7eff8bd548dee2b94ffeb Mon Sep 17 00:00:00 2001 From: Riley Rice Date: Mon, 25 Aug 2025 08:43:53 -0700 Subject: [PATCH 3/4] fix(agent): hotfix for interrupts in agent v6.3.0 (#86) --- .../src/skyhook_agent/controller.py | 16 ++++--- agent/skyhook-agent/tests/test_controller.py | 44 ++++++++++++++++--- containers/agent.Dockerfile | 4 +- 3 files changed, 51 insertions(+), 13 deletions(-) diff --git a/agent/skyhook-agent/src/skyhook_agent/controller.py b/agent/skyhook-agent/src/skyhook_agent/controller.py index 46d32f5f..b87914c9 100644 --- a/agent/skyhook-agent/src/skyhook_agent/controller.py +++ b/agent/skyhook-agent/src/skyhook_agent/controller.py @@ -442,10 +442,10 @@ def _make_interrupt_flag(interrupt_dir: str, interrupt_id: int) -> str: return f"{interrupt_dir}/{interrupt_id}.complete" SKYHOOK_RESOURCE_ID, _, _, _ = _get_env_config() - config_data = make_config_data_from_resource_id() interrupt = interrupts.inflate(interrupt_data) + # Check if the interrupt has already been run for this particular skyhook resource interrupt_dir = f"{get_skyhook_directory(root_mount)}/interrupts/flags/{SKYHOOK_RESOURCE_ID}" os.makedirs(interrupt_dir, exist_ok=True) @@ -469,6 +469,7 @@ def _make_interrupt_flag(interrupt_dir: str, interrupt_id: int) -> str: f.write(str(time.time())) return_code = _run( + root_mount, cmd, get_log_file(f"interrupts/{interrupt_id}", copy_dir, config_data, root_mount), write_cmds=True, @@ -476,10 +477,15 @@ def _make_interrupt_flag(interrupt_dir: str, interrupt_id: int) -> str: ) if return_code != 0: - print(f"INTERRUPT FAILED: {cmd} return_code: {return_code}") - # If this is not removed then we will skip all failing interrupts and it will look - # like the interrupt was successful when it was not. - os.remove(interrupt_flag) + # Special case: preserve flags only for reboot with a return code of -15 + # (SIGTERM signal sent to the process by OS because of reboot) + if not (interrupt.type == interrupts.NodeRestart._type() and return_code == -15): + print(f"INTERRUPT FAILED: {cmd} return_code: {return_code}") + + # If this is not removed then we will skip all failing interrupts and it will look + # like the interrupt was successful when it was not. + os.remove(interrupt_flag) + return True return False diff --git a/agent/skyhook-agent/tests/test_controller.py b/agent/skyhook-agent/tests/test_controller.py index 37bdcaf6..c49995cb 100644 --- a/agent/skyhook-agent/tests/test_controller.py +++ b/agent/skyhook-agent/tests/test_controller.py @@ -1068,8 +1068,8 @@ def test_interrupt_applies_all_commands(self, run_mock, datetime_mock): "package_version": "version" } run_mock.assert_has_calls([ - mock.call(["systemctl", "daemon-reload"], controller.get_log_file("interrupts/service_restart_0", copy_dir, config_data, root_dir), write_cmds=True, no_chmod=True), - mock.call(["systemctl", "restart", "containerd"], controller.get_log_file("interrupts/service_restart_1", copy_dir, config_data, root_dir), write_cmds=True, no_chmod=True) + mock.call(root_dir, ["systemctl", "daemon-reload"], controller.get_log_file("interrupts/service_restart_0", copy_dir, config_data, root_dir), write_cmds=True, no_chmod=True), + mock.call(root_dir, ["systemctl", "restart", "containerd"], controller.get_log_file("interrupts/service_restart_1", copy_dir, config_data, root_dir), write_cmds=True, no_chmod=True) ]) @mock.patch("skyhook_agent.controller._run") @@ -1090,7 +1090,7 @@ def test_interrupt_create_flags_per_cmd(self, run_mock): run_mock.return_value = 0 SKYHOOK_RESOURCE_ID="scr-id-1_package_version" with (self._setup_for_main() as (container_root_dir, config_data, root_dir, copy_dir), - set_env(SKYHOOK_RESOURCE_ID=SKYHOOK_RESOURCE_ID)): + set_env(SKYHOOK_RESOURCE_ID=SKYHOOK_RESOURCE_ID)): interrupt_dir = f"{controller.get_skyhook_directory(root_dir)}/interrupts/flags/{SKYHOOK_RESOURCE_ID}" interrupt = interrupts.ServiceRestart(["foo", "bar"]) controller.do_interrupt(interrupt.make_controller_input(), root_dir, copy_dir) @@ -1103,7 +1103,7 @@ def test_interrupt_failures_remove_flag(self, run_mock): run_mock.side_effect = [0,1,0] SKYHOOK_RESOURCE_ID="scr-id-1_package_version" with (self._setup_for_main() as (container_root_dir, config_data, root_dir, copy_dir), - set_env(SKYHOOK_RESOURCE_ID=SKYHOOK_RESOURCE_ID)): + set_env(SKYHOOK_RESOURCE_ID=SKYHOOK_RESOURCE_ID)): interrupt_dir = f"{controller.get_skyhook_directory(root_dir)}/interrupts/flags/{SKYHOOK_RESOURCE_ID}" interrupt = interrupts.ServiceRestart(["foo", "bar"]) controller.do_interrupt(interrupt.make_controller_input(), root_dir, copy_dir) @@ -1111,6 +1111,38 @@ def test_interrupt_failures_remove_flag(self, run_mock): self.assertTrue(os.path.exists(f"{interrupt_dir}/{interrupt._type()}_0.complete")) self.assertFalse(os.path.exists(f"{interrupt_dir}/{interrupt._type()}_1.complete")) self.assertFalse(os.path.exists(f"{interrupt_dir}/{interrupt._type()}_1.complete")) + + @mock.patch("skyhook_agent.controller._run") + def test_interrupt_reboot_SIGTERM_preserves_flag(self, run_mock): + run_mock.side_effect = [0, -15, 0] + SKYHOOK_RESOURCE_ID="scr-id-1_package_version" + with (self._setup_for_main() as (container_root_dir, config_data, root_dir, copy_dir), + set_env(SKYHOOK_RESOURCE_ID=SKYHOOK_RESOURCE_ID)): + interrupt_dir = f"{controller.get_skyhook_directory(root_dir)}/interrupts/flags/{SKYHOOK_RESOURCE_ID}" + interrupt = interrupts.NodeRestart() + controller.do_interrupt(interrupt.make_controller_input(), root_dir, copy_dir) + + self.assertTrue(os.path.exists(f"{interrupt_dir}/{interrupt._type()}_0.complete")) + + @mock.patch("skyhook_agent.controller._run") + def test_interrupt_calls_run_with_correct_parameters(self, run_mock): + run_mock.return_value = 0 + SKYHOOK_RESOURCE_ID = "scr-id-1_package_version" + + with (self._setup_for_main() as (container_root_dir, config_data, root_dir, copy_dir), + set_env(SKYHOOK_RESOURCE_ID=SKYHOOK_RESOURCE_ID)): + + interrupt = interrupts.ServiceRestart(["foo", "bar"]) + result = controller.do_interrupt(interrupt.make_controller_input(), root_dir, copy_dir) + + self.assertEqual(result, False) + expected_calls = [ + mock.call(root_dir, ["systemctl", "daemon-reload"], mock.ANY, write_cmds=True, no_chmod=True), + mock.call(root_dir, ["systemctl", "restart", "foo"], mock.ANY, write_cmds=True, no_chmod=True), + mock.call(root_dir, ["systemctl", "restart", "bar"], mock.ANY, write_cmds=True, no_chmod=True) + ] + run_mock.assert_has_calls(expected_calls) + self.assertEqual(run_mock.call_count, 3) @mock.patch("skyhook_agent.controller.datetime") @mock.patch("skyhook_agent.controller._run") @@ -1140,7 +1172,7 @@ def test_interrupt_failure_fails_controller(self, run_mock, datetime_mock): "package_version": "version" } run_mock.assert_has_calls([ - mock.call(["systemctl", "daemon-reload"], controller.get_log_file("interrupts/service_restart_0", "copy_dir", config_data, root_dir), write_cmds=True, no_chmod=True) + mock.call(root_dir, ["systemctl", "daemon-reload"], controller.get_log_file("interrupts/service_restart_0", "copy_dir", config_data, root_dir), write_cmds=True, no_chmod=True) ]) self.assertEqual(result, True) @@ -1173,7 +1205,7 @@ def test_interrupt_makes_config_from_skyhook_resource_id(self, run_mock, datetim "package_version": "version" } run_mock.assert_has_calls([ - mock.call(["systemctl", "daemon-reload"], controller.get_log_file("interrupts/service_restart_0", "copy_dir", config_data, root_dir), write_cmds=True, no_chmod=True) + mock.call(root_dir, ["systemctl", "daemon-reload"], controller.get_log_file("interrupts/service_restart_0", "copy_dir", config_data, root_dir), write_cmds=True, no_chmod=True) ]) def test_interrupt_noop_makes_the_flag_file(self): diff --git a/containers/agent.Dockerfile b/containers/agent.Dockerfile index f06eff89..419934c3 100644 --- a/containers/agent.Dockerfile +++ b/containers/agent.Dockerfile @@ -36,13 +36,13 @@ RUN make build build_version=${AGENT_VERSION} # Install the wheel in the builder stage RUN python3 -m venv venv && ./venv/bin/pip install /code/skyhook-agent/dist/skyhook_agent*.whl -FROM nvcr.io/nvidia/distroless/python:3.12-v3.4.13 +FROM nvcr.io/nvidia/distroless/python:3.12-v3.4.15 ARG AGENT_VERSION ARG GIT_SHA ## https://github.com/opencontainers/image-spec/blob/main/annotations.md -LABEL org.opencontainers.image.base.name="nvcr.io/nvidia/distroless/python:3.12-v3.4.13" \ +LABEL org.opencontainers.image.base.name="nvcr.io/nvidia/distroless/python:3.12-v3.4.15" \ org.opencontainers.image.licenses="Apache-2.0" \ org.opencontainers.image.title="skyhook-agent" \ org.opencontainers.image.version="${AGENT_VERSION}" \ From 0333bf45263ed60445eafdf3881c84d3d69e1c11 Mon Sep 17 00:00:00 2001 From: Brian Lockwood Date: Mon, 25 Aug 2025 12:59:44 -0700 Subject: [PATCH 4/4] chore: update agent version --- chart/values.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/chart/values.yaml b/chart/values.yaml index 59305113..08c88490 100644 --- a/chart/values.yaml +++ b/chart/values.yaml @@ -85,7 +85,7 @@ controllerManager: ## agentImage: is the image used for the agent container. This image is the default for this install, but can be overridden in the CR at package level. agent: repository: nvcr.io/nvidia/skyhook/agent - tag: "v6.3.0" + tag: "v6.3.1" # resources: If this is defined it will override the default calculation for resources # from estimatedNodeCount and estimatedPackageCount. The below values are