Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/workflows/operator-ci.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ on:
env:
REGISTRY: ghcr.io
IMAGE_NAME: ${{ github.repository }}
GO_VERSION: 1.24.5
GO_VERSION: 1.24.6
PLATFORMS: linux/amd64,linux/arm64

jobs:
Expand Down
2 changes: 1 addition & 1 deletion .gitlab-ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -249,7 +249,7 @@ create-chart-json:
},
"artifacts": [
{
"name": "chart",
"name": "skyhook-operator",
"version": "${CI_COMMIT_TAG#chart/}",
"type": "chart"
}
Expand Down
16 changes: 11 additions & 5 deletions agent/skyhook-agent/src/skyhook_agent/controller.py
Original file line number Diff line number Diff line change
Expand Up @@ -442,10 +442,10 @@ def _make_interrupt_flag(interrupt_dir: str, interrupt_id: int) -> str:
return f"{interrupt_dir}/{interrupt_id}.complete"

SKYHOOK_RESOURCE_ID, _, _, _ = _get_env_config()

config_data = make_config_data_from_resource_id()

interrupt = interrupts.inflate(interrupt_data)

# Check if the interrupt has already been run for this particular skyhook resource
interrupt_dir = f"{get_skyhook_directory(root_mount)}/interrupts/flags/{SKYHOOK_RESOURCE_ID}"
os.makedirs(interrupt_dir, exist_ok=True)
Expand All @@ -469,17 +469,23 @@ def _make_interrupt_flag(interrupt_dir: str, interrupt_id: int) -> str:
f.write(str(time.time()))

return_code = _run(
root_mount,
cmd,
get_log_file(f"interrupts/{interrupt_id}", copy_dir, config_data, root_mount),
write_cmds=True,
no_chmod=True
)

if return_code != 0:
print(f"INTERRUPT FAILED: {cmd} return_code: {return_code}")
# If this is not removed then we will skip all failing interrupts and it will look
# like the interrupt was successful when it was not.
os.remove(interrupt_flag)
# Special case: preserve flags only for reboot with a return code of -15
# (SIGTERM signal sent to the process by OS because of reboot)
if not (interrupt.type == interrupts.NodeRestart._type() and return_code == -15):
print(f"INTERRUPT FAILED: {cmd} return_code: {return_code}")

# If this is not removed then we will skip all failing interrupts and it will look
# like the interrupt was successful when it was not.
os.remove(interrupt_flag)

return True

return False
Expand Down
44 changes: 38 additions & 6 deletions agent/skyhook-agent/tests/test_controller.py
Original file line number Diff line number Diff line change
Expand Up @@ -1068,8 +1068,8 @@ def test_interrupt_applies_all_commands(self, run_mock, datetime_mock):
"package_version": "version"
}
run_mock.assert_has_calls([
mock.call(["systemctl", "daemon-reload"], controller.get_log_file("interrupts/service_restart_0", copy_dir, config_data, root_dir), write_cmds=True, no_chmod=True),
mock.call(["systemctl", "restart", "containerd"], controller.get_log_file("interrupts/service_restart_1", copy_dir, config_data, root_dir), write_cmds=True, no_chmod=True)
mock.call(root_dir, ["systemctl", "daemon-reload"], controller.get_log_file("interrupts/service_restart_0", copy_dir, config_data, root_dir), write_cmds=True, no_chmod=True),
mock.call(root_dir, ["systemctl", "restart", "containerd"], controller.get_log_file("interrupts/service_restart_1", copy_dir, config_data, root_dir), write_cmds=True, no_chmod=True)
])

@mock.patch("skyhook_agent.controller._run")
Expand All @@ -1090,7 +1090,7 @@ def test_interrupt_create_flags_per_cmd(self, run_mock):
run_mock.return_value = 0
SKYHOOK_RESOURCE_ID="scr-id-1_package_version"
with (self._setup_for_main() as (container_root_dir, config_data, root_dir, copy_dir),
set_env(SKYHOOK_RESOURCE_ID=SKYHOOK_RESOURCE_ID)):
set_env(SKYHOOK_RESOURCE_ID=SKYHOOK_RESOURCE_ID)):
interrupt_dir = f"{controller.get_skyhook_directory(root_dir)}/interrupts/flags/{SKYHOOK_RESOURCE_ID}"
interrupt = interrupts.ServiceRestart(["foo", "bar"])
controller.do_interrupt(interrupt.make_controller_input(), root_dir, copy_dir)
Expand All @@ -1103,14 +1103,46 @@ def test_interrupt_failures_remove_flag(self, run_mock):
run_mock.side_effect = [0,1,0]
SKYHOOK_RESOURCE_ID="scr-id-1_package_version"
with (self._setup_for_main() as (container_root_dir, config_data, root_dir, copy_dir),
set_env(SKYHOOK_RESOURCE_ID=SKYHOOK_RESOURCE_ID)):
set_env(SKYHOOK_RESOURCE_ID=SKYHOOK_RESOURCE_ID)):
interrupt_dir = f"{controller.get_skyhook_directory(root_dir)}/interrupts/flags/{SKYHOOK_RESOURCE_ID}"
interrupt = interrupts.ServiceRestart(["foo", "bar"])
controller.do_interrupt(interrupt.make_controller_input(), root_dir, copy_dir)

self.assertTrue(os.path.exists(f"{interrupt_dir}/{interrupt._type()}_0.complete"))
self.assertFalse(os.path.exists(f"{interrupt_dir}/{interrupt._type()}_1.complete"))
self.assertFalse(os.path.exists(f"{interrupt_dir}/{interrupt._type()}_1.complete"))

@mock.patch("skyhook_agent.controller._run")
def test_interrupt_reboot_SIGTERM_preserves_flag(self, run_mock):
run_mock.side_effect = [0, -15, 0]
SKYHOOK_RESOURCE_ID="scr-id-1_package_version"
with (self._setup_for_main() as (container_root_dir, config_data, root_dir, copy_dir),
set_env(SKYHOOK_RESOURCE_ID=SKYHOOK_RESOURCE_ID)):
interrupt_dir = f"{controller.get_skyhook_directory(root_dir)}/interrupts/flags/{SKYHOOK_RESOURCE_ID}"
interrupt = interrupts.NodeRestart()
controller.do_interrupt(interrupt.make_controller_input(), root_dir, copy_dir)

self.assertTrue(os.path.exists(f"{interrupt_dir}/{interrupt._type()}_0.complete"))

@mock.patch("skyhook_agent.controller._run")
def test_interrupt_calls_run_with_correct_parameters(self, run_mock):
run_mock.return_value = 0
SKYHOOK_RESOURCE_ID = "scr-id-1_package_version"

with (self._setup_for_main() as (container_root_dir, config_data, root_dir, copy_dir),
set_env(SKYHOOK_RESOURCE_ID=SKYHOOK_RESOURCE_ID)):

interrupt = interrupts.ServiceRestart(["foo", "bar"])
result = controller.do_interrupt(interrupt.make_controller_input(), root_dir, copy_dir)

self.assertEqual(result, False)
expected_calls = [
mock.call(root_dir, ["systemctl", "daemon-reload"], mock.ANY, write_cmds=True, no_chmod=True),
mock.call(root_dir, ["systemctl", "restart", "foo"], mock.ANY, write_cmds=True, no_chmod=True),
mock.call(root_dir, ["systemctl", "restart", "bar"], mock.ANY, write_cmds=True, no_chmod=True)
]
run_mock.assert_has_calls(expected_calls)
self.assertEqual(run_mock.call_count, 3)

@mock.patch("skyhook_agent.controller.datetime")
@mock.patch("skyhook_agent.controller._run")
Expand Down Expand Up @@ -1140,7 +1172,7 @@ def test_interrupt_failure_fails_controller(self, run_mock, datetime_mock):
"package_version": "version"
}
run_mock.assert_has_calls([
mock.call(["systemctl", "daemon-reload"], controller.get_log_file("interrupts/service_restart_0", "copy_dir", config_data, root_dir), write_cmds=True, no_chmod=True)
mock.call(root_dir, ["systemctl", "daemon-reload"], controller.get_log_file("interrupts/service_restart_0", "copy_dir", config_data, root_dir), write_cmds=True, no_chmod=True)
])

self.assertEqual(result, True)
Expand Down Expand Up @@ -1173,7 +1205,7 @@ def test_interrupt_makes_config_from_skyhook_resource_id(self, run_mock, datetim
"package_version": "version"
}
run_mock.assert_has_calls([
mock.call(["systemctl", "daemon-reload"], controller.get_log_file("interrupts/service_restart_0", "copy_dir", config_data, root_dir), write_cmds=True, no_chmod=True)
mock.call(root_dir, ["systemctl", "daemon-reload"], controller.get_log_file("interrupts/service_restart_0", "copy_dir", config_data, root_dir), write_cmds=True, no_chmod=True)
])

def test_interrupt_noop_makes_the_flag_file(self):
Expand Down
2 changes: 1 addition & 1 deletion chart/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -85,7 +85,7 @@ controllerManager:
## agentImage: is the image used for the agent container. This image is the default for this install, but can be overridden in the CR at package level.
agent:
repository: nvcr.io/nvidia/skyhook/agent
tag: "v6.3.0"
tag: "v6.3.1"

# resources: If this is defined it will override the default calculation for resources
# from estimatedNodeCount and estimatedPackageCount. The below values are
Expand Down
4 changes: 2 additions & 2 deletions containers/agent.Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -36,13 +36,13 @@ RUN make build build_version=${AGENT_VERSION}
# Install the wheel in the builder stage
RUN python3 -m venv venv && ./venv/bin/pip install /code/skyhook-agent/dist/skyhook_agent*.whl

FROM nvcr.io/nvidia/distroless/python:3.12-v3.4.13
FROM nvcr.io/nvidia/distroless/python:3.12-v3.4.15

ARG AGENT_VERSION
ARG GIT_SHA

## https://github.com/opencontainers/image-spec/blob/main/annotations.md
LABEL org.opencontainers.image.base.name="nvcr.io/nvidia/distroless/python:3.12-v3.4.13" \
LABEL org.opencontainers.image.base.name="nvcr.io/nvidia/distroless/python:3.12-v3.4.15" \
org.opencontainers.image.licenses="Apache-2.0" \
org.opencontainers.image.title="skyhook-agent" \
org.opencontainers.image.version="${AGENT_VERSION}" \
Expand Down
2 changes: 1 addition & 1 deletion operator/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ include deps.mk

## Version of the operator
VERSION ?= $(GIT_TAG_LAST)
GO_VERSION ?= 1.24.5
GO_VERSION ?= 1.24.6

# Image URL to use all building/pushing image
## TODO: update this to the correct image location
Expand Down
2 changes: 1 addition & 1 deletion operator/deps.mk
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ GOCOVER_VERSION ?= v1.3.0
GINKGO_VERSION ?= v2.22.2
MOCKERY_VERSION ?= v3.5.0
CHAINSAW_VERSION ?= v0.2.10
HELM_VERSION ?= v3.15.0
HELM_VERSION ?= v3.18.5
HELMIFY_VERSION ?= v0.4.12
GO_LICENSES_VERSION ?= v1.6.0

Expand Down
2 changes: 1 addition & 1 deletion operator/go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ module github.com/NVIDIA/skyhook/operator

go 1.24.0

toolchain go1.24.5
toolchain go1.24.6

require (
github.com/go-logr/logr v1.4.2
Expand Down
Loading