From 8993b0258ce00f29f5713e64e724310d57ca5eb8 Mon Sep 17 00:00:00 2001 From: Tommy Lam Date: Wed, 3 Sep 2025 12:26:00 -0700 Subject: [PATCH] feat: add package configuration to node config map --- .../assert-update-no-interrupt.yaml | 6 +- .../assert-update-while-running.yaml | 6 +- .../skyhook/config-skyhook/assert-update.yaml | 8 ++- .../skyhook/config-skyhook/assert.yaml | 8 ++- .../skyhook/delete-skyhook/assert.yaml | 6 +- .../skyhook/interrupt-grouping/assert.yaml | 5 +- .../chainsaw/skyhook/interrupt/assert-b.yaml | 5 +- .../skyhook/simple-skyhook/assert.yaml | 5 +- .../simple-update-skyhook/assert-update.yaml | 6 +- .../skyhook/simple-update-skyhook/assert.yaml | 6 +- .../internal/controller/skyhook_controller.go | 51 +++++++++++++- .../controller/skyhook_controller_test.go | 43 ++++++++++++ .../internal/controller/skyhook_metadata.go | 67 +++++++++++++++++++ operator/internal/controller/suite_test.go | 9 +++ 14 files changed, 218 insertions(+), 13 deletions(-) create mode 100644 operator/internal/controller/skyhook_metadata.go diff --git a/k8s-tests/chainsaw/skyhook/config-skyhook/assert-update-no-interrupt.yaml b/k8s-tests/chainsaw/skyhook/config-skyhook/assert-update-no-interrupt.yaml index 3e038bd2..03438f34 100644 --- a/k8s-tests/chainsaw/skyhook/config-skyhook/assert-update-no-interrupt.yaml +++ b/k8s-tests/chainsaw/skyhook/config-skyhook/assert-update-no-interrupt.yaml @@ -138,10 +138,14 @@ metadata: kind: Skyhook name: config-skyhook data: - (length(@)): 2 + (length(@)): 3 labels.json: (contains(@, 'skyhook.nvidia.com/test-node')): true (contains(@, 'skyhook.nvidia.com/status_config-skyhook')): true annotations.json: (contains(@, 'skyhook.nvidia.com/status_config-skyhook')): true (contains(@, 'skyhook.nvidia.com/nodeState_config-skyhook')): true + packages.json: + (contains(@, '"agentVersion"')): true + (contains(@, '"dexter"')): true + (contains(@, '"3.2.3"')): true diff --git a/k8s-tests/chainsaw/skyhook/config-skyhook/assert-update-while-running.yaml b/k8s-tests/chainsaw/skyhook/config-skyhook/assert-update-while-running.yaml index 6d1aa560..51518b6c 100644 --- a/k8s-tests/chainsaw/skyhook/config-skyhook/assert-update-while-running.yaml +++ b/k8s-tests/chainsaw/skyhook/config-skyhook/assert-update-while-running.yaml @@ -147,10 +147,14 @@ metadata: kind: Skyhook name: config-skyhook data: - (length(@)): 2 + (length(@)): 3 labels.json: (contains(@, 'skyhook.nvidia.com/test-node')): true (contains(@, 'skyhook.nvidia.com/status_config-skyhook')): true annotations.json: (contains(@, 'skyhook.nvidia.com/status_config-skyhook')): true (contains(@, 'skyhook.nvidia.com/nodeState_config-skyhook')): true + packages.json: + (contains(@, '"agentVersion"')): true + (contains(@, '"dexter"')): true + (contains(@, '"3.2.3"')): true \ No newline at end of file diff --git a/k8s-tests/chainsaw/skyhook/config-skyhook/assert-update.yaml b/k8s-tests/chainsaw/skyhook/config-skyhook/assert-update.yaml index c5352e16..370d55e3 100644 --- a/k8s-tests/chainsaw/skyhook/config-skyhook/assert-update.yaml +++ b/k8s-tests/chainsaw/skyhook/config-skyhook/assert-update.yaml @@ -144,10 +144,16 @@ metadata: kind: Skyhook name: config-skyhook data: - (length(@)): 2 + (length(@)): 3 labels.json: (contains(@, 'skyhook.nvidia.com/test-node')): true (contains(@, 'skyhook.nvidia.com/status_config-skyhook')): true annotations.json: (contains(@, 'skyhook.nvidia.com/status_config-skyhook')): true (contains(@, 'skyhook.nvidia.com/nodeState_config-skyhook')): true + packages.json: + (contains(@, '"agentVersion"')): true + (contains(@, '"dexter"')): true + (contains(@, '"baxter"')): true + (contains(@, '"spencer"')): true + (contains(@, '"3.2.3"')): true diff --git a/k8s-tests/chainsaw/skyhook/config-skyhook/assert.yaml b/k8s-tests/chainsaw/skyhook/config-skyhook/assert.yaml index 1f1fd759..dd7b1005 100644 --- a/k8s-tests/chainsaw/skyhook/config-skyhook/assert.yaml +++ b/k8s-tests/chainsaw/skyhook/config-skyhook/assert.yaml @@ -152,10 +152,16 @@ metadata: kind: Skyhook name: config-skyhook data: - (length(@)): 2 + (length(@)): 3 labels.json: (contains(@, 'skyhook.nvidia.com/test-node')): true (contains(@, 'skyhook.nvidia.com/status_config-skyhook')): true annotations.json: (contains(@, 'skyhook.nvidia.com/status_config-skyhook')): true (contains(@, 'skyhook.nvidia.com/nodeState_config-skyhook')): true + packages.json: + (contains(@, '"agentVersion"')): true + (contains(@, '"dexter"')): true + (contains(@, '"baxter"')): true + (contains(@, '"spencer"')): true + (contains(@, '"3.2.3"')): true diff --git a/k8s-tests/chainsaw/skyhook/delete-skyhook/assert.yaml b/k8s-tests/chainsaw/skyhook/delete-skyhook/assert.yaml index dd2b20e3..75811327 100644 --- a/k8s-tests/chainsaw/skyhook/delete-skyhook/assert.yaml +++ b/k8s-tests/chainsaw/skyhook/delete-skyhook/assert.yaml @@ -124,10 +124,14 @@ metadata: kind: Skyhook name: delete-skyhook data: - (length(@)): 2 + (length(@)): 3 labels.json: (contains(@, 'skyhook.nvidia.com/test-node')): true (contains(@, 'skyhook.nvidia.com/status_delete-skyhook')): true annotations.json: (contains(@, 'skyhook.nvidia.com/status_delete-skyhook')): true (contains(@, 'skyhook.nvidia.com/nodeState_delete-skyhook')): true + packages.json: + (contains(@, '"agentVersion"')): true + (contains(@, '"dexter"')): true + (contains(@, '"3.2.3"')): true diff --git a/k8s-tests/chainsaw/skyhook/interrupt-grouping/assert.yaml b/k8s-tests/chainsaw/skyhook/interrupt-grouping/assert.yaml index 371ec048..71365650 100644 --- a/k8s-tests/chainsaw/skyhook/interrupt-grouping/assert.yaml +++ b/k8s-tests/chainsaw/skyhook/interrupt-grouping/assert.yaml @@ -229,10 +229,13 @@ metadata: kind: Skyhook name: interrupt-grouping data: - (length(@)): 2 + (length(@)): 3 labels.json: (contains(@, 'skyhook.nvidia.com/test-node')): true (contains(@, 'skyhook.nvidia.com/status_interrupt-grouping')): true annotations.json: (contains(@, 'skyhook.nvidia.com/status_interrupt-grouping')): true (contains(@, 'skyhook.nvidia.com/nodeState_interrupt-grouping')): true + packages.json: + (contains(@, '"agentVersion"')): true + (contains(@, '"dax"')): true diff --git a/k8s-tests/chainsaw/skyhook/interrupt/assert-b.yaml b/k8s-tests/chainsaw/skyhook/interrupt/assert-b.yaml index e5d94d42..95711d58 100644 --- a/k8s-tests/chainsaw/skyhook/interrupt/assert-b.yaml +++ b/k8s-tests/chainsaw/skyhook/interrupt/assert-b.yaml @@ -260,10 +260,13 @@ metadata: kind: Skyhook name: interrupt data: - (length(@)): 2 + (length(@)): 3 labels.json: (contains(@, 'skyhook.nvidia.com/test-node')): true (contains(@, 'skyhook.nvidia.com/status_interrupt')): true annotations.json: (contains(@, 'skyhook.nvidia.com/status_interrupt')): true (contains(@, 'skyhook.nvidia.com/nodeState_interrupt')): true + packages.json: + (contains(@, '"agentVersion"')): true + (contains(@, '"dexter"')): true diff --git a/k8s-tests/chainsaw/skyhook/simple-skyhook/assert.yaml b/k8s-tests/chainsaw/skyhook/simple-skyhook/assert.yaml index f40ae922..1740cf02 100644 --- a/k8s-tests/chainsaw/skyhook/simple-skyhook/assert.yaml +++ b/k8s-tests/chainsaw/skyhook/simple-skyhook/assert.yaml @@ -124,10 +124,13 @@ metadata: kind: Skyhook name: simple-skyhook data: - (length(@)): 2 + (length(@)): 3 labels.json: (contains(@, 'skyhook.nvidia.com/test-node')): true (contains(@, 'skyhook.nvidia.com/status_simple-skyhook')): true annotations.json: (contains(@, 'skyhook.nvidia.com/status_simple-skyhook')): true (contains(@, 'skyhook.nvidia.com/nodeState_simple-skyhook')): true + packages.json: + (contains(@, '"agentVersion"')): true + (contains(@, '"dexter"')): true diff --git a/k8s-tests/chainsaw/skyhook/simple-update-skyhook/assert-update.yaml b/k8s-tests/chainsaw/skyhook/simple-update-skyhook/assert-update.yaml index aa215221..40447cd7 100644 --- a/k8s-tests/chainsaw/skyhook/simple-update-skyhook/assert-update.yaml +++ b/k8s-tests/chainsaw/skyhook/simple-update-skyhook/assert-update.yaml @@ -151,10 +151,14 @@ metadata: kind: Skyhook name: simple-update-skyhook data: - (length(@)): 2 + (length(@)): 3 labels.json: (contains(@, 'skyhook.nvidia.com/test-node')): true (contains(@, 'skyhook.nvidia.com/status_simple-update-skyhook')): true annotations.json: (contains(@, 'skyhook.nvidia.com/status_simple-update-skyhook')): true (contains(@, 'skyhook.nvidia.com/nodeState_simple-update-skyhook')): true + packages.json: + (contains(@, '"agentVersion"')): true + (contains(@, '"baxter"')): true + (contains(@, '"2.3.1-test"')): true diff --git a/k8s-tests/chainsaw/skyhook/simple-update-skyhook/assert.yaml b/k8s-tests/chainsaw/skyhook/simple-update-skyhook/assert.yaml index 20d4c685..99bcdabb 100644 --- a/k8s-tests/chainsaw/skyhook/simple-update-skyhook/assert.yaml +++ b/k8s-tests/chainsaw/skyhook/simple-update-skyhook/assert.yaml @@ -129,10 +129,14 @@ metadata: kind: Skyhook name: simple-update-skyhook data: - (length(@)): 2 + (length(@)): 3 labels.json: (contains(@, 'skyhook.nvidia.com/test-node')): true (contains(@, 'skyhook.nvidia.com/status_simple-update-skyhook')): true annotations.json: (contains(@, 'skyhook.nvidia.com/status_simple-update-skyhook')): true (contains(@, 'skyhook.nvidia.com/nodeState_simple-update-skyhook')): true + packages.json: + (contains(@, '"agentVersion"')): true + (contains(@, '"dexter"')): true + (contains(@, '"1.2.3"')): true diff --git a/operator/internal/controller/skyhook_controller.go b/operator/internal/controller/skyhook_controller.go index 9a3c1851..4a25e98c 100644 --- a/operator/internal/controller/skyhook_controller.go +++ b/operator/internal/controller/skyhook_controller.go @@ -135,6 +135,12 @@ func (o *SkyhookOperatorOptions) Validate() error { return nil } +// AgentVersion returns the image tag portion of AgentImage +func (o *SkyhookOperatorOptions) AgentVersion() string { + parts := strings.Split(o.AgentImage, ":") + return parts[len(parts)-1] +} + func (o *SkyhookOperatorOptions) GetRuntimeRequiredTaint() corev1.Taint { to_add, _, _ := taints.ParseTaints([]string{o.RuntimeRequiredTaint}) return to_add[0] @@ -619,9 +625,9 @@ func (r *SkyhookReconciler) SaveNodesAndSkyhook(ctx context.Context, clusterStat } saved = true - err = r.UpsertNodeLabelsAnnotations(ctx, skyhook.GetSkyhook(), node.GetNode()) + err = r.UpsertNodeLabelsAnnotationsPackages(ctx, skyhook.GetSkyhook(), node.GetNode()) if err != nil { - errs = append(errs, fmt.Errorf("error upserting labels and annotations config map for node [%s]: %w", node.GetNode().Name, err)) + errs = append(errs, fmt.Errorf("error upserting labels, annotations, and packages config map for node [%s]: %w", node.GetNode().Name, err)) } if node.IsComplete() { @@ -792,7 +798,7 @@ func generateSafeName(maxLen int, nameParts ...string) string { return strings.ToLower(fmt.Sprintf("%s-%s", name, uniqueStr)) } -func (r *SkyhookReconciler) UpsertNodeLabelsAnnotations(ctx context.Context, skyhook *wrapper.Skyhook, node *corev1.Node) error { +func (r *SkyhookReconciler) UpsertNodeLabelsAnnotationsPackages(ctx context.Context, skyhook *wrapper.Skyhook, node *corev1.Node) error { // No work to do if there is no labels or annotations for node if len(node.Labels) == 0 && len(node.Annotations) == 0 { return nil @@ -808,6 +814,13 @@ func (r *SkyhookReconciler) UpsertNodeLabelsAnnotations(ctx context.Context, sky return fmt.Errorf("error converting labels into byte array: %w", err) } + // marshal intermediary package metadata for the agent + metadata := NewSkyhookMetadata(r.opts, skyhook) + packages, err := metadata.Marshal() + if err != nil { + return fmt.Errorf("error converting packages into byte array: %w", err) + } + configMapName := generateSafeName(253, skyhook.Name, node.Name, "metadata") newCM := &corev1.ConfigMap{ ObjectMeta: metav1.ObjectMeta{ @@ -824,6 +837,7 @@ func (r *SkyhookReconciler) UpsertNodeLabelsAnnotations(ctx context.Context, sky Data: map[string]string{ "annotations.json": string(annotations), "labels.json": string(labels), + "packages.json": string(packages), }, } @@ -1401,6 +1415,37 @@ func (r *SkyhookReconciler) ValidateNodeConfigmaps(ctx context.Context, skyhookN } } + // Ensure packages.json is present and up-to-date for expected configmaps + skyhookCR, err := r.dal.GetSkyhook(ctx, skyhookName) + if err != nil { + return update, fmt.Errorf("error getting skyhook for metadata validation: %w", err) + } + skyhookWrapper := wrapper.NewSkyhookWrapper(skyhookCR) + metadata := NewSkyhookMetadata(r.opts, skyhookWrapper) + expectedBytes, err := metadata.Marshal() + if err != nil { + return update, fmt.Errorf("error marshalling metadata for validation: %w", err) + } + expected := string(expectedBytes) + + for i := range list.Items { + cm := &list.Items[i] + if _, ok := shouldExist[cm.Name]; !ok { + continue + } + if cm.Data == nil { + cm.Data = make(map[string]string) + } + if cm.Data["packages.json"] != expected { + cm.Data["packages.json"] = expected + if err := r.Update(ctx, cm); err != nil { + errs = append(errs, fmt.Errorf("error updating packages.json on config map [%s]: %w", cm.Name, err)) + } else { + update = true + } + } + } + return update, utilerrors.NewAggregate(errs) } diff --git a/operator/internal/controller/skyhook_controller_test.go b/operator/internal/controller/skyhook_controller_test.go index 43231378..edad43de 100644 --- a/operator/internal/controller/skyhook_controller_test.go +++ b/operator/internal/controller/skyhook_controller_test.go @@ -20,6 +20,7 @@ package controller import ( "context" + "encoding/json" "fmt" "testing" "time" @@ -1100,6 +1101,48 @@ var _ = Describe("skyhook controller tests", func() { Expect(result).To(MatchRegexp(`^[a-z0-9]([-a-z0-9]*[a-z0-9])?$`), "configmap name should match kubernetes naming requirements") } }) + + It("should create metadata configmap with packages.json including agentVersion and packages", func() { + // build minimal skyhook and node + skyhookCR := &v1alpha1.Skyhook{ + ObjectMeta: metav1.ObjectMeta{ + Name: "skyhook-meta", + UID: "uid-1234", + }, + Spec: v1alpha1.SkyhookSpec{ + Packages: v1alpha1.Packages{ + "pkg1": { + PackageRef: v1alpha1.PackageRef{Name: "pkg1", Version: "1.0.0"}, + Image: "ghcr.io/org/pkg1", + }, + }, + }, + } + sw := wrapper.NewSkyhookWrapper(skyhookCR) + + node := &corev1.Node{ObjectMeta: metav1.ObjectMeta{Name: "node-a", Labels: map[string]string{"a": "b"}}} + + // use initialized reconciler + r := operator + + // upsert configmap + Expect(r.UpsertNodeLabelsAnnotationsPackages(ctx, sw, node)).To(Succeed()) + + // fetch configmap + cmName := generateSafeName(253, sw.Name, node.Name, "metadata") + var cm corev1.ConfigMap + Expect(k8sClient.Get(ctx, types.NamespacedName{Name: cmName, Namespace: opts.Namespace}, &cm)).To(Succeed()) + + // validate packages.json exists and has expected agentVersion and packages + Expect(cm.Data).To(HaveKey("packages.json")) + var meta struct { + AgentVersion string `json:"agentVersion"` + Packages map[string]any `json:"packages"` + } + Expect(json.Unmarshal([]byte(cm.Data["packages.json"]), &meta)).To(Succeed()) + Expect(meta.AgentVersion).To(Equal(opts.AgentVersion())) + Expect(meta.Packages).To(HaveKey("pkg1")) + }) }) var _ = Describe("Resource Comparison", func() { diff --git a/operator/internal/controller/skyhook_metadata.go b/operator/internal/controller/skyhook_metadata.go new file mode 100644 index 00000000..1dba9ae1 --- /dev/null +++ b/operator/internal/controller/skyhook_metadata.go @@ -0,0 +1,67 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + * + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package controller + +import ( + "encoding/json" + + "github.com/NVIDIA/skyhook/operator/api/v1alpha1" + "github.com/NVIDIA/skyhook/operator/internal/wrapper" +) + +// PackageMetadata defines the intermediary contract for a single package that the agent can consume +type PackageMetadata struct { + Name string `json:"name"` + Version string `json:"version"` + Image string `json:"image"` + AgentImageOverride string `json:"agentImageOverride,omitempty"` + Interrupt *v1alpha1.Interrupt `json:"interrupt,omitempty"` + ConfigInterrupts map[string]v1alpha1.Interrupt `json:"configInterrupts,omitempty"` +} + +// SkyhookMetadata defines the node metadata contract exposed to the agent +type SkyhookMetadata struct { + AgentVersion string `json:"agentVersion"` + Packages map[string]PackageMetadata `json:"packages"` +} + +// NewSkyhookMetadata builds the intermediary SkyhookMetadata from the CR spec and operator options +func NewSkyhookMetadata(opts SkyhookOperatorOptions, s *wrapper.Skyhook) SkyhookMetadata { + packages := make(map[string]PackageMetadata) + for name, p := range s.Spec.Packages { + packages[name] = PackageMetadata{ + Name: p.Name, + Version: p.Version, + Image: p.Image, + AgentImageOverride: p.AgentImageOverride, + Interrupt: p.Interrupt, + ConfigInterrupts: p.ConfigInterrupts, + } + } + + return SkyhookMetadata{ + AgentVersion: opts.AgentVersion(), + Packages: packages, + } +} + +// Marshal returns the JSON encoding for inclusion in the node metadata configmap +func (m SkyhookMetadata) Marshal() ([]byte, error) { + return json.Marshal(m) +} diff --git a/operator/internal/controller/suite_test.go b/operator/internal/controller/suite_test.go index 6c248196..c9d126bc 100644 --- a/operator/internal/controller/suite_test.go +++ b/operator/internal/controller/suite_test.go @@ -39,6 +39,8 @@ import ( "sigs.k8s.io/controller-runtime/pkg/log/zap" corev1 "k8s.io/api/core/v1" + apierrors "k8s.io/apimachinery/pkg/api/errors" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" //+kubebuilder:scaffold:imports ) @@ -110,6 +112,13 @@ var _ = BeforeSuite(func() { Expect(err).ToNot(HaveOccurred()) Expect(opts.Validate()).ToNot(HaveOccurred()) + // Ensure the operator namespace exists for tests that create namespaced resources + ns := &corev1.Namespace{ObjectMeta: metav1.ObjectMeta{Name: opts.Namespace}} + err = k8sClient.Create(ctx, ns) + if err != nil && !apierrors.IsAlreadyExists(err) { + Expect(err).ToNot(HaveOccurred()) + } + operator, err = NewSkyhookReconciler( k8sManager.GetScheme(), k8sManager.GetClient(),