From dff6acd717fce0337c84e7bb766bdf35a9019dfc Mon Sep 17 00:00:00 2001 From: Brian Lockwood Date: Fri, 14 Mar 2025 13:26:30 -0700 Subject: [PATCH 1/4] fix(ci): fix bad path filters --- .github/workflows/agent-ci.yaml | 10 ++-------- .github/workflows/operator-ci.yaml | 4 ++-- 2 files changed, 4 insertions(+), 10 deletions(-) diff --git a/.github/workflows/agent-ci.yaml b/.github/workflows/agent-ci.yaml index f855013a..5d1d19e9 100644 --- a/.github/workflows/agent-ci.yaml +++ b/.github/workflows/agent-ci.yaml @@ -18,12 +18,6 @@ # LICENSE END # - - - - - - name: Agent CI on: pull_request: @@ -32,7 +26,7 @@ on: paths: - agent/** - containers/agent.Dockerfile - - .github/workflows/agent-container.yaml + - .github/workflows/agent-ci.yaml push: branches: - main @@ -41,7 +35,7 @@ on: paths: - agent/** - containers/agent.Dockerfile - - .github/workflows/agent-container.yaml + - .github/workflows/agent-ci.yaml env: REGISTRY: ghcr.io IMAGE_NAME: ${{ github.repository }} diff --git a/.github/workflows/operator-ci.yaml b/.github/workflows/operator-ci.yaml index 9df60332..aa645b9b 100644 --- a/.github/workflows/operator-ci.yaml +++ b/.github/workflows/operator-ci.yaml @@ -28,7 +28,7 @@ on: paths: - operator/** - containers/operator.Dockerfile - - .github/workflows/operator-container.yaml + - .github/workflows/operator-ci.yaml push: branches: - main @@ -37,7 +37,7 @@ on: paths: - operator/**/*.go - containers/operator.Dockerfile - - .github/workflows/operator-container.yaml + - .github/workflows/operator-ci.yaml # NOTE: we may want to switch to matrix build for multi-platform support if this is taking too long # https://docs.docker.com/build/ci/github-actions/multi-platform/#distribute-build-across-multiple-runners From 00e01837688c5cf6b42d7b3a910dabc0afa6c4ac Mon Sep 17 00:00:00 2001 From: Brian Lockwood Date: Fri, 14 Mar 2025 13:38:43 -0700 Subject: [PATCH 2/4] fix(ci): fix invalid syntax from last pr --- .github/workflows/operator-ci.yaml | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/.github/workflows/operator-ci.yaml b/.github/workflows/operator-ci.yaml index aa645b9b..8a607a26 100644 --- a/.github/workflows/operator-ci.yaml +++ b/.github/workflows/operator-ci.yaml @@ -123,8 +123,6 @@ jobs: - name: Build the operator container image id: build - with: - platforms: ${{ env.PLATFORMS }} run: | apt-get update && apt-get install -y make git jq cd operator @@ -151,7 +149,7 @@ jobs: --build-arg VERSION=${OPERATOR_VERSION} \ --build-arg GO_VERSION=${GO_VERSION} \ --push \ - --platform ${{ env.platforms }} \ + --platform ${{ env.PLATFORMS }} \ ${TAGS@L} \ --metadata-file=metadata.json \ -f ../containers/operator.Dockerfile . From 16abe95336043876e6b649bc7c6ce5db0c66b069 Mon Sep 17 00:00:00 2001 From: Brian Lockwood Date: Fri, 14 Mar 2025 14:08:30 -0700 Subject: [PATCH 3/4] fix(ci): added a missing path filter for e2e test --- .github/workflows/operator-ci.yaml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.github/workflows/operator-ci.yaml b/.github/workflows/operator-ci.yaml index 8a607a26..670c3e91 100644 --- a/.github/workflows/operator-ci.yaml +++ b/.github/workflows/operator-ci.yaml @@ -29,6 +29,7 @@ on: - operator/** - containers/operator.Dockerfile - .github/workflows/operator-ci.yaml + - k8s-tests/** push: branches: - main @@ -38,6 +39,7 @@ on: - operator/**/*.go - containers/operator.Dockerfile - .github/workflows/operator-ci.yaml + - k8s-tests/** # NOTE: we may want to switch to matrix build for multi-platform support if this is taking too long # https://docs.docker.com/build/ci/github-actions/multi-platform/#distribute-build-across-multiple-runners From a5a604c34e5941476df6db2c5277c14cd2b99d13 Mon Sep 17 00:00:00 2001 From: Brian Lockwood Date: Fri, 14 Mar 2025 14:10:08 -0700 Subject: [PATCH 4/4] docs: update readme with more details about what skyhook was built for --- README.md | 59 +++++++++++++++++++++++++++++++++++++++---------------- 1 file changed, 42 insertions(+), 17 deletions(-) diff --git a/README.md b/README.md index fd6b2fec..7d11e939 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,47 @@ # skyhook -Skyhook was developed for modifying the underlying host OS in Kubernetes clusters. Think of it as a package manager like apt/yum for linux but for whole cluster management. The package manager (Skyhook Operator) manages the lifecycle (install/configure/uninstall/upgrade) of the packages (Skyhook Custom Resource, often SCR for short). It is Kubernetes aware, making cluster modifications easy. This enables Skyhook to schedule updates around important workloads and do rolling updates. It can be used in any cluster environment: self-managed clusters, on-prem clusters, cloud clusters, etc. +**Skyhook** is a Kubernetes-aware package manager for cluster administrators to safely modify and maintain underlying host declaratively at scale. + +## Why Skyhook? + +Managing and updating Kubernetes clusters is challenging. While Kubernetes advocates treating compute as disposable, but certain scenarios make this difficult: + +- **Updating hosts without re-imaging:** + - Limited excess hardware/capacity for rolling replacements + - Long node replacement times (example can be hours in some cloud providers) +- **OS image management:** + - Maintain a common base image with workload-specific overlays instead of multiple OS images +- **Workload sensitivity:** + - Some workloads can't be moved, are difficult to move, or take a long time to migrate + +## What is Skyhook? + +Skyhook functions like a package manager but for your entire Kubernetes cluster, with three main components: + +1. **Skyhook Operator** - Manages installing, updating, and removing packages +2. **Skyhook Custom Resource (SCR)** - Declarative definitions of changes to apply +3. **Packages** - The actual modifications you want to implement + +## Where and When to use Skyhook + +Skyhook works in any Kubernetes environment (self-managed, on-prem, cloud) and shines when you need: + +- Kubernetes-aware scheduling that protects important workloads +- Rolling or simultaneous updates across your cluster +- Declarative configuration management for host-level changes + +## Benefits + - **Native Kubernetes integration** - Packages are standard Kubernetes resources compatible with GitOps tools like ArgoCD, Helm, and Flux + - **Autoscaling support** - Ensure newly created nodes are properly configured before schedulable + - **First-class upgrades** - Deploys changes with minimal disruption, waiting for running workloads to complete when needed + +## Key Features +- **Interruption Budget:** percent of nodes or count +- **Node Selectors:** selectors for which nodes to apply too (node labels) +- **Pod Non Interrupt Labels:** labels for pods to **never** interrupt +- **Package Interrupt:** service (containerd, cron, any thing systemd), or reboot +- **Additional Tolerations:** are tolerations added to the packages +- [**Runtime Required**](docs/runtime_required.md): requires node to come into the cluster with a taint, and will do work prior to removing custom taint. ## Pre-built Packages @@ -67,22 +108,6 @@ The Status will show the overall package status as well as the status of each no kubectl get nodes -o jsonpath='{range .items[?(@.metadata.labels.skyhook\.nvidia\.com/test-node=="demo")]}{.metadata.annotations.skyhook\.nvidia\.com/nodeState_demo}{"\n"}{end}' ``` -## Benefits - - The requested changes (the Packages) are native Kubernetes resources they can be combined and applied with common tools like ArgoCD, Helm, Flux etc. This means that all the tooling to manage applications can package customizations right alongside them to get applied, removed and upgraded as the applications themselves are. - - Autoscaling, with skyhook if you want to enable autoscaling on your cluster but need to modify all Nodes added to a cluster, you need something that is kubernetes aware. Skyhook as feature to make sure you nodes are ready before then enter the cluster. - - Upgrades are first class, with skyhook you can make deploy changes to your cluster and can wait for running workloads to finish before applying changes. - -## Key Features -- **interruptionBudget:** percent of nodes or count -- **nodeSelectors:** selectors for which nodes to apply too (node labels) -- **podNonInterruptLabels:** labels for pods to **never** interrupt -- **package interrupt:** service (containerd, cron, any thing systemd), or reboot -- **config interrupt:** service, or reboot when a certain key's value changes in the configmap -- **configMap:** per package -- **env vars:** per package -- **additionalTolerations:** are tolerations added to the packages -- [**runtimeRequired**](docs/runtime_required.md): requires node to come into the cluster with a taint, and will do work prior to removing custom taint. - ### Stages The operator will apply steps in a package throughout different lifecycle stages. This ensures that the right steps are applied in the right situations and in the correct order. - Upgrade: This stage will be ran whenever a package's version is upgraded in the SCR.