Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
The table of contents is too big for display.
Diff view
Diff view
  •  
  •  
  •  
37 changes: 22 additions & 15 deletions .github/workflows/operator-ci.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -43,49 +43,52 @@ on:
- k8s-tests/**
- chart/**

# NOTE: we may want to switch to matrix build for multi-platform support if this is taking too long
# https://docs.docker.com/build/ci/github-actions/multi-platform/#distribute-build-across-multiple-runners


# Defines two custom environment variables for the workflow. These are used for the Container registry domain, and a name for the Docker image that this workflow builds.
## these envs control the build and test process below
env:
REGISTRY: ghcr.io
IMAGE_NAME: ${{ github.repository }}
GO_VERSION: 1.23.8
GO_VERSION: 1.24.4
PLATFORMS: linux/amd64,linux/arm64

# There is a single job in this workflow. It's configured to run on the latest available version of Ubuntu.
jobs:
# Test operator across supported Kubernetes versions
tests:
runs-on: ubuntu-latest
strategy:
matrix:
# Test on all supported K8s versions (matches docs/kubernetes-support.md)
k8s-version: ["1.30.13", "1.31.9", "1.32.5", "1.33.1"]
fail-fast: false # Continue testing other versions if one fails
steps:
- uses: actions/checkout@v4
with:
fetch-tags: true
fetch-depth: 0
- name: Setup Go 1.23
- name: Setup Go ${{ env.GO_VERSION }}
uses: actions/setup-go@v5
with:
go-version: 1.23.9
go-version: ${{ env.GO_VERSION }}
cache-dependency-path: operator/go.sum
- name: Log in to the Container registry
uses: docker/login-action@v3
with:
registry: ${{ env.REGISTRY }}
username: ${{ github.actor }}
password: ${{ secrets.GITHUB_TOKEN }}
- name: Kubernetes KinD Cluster
- name: Kubernetes KinD Cluster v${{ matrix.k8s-version }}
id: kind
uses: helm/kind-action@v1
with:
version: v0.26.0
kubernetes_version: v${{ matrix.k8s-version }}
install_only: true
# Cache build tools and dependencies for faster builds
- name: Restore cached Binaries
id: cached-binaries
uses: actions/cache/restore@v4
with:
key: ${{ runner.os }}-${{ runner.arch }}-bin-${{ hashFiles('operator/deps.mk') }}
restore-keys: ${{ runner.os }}-${{ runner.arch }}-bin-
key: ${{ env.GO_VERSION }}-${{ runner.os }}-${{ runner.arch }}-bin-${{ hashFiles('operator/deps.mk') }}
restore-keys: ${{ env.GO_VERSION }}-${{ runner.os }}-${{ runner.arch }}-bin-
path: |
${{ github.workspace }}/operator/bin
~/.cache/go-build
Expand All @@ -99,15 +102,18 @@ jobs:
if: steps.cached-binaries.outputs.cache-hit != 'true'
uses: actions/cache/save@v4
with:
key: ${{ runner.os }}-${{ runner.arch }}-bin-${{ hashFiles('operator/deps.mk') }}
key: ${{ env.GO_VERSION }}-${{ runner.os }}-${{ runner.arch }}-bin-${{ hashFiles('operator/deps.mk') }}
path: |
${{ github.workspace }}/operator/bin
~/.cache/go-build
# Run full test suite including e2e tests
- name: end-to-end-tests
run: |
cd operator
GITHUB_TOKEN=${{ secrets.github_token }} make create-kind-cluster
make test

# Build multi-platform container image and push to registry
build-and-push-operator:
runs-on: ubuntu-latest
needs: [tests] # Don't run the build and push if the k8s tests fail
Expand All @@ -130,13 +136,14 @@ jobs:
username: ${{ github.actor }}
password: ${{ secrets.GITHUB_TOKEN }}

# Setup for multi-platform
# Setup for multi-platform builds (linux/amd64, linux/arm64)
- name: Set up QEMU
uses: docker/setup-qemu-action@v3

- name: Set up Docker Buildx
uses: docker/setup-buildx-action@v3

# Build and tag container image based on git ref type
- name: Build the operator container image
id: build
env:
Expand Down Expand Up @@ -178,7 +185,7 @@ jobs:
echo "digest=$(cat metadata.json | jq -r .\"containerimage.digest\")" >> $GITHUB_OUTPUT
cat $GITHUB_OUTPUT

# This step generates an artifact attestation for the image, which is an unforgeable statement about where and how it was built. It increases supply chain security for people who consume the image. For more information, see [AUTOTITLE](/actions/security-guides/using-artifact-attestations-to-establish-provenance-for-builds).
# Generate supply chain security attestation
- name: Generate artifact attestation
uses: actions/attest-build-provenance@v2
with:
Expand Down
26 changes: 5 additions & 21 deletions chart/templates/skyhook-crd.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ kind: CustomResourceDefinition
metadata:
name: skyhooks.skyhook.nvidia.com
annotations:
controller-gen.kubebuilder.io/version: v0.15.0
controller-gen.kubebuilder.io/version: v0.18.0
labels:
{{- include "chart.labels" . | nindent 4 }}
spec:
Expand Down Expand Up @@ -262,9 +262,7 @@ spec:
This field is effectively required, but due to backwards compatibility is
allowed to be empty. Instances of this type with an empty value here are
almost certainly wrong.
TODO: Add other useful fields. apiVersion, kind, uid?
More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names
TODO: Drop `kubebuilder:default` when controller-gen doesn't need it https://github.com/kubernetes-sigs/kubebuilder/issues/3896.
type: string
optional:
description: Specify whether the ConfigMap or its
Expand Down Expand Up @@ -330,9 +328,7 @@ spec:
This field is effectively required, but due to backwards compatibility is
allowed to be empty. Instances of this type with an empty value here are
almost certainly wrong.
TODO: Add other useful fields. apiVersion, kind, uid?
More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names
TODO: Drop `kubebuilder:default` when controller-gen doesn't need it https://github.com/kubernetes-sigs/kubebuilder/issues/3896.
type: string
optional:
description: Specify whether the Secret or its key
Expand Down Expand Up @@ -500,16 +496,8 @@ spec:
Represents the observations of a skyhook's current state.
Known .status.conditions.type are: "Available", "Progressing", and "Degraded" // TODO
items:
description: "Condition contains details for one aspect of the current
state of this API Resource.\n---\nThis struct is intended for direct
use as an array at the field path .status.conditions. For example,\n\n\n\ttype
FooStatus struct{\n\t // Represents the observations of a foo's
current state.\n\t // Known .status.conditions.type are: \"Available\",
\"Progressing\", and \"Degraded\"\n\t // +patchMergeKey=type\n\t
\ // +patchStrategy=merge\n\t // +listType=map\n\t // +listMapKey=type\n\t
\ Conditions []metav1.Condition `json:\"conditions,omitempty\"
patchStrategy:\"merge\" patchMergeKey:\"type\" protobuf:\"bytes,1,rep,name=conditions\"`\n\n\n\t
\ // other fields\n\t}"
description: Condition contains details for one aspect of the current
state of this API Resource.
properties:
lastTransitionTime:
description: |-
Expand Down Expand Up @@ -550,12 +538,7 @@ spec:
- Unknown
type: string
type:
description: |-
type of condition in CamelCase or in foo.example.com/CamelCase.
---
Many .condition.type values are consistent across resources like Available, but because arbitrary conditions can be
useful (see .node.status.conditions), the ability to deconflict is important.
The regex it matches is (dns1123SubdomainFmt/)?(qualifiedNameFmt)
description: type of condition in CamelCase or in foo.example.com/CamelCase.
maxLength: 316
pattern: ^([a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*/)?(([A-Za-z0-9][-A-Za-z0-9_.]*)?[A-Za-z0-9])$
type: string
Expand Down Expand Up @@ -633,6 +616,7 @@ spec:
- image
- name
- stage
- state
- version
type: object
type: object
Expand Down
113 changes: 113 additions & 0 deletions docs/kubernetes-support.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,113 @@
# Kubernetes Version Support

This document outlines Skyhook's approach to supporting different Kubernetes versions.

## Current Support Matrix

| Kubernetes Version | Skyhook Version | Status | Notes |
|--------------------|-----------------|---------|-------|
| 1.33, 1.32, 1.31 | v0.9.0+ | ✅ Fully Supported | Current stable versions |
| 1.30 | v0.8.x | ⚠️ Use older Skyhook | K8s 1.30 EOL: June 28, 2025 |
| 1.29 and older | v0.8.x or older | ⚠️ Use older Skyhook | No longer maintained |

## Support Policy

**Release Window Approach:** Each Skyhook release supports the Kubernetes versions that were actively maintained (non-EOL) at the time of that release.

### Our Strategy

- **Support all current non-EOL Kubernetes versions** (typically 3 versions)
- **Wait 4+ weeks** before adopting brand new Kubernetes versions (let them stabilize)
- **Older Skyhook versions** remain available for users on older Kubernetes clusters
- **Clear compatibility** - each release has a defined K8s support window

### What This Means

- **✅ Fully Supported:** We test and support these K8s versions in the current Skyhook release
- **⚠️ Use older Skyhook:** Your K8s version is supported, but use an older Skyhook release
- **❌ Not Supported:** Upgrade your Kubernetes cluster or use a much older Skyhook version

### When Versions Change

**For new Kubernetes releases:**
1. Wait **4+ weeks** after K8s release for ecosystem stability
2. Add to our CI testing matrix
3. Include in next Skyhook release

**For EOL Kubernetes versions:**
1. Stop including in new Skyhook releases
2. Existing Skyhook versions continue to work
3. Users should upgrade K8s and then upgrade Skyhook

## Upgrade Strategy

### Our Approach
- Update Kubernetes client libraries when we add support for new versions
- Test on both supported Kubernetes versions before each release
- Provide clear migration guidance when dropping version support

### For Users
We understand many installations run slightly older Kubernetes versions. Our strategy balances staying current while giving users time to upgrade:

- **6-week notice** before dropping support for a Kubernetes version
- **Clear documentation** about which Skyhook version to use for your Kubernetes version
- **Gradual transitions** rather than sudden jumps when possible

## Version Selection Guide

**Choose your Skyhook version based on your Kubernetes version:**

- **Kubernetes 1.33, 1.32, or 1.31:** Use latest Skyhook (v0.9.0+)
- **Kubernetes 1.30:** Use Skyhook v0.8.x (K8s 1.30 is EOL but v0.8.x still works)
- **Kubernetes 1.29 or older:** Use Skyhook v0.8.x or older (check release notes for compatibility)

### Migration Path

**If you're on an older Kubernetes version:**
1. **First:** Upgrade your Kubernetes cluster to a supported version (1.31, 1.32, or 1.33)
2. **Then:** Upgrade to the latest Skyhook version

**If you're on Kubernetes 1.30:**
- **Option A:** Upgrade to K8s 1.31/1.32/1.33, then use latest Skyhook
- **Option B:** Stay on Skyhook v0.8.x until you can upgrade Kubernetes

**Recommended:** If you can choose your Kubernetes version, use 1.33 or 1.32 for the longest support runway.

## FAQ

### Why don't you support EOL Kubernetes versions in new releases?

As a small project, we focus our efforts on actively maintained Kubernetes versions. This allows us to:
- Ensure better quality and security
- Adopt new Kubernetes features when they're stable
- Keep our testing matrix manageable
- Provide clearer upgrade paths

### What if I'm stuck on an older Kubernetes version?

**You can still use Skyhook!** Just use an older Skyhook version that was built for your K8s version:
- Older releases continue to work and don't disappear
- Check our release notes for which Skyhook version supports your K8s version
- Plan your Kubernetes upgrade timeline, then upgrade Skyhook afterward

### Why wait 4 weeks before supporting new Kubernetes versions?

We've learned that brand new Kubernetes versions often have:
- Ecosystem compatibility issues
- Updated client library dependencies
- Undiscovered bugs that get fixed in patch releases

Waiting 4+ weeks lets the ecosystem stabilize and gives us confidence in supporting the new version.

### How do you test compatibility?

For each Skyhook release, we test against all supported Kubernetes versions using:
- GitHub Actions matrix builds with multiple K8s versions (currently 1.31, 1.32, 1.33)
- Local testing with [kind](https://kind.sigs.k8s.io/)
- Basic functionality and integration tests

## Notes

This is a living document that will evolve as the project grows. Our current approach supports all actively maintained Kubernetes versions (typically 3 versions) while providing reasonable predictability for users.

For questions about Kubernetes support, please open an issue in our [GitHub repository](https://github.com/NVIDIA/skyhook).
2 changes: 1 addition & 1 deletion docs/metrics/prometheus_values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -36,4 +36,4 @@ extraScrapeConfigs: |

global:
scrape_interval: 5s
scrape_timeout: 5s
scrape_timeout: 5s
70 changes: 44 additions & 26 deletions operator/.golangci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -18,48 +18,66 @@
# LICENSE END
#

version: "2"
run:
deadline: 5m
allow-parallel-runners: true
modules-download-mode: vendor

issues:
# don't skip warning about doc comments
# don't exclude the default set of lint
exclude-use-default: false
# restore some of the defaults
# (fill in the rest as needed)
exclude-rules:
- path: "api/*"
linters:
- lll
- path: "internal/*"
linters:
- dupl
- lll
allow-parallel-runners: true
linters:
disable-all: true
default: none
enable:
- copyloopvar
- dupl
- errcheck
- copyloopvar
- goconst
- gocyclo
- gofmt
- goimports
- gosimple
- govet
- ineffassign
- lll
- misspell
- nakedret
- prealloc
- staticcheck
- typecheck
- unconvert
- unparam
- unused
settings:
lll:
line-length: 200
staticcheck:
# SAxxxx checks in https://staticcheck.dev/docs/configuration/options/#checks
# Example (to disable some checks): [ "all", "-SA1000", "-SA1001"]
# Default: ["all", "-ST1000", "-ST1003", "-ST1016", "-ST1020", "-ST1021", "-ST1022"]
checks:
- all
- -ST1000
- -ST1003
- -ST1016
- -ST1021
# Omit embedded fields from selector expression.
# https://staticcheck.dev/docs/checks/#QF1008
- -QF1008

linters-settings:
lll:
line-length: 200
exclusions:
generated: lax
rules:
- linters:
- lll
path: api/*
- linters:
- dupl
- lll
path: internal/*
paths:
- third_party$
- builtin$
- examples$
formatters:
enable:
- gofmt
- goimports
exclusions:
generated: lax
paths:
- third_party$
- builtin$
- examples$
Loading
Loading