diff --git a/.github/workflows/operator-container.yaml b/.github/workflows/operator-container.yaml new file mode 100644 index 00000000..c88bd414 --- /dev/null +++ b/.github/workflows/operator-container.yaml @@ -0,0 +1,104 @@ +# Build when operator code changes +name: Build and push operator container image + +on: + pull_request: + branches: + - main + paths: + - operator/**/*.go + - containers/operator.Dockerfile + - .github/workflows/operator-container.yaml + push: + branches: + - main + tags: + - operator/* + paths: + - operator/**/*.go + - containers/operator.Dockerfile + - .github/workflows/operator-container.yaml + +# NOTE: we may want to switch to matrix build for multi-platform support if this is taking too long +# https://docs.docker.com/build/ci/github-actions/multi-platform/#distribute-build-across-multiple-runners + + +# Defines two custom environment variables for the workflow. These are used for the Container registry domain, and a name for the Docker image that this workflow builds. +env: + REGISTRY: ghcr.io + IMAGE_NAME: ${{ github.repository }} + GO_VERSION: 1.23.4 + +# There is a single job in this workflow. It's configured to run on the latest available version of Ubuntu. +jobs: + build-and-push-operator: + runs-on: ubuntu-latest + # Sets the permissions granted to the `GITHUB_TOKEN` for the actions in this job. + permissions: + contents: read + packages: write + attestations: write + id-token: write + # + steps: + - name: Checkout repository + uses: actions/checkout@v4 + # Uses the `docker/login-action` action to log in to the Container registry using the account and password that will publish the packages. Once published, the packages are scoped to the account defined here. + - name: Log in to the Container registry + uses: docker/login-action@v3 + with: + registry: ${{ env.REGISTRY }} + username: ${{ github.actor }} + password: ${{ secrets.GITHUB_TOKEN }} + + # Setup for multi-platform + - name: Set up QEMU + uses: docker/setup-qemu-action@v3 + + - name: Set up Docker Buildx + uses: docker/setup-buildx-action@v3 + + - name: Build the operator container image + id: build + run: | + apt-get update && apt-get install -y make git jq + cd operator + # if this is a tag build, use the tag as the version, otherwise use the sha + TAGS="-t ${REGISTRY@L}/${{env.IMAGE_NAME}}/operator:${{ github.sha }} -t ${REGISTRY@L}/${{env.IMAGE_NAME}}/operator:latest" + case ${{ github.ref_type }} in + branch) + # The last tag + current git sha + export OPERATOR_VERSION=$(git describe --tags --abbrev=0 2>/dev/null || echo "0.0.0")+${{ github.sha }} + ;; + tag) + # The version part of the tag + export OPERATOR_VERSION=$(echo "${{ github.ref_name }}" | cut -f 2 -d /) + TAGS="$TAGS -t ${REGISTRY@L}/${{env.IMAGE_NAME}}/operator:${OPERATOR_VERSION}" + ;; + *) + echo "Unkown type ${{ github.ref_type }}" + exit 1 + ;; + esac + set -x + docker buildx build \ + --build-arg GIT_SHA=$${{ github.sha }} \ + --build-arg VERSION=${OPERATOR_VERSION} \ + --build-arg GO_VERSION=${GO_VERSION} \ + --push \ + --platform linux/amd64 \ + ${TAGS@L} \ + --metadata-file=metadata.json \ + -f ../containers/operator.Dockerfile . + cat metadata.json + echo "digest=$(cat metadata.json | jq -r .\"containerimage.digest\")" >> $GITHUB_OUTPUT + cat $GITHUB_OUTPUT + + # This step generates an artifact attestation for the image, which is an unforgeable statement about where and how it was built. It increases supply chain security for people who consume the image. For more information, see [AUTOTITLE](/actions/security-guides/using-artifact-attestations-to-establish-provenance-for-builds). + - name: Generate artifact attestation + uses: actions/attest-build-provenance@v2 + with: + subject-name: ${{ env.REGISTRY }}/${{env.IMAGE_NAME}}/operator + subject-digest: ${{ steps.build.outputs.digest }} + push-to-registry: true + diff --git a/containers/operator.Dockerfile b/containers/operator.Dockerfile index 54a87fb5..62ace625 100644 --- a/containers/operator.Dockerfile +++ b/containers/operator.Dockerfile @@ -13,7 +13,9 @@ # limitations under the License. # Build the manager binary -FROM gitlab-master.nvidia.com:5005/dgx/infra/skyhook-operator/ci:latest as builder +ARG GO_VERSION + +FROM golang:${GO_VERSION}-bookworm as builder ARG TARGETOS ARG TARGETARCH diff --git a/operator/releases.md b/docs/releases.md similarity index 100% rename from operator/releases.md rename to docs/releases.md diff --git a/operator/runtime_required.md b/operator/runtime_required.md deleted file mode 100644 index c95bc894..00000000 --- a/operator/runtime_required.md +++ /dev/null @@ -1,30 +0,0 @@ -# What it is - -Runtime required is a special mode that packages can be run in. This mode is for when a set of Packages must complete before any other workloads are allowed to run on the node. - -# How to use it - -## Pre-requisites -1. A node MUST join the cluster with a pre define taint -1. That same taint must be set as the chart value `controllerManager.manager.env.runtimeRequiredTaint` - 1. The default value for this taint is `skyhook.nvidia.com=runtime-required:NoSchedule` - -## Required Skyhooks - -Once the pre-requisites are satisfied any Skyhook Custom Resource (SCR) may be marked with `runtimeRequired: true`. This flag indicates that all packages within this SCR must complete -before the nodes that it targets are considered available for general use. - -## What runtimeRequired: true will NOT do -1. It will NOT add the taint to any nodes targeted by a SCR with `runtimeRequired: true` - -# Details -## When is a node considered ready -When all of the following conditions are true per node: -1. All SCRs with `runtimeRequired: true` are complete - -## What happens happens when a node is considered ready -1. The runtime required taint is removed from the node if it exists. - - -# Why would you use runtime required -This is useful when you want to gate other work behind the successful completion of some set of Skyhook Packages. This can be for security reasons or for scheduling. \ No newline at end of file