Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion .github/workflows/operator-ci.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@ on:
env:
REGISTRY: ghcr.io
IMAGE_NAME: ${{ github.repository }}
GO_VERSION: 1.23.6
GO_VERSION: 1.23.7
PLATFORMS: linux/amd64,linux/arm64

# There is a single job in this workflow. It's configured to run on the latest available version of Ubuntu.
Expand Down Expand Up @@ -125,6 +125,8 @@ jobs:

- name: Build the operator container image
id: build
env:
platforms: ${{ env.PLATFORMS }}
run: |
apt-get update && apt-get install -y make git jq
cd operator
Expand Down
6 changes: 1 addition & 5 deletions .gitlab-ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@
default:
tags:
- baseos-infra
image: gitlab-master.nvidia.com:5005/dgx/infra/skyhook-operator/ci:latest
image: gitlab-master.nvidia.com:5005/dgx/infra/skyhook-operator/ci:latest ## TODO: this is no longer getting updated, we need to address this

variables:
KUBERNETES_CPU_LIMIT: "4"
Expand All @@ -35,10 +35,6 @@ workflow:
rules:
- if: $CI_COMMIT_TAG

include:
- project: dgx/infra/gitlint-ci
ref: main
file: gitlint.yml

## setup vault creds
bootstrap:
Expand Down
2 changes: 1 addition & 1 deletion chart/LICENSE
19 changes: 19 additions & 0 deletions containers/agent.Dockerfile
Original file line number Diff line number Diff line change
@@ -1,3 +1,22 @@
#
# LICENSE START
#
# Copyright (c) NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
# LICENSE END
#
FROM python:3.12-alpine AS builder

ARG AGENT_VERSION
Expand Down
27 changes: 17 additions & 10 deletions containers/agentless/Dockerfile
Original file line number Diff line number Diff line change
@@ -1,16 +1,23 @@
# Copyright (c) NVIDIA CORPORATION. All rights reserved.
#
# LICENSE START
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# Copyright (c) NVIDIA CORPORATION. All rights reserved.
#
# http://www.apache.org/licenses/LICENSE-2.0
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
# LICENSE END
#


ARG BUSYBOX_TAG=1.36.1

Expand Down
12 changes: 0 additions & 12 deletions containers/agentless/entrypoint.sh
Original file line number Diff line number Diff line change
Expand Up @@ -21,18 +21,6 @@
#














SLEEP_LEN=${SLEEP_LEN:-$(($RANDOM % 5 + 5))}

echo "agentless ["$@"] sleep for ${SLEEP_LEN} and exit with ${EXIT_CODE}"
Expand Down
12 changes: 0 additions & 12 deletions containers/agentless/versions.sh
Original file line number Diff line number Diff line change
Expand Up @@ -21,18 +21,6 @@
#














## This is the list of versions that will be tagged for the test container so it can be
## used in the e2e tests.
##
Expand Down
26 changes: 16 additions & 10 deletions containers/ci.Dockerfile
Original file line number Diff line number Diff line change
@@ -1,16 +1,22 @@
# Copyright (c) NVIDIA CORPORATION. All rights reserved.
#
# LICENSE START
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# Copyright (c) NVIDIA CORPORATION. All rights reserved.
#
# http://www.apache.org/licenses/LICENSE-2.0
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
# LICENSE END
#

ARG GO_VERSION

Expand Down
26 changes: 16 additions & 10 deletions containers/operator.Dockerfile
Original file line number Diff line number Diff line change
@@ -1,16 +1,22 @@
# Copyright (c) NVIDIA CORPORATION. All rights reserved.
#
# LICENSE START
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# Copyright (c) NVIDIA CORPORATION. All rights reserved.
#
# http://www.apache.org/licenses/LICENSE-2.0
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
# LICENSE END
#

# Build the manager binary
ARG GO_VERSION
Expand Down
12 changes: 3 additions & 9 deletions k8s-tests/chainsaw/helm/helm-chart-test/chainsaw-test.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -18,24 +18,18 @@
# LICENSE END
#







# yaml-language-server: $schema=https://raw.githubusercontent.com/kyverno/chainsaw/main/.schemas/json/test-chainsaw-v1alpha1.json
apiVersion: chainsaw.kyverno.io/v1alpha1
kind: Test
metadata:
name: helm-chart
spec:
description: This test asserts that the helm chart is working as expected. Specifcally it asserts that the helm chart works when given a different
description: This test asserts that the helm chart is working as expected. Specifically it asserts that the helm chart works when given a different
deployment name than skyhook-operator and that the tolerations that are given to the chart through a values file work as expected.
concurrent: false
timeouts:
assert: 120s
exec: 120s
assert: 240s
exec: 240s
steps:
- try:
- script:
Expand Down
10 changes: 2 additions & 8 deletions k8s-tests/chainsaw/helm/helm-scale-test/chainsaw-test.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -18,12 +18,6 @@
# LICENSE END
#







# yaml-language-server: $schema=https://raw.githubusercontent.com/kyverno/chainsaw/main/.schemas/json/test-chainsaw-v1alpha1.json
apiVersion: chainsaw.kyverno.io/v1alpha1
kind: Test
Expand All @@ -34,8 +28,8 @@ spec:
deployment name than skyhook-operator and that the tolerations that are given to the chart through a values file work as expected.
concurrent: false
timeouts:
assert: 120s
exec: 120s
assert: 240s
exec: 240s
steps:
- try:
- script:
Expand Down
2 changes: 1 addition & 1 deletion operator/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -223,7 +223,7 @@ merge-coverage:
echo "mode: set" > $(REPORTING)/cover.out
## skip first line with +2
tail -n +2 $(REPORTING)/temp-cover.out | sed '/mode: set/d' >> $(REPORTING)/cover.out
$(sedrp) 's/^\/.*(\/skyhook-operator.*)/gitlab-master\.nvidia\.com\/dgx\/infra\1/g' $(REPORTING)/cover.out
$(sedrp) 's|^/.*skyhook/operator/(.*)$$|github\.com/NVIDIA/skyhook/\1|g' $(REPORTING)/cover.out

.PHONY: lint
lint: golangci-lint license-check ## Run golangci-lint linter & yamllint
Expand Down
82 changes: 56 additions & 26 deletions operator/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -27,45 +27,70 @@ spec:
interruptionBudget:
percent: 33
packages:
nvssh:
version: 2024.05.10
image: nvcr.io/nvidian/swgpu-baseos/nvssh:2024.05.10
configInterrupts:
nvssh_vars.sh:
type: service
services: [cron]
something_important:
version: 1.0.0
image: ghcr.io/nvidia/skyhook-packages/shellscript
depends_on:
tuning: 1.0.0
configMap:
nvssh_vars.sh: |-
apply.sh: |-
#!/bin/bash
echo "hello world" > /skyhook-hello-world
sleep 60
apply_check.sh: |-
#!/bin/bash
cat /skyhook-hello-world
sleep 30
config.sh: |-
#!/bin/bash
nvssh_allowed_roles=access-azure-nv-ngc-prod-dgxc-admin
nvssh_allowed_sudo_roles=access-azure-nv-ngc-prod-dgxc-admin
echo $0
bcp:
version: 2024.05.13
image: nvcr.io/nvidian/swgpu-baseos/bcp:2024.05.13
env:
- name: CSP
value: azure
interrupt:
type: service
services: [containerd]
echo "a config is run" >> /skyhook-hello-world
sleep 60
config_check.sh: |-
#!/bin/bash
grep "config" /skyhook-hello-world
sleep 30
tuning:
version: 1.0.0
image: ghcr.io/nvidia/skyhook-packages/tuning
interrupt:
type: reboot
configInterrupts:
grub.conf:
type: reboot
sysctl.conf:
type: restart_all_services
configMap:
grub.conf: |-
hugepagesz=1G
hugepages=2
hugepagesz=2M
hugepages=5128
sysctl.conf: |-
fs.inotify.max_user_instances=65535
fs.inotify.max_user_watches=524288
kernel.threads-max=16512444
vm.max_map_count=262144
vm.min_free_kbytes=65536
ulimit.conf: |-
memlock: 128
fsize: 1000
```

Packages can depend on each other, so if you needed bcp to be installed before nvssh you can define that like this:
Packages can depend on each other, so if you needed `something_important` to be installed after `tuning` you can define that like this:

```yaml
nvssh:
something_important:
...
dependsOn:
bcp: "3.0"
bcp:
tuning: "1.0.0"
tuning:
...
```

## Development

### Prerequisites
- go version v1.23.4+
- go version v1.23.7+
- docker version 17.03+ or podman 4.9.4+ (project makefile kind of assumes podman)
- kubectl version v1.27.3+.
- Access to a Kubernetes v1.27+ cluster. (we test on 1.27, should work on older if needed, just not tested.)
Expand Down Expand Up @@ -130,6 +155,7 @@ Development
generate-mocks Generate code for interface mocking
license-report Run run license report
license-check Run go-licenses check against code.
license-fmt Run add license header to code.
fmt Run go fmt against code.
vet Run go vet against code.
test Run all tests.
Expand All @@ -149,7 +175,12 @@ Build
docker-build Build docker image with the manager.

Deployment
create-namespace Create the namespace in the K8s cluster specified in ~/.kube/config.
install Install CRDs into the K8s cluster specified in ~/.kube/config.
install-cert-manager Install cert-manager into the K8s cluster specified in ~/.kube/config.
install-helm-chart Install helm chart into the K8s cluster specified in ~/.kube/config.
uninstall-helm-chart Uninstall helm chart from the K8s cluster specified in ~/.kube/config.
uninstall-cert-manager Uninstall cert-manager from the K8s cluster specified in ~/.kube/config.
uninstall Uninstall CRDs from the K8s cluster specified in ~/.kube/config. Call with ignore-not-found=true to ignore resource not found errors during deletion.
deploy Deploy controller to the K8s cluster specified in ~/.kube/config.
generate-helm Generates new helm chart using helmify. Be-careful, this can break things, it overwrites files, make sure to look at you git diff.
Expand All @@ -167,7 +198,6 @@ Build Dependencies
chainsaw Download chainsaw locally if necessary.
helm Download helm locally if necessary.
helmify Download helmify locally if necessary.
go-license Download go-license locally if necessary.
go-licenses Download go-licenses locally if necessary.
```

Expand Down
3 changes: 0 additions & 3 deletions operator/api/v1alpha1/groupversion_info.go
Original file line number Diff line number Diff line change
Expand Up @@ -18,9 +18,6 @@
* LICENSE END
*/




// Package v1alpha1 contains API Schema definitions for the skyhook v1alpha1 API group
// +kubebuilder:object:generate=true
// +groupName=skyhook.nvidia.com
Expand Down
Loading
Loading