diff --git a/cmd/action/ci/entrypoint.go b/cmd/action/ci/entrypoint.go index e4292182a..64641873d 100644 --- a/cmd/action/ci/entrypoint.go +++ b/cmd/action/ci/entrypoint.go @@ -96,7 +96,7 @@ func entrypoint(log *logger.FunLogger) error { } if cfg.Spec.Kubernetes.Install { - err = utils.GetKubeConfig(log, &cfg, hostUrl, kubeconfig) + err = utils.GetKubeConfig(log, &cfg, hostUrl, kubeconfig, "") if err != nil { return fmt.Errorf("failed to get kubeconfig: %w", err) } diff --git a/cmd/cli/common/host.go b/cmd/cli/common/host.go index e4fe0aee5..3cdfdce72 100644 --- a/cmd/cli/common/host.go +++ b/cmd/cli/common/host.go @@ -27,6 +27,7 @@ import ( "github.com/NVIDIA/holodeck/internal/logger" "github.com/NVIDIA/holodeck/pkg/provider/aws" "github.com/NVIDIA/holodeck/pkg/sshutil" + "github.com/NVIDIA/holodeck/pkg/utils" ) // GetHostURL resolves the SSH-reachable host URL for an environment. @@ -82,6 +83,10 @@ const ( // ConnectSSH establishes an SSH connection with retries. // Host key verification uses Trust-On-First-Use (TOFU). func ConnectSSH(log *logger.FunLogger, keyPath, userName, hostUrl string) (*ssh.Client, error) { + keyPath, err := utils.ExpandPath(keyPath) + if err != nil { + return nil, fmt.Errorf("expanding key path: %w", err) + } key, err := os.ReadFile(keyPath) //nolint:gosec // keyPath is from trusted env config if err != nil { return nil, fmt.Errorf("failed to read key file %s: %w", keyPath, err) diff --git a/cmd/cli/create/create.go b/cmd/cli/create/create.go index dffa420a8..442f85c19 100644 --- a/cmd/cli/create/create.go +++ b/cmd/cli/create/create.go @@ -485,7 +485,7 @@ func runSingleNodeProvision(log *logger.FunLogger, opts *options) error { break } } - if err = utils.GetKubeConfig(log, &opts.cache, hostUrl, opts.kubeconfig); err != nil { + if err = utils.GetKubeConfig(log, &opts.cache, hostUrl, opts.kubeconfig, ""); err != nil { return fmt.Errorf("failed to get kubeconfig: %w", err) } } @@ -563,7 +563,7 @@ func runMultinodeProvision(log *logger.FunLogger, opts *options) error { } } if hostUrl != "" { - if err := utils.GetKubeConfig(log, &opts.cache, hostUrl, opts.kubeconfig); err != nil { + if err := utils.GetKubeConfig(log, &opts.cache, hostUrl, opts.kubeconfig, ""); err != nil { return fmt.Errorf("failed to get kubeconfig: %w", err) } } diff --git a/cmd/cli/delete/delete_test.go b/cmd/cli/delete/delete_test.go index 508c191d3..322b2d7ee 100644 --- a/cmd/cli/delete/delete_test.go +++ b/cmd/cli/delete/delete_test.go @@ -131,8 +131,8 @@ var _ = Describe("Delete Command", func() { It("should delete single SSH instance successfully", func() { // Create a valid SSH cache file - yaml := sshCacheYAML("sshdelete1", "ssh-delete-test") - cacheFile := filepath.Join(tempDir, "sshdelete1.yaml") + yaml := sshCacheYAML("a1b2c3d4", "ssh-delete-test") + cacheFile := filepath.Join(tempDir, "a1b2c3d4.yaml") err := os.WriteFile(cacheFile, []byte(yaml), 0600) Expect(err).NotTo(HaveOccurred()) @@ -141,7 +141,7 @@ var _ = Describe("Delete Command", func() { Commands: []*cli.Command{cmd}, } - err = app.Run([]string{"holodeck", "delete", "--cachepath", tempDir, "sshdelete1"}) + err = app.Run([]string{"holodeck", "delete", "--cachepath", tempDir, "a1b2c3d4"}) Expect(err).NotTo(HaveOccurred()) // Verify cache file was removed @@ -150,15 +150,15 @@ var _ = Describe("Delete Command", func() { // Verify success message Expect(buf.String()).To(ContainSubstring("Successfully deleted")) - Expect(buf.String()).To(ContainSubstring("sshdelete1")) + Expect(buf.String()).To(ContainSubstring("a1b2c3d4")) }) It("should delete multiple SSH instances successfully", func() { // Create two cache files - yaml1 := sshCacheYAML("sshmulti1", "ssh-multi-1") - yaml2 := sshCacheYAML("sshmulti2", "ssh-multi-2") - cacheFile1 := filepath.Join(tempDir, "sshmulti1.yaml") - cacheFile2 := filepath.Join(tempDir, "sshmulti2.yaml") + yaml1 := sshCacheYAML("e5f6a7b8", "ssh-multi-1") + yaml2 := sshCacheYAML("c9d0e1f2", "ssh-multi-2") + cacheFile1 := filepath.Join(tempDir, "e5f6a7b8.yaml") + cacheFile2 := filepath.Join(tempDir, "c9d0e1f2.yaml") err := os.WriteFile(cacheFile1, []byte(yaml1), 0600) Expect(err).NotTo(HaveOccurred()) err = os.WriteFile(cacheFile2, []byte(yaml2), 0600) @@ -169,7 +169,7 @@ var _ = Describe("Delete Command", func() { Commands: []*cli.Command{cmd}, } - err = app.Run([]string{"holodeck", "delete", "--cachepath", tempDir, "sshmulti1", "sshmulti2"}) + err = app.Run([]string{"holodeck", "delete", "--cachepath", tempDir, "e5f6a7b8", "c9d0e1f2"}) Expect(err).NotTo(HaveOccurred()) // Verify both cache files were removed @@ -179,14 +179,14 @@ var _ = Describe("Delete Command", func() { Expect(os.IsNotExist(err)).To(BeTrue()) // Verify success messages for both - Expect(buf.String()).To(ContainSubstring("sshmulti1")) - Expect(buf.String()).To(ContainSubstring("sshmulti2")) + Expect(buf.String()).To(ContainSubstring("e5f6a7b8")) + Expect(buf.String()).To(ContainSubstring("c9d0e1f2")) }) It("should stop on first error with multiple instances", func() { // Create only one valid cache file - yaml := sshCacheYAML("sshvalid1", "ssh-valid") - cacheFile := filepath.Join(tempDir, "sshvalid1.yaml") + yaml := sshCacheYAML("a3b4c5d6", "ssh-valid") + cacheFile := filepath.Join(tempDir, "a3b4c5d6.yaml") err := os.WriteFile(cacheFile, []byte(yaml), 0600) Expect(err).NotTo(HaveOccurred()) @@ -196,7 +196,7 @@ var _ = Describe("Delete Command", func() { } // First instance doesn't exist, should fail before processing second - err = app.Run([]string{"holodeck", "delete", "--cachepath", tempDir, "nonexistent", "sshvalid1"}) + err = app.Run([]string{"holodeck", "delete", "--cachepath", tempDir, "nonexistent", "a3b4c5d6"}) Expect(err).To(HaveOccurred()) Expect(err.Error()).To(ContainSubstring("failed to get instance nonexistent")) @@ -207,8 +207,8 @@ var _ = Describe("Delete Command", func() { It("should fail if second instance doesn't exist", func() { // Create only one valid cache file - yaml := sshCacheYAML("sshfirst1", "ssh-first") - cacheFile := filepath.Join(tempDir, "sshfirst1.yaml") + yaml := sshCacheYAML("e7f8a9b0", "ssh-first") + cacheFile := filepath.Join(tempDir, "e7f8a9b0.yaml") err := os.WriteFile(cacheFile, []byte(yaml), 0600) Expect(err).NotTo(HaveOccurred()) @@ -218,7 +218,7 @@ var _ = Describe("Delete Command", func() { } // First succeeds, second fails - err = app.Run([]string{"holodeck", "delete", "--cachepath", tempDir, "sshfirst1", "nonexistent"}) + err = app.Run([]string{"holodeck", "delete", "--cachepath", tempDir, "e7f8a9b0", "nonexistent"}) Expect(err).To(HaveOccurred()) Expect(err.Error()).To(ContainSubstring("failed to get instance nonexistent")) @@ -235,8 +235,8 @@ var _ = Describe("Delete Command", func() { DeferCleanup(os.RemoveAll, tempDir) // Create a cache file in custom path - yaml := sshCacheYAML("customdel", "custom-delete") - cacheFile := filepath.Join(tempDir, "customdel.yaml") + yaml := sshCacheYAML("f1e2d3c4", "custom-delete") + cacheFile := filepath.Join(tempDir, "f1e2d3c4.yaml") err = os.WriteFile(cacheFile, []byte(yaml), 0600) Expect(err).NotTo(HaveOccurred()) @@ -246,7 +246,7 @@ var _ = Describe("Delete Command", func() { } // Use -c alias for cachepath - err = app.Run([]string{"holodeck", "delete", "-c", tempDir, "customdel"}) + err = app.Run([]string{"holodeck", "delete", "-c", tempDir, "f1e2d3c4"}) Expect(err).NotTo(HaveOccurred()) // Verify cache file was removed diff --git a/cmd/cli/dryrun/dryrun.go b/cmd/cli/dryrun/dryrun.go index 7bb147b77..f622ee086 100644 --- a/cmd/cli/dryrun/dryrun.go +++ b/cmd/cli/dryrun/dryrun.go @@ -27,6 +27,7 @@ import ( "github.com/NVIDIA/holodeck/pkg/provider/aws" "github.com/NVIDIA/holodeck/pkg/provisioner" "github.com/NVIDIA/holodeck/pkg/sshutil" + "github.com/NVIDIA/holodeck/pkg/utils" cli "github.com/urfave/cli/v2" "golang.org/x/crypto/ssh" @@ -131,6 +132,10 @@ func validateAWS(log *logger.FunLogger, opts *options) error { // createSshClient creates a ssh client, and retries if it fails to connect func connectOrDie(keyPath, userName, hostUrl string) error { var err error + keyPath, err = utils.ExpandPath(keyPath) + if err != nil { + return fmt.Errorf("expanding key path: %w", err) + } key, err := os.ReadFile(keyPath) // nolint:gosec if err != nil { return fmt.Errorf("failed to read key file: %w", err) diff --git a/cmd/cli/get/get.go b/cmd/cli/get/get.go index 4a80d157d..31b6f1f78 100644 --- a/cmd/cli/get/get.go +++ b/cmd/cli/get/get.go @@ -181,7 +181,7 @@ func (m command) runKubeconfig(instanceID string) error { } // Download kubeconfig - if err := utils.GetKubeConfig(m.log, &env, hostUrl, outputPath); err != nil { + if err := utils.GetKubeConfig(m.log, &env, hostUrl, outputPath, ""); err != nil { return fmt.Errorf("failed to download kubeconfig: %w", err) } diff --git a/cmd/cli/status/status_test.go b/cmd/cli/status/status_test.go index 33f517aac..48a5b4f04 100644 --- a/cmd/cli/status/status_test.go +++ b/cmd/cli/status/status_test.go @@ -130,14 +130,14 @@ var _ = Describe("Status Command", func() { Expect(err).NotTo(HaveOccurred()) DeferCleanup(os.RemoveAll, tempDir) - instanceID := "test12345678" + instanceID := "ab12cd34" cacheFile := filepath.Join(tempDir, instanceID+".yaml") validYAML := `apiVersion: holodeck.nvidia.com/v1alpha1 kind: Environment metadata: name: test-environment labels: - holodeck-instance-id: test12345678 + holodeck-instance-id: ab12cd34 spec: provider: ssh username: testuser diff --git a/docs/examples/README.md b/docs/examples/README.md index 5b621399e..6b000f29d 100644 --- a/docs/examples/README.md +++ b/docs/examples/README.md @@ -129,9 +129,12 @@ Install the NVIDIA driver using the official `.run` installer. **Files:** -- [`examples/ctk_package_pinned.yaml`](../../examples/ctk_package_pinned.yaml) — CTK pinned to a specific version -- [`examples/ctk_git_source.yaml`](../../examples/ctk_git_source.yaml) — CTK built from git -- [`examples/ctk_latest_source.yaml`](../../examples/ctk_latest_source.yaml) — CTK tracking latest branch +- [`ctk_package_pinned.yaml`](../../examples/ctk_package_pinned.yaml) + — CTK pinned to a specific version +- [`ctk_git_source.yaml`](../../examples/ctk_git_source.yaml) + — CTK built from git +- [`ctk_latest_source.yaml`](../../examples/ctk_latest_source.yaml) + — CTK tracking latest branch See the [CTK Installation Sources Guide](../guides/ctk-sources.md) for detailed configuration options. @@ -140,8 +143,10 @@ detailed configuration options. **Files:** -- [`examples/runtime_containerd_git.yaml`](../../examples/runtime_containerd_git.yaml) — Containerd built from git -- [`examples/runtime_containerd_latest.yaml`](../../examples/runtime_containerd_latest.yaml) — Containerd tracking latest +- [`runtime_containerd_git.yaml`](../../examples/runtime_containerd_git.yaml) + — Containerd built from git +- [`runtime_containerd_latest.yaml`](../../examples/runtime_containerd_latest.yaml) + — Containerd tracking latest See the [Container Runtime Sources Guide](../guides/runtime-sources.md) for all runtime options. diff --git a/docs/guides/custom-templates.md b/docs/guides/custom-templates.md index 3a511b218..f4d71caff 100644 --- a/docs/guides/custom-templates.md +++ b/docs/guides/custom-templates.md @@ -178,15 +178,15 @@ customTemplates: 1. **Use `pre-install` for system prerequisites** like package repos, kernel parameters, or certificates. -2. **Use `post-kubernetes` for workload deployment** since the +1. **Use `post-kubernetes` for workload deployment** since the cluster is ready at that point. -3. **Use `post-install` for validation scripts** that verify +1. **Use `post-install` for validation scripts** that verify the full stack. -4. **Set `continueOnError: true` for non-critical scripts** +1. **Set `continueOnError: true` for non-critical scripts** like monitoring or logging. -5. **Add checksums for URL sources** to ensure script integrity. -6. **Keep scripts idempotent** so re-runs produce the same result. -7. **Test with `holodeck dryrun`** to validate configuration +1. **Add checksums for URL sources** to ensure script integrity. +1. **Keep scripts idempotent** so re-runs produce the same result. +1. **Test with `holodeck dryrun`** to validate configuration before provisioning. ## Related diff --git a/pkg/provider/aws/cluster.go b/pkg/provider/aws/cluster.go index 718a91995..55f938331 100644 --- a/pkg/provider/aws/cluster.go +++ b/pkg/provider/aws/cluster.go @@ -323,6 +323,32 @@ func (p *Provider) createClusterSecurityGroup(cache *ClusterCache) error { }, } + // Self-referencing rules: allow all traffic between instances in this SG. + // Covers webhooks (dynamic ports), NodePort (30000-32767), IPIP (Calico), + // and any future K8s inter-node communication. + // Uses explicit TCP+UDP+ICMP (not protocol -1) for stricter compliance. + sgRef := []types.UserIdGroupPair{{GroupId: sgOutput.GroupId}} + permissions = append(permissions, + types.IpPermission{ + FromPort: aws.Int32(0), + ToPort: aws.Int32(65535), + IpProtocol: aws.String("tcp"), + UserIdGroupPairs: sgRef, + }, + types.IpPermission{ + FromPort: aws.Int32(0), + ToPort: aws.Int32(65535), + IpProtocol: aws.String("udp"), + UserIdGroupPairs: sgRef, + }, + types.IpPermission{ + FromPort: aws.Int32(-1), + ToPort: aws.Int32(-1), + IpProtocol: aws.String("icmp"), + UserIdGroupPairs: sgRef, + }, + ) + irInput := &ec2.AuthorizeSecurityGroupIngressInput{ GroupId: sgOutput.GroupId, IpPermissions: permissions, diff --git a/pkg/provider/aws/create.go b/pkg/provider/aws/create.go index f0470565e..6add5dec3 100644 --- a/pkg/provider/aws/create.go +++ b/pkg/provider/aws/create.go @@ -404,6 +404,25 @@ func (p *Provider) createSecurityGroup(cache *AWS) error { IpProtocol: &tcp, IpRanges: ipRanges, }, + // Self-referencing: allow all TCP/UDP/ICMP between SG members + { + FromPort: aws.Int32(0), + ToPort: aws.Int32(65535), + IpProtocol: aws.String("tcp"), + UserIdGroupPairs: []types.UserIdGroupPair{{GroupId: sgOutput.GroupId}}, + }, + { + FromPort: aws.Int32(0), + ToPort: aws.Int32(65535), + IpProtocol: aws.String("udp"), + UserIdGroupPairs: []types.UserIdGroupPair{{GroupId: sgOutput.GroupId}}, + }, + { + FromPort: aws.Int32(-1), + ToPort: aws.Int32(-1), + IpProtocol: aws.String("icmp"), + UserIdGroupPairs: []types.UserIdGroupPair{{GroupId: sgOutput.GroupId}}, + }, }, } diff --git a/pkg/provider/aws/nlb.go b/pkg/provider/aws/nlb.go index 93c23856c..2507d3f76 100644 --- a/pkg/provider/aws/nlb.go +++ b/pkg/provider/aws/nlb.go @@ -45,7 +45,14 @@ func (p *Provider) createNLB(cache *ClusterCache) error { cancelLoading := p.log.Loading("Creating Network Load Balancer") lbType := elbv2types.LoadBalancerTypeEnumNetwork - lbName := fmt.Sprintf("%s-nlb", p.ObjectMeta.Name) + // AWS load balancer names are limited to 32 characters. + const nlbSuffix = "-nlb" + maxNLBNameLen := 32 - len(nlbSuffix) + nlbBaseName := p.ObjectMeta.Name + if len(nlbBaseName) > maxNLBNameLen { + nlbBaseName = nlbBaseName[:maxNLBNameLen] + } + lbName := nlbBaseName + nlbSuffix // Determine subnet IDs (use the same subnet for NLB) subnetIDs := []string{cache.Subnetid} @@ -91,7 +98,15 @@ func (p *Provider) createTargetGroup(cache *ClusterCache) error { cancelLoading := p.log.Loading("Creating target group for Kubernetes API") - tgName := fmt.Sprintf("%s-k8s-api-tg", p.ObjectMeta.Name) + // AWS target group names are limited to 32 characters. + // Truncate the environment name to fit within the limit. + const tgSuffix = "-k8s-tg" + maxNameLen := 32 - len(tgSuffix) + name := p.ObjectMeta.Name + if len(name) > maxNameLen { + name = name[:maxNameLen] + } + tgName := name + tgSuffix // Create target group for Kubernetes API (port 6443) createTGInput := &elasticloadbalancingv2.CreateTargetGroupInput{ diff --git a/pkg/provisioner/cluster.go b/pkg/provisioner/cluster.go index 32ce74002..65805662f 100644 --- a/pkg/provisioner/cluster.go +++ b/pkg/provisioner/cluster.go @@ -149,13 +149,17 @@ func (cp *ClusterProvisioner) ProvisionCluster(nodes []NodeInfo) error { return nil } -// determineControlPlaneEndpoint returns the control plane endpoint +// determineControlPlaneEndpoint returns the control plane endpoint for cluster-internal +// communication (kubeadm init, join, API server binding). For HA with NLB, returns the +// NLB DNS. For non-HA, returns the first CP's private IP since all nodes are in the +// same VPC and the private IP is always routable. External access (kubeconfig) is +// handled separately by RewriteKubeConfigServer. func (cp *ClusterProvisioner) determineControlPlaneEndpoint(firstCP NodeInfo) string { // Check if HA is enabled and we have a load balancer DNS if cp.Environment.Status.Cluster != nil && cp.Environment.Status.Cluster.LoadBalancerDNS != "" { return cp.Environment.Status.Cluster.LoadBalancerDNS } - // Fall back to first control-plane private IP + // Use private IP for intra-VPC communication (init + join) return firstCP.PrivateIP } diff --git a/pkg/provisioner/cluster_test.go b/pkg/provisioner/cluster_test.go index 6d64cd517..fd3b23e13 100644 --- a/pkg/provisioner/cluster_test.go +++ b/pkg/provisioner/cluster_test.go @@ -310,6 +310,7 @@ func TestClusterProvisioner_determineControlPlaneEndpoint(t *testing.T) { Status: v1alpha1.EnvironmentStatus{}, }, firstCP: NodeInfo{ + PublicIP: "54.1.2.3", PrivateIP: "10.0.0.1", }, expected: "10.0.0.1", @@ -322,6 +323,7 @@ func TestClusterProvisioner_determineControlPlaneEndpoint(t *testing.T) { }, }, firstCP: NodeInfo{ + PublicIP: "54.1.2.4", PrivateIP: "10.0.0.2", }, expected: "10.0.0.2", diff --git a/pkg/provisioner/provisioner.go b/pkg/provisioner/provisioner.go index eaae789d4..a48d02ca0 100644 --- a/pkg/provisioner/provisioner.go +++ b/pkg/provisioner/provisioner.go @@ -451,6 +451,17 @@ func addScriptHeader(tpl *bytes.Buffer) error { func connectOrDie(keyPath, userName, hostUrl string) (*ssh.Client, error) { var client *ssh.Client var err error + if strings.HasPrefix(keyPath, "~") { + home, homeErr := os.UserHomeDir() + if homeErr != nil { + return nil, fmt.Errorf("expanding key path: %w", homeErr) + } + if keyPath == "~" { + keyPath = home + } else { + keyPath = filepath.Join(home, keyPath[2:]) + } + } key, err := os.ReadFile(keyPath) // nolint:gosec if err != nil { return nil, fmt.Errorf("failed to read key file: %w", err) diff --git a/pkg/provisioner/templates/common.go b/pkg/provisioner/templates/common.go index 328be124b..0afad2f27 100644 --- a/pkg/provisioner/templates/common.go +++ b/pkg/provisioner/templates/common.go @@ -312,6 +312,14 @@ holodeck_verify_crio() { holodeck_verify_toolkit() { command -v nvidia-ctk &>/dev/null || return 1 nvidia-ctk --version &>/dev/null || return 1 + # Verify the runtime binary exists (needed by container runtimes) + if ! command -v nvidia-container-runtime &>/dev/null; then + holodeck_log "WARN" "nvidia-container-toolkit" \ + "nvidia-container-runtime binary not found, creating symlink from nvidia-ctk" + local ctk_path + ctk_path=$(command -v nvidia-ctk) + sudo ln -sf "$ctk_path" /usr/bin/nvidia-container-runtime + fi return 0 } diff --git a/pkg/provisioner/templates/container-toolkit.go b/pkg/provisioner/templates/container-toolkit.go index 5329e4f55..42c925053 100644 --- a/pkg/provisioner/templates/container-toolkit.go +++ b/pkg/provisioner/templates/container-toolkit.go @@ -84,6 +84,11 @@ case "${HOLODECK_OS_FAMILY}" in if [[ ! -f /etc/yum.repos.d/nvidia-container-toolkit.repo ]]; then sudo curl -fsSL -o /etc/yum.repos.d/nvidia-container-toolkit.repo \ "https://nvidia.github.io/libnvidia-container/${CHANNEL}/rpm/nvidia-container-toolkit.repo" + # Disable repo metadata GPG check — upstream repomd.xml signature + # is intermittently broken. Individual RPM packages are still + # GPG-verified via gpgcheck=1. + sudo sed -i 's/^repo_gpgcheck=1/repo_gpgcheck=0/' \ + /etc/yum.repos.d/nvidia-container-toolkit.repo fi holodeck_retry 3 "$COMPONENT" pkg_update ;; @@ -306,6 +311,24 @@ else GHCR_DIGEST="source-build" fi +# Ensure nvidia-container-runtime exists (newer toolkit versions may not +# build it as a separate binary). Create a symlink so container runtimes +# can find it at the expected path. +if ! command -v nvidia-container-runtime &>/dev/null; then + holodeck_log "INFO" "$COMPONENT" "Creating nvidia-container-runtime symlink from nvidia-ctk" + CTK_PATH=$(command -v nvidia-ctk) + sudo ln -sf "$CTK_PATH" "$(dirname "$CTK_PATH")/nvidia-container-runtime" +fi +# nvidia-ctk runtime configure hardcodes /usr/bin/nvidia-container-runtime in +# the container runtime config. Ensure a binary or symlink exists there even +# when the actual binary was installed elsewhere (e.g. /usr/local/bin from +# a source build). +if [[ ! -f /usr/bin/nvidia-container-runtime ]]; then + RUNTIME_SRC=$(command -v nvidia-container-runtime 2>/dev/null || command -v nvidia-ctk) + holodeck_log "INFO" "$COMPONENT" "Symlinking ${RUNTIME_SRC} -> /usr/bin/nvidia-container-runtime" + sudo ln -sf "$RUNTIME_SRC" /usr/bin/nvidia-container-runtime +fi + holodeck_progress "$COMPONENT" 5 5 "Configuring runtime" sudo nvidia-ctk runtime configure \ @@ -510,6 +533,24 @@ else GHCR_DIGEST="source-build" fi +# Ensure nvidia-container-runtime exists (newer toolkit versions may not +# build it as a separate binary). Create a symlink so container runtimes +# can find it at the expected path. +if ! command -v nvidia-container-runtime &>/dev/null; then + holodeck_log "INFO" "$COMPONENT" "Creating nvidia-container-runtime symlink from nvidia-ctk" + CTK_PATH=$(command -v nvidia-ctk) + sudo ln -sf "$CTK_PATH" "$(dirname "$CTK_PATH")/nvidia-container-runtime" +fi +# nvidia-ctk runtime configure hardcodes /usr/bin/nvidia-container-runtime in +# the container runtime config. Ensure a binary or symlink exists there even +# when the actual binary was installed elsewhere (e.g. /usr/local/bin from +# a source build). +if [[ ! -f /usr/bin/nvidia-container-runtime ]]; then + RUNTIME_SRC=$(command -v nvidia-container-runtime 2>/dev/null || command -v nvidia-ctk) + holodeck_log "INFO" "$COMPONENT" "Symlinking ${RUNTIME_SRC} -> /usr/bin/nvidia-container-runtime" + sudo ln -sf "$RUNTIME_SRC" /usr/bin/nvidia-container-runtime +fi + holodeck_progress "$COMPONENT" 5 5 "Configuring runtime" sudo nvidia-ctk runtime configure \ diff --git a/pkg/provisioner/templates/kubeadm_cluster.go b/pkg/provisioner/templates/kubeadm_cluster.go index d4a77abb1..be0ed4ed0 100644 --- a/pkg/provisioner/templates/kubeadm_cluster.go +++ b/pkg/provisioner/templates/kubeadm_cluster.go @@ -138,10 +138,43 @@ holodeck_progress "$COMPONENT" 5 8 "Initializing Kubernetes cluster" # Initialize cluster if [[ ! -f /etc/kubernetes/admin.conf ]]; then + # Wait for control-plane endpoint to be resolvable (NLB DNS may take time) + if [[ "$CONTROL_PLANE_ENDPOINT" == *"elb.amazonaws.com"* ]] || \ + [[ "$CONTROL_PLANE_ENDPOINT" == *"amazonaws.com"* ]]; then + holodeck_log "INFO" "$COMPONENT" "Waiting for NLB DNS to resolve: ${CONTROL_PLANE_ENDPOINT}" + for i in {1..30}; do + if host "${CONTROL_PLANE_ENDPOINT}" &>/dev/null || \ + getent hosts "${CONTROL_PLANE_ENDPOINT}" &>/dev/null; then + holodeck_log "INFO" "$COMPONENT" "NLB DNS resolved successfully" + break + fi + if [[ $i -eq 30 ]]; then + holodeck_log "WARN" "$COMPONENT" "NLB DNS not yet resolved after 5 min, proceeding anyway" + fi + sleep 10 + done + fi + + # Detect this node's private IP for API server binding + NODE_PRIVATE_IP=$(hostname -I | awk '{print $1}') + + # Always use local IP for init health checks: kubeadm v1.33+ validates the API + # server via control-plane-endpoint, which may not be routable from within the + # instance during init (public IPs, NLB DNS, etc.). Use private IP for init and + # include the original endpoint in cert SANs so external access works. + if [[ "$CONTROL_PLANE_ENDPOINT" != "$NODE_PRIVATE_IP" ]]; then + INIT_ENDPOINT="${NODE_PRIVATE_IP}" + holodeck_log "INFO" "$COMPONENT" "Using local IP ${NODE_PRIVATE_IP} for init (endpoint: ${CONTROL_PLANE_ENDPOINT} in cert SANs)" + else + INIT_ENDPOINT="${CONTROL_PLANE_ENDPOINT}" + fi + INIT_ARGS=( --kubernetes-version="${K8S_VERSION}" --pod-network-cidr=192.168.0.0/16 - --control-plane-endpoint="${CONTROL_PLANE_ENDPOINT}:6443" + --control-plane-endpoint="${INIT_ENDPOINT}:6443" + --apiserver-advertise-address="${NODE_PRIVATE_IP}" + --apiserver-cert-extra-sans="${CONTROL_PLANE_ENDPOINT},${NODE_PRIVATE_IP},${INIT_ENDPOINT}" --ignore-preflight-errors=all ) @@ -150,8 +183,22 @@ if [[ ! -f /etc/kubernetes/admin.conf ]]; then INIT_ARGS+=(--upload-certs) fi - holodeck_log "INFO" "$COMPONENT" "Running kubeadm init" + holodeck_log "INFO" "$COMPONENT" "Running kubeadm init with args: ${INIT_ARGS[*]}" holodeck_retry 3 "$COMPONENT" sudo kubeadm init "${INIT_ARGS[@]}" + + # For HA with NLB: after init succeeds, update the cluster config to use NLB DNS + # so that join tokens reference the NLB endpoint (reachable by other nodes). + if [[ "$IS_HA" == "true" ]] && [[ "$INIT_ENDPOINT" != "$CONTROL_PLANE_ENDPOINT" ]]; then + holodeck_log "INFO" "$COMPONENT" "Updating cluster config to use NLB endpoint: ${CONTROL_PLANE_ENDPOINT}:6443" + # Update the kubeadm-config ConfigMap + sudo kubectl --kubeconfig=/etc/kubernetes/admin.conf -n kube-system get configmap kubeadm-config -o yaml | \ + sed "s|controlPlaneEndpoint: ${INIT_ENDPOINT}:6443|controlPlaneEndpoint: ${CONTROL_PLANE_ENDPOINT}:6443|g" | \ + sudo kubectl --kubeconfig=/etc/kubernetes/admin.conf apply -f - || \ + holodeck_log "WARN" "$COMPONENT" "Could not update kubeadm-config, join may need manual endpoint" + # Also update admin.conf kubeconfig to use the NLB + sudo sed -i "s|server: https://${INIT_ENDPOINT}:6443|server: https://${CONTROL_PLANE_ENDPOINT}:6443|g" \ + /etc/kubernetes/admin.conf + fi fi # Setup kubeconfig diff --git a/pkg/provisioner/templates/kubernetes.go b/pkg/provisioner/templates/kubernetes.go index 52d91992e..e1d7ba3f7 100644 --- a/pkg/provisioner/templates/kubernetes.go +++ b/pkg/provisioner/templates/kubernetes.go @@ -179,6 +179,18 @@ sudo systemctl enable --now kubelet holodeck_progress "$COMPONENT" 5 8 "Initializing Kubernetes cluster" +# Ensure CRI socket service is running before kubeadm init. +# When Docker is the runtime, CTK installation restarts dockerd between the +# Docker and kubeadm provisioning steps. cri-dockerd loses its Docker +# connection and crashes. With systemd StartLimitBurst=3 in 60s, it may +# not auto-recover by the time kubeadm runs. +{{- if eq .CriSocket "unix:///run/cri-dockerd.sock" }} +holodeck_log "INFO" "$COMPONENT" "Ensuring cri-dockerd is running" +sudo systemctl reset-failed cri-docker.service 2>/dev/null || true +sudo systemctl restart cri-docker.service +sleep 2 +{{- end }} + # Initialize cluster only if not already initialized if [[ ! -f /etc/kubernetes/admin.conf ]]; then # Pre-pull images before init. kubeadm init with --ignore-preflight-errors=all @@ -187,7 +199,8 @@ if [[ ! -f /etc/kubernetes/admin.conf ]]; then holodeck_log "INFO" "$COMPONENT" "Pre-pulling control plane images" {{- if .UseLegacyInit }} holodeck_retry 3 "$COMPONENT" sudo kubeadm config images pull \ - --kubernetes-version="${K8S_VERSION}" + --kubernetes-version="${K8S_VERSION}" \ + --cri-socket "{{ .CriSocket }}" {{- else }} holodeck_retry 3 "$COMPONENT" sudo kubeadm config images pull \ --config /etc/kubernetes/kubeadm-config.yaml @@ -203,12 +216,46 @@ if [[ ! -f /etc/kubernetes/admin.conf ]]; then set +e {{- if .UseLegacyInit }} + # Use private IP for init health checks (kubeadm v1.33+ checks via control-plane-endpoint). + # Public DNS may not be routable from within the instance during init. + # Include public endpoint in cert SANs so kubectl works externally. + KUBEADM_NODE_IP=$(hostname -I | awk '{print $1}') sudo kubeadm init \ --kubernetes-version="${K8S_VERSION}" \ + --cri-socket "{{ .CriSocket }}" \ --pod-network-cidr=192.168.0.0/16 \ - --control-plane-endpoint="${K8S_ENDPOINT_HOST}:6443" \ + --control-plane-endpoint="${KUBEADM_NODE_IP}:6443" \ + --apiserver-advertise-address="${KUBEADM_NODE_IP}" \ + --apiserver-cert-extra-sans="${K8S_ENDPOINT_HOST},${KUBEADM_NODE_IP},localhost" \ --ignore-preflight-errors=all {{- else }} + # Use private IP for init health checks (kubeadm v1.33+ checks via control-plane-endpoint). + # The config file has the public DNS as controlPlaneEndpoint, which may not be + # routable from within the instance during init. Replace with private IP and + # add cert SANs so kubectl works externally. + KUBEADM_NODE_IP=$(hostname -I | awk '{print $1}') + sudo sed -i "s|controlPlaneEndpoint: \"${K8S_ENDPOINT_HOST}:6443\"|controlPlaneEndpoint: \"${KUBEADM_NODE_IP}:6443\"|" \ + /etc/kubernetes/kubeadm-config.yaml + # Inject certSANs into ClusterConfiguration so the API server cert + # covers both the public endpoint and the private IP we use for init. + if ! grep -q 'certSANs' /etc/kubernetes/kubeadm-config.yaml; then + if grep -q '^apiServer:' /etc/kubernetes/kubeadm-config.yaml; then + # apiServer block exists (e.g. feature gates) — inject certSANs into it + sudo sed -i "/^apiServer:/a\\ + certSANs:\\ + - \"${K8S_ENDPOINT_HOST}\"\\ + - \"${KUBEADM_NODE_IP}\"\\ + - \"localhost\"" /etc/kubernetes/kubeadm-config.yaml + else + # No apiServer block — create one after controlPlaneEndpoint + sudo sed -i "/^controlPlaneEndpoint:/a\\ +apiServer:\\ + certSANs:\\ + - \"${K8S_ENDPOINT_HOST}\"\\ + - \"${KUBEADM_NODE_IP}\"\\ + - \"localhost\"" /etc/kubernetes/kubeadm-config.yaml + fi + fi sudo kubeadm init \ --config /etc/kubernetes/kubeadm-config.yaml \ --ignore-preflight-errors=all @@ -226,7 +273,7 @@ if [[ ! -f /etc/kubernetes/admin.conf ]]; then holodeck_log "INFO" "$COMPONENT" "--- kubelet logs (last 30 lines) ---" sudo journalctl -u kubelet --no-pager -n 30 2>&1 || true holodeck_log "INFO" "$COMPONENT" "--- container status via crictl ---" - sudo crictl --runtime-endpoint unix:///run/cri-dockerd.sock ps -a 2>&1 || true + sudo crictl --runtime-endpoint {{ .CriSocket }} ps -a 2>&1 || true holodeck_log "INFO" "$COMPONENT" "--- container status via docker ---" sudo docker ps -a 2>&1 || true holodeck_log "INFO" "$COMPONENT" "--- kubeadm-flags.env ---" @@ -235,7 +282,7 @@ if [[ ! -f /etc/kubernetes/admin.conf ]]; then if [[ $KUBEADM_ATTEMPT -lt $KUBEADM_MAX_ATTEMPTS ]]; then holodeck_log "INFO" "$COMPONENT" "Resetting cluster state before retry" - sudo kubeadm reset -f --cri-socket "unix:///run/cri-dockerd.sock" 2>&1 || true + sudo kubeadm reset -f --cri-socket "{{ .CriSocket }}" 2>&1 || true # Re-enable kubelet after reset sudo systemctl daemon-reload sudo systemctl restart kubelet diff --git a/pkg/provisioner/templates/kubernetes_test.go b/pkg/provisioner/templates/kubernetes_test.go index d421050d8..bea0ba7b6 100644 --- a/pkg/provisioner/templates/kubernetes_test.go +++ b/pkg/provisioner/templates/kubernetes_test.go @@ -266,7 +266,7 @@ func TestKubernetes_Execute(t *testing.T) { }, wantErr: false, checkTemplate: true, - expectedString: `--control-plane-endpoint="${K8S_ENDPOINT_HOST}:6443"`, + expectedString: `--control-plane-endpoint="${KUBEADM_NODE_IP}:6443"`, checkSafeExit: true, }, { diff --git a/pkg/utils/kubeconfig.go b/pkg/utils/kubeconfig.go index 6c15d402d..8c10decfc 100644 --- a/pkg/utils/kubeconfig.go +++ b/pkg/utils/kubeconfig.go @@ -21,13 +21,68 @@ import ( "io" "os" + "sigs.k8s.io/yaml" + "github.com/NVIDIA/holodeck/api/holodeck/v1alpha1" "github.com/NVIDIA/holodeck/internal/logger" "github.com/NVIDIA/holodeck/pkg/provisioner" ) +// kubeConfig is a minimal representation for server URL rewriting. +type kubeConfig struct { + APIVersion string `json:"apiVersion"` + Kind string `json:"kind"` + Clusters []kubeConfigClusterEntry `json:"clusters"` + Contexts []any `json:"contexts"` + CurrentContext string `json:"current-context"` + Users []any `json:"users"` +} + +type kubeConfigClusterEntry struct { + Name string `json:"name"` + Cluster kubeConfigCluster `json:"cluster"` +} + +type kubeConfigCluster struct { + Server string `json:"server"` + CertificateAuthorityData string `json:"certificate-authority-data,omitempty"` +} + +// RewriteKubeConfigServer rewrites the server URL in a kubeconfig file. +// If serverURL is empty, this is a no-op. +func RewriteKubeConfigServer(path string, serverURL string) error { + if serverURL == "" { + return nil + } + + data, err := os.ReadFile(path) //nolint:gosec // path is caller-provided kubeconfig + if err != nil { + return fmt.Errorf("reading kubeconfig: %w", err) + } + + var cfg kubeConfig + if err := yaml.Unmarshal(data, &cfg); err != nil { + return fmt.Errorf("parsing kubeconfig: %w", err) + } + + for i := range cfg.Clusters { + cfg.Clusters[i].Cluster.Server = serverURL + } + + out, err := yaml.Marshal(&cfg) + if err != nil { + return fmt.Errorf("marshaling kubeconfig: %w", err) + } + + if err := os.WriteFile(path, out, 0600); err != nil { + return fmt.Errorf("writing kubeconfig: %w", err) + } + + return nil +} + // GetKubeConfig downloads the kubeconfig file from the remote host -func GetKubeConfig(log *logger.FunLogger, cfg *v1alpha1.Environment, hostUrl string, dest string) error { +func GetKubeConfig(log *logger.FunLogger, cfg *v1alpha1.Environment, hostUrl string, dest string, desiredServerURL string) error { remoteFilePath := "${HOME}/.kube/config" // Create a new ssh session @@ -76,5 +131,12 @@ func GetKubeConfig(log *logger.FunLogger, cfg *v1alpha1.Environment, hostUrl str log.Info(fmt.Sprintf("Kubeconfig saved to %s\n", dest)) + if desiredServerURL != "" { + if err := RewriteKubeConfigServer(dest, desiredServerURL); err != nil { + return fmt.Errorf("failed to rewrite kubeconfig server URL: %w", err) + } + log.Info(fmt.Sprintf("Kubeconfig server URL rewritten to %s\n", desiredServerURL)) + } + return nil } diff --git a/pkg/utils/kubeconfig_test.go b/pkg/utils/kubeconfig_test.go new file mode 100644 index 000000000..acf093088 --- /dev/null +++ b/pkg/utils/kubeconfig_test.go @@ -0,0 +1,109 @@ +/* + * Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package utils + +import ( + "os" + "path/filepath" + "testing" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" +) + +func TestRewriteKubeConfigServer(t *testing.T) { + tests := []struct { + name string + input string + serverURL string + expected string + }{ + { + name: "rewrite private IP to public IP", + input: `apiVersion: v1 +clusters: +- cluster: + certificate-authority-data: dGVzdA== + server: https://10.0.0.1:6443 + name: kubernetes +contexts: +- context: + cluster: kubernetes + user: kubernetes-admin + name: kubernetes-admin@kubernetes +current-context: kubernetes-admin@kubernetes +kind: Config +users: +- name: kubernetes-admin + user: + client-certificate-data: dGVzdA== + client-key-data: dGVzdA== +`, + serverURL: "https://54.1.2.3:6443", + expected: "https://54.1.2.3:6443", + }, + { + name: "rewrite to NLB DNS", + input: `apiVersion: v1 +clusters: +- cluster: + server: https://10.0.0.1:6443 + name: kubernetes +contexts: +- context: + cluster: kubernetes + user: admin + name: admin@kubernetes +current-context: admin@kubernetes +kind: Config +users: +- name: admin + user: + client-certificate-data: dGVzdA== +`, + serverURL: "https://my-nlb.elb.amazonaws.com:6443", + expected: "https://my-nlb.elb.amazonaws.com:6443", + }, + { + name: "empty server URL is no-op", + input: "apiVersion: v1\nclusters:\n- cluster:\n server: https://10.0.0.1:6443\n name: kubernetes\nkind: Config\n", + serverURL: "", + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + dir := t.TempDir() + path := filepath.Join(dir, "kubeconfig") + err := os.WriteFile(path, []byte(tt.input), 0600) + require.NoError(t, err) + + err = RewriteKubeConfigServer(path, tt.serverURL) + require.NoError(t, err) + + data, err := os.ReadFile(path) //nolint:gosec // test file from t.TempDir() + require.NoError(t, err) + + if tt.serverURL == "" { + assert.Contains(t, string(data), "https://10.0.0.1:6443") + } else { + assert.Contains(t, string(data), tt.expected) + assert.NotContains(t, string(data), "10.0.0.1") + } + }) + } +} diff --git a/pkg/utils/path.go b/pkg/utils/path.go new file mode 100644 index 000000000..e4e17294f --- /dev/null +++ b/pkg/utils/path.go @@ -0,0 +1,43 @@ +/* + * Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package utils + +import ( + "os" + "path/filepath" + "strings" +) + +// ExpandPath expands a leading ~ to the user's home directory. +// Non-tilde paths are returned unchanged. +func ExpandPath(path string) (string, error) { + if path == "" || !strings.HasPrefix(path, "~") { + return path, nil + } + + home, err := os.UserHomeDir() + if err != nil { + return "", err + } + + if path == "~" { + return home, nil + } + + // ~/foo/bar → /home/user/foo/bar + return filepath.Join(home, path[2:]), nil +} diff --git a/pkg/utils/path_test.go b/pkg/utils/path_test.go new file mode 100644 index 000000000..b35ebf689 --- /dev/null +++ b/pkg/utils/path_test.go @@ -0,0 +1,76 @@ +/* + * Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package utils + +import ( + "os" + "path/filepath" + "testing" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" +) + +func TestExpandPath(t *testing.T) { + home, err := os.UserHomeDir() + require.NoError(t, err) + + tests := []struct { + name string + input string + expected string + }{ + { + name: "tilde only", + input: "~", + expected: home, + }, + { + name: "tilde with path", + input: "~/.ssh/id_rsa", + expected: filepath.Join(home, ".ssh/id_rsa"), + }, + { + name: "absolute path unchanged", + input: "/etc/holodeck/key.pem", + expected: "/etc/holodeck/key.pem", + }, + { + name: "relative path unchanged", + input: "keys/my.pem", + expected: "keys/my.pem", + }, + { + name: "tilde in middle unchanged", + input: "/home/user/~/.ssh/key", + expected: "/home/user/~/.ssh/key", + }, + { + name: "empty string", + input: "", + expected: "", + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + result, err := ExpandPath(tt.input) + require.NoError(t, err) + assert.Equal(t, tt.expected, result) + }) + } +} diff --git a/tests/data/test_rpm_al2023_containerd.yml b/tests/data/test_rpm_al2023_containerd.yml index 45ca634d3..6cec657df 100644 --- a/tests/data/test_rpm_al2023_containerd.yml +++ b/tests/data/test_rpm_al2023_containerd.yml @@ -24,3 +24,6 @@ spec: kubernetes: install: true installer: kubeadm + source: release + release: + version: v1.31.0 diff --git a/tests/data/test_rpm_al2023_crio.yml b/tests/data/test_rpm_al2023_crio.yml index f729db96a..7c266d59e 100644 --- a/tests/data/test_rpm_al2023_crio.yml +++ b/tests/data/test_rpm_al2023_crio.yml @@ -24,3 +24,6 @@ spec: kubernetes: install: true installer: kubeadm + source: release + release: + version: v1.31.0 diff --git a/tests/data/test_rpm_al2023_docker.yml b/tests/data/test_rpm_al2023_docker.yml index ca10c0c4c..00aad0a79 100644 --- a/tests/data/test_rpm_al2023_docker.yml +++ b/tests/data/test_rpm_al2023_docker.yml @@ -24,3 +24,6 @@ spec: kubernetes: install: true installer: kubeadm + source: release + release: + version: v1.31.0 diff --git a/tests/data/test_rpm_fedora42_containerd.yml b/tests/data/test_rpm_fedora42_containerd.yml index 9f9e9c3a5..f597f266f 100644 --- a/tests/data/test_rpm_fedora42_containerd.yml +++ b/tests/data/test_rpm_fedora42_containerd.yml @@ -24,3 +24,6 @@ spec: kubernetes: install: true installer: kubeadm + source: release + release: + version: v1.31.0 diff --git a/tests/data/test_rpm_fedora42_crio.yml b/tests/data/test_rpm_fedora42_crio.yml index c4fcbac3f..cc201410e 100644 --- a/tests/data/test_rpm_fedora42_crio.yml +++ b/tests/data/test_rpm_fedora42_crio.yml @@ -24,3 +24,6 @@ spec: kubernetes: install: true installer: kubeadm + source: release + release: + version: v1.31.0 diff --git a/tests/data/test_rpm_rocky9_containerd.yml b/tests/data/test_rpm_rocky9_containerd.yml index 2bf30f4c1..aa063cb1f 100644 --- a/tests/data/test_rpm_rocky9_containerd.yml +++ b/tests/data/test_rpm_rocky9_containerd.yml @@ -24,3 +24,6 @@ spec: kubernetes: install: true installer: kubeadm + source: release + release: + version: v1.31.0 diff --git a/tests/data/test_rpm_rocky9_crio.yml b/tests/data/test_rpm_rocky9_crio.yml index b3180cab1..39f04fb00 100644 --- a/tests/data/test_rpm_rocky9_crio.yml +++ b/tests/data/test_rpm_rocky9_crio.yml @@ -24,3 +24,6 @@ spec: kubernetes: install: true installer: kubeadm + source: release + release: + version: v1.31.0