From 7f8f06f25a2f224d49ca5838311b4b9fba9bff5b Mon Sep 17 00:00:00 2001 From: Carlos Eduardo Arango Gutierrez Date: Mon, 9 Mar 2026 14:53:05 +0100 Subject: [PATCH 01/19] fix: use local IP for kubeadm init health checks (single-node) kubeadm v1.33+ validates API server health via the control-plane-endpoint URL. When set to EC2 public DNS, this can timeout because the public DNS isn't routable from within the instance during init. Use the node's private IP for init and include the public DNS in cert SANs for external access. Signed-off-by: Carlos Eduardo Arango Gutierrez --- cmd/cli/delete/delete_test.go | 40 ++++++++++---------- cmd/cli/status/status_test.go | 4 +- pkg/provisioner/templates/kubernetes.go | 24 +++++++++++- pkg/provisioner/templates/kubernetes_test.go | 2 +- 4 files changed, 46 insertions(+), 24 deletions(-) diff --git a/cmd/cli/delete/delete_test.go b/cmd/cli/delete/delete_test.go index 508c191d3..322b2d7ee 100644 --- a/cmd/cli/delete/delete_test.go +++ b/cmd/cli/delete/delete_test.go @@ -131,8 +131,8 @@ var _ = Describe("Delete Command", func() { It("should delete single SSH instance successfully", func() { // Create a valid SSH cache file - yaml := sshCacheYAML("sshdelete1", "ssh-delete-test") - cacheFile := filepath.Join(tempDir, "sshdelete1.yaml") + yaml := sshCacheYAML("a1b2c3d4", "ssh-delete-test") + cacheFile := filepath.Join(tempDir, "a1b2c3d4.yaml") err := os.WriteFile(cacheFile, []byte(yaml), 0600) Expect(err).NotTo(HaveOccurred()) @@ -141,7 +141,7 @@ var _ = Describe("Delete Command", func() { Commands: []*cli.Command{cmd}, } - err = app.Run([]string{"holodeck", "delete", "--cachepath", tempDir, "sshdelete1"}) + err = app.Run([]string{"holodeck", "delete", "--cachepath", tempDir, "a1b2c3d4"}) Expect(err).NotTo(HaveOccurred()) // Verify cache file was removed @@ -150,15 +150,15 @@ var _ = Describe("Delete Command", func() { // Verify success message Expect(buf.String()).To(ContainSubstring("Successfully deleted")) - Expect(buf.String()).To(ContainSubstring("sshdelete1")) + Expect(buf.String()).To(ContainSubstring("a1b2c3d4")) }) It("should delete multiple SSH instances successfully", func() { // Create two cache files - yaml1 := sshCacheYAML("sshmulti1", "ssh-multi-1") - yaml2 := sshCacheYAML("sshmulti2", "ssh-multi-2") - cacheFile1 := filepath.Join(tempDir, "sshmulti1.yaml") - cacheFile2 := filepath.Join(tempDir, "sshmulti2.yaml") + yaml1 := sshCacheYAML("e5f6a7b8", "ssh-multi-1") + yaml2 := sshCacheYAML("c9d0e1f2", "ssh-multi-2") + cacheFile1 := filepath.Join(tempDir, "e5f6a7b8.yaml") + cacheFile2 := filepath.Join(tempDir, "c9d0e1f2.yaml") err := os.WriteFile(cacheFile1, []byte(yaml1), 0600) Expect(err).NotTo(HaveOccurred()) err = os.WriteFile(cacheFile2, []byte(yaml2), 0600) @@ -169,7 +169,7 @@ var _ = Describe("Delete Command", func() { Commands: []*cli.Command{cmd}, } - err = app.Run([]string{"holodeck", "delete", "--cachepath", tempDir, "sshmulti1", "sshmulti2"}) + err = app.Run([]string{"holodeck", "delete", "--cachepath", tempDir, "e5f6a7b8", "c9d0e1f2"}) Expect(err).NotTo(HaveOccurred()) // Verify both cache files were removed @@ -179,14 +179,14 @@ var _ = Describe("Delete Command", func() { Expect(os.IsNotExist(err)).To(BeTrue()) // Verify success messages for both - Expect(buf.String()).To(ContainSubstring("sshmulti1")) - Expect(buf.String()).To(ContainSubstring("sshmulti2")) + Expect(buf.String()).To(ContainSubstring("e5f6a7b8")) + Expect(buf.String()).To(ContainSubstring("c9d0e1f2")) }) It("should stop on first error with multiple instances", func() { // Create only one valid cache file - yaml := sshCacheYAML("sshvalid1", "ssh-valid") - cacheFile := filepath.Join(tempDir, "sshvalid1.yaml") + yaml := sshCacheYAML("a3b4c5d6", "ssh-valid") + cacheFile := filepath.Join(tempDir, "a3b4c5d6.yaml") err := os.WriteFile(cacheFile, []byte(yaml), 0600) Expect(err).NotTo(HaveOccurred()) @@ -196,7 +196,7 @@ var _ = Describe("Delete Command", func() { } // First instance doesn't exist, should fail before processing second - err = app.Run([]string{"holodeck", "delete", "--cachepath", tempDir, "nonexistent", "sshvalid1"}) + err = app.Run([]string{"holodeck", "delete", "--cachepath", tempDir, "nonexistent", "a3b4c5d6"}) Expect(err).To(HaveOccurred()) Expect(err.Error()).To(ContainSubstring("failed to get instance nonexistent")) @@ -207,8 +207,8 @@ var _ = Describe("Delete Command", func() { It("should fail if second instance doesn't exist", func() { // Create only one valid cache file - yaml := sshCacheYAML("sshfirst1", "ssh-first") - cacheFile := filepath.Join(tempDir, "sshfirst1.yaml") + yaml := sshCacheYAML("e7f8a9b0", "ssh-first") + cacheFile := filepath.Join(tempDir, "e7f8a9b0.yaml") err := os.WriteFile(cacheFile, []byte(yaml), 0600) Expect(err).NotTo(HaveOccurred()) @@ -218,7 +218,7 @@ var _ = Describe("Delete Command", func() { } // First succeeds, second fails - err = app.Run([]string{"holodeck", "delete", "--cachepath", tempDir, "sshfirst1", "nonexistent"}) + err = app.Run([]string{"holodeck", "delete", "--cachepath", tempDir, "e7f8a9b0", "nonexistent"}) Expect(err).To(HaveOccurred()) Expect(err.Error()).To(ContainSubstring("failed to get instance nonexistent")) @@ -235,8 +235,8 @@ var _ = Describe("Delete Command", func() { DeferCleanup(os.RemoveAll, tempDir) // Create a cache file in custom path - yaml := sshCacheYAML("customdel", "custom-delete") - cacheFile := filepath.Join(tempDir, "customdel.yaml") + yaml := sshCacheYAML("f1e2d3c4", "custom-delete") + cacheFile := filepath.Join(tempDir, "f1e2d3c4.yaml") err = os.WriteFile(cacheFile, []byte(yaml), 0600) Expect(err).NotTo(HaveOccurred()) @@ -246,7 +246,7 @@ var _ = Describe("Delete Command", func() { } // Use -c alias for cachepath - err = app.Run([]string{"holodeck", "delete", "-c", tempDir, "customdel"}) + err = app.Run([]string{"holodeck", "delete", "-c", tempDir, "f1e2d3c4"}) Expect(err).NotTo(HaveOccurred()) // Verify cache file was removed diff --git a/cmd/cli/status/status_test.go b/cmd/cli/status/status_test.go index 33f517aac..48a5b4f04 100644 --- a/cmd/cli/status/status_test.go +++ b/cmd/cli/status/status_test.go @@ -130,14 +130,14 @@ var _ = Describe("Status Command", func() { Expect(err).NotTo(HaveOccurred()) DeferCleanup(os.RemoveAll, tempDir) - instanceID := "test12345678" + instanceID := "ab12cd34" cacheFile := filepath.Join(tempDir, instanceID+".yaml") validYAML := `apiVersion: holodeck.nvidia.com/v1alpha1 kind: Environment metadata: name: test-environment labels: - holodeck-instance-id: test12345678 + holodeck-instance-id: ab12cd34 spec: provider: ssh username: testuser diff --git a/pkg/provisioner/templates/kubernetes.go b/pkg/provisioner/templates/kubernetes.go index 52d91992e..0914c8eec 100644 --- a/pkg/provisioner/templates/kubernetes.go +++ b/pkg/provisioner/templates/kubernetes.go @@ -203,12 +203,34 @@ if [[ ! -f /etc/kubernetes/admin.conf ]]; then set +e {{- if .UseLegacyInit }} + # Use private IP for init health checks (kubeadm v1.33+ checks via control-plane-endpoint). + # Public DNS may not be routable from within the instance during init. + # Include public endpoint in cert SANs so kubectl works externally. + KUBEADM_NODE_IP=$(hostname -I | awk '{print $1}') sudo kubeadm init \ --kubernetes-version="${K8S_VERSION}" \ --pod-network-cidr=192.168.0.0/16 \ - --control-plane-endpoint="${K8S_ENDPOINT_HOST}:6443" \ + --control-plane-endpoint="${KUBEADM_NODE_IP}:6443" \ + --apiserver-advertise-address="${KUBEADM_NODE_IP}" \ + --apiserver-cert-extra-sans="${K8S_ENDPOINT_HOST},${KUBEADM_NODE_IP},localhost" \ --ignore-preflight-errors=all {{- else }} + # Use private IP for init health checks (kubeadm v1.33+ checks via control-plane-endpoint). + # The config file has the public DNS as controlPlaneEndpoint, which may not be + # routable from within the instance during init. Replace with private IP and + # add cert SANs so kubectl works externally. + KUBEADM_NODE_IP=$(hostname -I | awk '{print $1}') + sudo sed -i "s|controlPlaneEndpoint: \"${K8S_ENDPOINT_HOST}:6443\"|controlPlaneEndpoint: \"${KUBEADM_NODE_IP}:6443\"|" \ + /etc/kubernetes/kubeadm-config.yaml + # Inject certSANs into ClusterConfiguration if not already present + if ! grep -q 'certSANs' /etc/kubernetes/kubeadm-config.yaml; then + sudo sed -i "/^controlPlaneEndpoint:/a\\ +apiServer:\\ + certSANs:\\ + - \"${K8S_ENDPOINT_HOST}\"\\ + - \"${KUBEADM_NODE_IP}\"\\ + - \"localhost\"" /etc/kubernetes/kubeadm-config.yaml + fi sudo kubeadm init \ --config /etc/kubernetes/kubeadm-config.yaml \ --ignore-preflight-errors=all diff --git a/pkg/provisioner/templates/kubernetes_test.go b/pkg/provisioner/templates/kubernetes_test.go index d421050d8..bea0ba7b6 100644 --- a/pkg/provisioner/templates/kubernetes_test.go +++ b/pkg/provisioner/templates/kubernetes_test.go @@ -266,7 +266,7 @@ func TestKubernetes_Execute(t *testing.T) { }, wantErr: false, checkTemplate: true, - expectedString: `--control-plane-endpoint="${K8S_ENDPOINT_HOST}:6443"`, + expectedString: `--control-plane-endpoint="${KUBEADM_NODE_IP}:6443"`, checkSafeExit: true, }, { From e857b510c9622820e146f27f6b816945f4e841b8 Mon Sep 17 00:00:00 2001 From: Carlos Eduardo Arango Gutierrez Date: Mon, 9 Mar 2026 14:54:32 +0100 Subject: [PATCH 02/19] fix: HA kubeadm init uses local IP to avoid NLB chicken-and-egg kubeadm v1.33+ health checks reach the API server via the control-plane- endpoint. With NLB, this creates a deadlock: NLB can't route until API server is healthy, but kubeadm won't report healthy until NLB routes. Fix: init with local private IP, include NLB DNS in cert SANs, then update kubeadm-config and admin.conf to reference NLB after init. Signed-off-by: Carlos Eduardo Arango Gutierrez --- pkg/provisioner/templates/kubeadm_cluster.go | 52 +++++++++++++++++++- 1 file changed, 50 insertions(+), 2 deletions(-) diff --git a/pkg/provisioner/templates/kubeadm_cluster.go b/pkg/provisioner/templates/kubeadm_cluster.go index d4a77abb1..0e6c49fe3 100644 --- a/pkg/provisioner/templates/kubeadm_cluster.go +++ b/pkg/provisioner/templates/kubeadm_cluster.go @@ -138,10 +138,44 @@ holodeck_progress "$COMPONENT" 5 8 "Initializing Kubernetes cluster" # Initialize cluster if [[ ! -f /etc/kubernetes/admin.conf ]]; then + # Wait for control-plane endpoint to be resolvable (NLB DNS may take time) + if [[ "$CONTROL_PLANE_ENDPOINT" == *"elb.amazonaws.com"* ]] || \ + [[ "$CONTROL_PLANE_ENDPOINT" == *"amazonaws.com"* ]]; then + holodeck_log "INFO" "$COMPONENT" "Waiting for NLB DNS to resolve: ${CONTROL_PLANE_ENDPOINT}" + for i in {1..30}; do + if host "${CONTROL_PLANE_ENDPOINT}" &>/dev/null || \ + getent hosts "${CONTROL_PLANE_ENDPOINT}" &>/dev/null; then + holodeck_log "INFO" "$COMPONENT" "NLB DNS resolved successfully" + break + fi + if [[ $i -eq 30 ]]; then + holodeck_log "WARN" "$COMPONENT" "NLB DNS not yet resolved after 5 min, proceeding anyway" + fi + sleep 10 + done + fi + + # Detect this node's private IP for API server binding + NODE_PRIVATE_IP=$(hostname -I | awk '{print $1}') + + # For HA with NLB, use local IP for init health checks to avoid chicken-and-egg: + # NLB can't route to API server until it's healthy, but kubeadm won't report + # healthy until it can reach the control-plane-endpoint (NLB). + # Solution: init with local endpoint, include NLB in cert SANs for later joins. + if [[ "$IS_HA" == "true" ]] && { [[ "$CONTROL_PLANE_ENDPOINT" == *"elb.amazonaws.com"* ]] || \ + [[ "$CONTROL_PLANE_ENDPOINT" == *"amazonaws.com"* ]]; }; then + INIT_ENDPOINT="${NODE_PRIVATE_IP}" + holodeck_log "INFO" "$COMPONENT" "HA mode: using local IP ${NODE_PRIVATE_IP} for init, NLB in cert SANs" + else + INIT_ENDPOINT="${CONTROL_PLANE_ENDPOINT}" + fi + INIT_ARGS=( --kubernetes-version="${K8S_VERSION}" --pod-network-cidr=192.168.0.0/16 - --control-plane-endpoint="${CONTROL_PLANE_ENDPOINT}:6443" + --control-plane-endpoint="${INIT_ENDPOINT}:6443" + --apiserver-advertise-address="${NODE_PRIVATE_IP}" + --apiserver-cert-extra-sans="${CONTROL_PLANE_ENDPOINT},${NODE_PRIVATE_IP},${INIT_ENDPOINT}" --ignore-preflight-errors=all ) @@ -150,8 +184,22 @@ if [[ ! -f /etc/kubernetes/admin.conf ]]; then INIT_ARGS+=(--upload-certs) fi - holodeck_log "INFO" "$COMPONENT" "Running kubeadm init" + holodeck_log "INFO" "$COMPONENT" "Running kubeadm init with args: ${INIT_ARGS[*]}" holodeck_retry 3 "$COMPONENT" sudo kubeadm init "${INIT_ARGS[@]}" + + # For HA with NLB: after init succeeds, update the cluster config to use NLB DNS + # so that join tokens reference the NLB endpoint (reachable by other nodes). + if [[ "$IS_HA" == "true" ]] && [[ "$INIT_ENDPOINT" != "$CONTROL_PLANE_ENDPOINT" ]]; then + holodeck_log "INFO" "$COMPONENT" "Updating cluster config to use NLB endpoint: ${CONTROL_PLANE_ENDPOINT}:6443" + # Update the kubeadm-config ConfigMap + sudo kubectl --kubeconfig=/etc/kubernetes/admin.conf -n kube-system get configmap kubeadm-config -o yaml | \ + sed "s|controlPlaneEndpoint: ${INIT_ENDPOINT}:6443|controlPlaneEndpoint: ${CONTROL_PLANE_ENDPOINT}:6443|g" | \ + sudo kubectl --kubeconfig=/etc/kubernetes/admin.conf apply -f - || \ + holodeck_log "WARN" "$COMPONENT" "Could not update kubeadm-config, join may need manual endpoint" + # Also update admin.conf kubeconfig to use the NLB + sudo sed -i "s|server: https://${INIT_ENDPOINT}:6443|server: https://${CONTROL_PLANE_ENDPOINT}:6443|g" \ + /etc/kubernetes/admin.conf + fi fi # Setup kubeconfig From cb02c6839dd1995e336632e4b41647058d47ddc5 Mon Sep 17 00:00:00 2001 From: Carlos Eduardo Arango Gutierrez Date: Mon, 9 Mar 2026 14:54:46 +0100 Subject: [PATCH 03/19] fix: create nvidia-container-runtime symlink on RPM platforms nvidia-container-toolkit provides nvidia-ctk but may not provide the nvidia-container-runtime binary on RPM distros. Container runtimes (containerd, CRI-O) require it. Create symlink if missing. Signed-off-by: Carlos Eduardo Arango Gutierrez --- pkg/provisioner/templates/common.go | 8 ++++++ .../templates/container-toolkit.go | 25 +++++++++++++++++++ 2 files changed, 33 insertions(+) diff --git a/pkg/provisioner/templates/common.go b/pkg/provisioner/templates/common.go index 328be124b..0afad2f27 100644 --- a/pkg/provisioner/templates/common.go +++ b/pkg/provisioner/templates/common.go @@ -312,6 +312,14 @@ holodeck_verify_crio() { holodeck_verify_toolkit() { command -v nvidia-ctk &>/dev/null || return 1 nvidia-ctk --version &>/dev/null || return 1 + # Verify the runtime binary exists (needed by container runtimes) + if ! command -v nvidia-container-runtime &>/dev/null; then + holodeck_log "WARN" "nvidia-container-toolkit" \ + "nvidia-container-runtime binary not found, creating symlink from nvidia-ctk" + local ctk_path + ctk_path=$(command -v nvidia-ctk) + sudo ln -sf "$ctk_path" /usr/bin/nvidia-container-runtime + fi return 0 } diff --git a/pkg/provisioner/templates/container-toolkit.go b/pkg/provisioner/templates/container-toolkit.go index 5329e4f55..0d2ebfa8b 100644 --- a/pkg/provisioner/templates/container-toolkit.go +++ b/pkg/provisioner/templates/container-toolkit.go @@ -306,6 +306,19 @@ else GHCR_DIGEST="source-build" fi +# Ensure nvidia-container-runtime exists (newer toolkit versions may not +# build it as a separate binary). Create a symlink so container runtimes +# can find it at the expected path. +if ! command -v nvidia-container-runtime &>/dev/null; then + holodeck_log "INFO" "$COMPONENT" "Creating nvidia-container-runtime symlink from nvidia-ctk" + CTK_PATH=$(command -v nvidia-ctk) + sudo ln -sf "$CTK_PATH" "$(dirname "$CTK_PATH")/nvidia-container-runtime" + # Also ensure it's in /usr/bin for container runtime configs + if [[ ! -f /usr/bin/nvidia-container-runtime ]]; then + sudo ln -sf "$CTK_PATH" /usr/bin/nvidia-container-runtime + fi +fi + holodeck_progress "$COMPONENT" 5 5 "Configuring runtime" sudo nvidia-ctk runtime configure \ @@ -510,6 +523,18 @@ else GHCR_DIGEST="source-build" fi +# Ensure nvidia-container-runtime exists (newer toolkit versions may not +# build it as a separate binary). Create a symlink so container runtimes +# can find it at the expected path. +if ! command -v nvidia-container-runtime &>/dev/null; then + holodeck_log "INFO" "$COMPONENT" "Creating nvidia-container-runtime symlink from nvidia-ctk" + CTK_PATH=$(command -v nvidia-ctk) + sudo ln -sf "$CTK_PATH" "$(dirname "$CTK_PATH")/nvidia-container-runtime" + if [[ ! -f /usr/bin/nvidia-container-runtime ]]; then + sudo ln -sf "$CTK_PATH" /usr/bin/nvidia-container-runtime + fi +fi + holodeck_progress "$COMPONENT" 5 5 "Configuring runtime" sudo nvidia-ctk runtime configure \ From 38ffde8822b9b43354db098af0b7abfb68131001 Mon Sep 17 00:00:00 2001 From: Carlos Eduardo Arango Gutierrez Date: Mon, 9 Mar 2026 14:58:46 +0100 Subject: [PATCH 04/19] fix: truncate NLB target group name to AWS 32-char limit AWS enforces a 32-character limit on target group names. Long holodeck cluster names could exceed this, causing NLB creation to fail. Signed-off-by: Carlos Eduardo Arango Gutierrez --- pkg/provider/aws/nlb.go | 19 +++++++++++++++++-- 1 file changed, 17 insertions(+), 2 deletions(-) diff --git a/pkg/provider/aws/nlb.go b/pkg/provider/aws/nlb.go index 93c23856c..2507d3f76 100644 --- a/pkg/provider/aws/nlb.go +++ b/pkg/provider/aws/nlb.go @@ -45,7 +45,14 @@ func (p *Provider) createNLB(cache *ClusterCache) error { cancelLoading := p.log.Loading("Creating Network Load Balancer") lbType := elbv2types.LoadBalancerTypeEnumNetwork - lbName := fmt.Sprintf("%s-nlb", p.ObjectMeta.Name) + // AWS load balancer names are limited to 32 characters. + const nlbSuffix = "-nlb" + maxNLBNameLen := 32 - len(nlbSuffix) + nlbBaseName := p.ObjectMeta.Name + if len(nlbBaseName) > maxNLBNameLen { + nlbBaseName = nlbBaseName[:maxNLBNameLen] + } + lbName := nlbBaseName + nlbSuffix // Determine subnet IDs (use the same subnet for NLB) subnetIDs := []string{cache.Subnetid} @@ -91,7 +98,15 @@ func (p *Provider) createTargetGroup(cache *ClusterCache) error { cancelLoading := p.log.Loading("Creating target group for Kubernetes API") - tgName := fmt.Sprintf("%s-k8s-api-tg", p.ObjectMeta.Name) + // AWS target group names are limited to 32 characters. + // Truncate the environment name to fit within the limit. + const tgSuffix = "-k8s-tg" + maxNameLen := 32 - len(tgSuffix) + name := p.ObjectMeta.Name + if len(name) > maxNameLen { + name = name[:maxNameLen] + } + tgName := name + tgSuffix // Create target group for Kubernetes API (port 6443) createTGInput := &elasticloadbalancingv2.CreateTargetGroupInput{ From 5a2e19a6af54a5c0a0fb10db434057a203f43c80 Mon Sep 17 00:00:00 2001 From: Carlos Eduardo Arango Gutierrez Date: Mon, 9 Mar 2026 15:00:39 +0100 Subject: [PATCH 05/19] test: add explicit K8s version to RPM test configs Without an explicit version, RPM tests pull the latest K8s release which may be incompatible with the test infrastructure. Signed-off-by: Carlos Eduardo Arango Gutierrez --- tests/data/test_rpm_al2023_containerd.yml | 3 +++ tests/data/test_rpm_al2023_crio.yml | 3 +++ tests/data/test_rpm_al2023_docker.yml | 3 +++ tests/data/test_rpm_fedora42_containerd.yml | 3 +++ tests/data/test_rpm_fedora42_crio.yml | 3 +++ tests/data/test_rpm_rocky9_containerd.yml | 3 +++ tests/data/test_rpm_rocky9_crio.yml | 3 +++ 7 files changed, 21 insertions(+) diff --git a/tests/data/test_rpm_al2023_containerd.yml b/tests/data/test_rpm_al2023_containerd.yml index 45ca634d3..6cec657df 100644 --- a/tests/data/test_rpm_al2023_containerd.yml +++ b/tests/data/test_rpm_al2023_containerd.yml @@ -24,3 +24,6 @@ spec: kubernetes: install: true installer: kubeadm + source: release + release: + version: v1.31.0 diff --git a/tests/data/test_rpm_al2023_crio.yml b/tests/data/test_rpm_al2023_crio.yml index f729db96a..7c266d59e 100644 --- a/tests/data/test_rpm_al2023_crio.yml +++ b/tests/data/test_rpm_al2023_crio.yml @@ -24,3 +24,6 @@ spec: kubernetes: install: true installer: kubeadm + source: release + release: + version: v1.31.0 diff --git a/tests/data/test_rpm_al2023_docker.yml b/tests/data/test_rpm_al2023_docker.yml index ca10c0c4c..00aad0a79 100644 --- a/tests/data/test_rpm_al2023_docker.yml +++ b/tests/data/test_rpm_al2023_docker.yml @@ -24,3 +24,6 @@ spec: kubernetes: install: true installer: kubeadm + source: release + release: + version: v1.31.0 diff --git a/tests/data/test_rpm_fedora42_containerd.yml b/tests/data/test_rpm_fedora42_containerd.yml index 9f9e9c3a5..f597f266f 100644 --- a/tests/data/test_rpm_fedora42_containerd.yml +++ b/tests/data/test_rpm_fedora42_containerd.yml @@ -24,3 +24,6 @@ spec: kubernetes: install: true installer: kubeadm + source: release + release: + version: v1.31.0 diff --git a/tests/data/test_rpm_fedora42_crio.yml b/tests/data/test_rpm_fedora42_crio.yml index c4fcbac3f..cc201410e 100644 --- a/tests/data/test_rpm_fedora42_crio.yml +++ b/tests/data/test_rpm_fedora42_crio.yml @@ -24,3 +24,6 @@ spec: kubernetes: install: true installer: kubeadm + source: release + release: + version: v1.31.0 diff --git a/tests/data/test_rpm_rocky9_containerd.yml b/tests/data/test_rpm_rocky9_containerd.yml index 2bf30f4c1..aa063cb1f 100644 --- a/tests/data/test_rpm_rocky9_containerd.yml +++ b/tests/data/test_rpm_rocky9_containerd.yml @@ -24,3 +24,6 @@ spec: kubernetes: install: true installer: kubeadm + source: release + release: + version: v1.31.0 diff --git a/tests/data/test_rpm_rocky9_crio.yml b/tests/data/test_rpm_rocky9_crio.yml index b3180cab1..39f04fb00 100644 --- a/tests/data/test_rpm_rocky9_crio.yml +++ b/tests/data/test_rpm_rocky9_crio.yml @@ -24,3 +24,6 @@ spec: kubernetes: install: true installer: kubeadm + source: release + release: + version: v1.31.0 From ce675cae872996007f2c845c2b177220d983e871 Mon Sep 17 00:00:00 2001 From: Carlos Eduardo Arango Gutierrez Date: Mon, 9 Mar 2026 15:02:06 +0100 Subject: [PATCH 06/19] fix: use public IP for control plane endpoint when no LB determineControlPlaneEndpoint returned PrivateIP when no LoadBalancerDNS, making kubeconfig unreachable from outside the VPC. Use PublicIP instead. Signed-off-by: Carlos Eduardo Arango Gutierrez --- pkg/provisioner/cluster.go | 4 ++-- pkg/provisioner/cluster_test.go | 8 +++++--- 2 files changed, 7 insertions(+), 5 deletions(-) diff --git a/pkg/provisioner/cluster.go b/pkg/provisioner/cluster.go index 32ce74002..6320da9b6 100644 --- a/pkg/provisioner/cluster.go +++ b/pkg/provisioner/cluster.go @@ -155,8 +155,8 @@ func (cp *ClusterProvisioner) determineControlPlaneEndpoint(firstCP NodeInfo) st if cp.Environment.Status.Cluster != nil && cp.Environment.Status.Cluster.LoadBalancerDNS != "" { return cp.Environment.Status.Cluster.LoadBalancerDNS } - // Fall back to first control-plane private IP - return firstCP.PrivateIP + // Fall back to first control-plane public IP so kubeconfig is reachable externally + return firstCP.PublicIP } // provisionBaseOnAllNodes provisions base dependencies (kernel, driver, runtime, toolkit) diff --git a/pkg/provisioner/cluster_test.go b/pkg/provisioner/cluster_test.go index 6d64cd517..bdc7a5b2a 100644 --- a/pkg/provisioner/cluster_test.go +++ b/pkg/provisioner/cluster_test.go @@ -302,7 +302,7 @@ func TestClusterProvisioner_determineControlPlaneEndpoint(t *testing.T) { expected: "my-lb.elb.amazonaws.com", }, { - name: "Fall back to first CP private IP", + name: "Fall back to first CP public IP", env: &v1alpha1.Environment{ Spec: v1alpha1.EnvironmentSpec{ Cluster: &v1alpha1.ClusterSpec{}, @@ -310,9 +310,10 @@ func TestClusterProvisioner_determineControlPlaneEndpoint(t *testing.T) { Status: v1alpha1.EnvironmentStatus{}, }, firstCP: NodeInfo{ + PublicIP: "54.1.2.3", PrivateIP: "10.0.0.1", }, - expected: "10.0.0.1", + expected: "54.1.2.3", }, { name: "No cluster status", @@ -322,9 +323,10 @@ func TestClusterProvisioner_determineControlPlaneEndpoint(t *testing.T) { }, }, firstCP: NodeInfo{ + PublicIP: "54.1.2.4", PrivateIP: "10.0.0.2", }, - expected: "10.0.0.2", + expected: "54.1.2.4", }, } From c8ec13b7f1ad06ae1cf1615d2116e2de0ec5ac4d Mon Sep 17 00:00:00 2001 From: Carlos Eduardo Arango Gutierrez Date: Mon, 9 Mar 2026 15:08:38 +0100 Subject: [PATCH 07/19] fix: add self-referencing SG rules for cluster inter-node traffic Existing VPC CIDR rules only cover specific ports. Self-referencing SG rules allow all TCP/UDP/ICMP between instances in the same security group, covering webhooks, NodePort, IPIP (Calico), and future K8s services. Uses explicit protocols (not -1) for stricter compliance. Signed-off-by: Carlos Eduardo Arango Gutierrez --- pkg/provider/aws/cluster.go | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) diff --git a/pkg/provider/aws/cluster.go b/pkg/provider/aws/cluster.go index 718a91995..55f938331 100644 --- a/pkg/provider/aws/cluster.go +++ b/pkg/provider/aws/cluster.go @@ -323,6 +323,32 @@ func (p *Provider) createClusterSecurityGroup(cache *ClusterCache) error { }, } + // Self-referencing rules: allow all traffic between instances in this SG. + // Covers webhooks (dynamic ports), NodePort (30000-32767), IPIP (Calico), + // and any future K8s inter-node communication. + // Uses explicit TCP+UDP+ICMP (not protocol -1) for stricter compliance. + sgRef := []types.UserIdGroupPair{{GroupId: sgOutput.GroupId}} + permissions = append(permissions, + types.IpPermission{ + FromPort: aws.Int32(0), + ToPort: aws.Int32(65535), + IpProtocol: aws.String("tcp"), + UserIdGroupPairs: sgRef, + }, + types.IpPermission{ + FromPort: aws.Int32(0), + ToPort: aws.Int32(65535), + IpProtocol: aws.String("udp"), + UserIdGroupPairs: sgRef, + }, + types.IpPermission{ + FromPort: aws.Int32(-1), + ToPort: aws.Int32(-1), + IpProtocol: aws.String("icmp"), + UserIdGroupPairs: sgRef, + }, + ) + irInput := &ec2.AuthorizeSecurityGroupIngressInput{ GroupId: sgOutput.GroupId, IpPermissions: permissions, From f835f00aa2ff2410611ec82e3981647df24136f1 Mon Sep 17 00:00:00 2001 From: Carlos Eduardo Arango Gutierrez Date: Mon, 9 Mar 2026 15:08:47 +0100 Subject: [PATCH 08/19] fix: add self-referencing SG rules to single-node security group Consistent with cluster SG: adds TCP/UDP/ICMP self-referencing rules so instances in the same SG can communicate freely. Signed-off-by: Carlos Eduardo Arango Gutierrez --- pkg/provider/aws/create.go | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/pkg/provider/aws/create.go b/pkg/provider/aws/create.go index f0470565e..6add5dec3 100644 --- a/pkg/provider/aws/create.go +++ b/pkg/provider/aws/create.go @@ -404,6 +404,25 @@ func (p *Provider) createSecurityGroup(cache *AWS) error { IpProtocol: &tcp, IpRanges: ipRanges, }, + // Self-referencing: allow all TCP/UDP/ICMP between SG members + { + FromPort: aws.Int32(0), + ToPort: aws.Int32(65535), + IpProtocol: aws.String("tcp"), + UserIdGroupPairs: []types.UserIdGroupPair{{GroupId: sgOutput.GroupId}}, + }, + { + FromPort: aws.Int32(0), + ToPort: aws.Int32(65535), + IpProtocol: aws.String("udp"), + UserIdGroupPairs: []types.UserIdGroupPair{{GroupId: sgOutput.GroupId}}, + }, + { + FromPort: aws.Int32(-1), + ToPort: aws.Int32(-1), + IpProtocol: aws.String("icmp"), + UserIdGroupPairs: []types.UserIdGroupPair{{GroupId: sgOutput.GroupId}}, + }, }, } From 318f0de86c8bb5290295a668d33bf502365a329a Mon Sep 17 00:00:00 2001 From: Carlos Eduardo Arango Gutierrez Date: Mon, 9 Mar 2026 15:28:36 +0100 Subject: [PATCH 09/19] fix: rewrite kubeconfig server URL to public endpoint after download Kubeconfig downloaded from remote nodes contains private IPs in the server URL, making it unusable from outside the VPC. Add structured YAML parsing to rewrite the server URL to the public IP or NLB DNS. Update all GetKubeConfig callers with the new desiredServerURL parameter. Signed-off-by: Carlos Eduardo Arango Gutierrez --- cmd/action/ci/entrypoint.go | 2 +- cmd/cli/create/create.go | 4 +- cmd/cli/get/get.go | 2 +- pkg/utils/kubeconfig.go | 63 +++++++++++++++++++- pkg/utils/kubeconfig_test.go | 109 +++++++++++++++++++++++++++++++++++ 5 files changed, 175 insertions(+), 5 deletions(-) create mode 100644 pkg/utils/kubeconfig_test.go diff --git a/cmd/action/ci/entrypoint.go b/cmd/action/ci/entrypoint.go index e4292182a..64641873d 100644 --- a/cmd/action/ci/entrypoint.go +++ b/cmd/action/ci/entrypoint.go @@ -96,7 +96,7 @@ func entrypoint(log *logger.FunLogger) error { } if cfg.Spec.Kubernetes.Install { - err = utils.GetKubeConfig(log, &cfg, hostUrl, kubeconfig) + err = utils.GetKubeConfig(log, &cfg, hostUrl, kubeconfig, "") if err != nil { return fmt.Errorf("failed to get kubeconfig: %w", err) } diff --git a/cmd/cli/create/create.go b/cmd/cli/create/create.go index dffa420a8..442f85c19 100644 --- a/cmd/cli/create/create.go +++ b/cmd/cli/create/create.go @@ -485,7 +485,7 @@ func runSingleNodeProvision(log *logger.FunLogger, opts *options) error { break } } - if err = utils.GetKubeConfig(log, &opts.cache, hostUrl, opts.kubeconfig); err != nil { + if err = utils.GetKubeConfig(log, &opts.cache, hostUrl, opts.kubeconfig, ""); err != nil { return fmt.Errorf("failed to get kubeconfig: %w", err) } } @@ -563,7 +563,7 @@ func runMultinodeProvision(log *logger.FunLogger, opts *options) error { } } if hostUrl != "" { - if err := utils.GetKubeConfig(log, &opts.cache, hostUrl, opts.kubeconfig); err != nil { + if err := utils.GetKubeConfig(log, &opts.cache, hostUrl, opts.kubeconfig, ""); err != nil { return fmt.Errorf("failed to get kubeconfig: %w", err) } } diff --git a/cmd/cli/get/get.go b/cmd/cli/get/get.go index 4a80d157d..31b6f1f78 100644 --- a/cmd/cli/get/get.go +++ b/cmd/cli/get/get.go @@ -181,7 +181,7 @@ func (m command) runKubeconfig(instanceID string) error { } // Download kubeconfig - if err := utils.GetKubeConfig(m.log, &env, hostUrl, outputPath); err != nil { + if err := utils.GetKubeConfig(m.log, &env, hostUrl, outputPath, ""); err != nil { return fmt.Errorf("failed to download kubeconfig: %w", err) } diff --git a/pkg/utils/kubeconfig.go b/pkg/utils/kubeconfig.go index 6c15d402d..2c358e642 100644 --- a/pkg/utils/kubeconfig.go +++ b/pkg/utils/kubeconfig.go @@ -24,10 +24,64 @@ import ( "github.com/NVIDIA/holodeck/api/holodeck/v1alpha1" "github.com/NVIDIA/holodeck/internal/logger" "github.com/NVIDIA/holodeck/pkg/provisioner" + "sigs.k8s.io/yaml" ) +// kubeConfig is a minimal representation for server URL rewriting. +type kubeConfig struct { + APIVersion string `json:"apiVersion"` + Kind string `json:"kind"` + Clusters []kubeConfigClusterEntry `json:"clusters"` + Contexts []any `json:"contexts"` + CurrentContext string `json:"current-context"` + Users []any `json:"users"` +} + +type kubeConfigClusterEntry struct { + Name string `json:"name"` + Cluster kubeConfigCluster `json:"cluster"` +} + +type kubeConfigCluster struct { + Server string `json:"server"` + CertificateAuthorityData string `json:"certificate-authority-data,omitempty"` +} + +// RewriteKubeConfigServer rewrites the server URL in a kubeconfig file. +// If serverURL is empty, this is a no-op. +func RewriteKubeConfigServer(path string, serverURL string) error { + if serverURL == "" { + return nil + } + + data, err := os.ReadFile(path) //nolint:gosec // path is caller-provided kubeconfig + if err != nil { + return fmt.Errorf("reading kubeconfig: %w", err) + } + + var cfg kubeConfig + if err := yaml.Unmarshal(data, &cfg); err != nil { + return fmt.Errorf("parsing kubeconfig: %w", err) + } + + for i := range cfg.Clusters { + cfg.Clusters[i].Cluster.Server = serverURL + } + + out, err := yaml.Marshal(&cfg) + if err != nil { + return fmt.Errorf("marshaling kubeconfig: %w", err) + } + + if err := os.WriteFile(path, out, 0600); err != nil { + return fmt.Errorf("writing kubeconfig: %w", err) + } + + return nil +} + // GetKubeConfig downloads the kubeconfig file from the remote host -func GetKubeConfig(log *logger.FunLogger, cfg *v1alpha1.Environment, hostUrl string, dest string) error { +func GetKubeConfig(log *logger.FunLogger, cfg *v1alpha1.Environment, hostUrl string, dest string, desiredServerURL string) error { remoteFilePath := "${HOME}/.kube/config" // Create a new ssh session @@ -76,5 +130,12 @@ func GetKubeConfig(log *logger.FunLogger, cfg *v1alpha1.Environment, hostUrl str log.Info(fmt.Sprintf("Kubeconfig saved to %s\n", dest)) + if desiredServerURL != "" { + if err := RewriteKubeConfigServer(dest, desiredServerURL); err != nil { + return fmt.Errorf("failed to rewrite kubeconfig server URL: %w", err) + } + log.Info(fmt.Sprintf("Kubeconfig server URL rewritten to %s\n", desiredServerURL)) + } + return nil } diff --git a/pkg/utils/kubeconfig_test.go b/pkg/utils/kubeconfig_test.go new file mode 100644 index 000000000..910f19e05 --- /dev/null +++ b/pkg/utils/kubeconfig_test.go @@ -0,0 +1,109 @@ +/* + * Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package utils + +import ( + "os" + "path/filepath" + "testing" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" +) + +func TestRewriteKubeConfigServer(t *testing.T) { + tests := []struct { + name string + input string + serverURL string + expected string + }{ + { + name: "rewrite private IP to public IP", + input: `apiVersion: v1 +clusters: +- cluster: + certificate-authority-data: dGVzdA== + server: https://10.0.0.1:6443 + name: kubernetes +contexts: +- context: + cluster: kubernetes + user: kubernetes-admin + name: kubernetes-admin@kubernetes +current-context: kubernetes-admin@kubernetes +kind: Config +users: +- name: kubernetes-admin + user: + client-certificate-data: dGVzdA== + client-key-data: dGVzdA== +`, + serverURL: "https://54.1.2.3:6443", + expected: "https://54.1.2.3:6443", + }, + { + name: "rewrite to NLB DNS", + input: `apiVersion: v1 +clusters: +- cluster: + server: https://10.0.0.1:6443 + name: kubernetes +contexts: +- context: + cluster: kubernetes + user: admin + name: admin@kubernetes +current-context: admin@kubernetes +kind: Config +users: +- name: admin + user: + client-certificate-data: dGVzdA== +`, + serverURL: "https://my-nlb.elb.amazonaws.com:6443", + expected: "https://my-nlb.elb.amazonaws.com:6443", + }, + { + name: "empty server URL is no-op", + input: "apiVersion: v1\nclusters:\n- cluster:\n server: https://10.0.0.1:6443\n name: kubernetes\nkind: Config\n", + serverURL: "", + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + dir := t.TempDir() + path := filepath.Join(dir, "kubeconfig") + err := os.WriteFile(path, []byte(tt.input), 0600) + require.NoError(t, err) + + err = RewriteKubeConfigServer(path, tt.serverURL) + require.NoError(t, err) + + data, err := os.ReadFile(path) + require.NoError(t, err) + + if tt.serverURL == "" { + assert.Contains(t, string(data), "https://10.0.0.1:6443") + } else { + assert.Contains(t, string(data), tt.expected) + assert.NotContains(t, string(data), "10.0.0.1") + } + }) + } +} From 05e6dc9947920a8838a255ef7d4135ce4413011b Mon Sep 17 00:00:00 2001 From: Carlos Eduardo Arango Gutierrez Date: Mon, 9 Mar 2026 15:31:33 +0100 Subject: [PATCH 10/19] feat: add ExpandPath utility for tilde expansion os.ReadFile does not expand ~ in paths. ExpandPath replaces a leading tilde with the user's home directory. Used for privateKey paths. Signed-off-by: Carlos Eduardo Arango Gutierrez --- pkg/utils/path.go | 43 ++++++++++++++++++++++++ pkg/utils/path_test.go | 76 ++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 119 insertions(+) create mode 100644 pkg/utils/path.go create mode 100644 pkg/utils/path_test.go diff --git a/pkg/utils/path.go b/pkg/utils/path.go new file mode 100644 index 000000000..e4e17294f --- /dev/null +++ b/pkg/utils/path.go @@ -0,0 +1,43 @@ +/* + * Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package utils + +import ( + "os" + "path/filepath" + "strings" +) + +// ExpandPath expands a leading ~ to the user's home directory. +// Non-tilde paths are returned unchanged. +func ExpandPath(path string) (string, error) { + if path == "" || !strings.HasPrefix(path, "~") { + return path, nil + } + + home, err := os.UserHomeDir() + if err != nil { + return "", err + } + + if path == "~" { + return home, nil + } + + // ~/foo/bar → /home/user/foo/bar + return filepath.Join(home, path[2:]), nil +} diff --git a/pkg/utils/path_test.go b/pkg/utils/path_test.go new file mode 100644 index 000000000..b35ebf689 --- /dev/null +++ b/pkg/utils/path_test.go @@ -0,0 +1,76 @@ +/* + * Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package utils + +import ( + "os" + "path/filepath" + "testing" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" +) + +func TestExpandPath(t *testing.T) { + home, err := os.UserHomeDir() + require.NoError(t, err) + + tests := []struct { + name string + input string + expected string + }{ + { + name: "tilde only", + input: "~", + expected: home, + }, + { + name: "tilde with path", + input: "~/.ssh/id_rsa", + expected: filepath.Join(home, ".ssh/id_rsa"), + }, + { + name: "absolute path unchanged", + input: "/etc/holodeck/key.pem", + expected: "/etc/holodeck/key.pem", + }, + { + name: "relative path unchanged", + input: "keys/my.pem", + expected: "keys/my.pem", + }, + { + name: "tilde in middle unchanged", + input: "/home/user/~/.ssh/key", + expected: "/home/user/~/.ssh/key", + }, + { + name: "empty string", + input: "", + expected: "", + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + result, err := ExpandPath(tt.input) + require.NoError(t, err) + assert.Equal(t, tt.expected, result) + }) + } +} From 16c165c17b22b8e134dd3b86d019a47384f29305 Mon Sep 17 00:00:00 2001 From: Carlos Eduardo Arango Gutierrez Date: Mon, 9 Mar 2026 15:36:58 +0100 Subject: [PATCH 11/19] fix: expand tilde in privateKey paths before reading All three SSH connection sites now expand ~ to the user's home directory, allowing users to specify paths like ~/.ssh/my-key.pem. Signed-off-by: Carlos Eduardo Arango Gutierrez --- cmd/cli/common/host.go | 5 +++++ cmd/cli/dryrun/dryrun.go | 5 +++++ pkg/provisioner/provisioner.go | 11 +++++++++++ 3 files changed, 21 insertions(+) diff --git a/cmd/cli/common/host.go b/cmd/cli/common/host.go index e4fe0aee5..3cdfdce72 100644 --- a/cmd/cli/common/host.go +++ b/cmd/cli/common/host.go @@ -27,6 +27,7 @@ import ( "github.com/NVIDIA/holodeck/internal/logger" "github.com/NVIDIA/holodeck/pkg/provider/aws" "github.com/NVIDIA/holodeck/pkg/sshutil" + "github.com/NVIDIA/holodeck/pkg/utils" ) // GetHostURL resolves the SSH-reachable host URL for an environment. @@ -82,6 +83,10 @@ const ( // ConnectSSH establishes an SSH connection with retries. // Host key verification uses Trust-On-First-Use (TOFU). func ConnectSSH(log *logger.FunLogger, keyPath, userName, hostUrl string) (*ssh.Client, error) { + keyPath, err := utils.ExpandPath(keyPath) + if err != nil { + return nil, fmt.Errorf("expanding key path: %w", err) + } key, err := os.ReadFile(keyPath) //nolint:gosec // keyPath is from trusted env config if err != nil { return nil, fmt.Errorf("failed to read key file %s: %w", keyPath, err) diff --git a/cmd/cli/dryrun/dryrun.go b/cmd/cli/dryrun/dryrun.go index 7bb147b77..f622ee086 100644 --- a/cmd/cli/dryrun/dryrun.go +++ b/cmd/cli/dryrun/dryrun.go @@ -27,6 +27,7 @@ import ( "github.com/NVIDIA/holodeck/pkg/provider/aws" "github.com/NVIDIA/holodeck/pkg/provisioner" "github.com/NVIDIA/holodeck/pkg/sshutil" + "github.com/NVIDIA/holodeck/pkg/utils" cli "github.com/urfave/cli/v2" "golang.org/x/crypto/ssh" @@ -131,6 +132,10 @@ func validateAWS(log *logger.FunLogger, opts *options) error { // createSshClient creates a ssh client, and retries if it fails to connect func connectOrDie(keyPath, userName, hostUrl string) error { var err error + keyPath, err = utils.ExpandPath(keyPath) + if err != nil { + return fmt.Errorf("expanding key path: %w", err) + } key, err := os.ReadFile(keyPath) // nolint:gosec if err != nil { return fmt.Errorf("failed to read key file: %w", err) diff --git a/pkg/provisioner/provisioner.go b/pkg/provisioner/provisioner.go index eaae789d4..a48d02ca0 100644 --- a/pkg/provisioner/provisioner.go +++ b/pkg/provisioner/provisioner.go @@ -451,6 +451,17 @@ func addScriptHeader(tpl *bytes.Buffer) error { func connectOrDie(keyPath, userName, hostUrl string) (*ssh.Client, error) { var client *ssh.Client var err error + if strings.HasPrefix(keyPath, "~") { + home, homeErr := os.UserHomeDir() + if homeErr != nil { + return nil, fmt.Errorf("expanding key path: %w", homeErr) + } + if keyPath == "~" { + keyPath = home + } else { + keyPath = filepath.Join(home, keyPath[2:]) + } + } key, err := os.ReadFile(keyPath) // nolint:gosec if err != nil { return nil, fmt.Errorf("failed to read key file: %w", err) From 9b34cc58bdb78ef3ae0b12a0a8f03d39c13aa33c Mon Sep 17 00:00:00 2001 From: Carlos Eduardo Arango Gutierrez Date: Mon, 9 Mar 2026 16:56:09 +0100 Subject: [PATCH 12/19] fix: use local IP for kubeadm init in all cluster modes, not just HA kubeadm v1.33+ validates the API server via control-plane-endpoint during init. When this is set to a public IP or DNS name, the health check times out because the endpoint isn't routable from within the instance during bootstrap. Previously this was only fixed for HA mode with NLB; now all cluster configurations use the local IP for init. Signed-off-by: Carlos Eduardo Arango Gutierrez --- pkg/provisioner/templates/kubeadm_cluster.go | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/pkg/provisioner/templates/kubeadm_cluster.go b/pkg/provisioner/templates/kubeadm_cluster.go index 0e6c49fe3..be0ed4ed0 100644 --- a/pkg/provisioner/templates/kubeadm_cluster.go +++ b/pkg/provisioner/templates/kubeadm_cluster.go @@ -158,14 +158,13 @@ if [[ ! -f /etc/kubernetes/admin.conf ]]; then # Detect this node's private IP for API server binding NODE_PRIVATE_IP=$(hostname -I | awk '{print $1}') - # For HA with NLB, use local IP for init health checks to avoid chicken-and-egg: - # NLB can't route to API server until it's healthy, but kubeadm won't report - # healthy until it can reach the control-plane-endpoint (NLB). - # Solution: init with local endpoint, include NLB in cert SANs for later joins. - if [[ "$IS_HA" == "true" ]] && { [[ "$CONTROL_PLANE_ENDPOINT" == *"elb.amazonaws.com"* ]] || \ - [[ "$CONTROL_PLANE_ENDPOINT" == *"amazonaws.com"* ]]; }; then + # Always use local IP for init health checks: kubeadm v1.33+ validates the API + # server via control-plane-endpoint, which may not be routable from within the + # instance during init (public IPs, NLB DNS, etc.). Use private IP for init and + # include the original endpoint in cert SANs so external access works. + if [[ "$CONTROL_PLANE_ENDPOINT" != "$NODE_PRIVATE_IP" ]]; then INIT_ENDPOINT="${NODE_PRIVATE_IP}" - holodeck_log "INFO" "$COMPONENT" "HA mode: using local IP ${NODE_PRIVATE_IP} for init, NLB in cert SANs" + holodeck_log "INFO" "$COMPONENT" "Using local IP ${NODE_PRIVATE_IP} for init (endpoint: ${CONTROL_PLANE_ENDPOINT} in cert SANs)" else INIT_ENDPOINT="${CONTROL_PLANE_ENDPOINT}" fi From cbb6aa45993ee0102d51fe3bc74a0dee679f723e Mon Sep 17 00:00:00 2001 From: Carlos Eduardo Arango Gutierrez Date: Mon, 9 Mar 2026 18:12:55 +0100 Subject: [PATCH 13/19] fix: use private IP for cluster endpoint in non-HA mode For non-HA clusters, all nodes are in the same VPC so the private IP is always routable. Using the public IP for kubeadm join fails because intra-VPC traffic via public IPs goes through the IGW and may timeout. External access (kubeconfig) is handled by RewriteKubeConfigServer. Signed-off-by: Carlos Eduardo Arango Gutierrez --- pkg/provisioner/cluster.go | 10 +++++++--- pkg/provisioner/cluster_test.go | 6 +++--- 2 files changed, 10 insertions(+), 6 deletions(-) diff --git a/pkg/provisioner/cluster.go b/pkg/provisioner/cluster.go index 6320da9b6..65805662f 100644 --- a/pkg/provisioner/cluster.go +++ b/pkg/provisioner/cluster.go @@ -149,14 +149,18 @@ func (cp *ClusterProvisioner) ProvisionCluster(nodes []NodeInfo) error { return nil } -// determineControlPlaneEndpoint returns the control plane endpoint +// determineControlPlaneEndpoint returns the control plane endpoint for cluster-internal +// communication (kubeadm init, join, API server binding). For HA with NLB, returns the +// NLB DNS. For non-HA, returns the first CP's private IP since all nodes are in the +// same VPC and the private IP is always routable. External access (kubeconfig) is +// handled separately by RewriteKubeConfigServer. func (cp *ClusterProvisioner) determineControlPlaneEndpoint(firstCP NodeInfo) string { // Check if HA is enabled and we have a load balancer DNS if cp.Environment.Status.Cluster != nil && cp.Environment.Status.Cluster.LoadBalancerDNS != "" { return cp.Environment.Status.Cluster.LoadBalancerDNS } - // Fall back to first control-plane public IP so kubeconfig is reachable externally - return firstCP.PublicIP + // Use private IP for intra-VPC communication (init + join) + return firstCP.PrivateIP } // provisionBaseOnAllNodes provisions base dependencies (kernel, driver, runtime, toolkit) diff --git a/pkg/provisioner/cluster_test.go b/pkg/provisioner/cluster_test.go index bdc7a5b2a..fd3b23e13 100644 --- a/pkg/provisioner/cluster_test.go +++ b/pkg/provisioner/cluster_test.go @@ -302,7 +302,7 @@ func TestClusterProvisioner_determineControlPlaneEndpoint(t *testing.T) { expected: "my-lb.elb.amazonaws.com", }, { - name: "Fall back to first CP public IP", + name: "Fall back to first CP private IP", env: &v1alpha1.Environment{ Spec: v1alpha1.EnvironmentSpec{ Cluster: &v1alpha1.ClusterSpec{}, @@ -313,7 +313,7 @@ func TestClusterProvisioner_determineControlPlaneEndpoint(t *testing.T) { PublicIP: "54.1.2.3", PrivateIP: "10.0.0.1", }, - expected: "54.1.2.3", + expected: "10.0.0.1", }, { name: "No cluster status", @@ -326,7 +326,7 @@ func TestClusterProvisioner_determineControlPlaneEndpoint(t *testing.T) { PublicIP: "54.1.2.4", PrivateIP: "10.0.0.2", }, - expected: "54.1.2.4", + expected: "10.0.0.2", }, } From b99b4ae8216645ada80713066de103f17c1c0723 Mon Sep 17 00:00:00 2001 From: Carlos Eduardo Arango Gutierrez Date: Tue, 10 Mar 2026 14:49:26 +0100 Subject: [PATCH 14/19] fix: restart cri-dockerd before kubeadm init to prevent socket errors When Docker is the container runtime, CTK installation restarts dockerd between provisioning steps. cri-dockerd loses its Docker connection and crashes. With systemd's StartLimitBurst=3 in 60s, it may not auto-recover by the time kubeadm runs, resulting in "no such file or directory" errors for /run/cri-dockerd.sock. Add a systemctl reset-failed + restart for cri-docker.service before kubeadm init when Docker is the runtime. Also fix hardcoded cri-dockerd.sock references in diagnostics and kubeadm reset to use the template's CriSocket variable, making them work with all runtimes. Signed-off-by: Carlos Eduardo Arango Gutierrez --- pkg/provisioner/templates/kubernetes.go | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/pkg/provisioner/templates/kubernetes.go b/pkg/provisioner/templates/kubernetes.go index 0914c8eec..2a79e258e 100644 --- a/pkg/provisioner/templates/kubernetes.go +++ b/pkg/provisioner/templates/kubernetes.go @@ -179,6 +179,18 @@ sudo systemctl enable --now kubelet holodeck_progress "$COMPONENT" 5 8 "Initializing Kubernetes cluster" +# Ensure CRI socket service is running before kubeadm init. +# When Docker is the runtime, CTK installation restarts dockerd between the +# Docker and kubeadm provisioning steps. cri-dockerd loses its Docker +# connection and crashes. With systemd StartLimitBurst=3 in 60s, it may +# not auto-recover by the time kubeadm runs. +{{- if eq .CriSocket "unix:///run/cri-dockerd.sock" }} +holodeck_log "INFO" "$COMPONENT" "Ensuring cri-dockerd is running" +sudo systemctl reset-failed cri-docker.service 2>/dev/null || true +sudo systemctl restart cri-docker.service +sleep 2 +{{- end }} + # Initialize cluster only if not already initialized if [[ ! -f /etc/kubernetes/admin.conf ]]; then # Pre-pull images before init. kubeadm init with --ignore-preflight-errors=all @@ -248,7 +260,7 @@ apiServer:\\ holodeck_log "INFO" "$COMPONENT" "--- kubelet logs (last 30 lines) ---" sudo journalctl -u kubelet --no-pager -n 30 2>&1 || true holodeck_log "INFO" "$COMPONENT" "--- container status via crictl ---" - sudo crictl --runtime-endpoint unix:///run/cri-dockerd.sock ps -a 2>&1 || true + sudo crictl --runtime-endpoint {{ .CriSocket }} ps -a 2>&1 || true holodeck_log "INFO" "$COMPONENT" "--- container status via docker ---" sudo docker ps -a 2>&1 || true holodeck_log "INFO" "$COMPONENT" "--- kubeadm-flags.env ---" @@ -257,7 +269,7 @@ apiServer:\\ if [[ $KUBEADM_ATTEMPT -lt $KUBEADM_MAX_ATTEMPTS ]]; then holodeck_log "INFO" "$COMPONENT" "Resetting cluster state before retry" - sudo kubeadm reset -f --cri-socket "unix:///run/cri-dockerd.sock" 2>&1 || true + sudo kubeadm reset -f --cri-socket "{{ .CriSocket }}" 2>&1 || true # Re-enable kubelet after reset sudo systemctl daemon-reload sudo systemctl restart kubelet From 5b8fda3cd08dd99a5bf5ca2f1158d67ddaef2b09 Mon Sep 17 00:00:00 2001 From: Carlos Eduardo Arango Gutierrez Date: Tue, 10 Mar 2026 14:59:41 +0100 Subject: [PATCH 15/19] fix: ensure nvidia-container-runtime exists at /usr/bin on all platforms nvidia-ctk runtime configure hardcodes /usr/bin/nvidia-container-runtime in the container runtime config. When CTK is built from source, the binary gets installed to /usr/local/bin, causing containerd to fail with "fork/exec /usr/bin/nvidia-container-runtime: no such file or directory" for all pods including control plane components. Split the symlink logic into two steps: 1. Ensure nvidia-container-runtime is in PATH (symlink from nvidia-ctk) 2. Unconditionally ensure /usr/bin/nvidia-container-runtime exists Applied to both git and latest CTK templates. Signed-off-by: Carlos Eduardo Arango Gutierrez --- .../templates/container-toolkit.go | 25 +++++++++++++------ 1 file changed, 18 insertions(+), 7 deletions(-) diff --git a/pkg/provisioner/templates/container-toolkit.go b/pkg/provisioner/templates/container-toolkit.go index 0d2ebfa8b..d9c541ff3 100644 --- a/pkg/provisioner/templates/container-toolkit.go +++ b/pkg/provisioner/templates/container-toolkit.go @@ -313,10 +313,15 @@ if ! command -v nvidia-container-runtime &>/dev/null; then holodeck_log "INFO" "$COMPONENT" "Creating nvidia-container-runtime symlink from nvidia-ctk" CTK_PATH=$(command -v nvidia-ctk) sudo ln -sf "$CTK_PATH" "$(dirname "$CTK_PATH")/nvidia-container-runtime" - # Also ensure it's in /usr/bin for container runtime configs - if [[ ! -f /usr/bin/nvidia-container-runtime ]]; then - sudo ln -sf "$CTK_PATH" /usr/bin/nvidia-container-runtime - fi +fi +# nvidia-ctk runtime configure hardcodes /usr/bin/nvidia-container-runtime in +# the container runtime config. Ensure a binary or symlink exists there even +# when the actual binary was installed elsewhere (e.g. /usr/local/bin from +# a source build). +if [[ ! -f /usr/bin/nvidia-container-runtime ]]; then + RUNTIME_SRC=$(command -v nvidia-container-runtime 2>/dev/null || command -v nvidia-ctk) + holodeck_log "INFO" "$COMPONENT" "Symlinking ${RUNTIME_SRC} -> /usr/bin/nvidia-container-runtime" + sudo ln -sf "$RUNTIME_SRC" /usr/bin/nvidia-container-runtime fi holodeck_progress "$COMPONENT" 5 5 "Configuring runtime" @@ -530,9 +535,15 @@ if ! command -v nvidia-container-runtime &>/dev/null; then holodeck_log "INFO" "$COMPONENT" "Creating nvidia-container-runtime symlink from nvidia-ctk" CTK_PATH=$(command -v nvidia-ctk) sudo ln -sf "$CTK_PATH" "$(dirname "$CTK_PATH")/nvidia-container-runtime" - if [[ ! -f /usr/bin/nvidia-container-runtime ]]; then - sudo ln -sf "$CTK_PATH" /usr/bin/nvidia-container-runtime - fi +fi +# nvidia-ctk runtime configure hardcodes /usr/bin/nvidia-container-runtime in +# the container runtime config. Ensure a binary or symlink exists there even +# when the actual binary was installed elsewhere (e.g. /usr/local/bin from +# a source build). +if [[ ! -f /usr/bin/nvidia-container-runtime ]]; then + RUNTIME_SRC=$(command -v nvidia-container-runtime 2>/dev/null || command -v nvidia-ctk) + holodeck_log "INFO" "$COMPONENT" "Symlinking ${RUNTIME_SRC} -> /usr/bin/nvidia-container-runtime" + sudo ln -sf "$RUNTIME_SRC" /usr/bin/nvidia-container-runtime fi holodeck_progress "$COMPONENT" 5 5 "Configuring runtime" From f40a5625b3cd11a36d65b9b86bd0377959440cc3 Mon Sep 17 00:00:00 2001 From: Carlos Eduardo Arango Gutierrez Date: Tue, 10 Mar 2026 15:02:25 +0100 Subject: [PATCH 16/19] fix: prevent duplicate apiServer block in kubeadm config with feature gates When feature gates are enabled (e.g. DRA), the kubeadm config template already contains an apiServer: block with extraArgs. The certSANs injection via sed was unconditionally appending a new apiServer: block, creating duplicate YAML keys. kubeadm uses the last occurrence, so the certSANs were silently ignored. Now detect whether apiServer: already exists in the config and inject certSANs into it rather than creating a duplicate block. Signed-off-by: Carlos Eduardo Arango Gutierrez --- pkg/provisioner/templates/kubernetes.go | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/pkg/provisioner/templates/kubernetes.go b/pkg/provisioner/templates/kubernetes.go index 2a79e258e..aac5e700e 100644 --- a/pkg/provisioner/templates/kubernetes.go +++ b/pkg/provisioner/templates/kubernetes.go @@ -234,14 +234,25 @@ if [[ ! -f /etc/kubernetes/admin.conf ]]; then KUBEADM_NODE_IP=$(hostname -I | awk '{print $1}') sudo sed -i "s|controlPlaneEndpoint: \"${K8S_ENDPOINT_HOST}:6443\"|controlPlaneEndpoint: \"${KUBEADM_NODE_IP}:6443\"|" \ /etc/kubernetes/kubeadm-config.yaml - # Inject certSANs into ClusterConfiguration if not already present + # Inject certSANs into ClusterConfiguration so the API server cert + # covers both the public endpoint and the private IP we use for init. if ! grep -q 'certSANs' /etc/kubernetes/kubeadm-config.yaml; then - sudo sed -i "/^controlPlaneEndpoint:/a\\ + if grep -q '^apiServer:' /etc/kubernetes/kubeadm-config.yaml; then + # apiServer block exists (e.g. feature gates) — inject certSANs into it + sudo sed -i "/^apiServer:/a\\ + certSANs:\\ + - \"${K8S_ENDPOINT_HOST}\"\\ + - \"${KUBEADM_NODE_IP}\"\\ + - \"localhost\"" /etc/kubernetes/kubeadm-config.yaml + else + # No apiServer block — create one after controlPlaneEndpoint + sudo sed -i "/^controlPlaneEndpoint:/a\\ apiServer:\\ certSANs:\\ - \"${K8S_ENDPOINT_HOST}\"\\ - \"${KUBEADM_NODE_IP}\"\\ - \"localhost\"" /etc/kubernetes/kubeadm-config.yaml + fi fi sudo kubeadm init \ --config /etc/kubernetes/kubeadm-config.yaml \ From 3f479cd0d7ad1a4855a259db328e1ec185b66880 Mon Sep 17 00:00:00 2001 From: Carlos Eduardo Arango Gutierrez Date: Tue, 10 Mar 2026 18:01:09 +0100 Subject: [PATCH 17/19] fix: resolve CI lint failures (goimports, gosec, markdown) - Fix goimports grouping in kubeconfig.go (third-party before internal) - Suppress gosec G304 in test file (path from t.TempDir()) - Fix MD029 ordered list prefix in custom-templates.md - Fix MD013 line length in examples/README.md Signed-off-by: Carlos Eduardo Arango Gutierrez --- docs/examples/README.md | 15 ++++++++++----- docs/guides/custom-templates.md | 12 ++++++------ pkg/utils/kubeconfig.go | 3 ++- pkg/utils/kubeconfig_test.go | 2 +- 4 files changed, 19 insertions(+), 13 deletions(-) diff --git a/docs/examples/README.md b/docs/examples/README.md index 5b621399e..6b000f29d 100644 --- a/docs/examples/README.md +++ b/docs/examples/README.md @@ -129,9 +129,12 @@ Install the NVIDIA driver using the official `.run` installer. **Files:** -- [`examples/ctk_package_pinned.yaml`](../../examples/ctk_package_pinned.yaml) — CTK pinned to a specific version -- [`examples/ctk_git_source.yaml`](../../examples/ctk_git_source.yaml) — CTK built from git -- [`examples/ctk_latest_source.yaml`](../../examples/ctk_latest_source.yaml) — CTK tracking latest branch +- [`ctk_package_pinned.yaml`](../../examples/ctk_package_pinned.yaml) + — CTK pinned to a specific version +- [`ctk_git_source.yaml`](../../examples/ctk_git_source.yaml) + — CTK built from git +- [`ctk_latest_source.yaml`](../../examples/ctk_latest_source.yaml) + — CTK tracking latest branch See the [CTK Installation Sources Guide](../guides/ctk-sources.md) for detailed configuration options. @@ -140,8 +143,10 @@ detailed configuration options. **Files:** -- [`examples/runtime_containerd_git.yaml`](../../examples/runtime_containerd_git.yaml) — Containerd built from git -- [`examples/runtime_containerd_latest.yaml`](../../examples/runtime_containerd_latest.yaml) — Containerd tracking latest +- [`runtime_containerd_git.yaml`](../../examples/runtime_containerd_git.yaml) + — Containerd built from git +- [`runtime_containerd_latest.yaml`](../../examples/runtime_containerd_latest.yaml) + — Containerd tracking latest See the [Container Runtime Sources Guide](../guides/runtime-sources.md) for all runtime options. diff --git a/docs/guides/custom-templates.md b/docs/guides/custom-templates.md index 3a511b218..f4d71caff 100644 --- a/docs/guides/custom-templates.md +++ b/docs/guides/custom-templates.md @@ -178,15 +178,15 @@ customTemplates: 1. **Use `pre-install` for system prerequisites** like package repos, kernel parameters, or certificates. -2. **Use `post-kubernetes` for workload deployment** since the +1. **Use `post-kubernetes` for workload deployment** since the cluster is ready at that point. -3. **Use `post-install` for validation scripts** that verify +1. **Use `post-install` for validation scripts** that verify the full stack. -4. **Set `continueOnError: true` for non-critical scripts** +1. **Set `continueOnError: true` for non-critical scripts** like monitoring or logging. -5. **Add checksums for URL sources** to ensure script integrity. -6. **Keep scripts idempotent** so re-runs produce the same result. -7. **Test with `holodeck dryrun`** to validate configuration +1. **Add checksums for URL sources** to ensure script integrity. +1. **Keep scripts idempotent** so re-runs produce the same result. +1. **Test with `holodeck dryrun`** to validate configuration before provisioning. ## Related diff --git a/pkg/utils/kubeconfig.go b/pkg/utils/kubeconfig.go index 2c358e642..8c10decfc 100644 --- a/pkg/utils/kubeconfig.go +++ b/pkg/utils/kubeconfig.go @@ -21,10 +21,11 @@ import ( "io" "os" + "sigs.k8s.io/yaml" + "github.com/NVIDIA/holodeck/api/holodeck/v1alpha1" "github.com/NVIDIA/holodeck/internal/logger" "github.com/NVIDIA/holodeck/pkg/provisioner" - "sigs.k8s.io/yaml" ) // kubeConfig is a minimal representation for server URL rewriting. diff --git a/pkg/utils/kubeconfig_test.go b/pkg/utils/kubeconfig_test.go index 910f19e05..acf093088 100644 --- a/pkg/utils/kubeconfig_test.go +++ b/pkg/utils/kubeconfig_test.go @@ -95,7 +95,7 @@ users: err = RewriteKubeConfigServer(path, tt.serverURL) require.NoError(t, err) - data, err := os.ReadFile(path) + data, err := os.ReadFile(path) //nolint:gosec // test file from t.TempDir() require.NoError(t, err) if tt.serverURL == "" { From bf0b12aeb50e6825e0f417d5789bb3ae3377dc2f Mon Sep 17 00:00:00 2001 From: Carlos Eduardo Arango Gutierrez Date: Tue, 10 Mar 2026 19:16:35 +0100 Subject: [PATCH 18/19] fix: disable repo_gpgcheck for NVIDIA CTK repo on RPM platforms The upstream NVIDIA container toolkit repository has intermittently broken repomd.xml GPG signatures, causing dnf metadata download failures on Amazon Linux 2023 and other RPM-based distros. Disable repo-level GPG check (repo_gpgcheck=0) while keeping individual RPM package GPG verification (gpgcheck=1) intact. Signed-off-by: Carlos Eduardo Arango Gutierrez --- pkg/provisioner/templates/container-toolkit.go | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/pkg/provisioner/templates/container-toolkit.go b/pkg/provisioner/templates/container-toolkit.go index d9c541ff3..42c925053 100644 --- a/pkg/provisioner/templates/container-toolkit.go +++ b/pkg/provisioner/templates/container-toolkit.go @@ -84,6 +84,11 @@ case "${HOLODECK_OS_FAMILY}" in if [[ ! -f /etc/yum.repos.d/nvidia-container-toolkit.repo ]]; then sudo curl -fsSL -o /etc/yum.repos.d/nvidia-container-toolkit.repo \ "https://nvidia.github.io/libnvidia-container/${CHANNEL}/rpm/nvidia-container-toolkit.repo" + # Disable repo metadata GPG check — upstream repomd.xml signature + # is intermittently broken. Individual RPM packages are still + # GPG-verified via gpgcheck=1. + sudo sed -i 's/^repo_gpgcheck=1/repo_gpgcheck=0/' \ + /etc/yum.repos.d/nvidia-container-toolkit.repo fi holodeck_retry 3 "$COMPONENT" pkg_update ;; From c8c19652d5597b8c2e4f3abbbff53cf916bdbc72 Mon Sep 17 00:00:00 2001 From: Carlos Eduardo Arango Gutierrez Date: Tue, 10 Mar 2026 20:19:28 +0100 Subject: [PATCH 19/19] fix: pass --cri-socket to legacy kubeadm commands on multi-CRI hosts On AL2023, both containerd and CRI-O sockets are present when using Docker or CRI-O runtimes. The legacy kubeadm init path (k8s < v1.32) did not pass --cri-socket, causing kubeadm to fail with "found multiple CRI endpoints". The config-file path is unaffected because the kubeadm config already specifies the socket. Signed-off-by: Carlos Eduardo Arango Gutierrez --- pkg/provisioner/templates/kubernetes.go | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/pkg/provisioner/templates/kubernetes.go b/pkg/provisioner/templates/kubernetes.go index aac5e700e..e1d7ba3f7 100644 --- a/pkg/provisioner/templates/kubernetes.go +++ b/pkg/provisioner/templates/kubernetes.go @@ -199,7 +199,8 @@ if [[ ! -f /etc/kubernetes/admin.conf ]]; then holodeck_log "INFO" "$COMPONENT" "Pre-pulling control plane images" {{- if .UseLegacyInit }} holodeck_retry 3 "$COMPONENT" sudo kubeadm config images pull \ - --kubernetes-version="${K8S_VERSION}" + --kubernetes-version="${K8S_VERSION}" \ + --cri-socket "{{ .CriSocket }}" {{- else }} holodeck_retry 3 "$COMPONENT" sudo kubeadm config images pull \ --config /etc/kubernetes/kubeadm-config.yaml @@ -221,6 +222,7 @@ if [[ ! -f /etc/kubernetes/admin.conf ]]; then KUBEADM_NODE_IP=$(hostname -I | awk '{print $1}') sudo kubeadm init \ --kubernetes-version="${K8S_VERSION}" \ + --cri-socket "{{ .CriSocket }}" \ --pod-network-cidr=192.168.0.0/16 \ --control-plane-endpoint="${KUBEADM_NODE_IP}:6443" \ --apiserver-advertise-address="${KUBEADM_NODE_IP}" \