Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
19 commits
Select commit Hold shift + click to select a range
7f8f06f
fix: use local IP for kubeadm init health checks (single-node)
ArangoGutierrez Mar 9, 2026
e857b51
fix: HA kubeadm init uses local IP to avoid NLB chicken-and-egg
ArangoGutierrez Mar 9, 2026
cb02c68
fix: create nvidia-container-runtime symlink on RPM platforms
ArangoGutierrez Mar 9, 2026
38ffde8
fix: truncate NLB target group name to AWS 32-char limit
ArangoGutierrez Mar 9, 2026
5a2e19a
test: add explicit K8s version to RPM test configs
ArangoGutierrez Mar 9, 2026
ce675ca
fix: use public IP for control plane endpoint when no LB
ArangoGutierrez Mar 9, 2026
c8ec13b
fix: add self-referencing SG rules for cluster inter-node traffic
ArangoGutierrez Mar 9, 2026
f835f00
fix: add self-referencing SG rules to single-node security group
ArangoGutierrez Mar 9, 2026
318f0de
fix: rewrite kubeconfig server URL to public endpoint after download
ArangoGutierrez Mar 9, 2026
05e6dc9
feat: add ExpandPath utility for tilde expansion
ArangoGutierrez Mar 9, 2026
16c165c
fix: expand tilde in privateKey paths before reading
ArangoGutierrez Mar 9, 2026
9b34cc5
fix: use local IP for kubeadm init in all cluster modes, not just HA
ArangoGutierrez Mar 9, 2026
cbb6aa4
fix: use private IP for cluster endpoint in non-HA mode
ArangoGutierrez Mar 9, 2026
b99b4ae
fix: restart cri-dockerd before kubeadm init to prevent socket errors
ArangoGutierrez Mar 10, 2026
5b8fda3
fix: ensure nvidia-container-runtime exists at /usr/bin on all platforms
ArangoGutierrez Mar 10, 2026
f40a562
fix: prevent duplicate apiServer block in kubeadm config with feature…
ArangoGutierrez Mar 10, 2026
3f479cd
fix: resolve CI lint failures (goimports, gosec, markdown)
ArangoGutierrez Mar 10, 2026
bf0b12a
fix: disable repo_gpgcheck for NVIDIA CTK repo on RPM platforms
ArangoGutierrez Mar 10, 2026
c8c1965
fix: pass --cri-socket to legacy kubeadm commands on multi-CRI hosts
ArangoGutierrez Mar 10, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion cmd/action/ci/entrypoint.go
Original file line number Diff line number Diff line change
Expand Up @@ -96,7 +96,7 @@ func entrypoint(log *logger.FunLogger) error {
}

if cfg.Spec.Kubernetes.Install {
err = utils.GetKubeConfig(log, &cfg, hostUrl, kubeconfig)
err = utils.GetKubeConfig(log, &cfg, hostUrl, kubeconfig, "")
if err != nil {
return fmt.Errorf("failed to get kubeconfig: %w", err)
}
Expand Down
5 changes: 5 additions & 0 deletions cmd/cli/common/host.go
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ import (
"github.com/NVIDIA/holodeck/internal/logger"
"github.com/NVIDIA/holodeck/pkg/provider/aws"
"github.com/NVIDIA/holodeck/pkg/sshutil"
"github.com/NVIDIA/holodeck/pkg/utils"
)

// GetHostURL resolves the SSH-reachable host URL for an environment.
Expand Down Expand Up @@ -82,6 +83,10 @@ const (
// ConnectSSH establishes an SSH connection with retries.
// Host key verification uses Trust-On-First-Use (TOFU).
func ConnectSSH(log *logger.FunLogger, keyPath, userName, hostUrl string) (*ssh.Client, error) {
keyPath, err := utils.ExpandPath(keyPath)
if err != nil {
return nil, fmt.Errorf("expanding key path: %w", err)
}
key, err := os.ReadFile(keyPath) //nolint:gosec // keyPath is from trusted env config
if err != nil {
return nil, fmt.Errorf("failed to read key file %s: %w", keyPath, err)
Expand Down
4 changes: 2 additions & 2 deletions cmd/cli/create/create.go
Original file line number Diff line number Diff line change
Expand Up @@ -485,7 +485,7 @@ func runSingleNodeProvision(log *logger.FunLogger, opts *options) error {
break
}
}
if err = utils.GetKubeConfig(log, &opts.cache, hostUrl, opts.kubeconfig); err != nil {
if err = utils.GetKubeConfig(log, &opts.cache, hostUrl, opts.kubeconfig, ""); err != nil {
return fmt.Errorf("failed to get kubeconfig: %w", err)
}
}
Expand Down Expand Up @@ -563,7 +563,7 @@ func runMultinodeProvision(log *logger.FunLogger, opts *options) error {
}
}
if hostUrl != "" {
if err := utils.GetKubeConfig(log, &opts.cache, hostUrl, opts.kubeconfig); err != nil {
if err := utils.GetKubeConfig(log, &opts.cache, hostUrl, opts.kubeconfig, ""); err != nil {
return fmt.Errorf("failed to get kubeconfig: %w", err)
}
}
Expand Down
40 changes: 20 additions & 20 deletions cmd/cli/delete/delete_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -131,8 +131,8 @@ var _ = Describe("Delete Command", func() {

It("should delete single SSH instance successfully", func() {
// Create a valid SSH cache file
yaml := sshCacheYAML("sshdelete1", "ssh-delete-test")
cacheFile := filepath.Join(tempDir, "sshdelete1.yaml")
yaml := sshCacheYAML("a1b2c3d4", "ssh-delete-test")
cacheFile := filepath.Join(tempDir, "a1b2c3d4.yaml")
err := os.WriteFile(cacheFile, []byte(yaml), 0600)
Expect(err).NotTo(HaveOccurred())

Expand All @@ -141,7 +141,7 @@ var _ = Describe("Delete Command", func() {
Commands: []*cli.Command{cmd},
}

err = app.Run([]string{"holodeck", "delete", "--cachepath", tempDir, "sshdelete1"})
err = app.Run([]string{"holodeck", "delete", "--cachepath", tempDir, "a1b2c3d4"})
Expect(err).NotTo(HaveOccurred())

// Verify cache file was removed
Expand All @@ -150,15 +150,15 @@ var _ = Describe("Delete Command", func() {

// Verify success message
Expect(buf.String()).To(ContainSubstring("Successfully deleted"))
Expect(buf.String()).To(ContainSubstring("sshdelete1"))
Expect(buf.String()).To(ContainSubstring("a1b2c3d4"))
})

It("should delete multiple SSH instances successfully", func() {
// Create two cache files
yaml1 := sshCacheYAML("sshmulti1", "ssh-multi-1")
yaml2 := sshCacheYAML("sshmulti2", "ssh-multi-2")
cacheFile1 := filepath.Join(tempDir, "sshmulti1.yaml")
cacheFile2 := filepath.Join(tempDir, "sshmulti2.yaml")
yaml1 := sshCacheYAML("e5f6a7b8", "ssh-multi-1")
yaml2 := sshCacheYAML("c9d0e1f2", "ssh-multi-2")
cacheFile1 := filepath.Join(tempDir, "e5f6a7b8.yaml")
cacheFile2 := filepath.Join(tempDir, "c9d0e1f2.yaml")
err := os.WriteFile(cacheFile1, []byte(yaml1), 0600)
Expect(err).NotTo(HaveOccurred())
err = os.WriteFile(cacheFile2, []byte(yaml2), 0600)
Expand All @@ -169,7 +169,7 @@ var _ = Describe("Delete Command", func() {
Commands: []*cli.Command{cmd},
}

err = app.Run([]string{"holodeck", "delete", "--cachepath", tempDir, "sshmulti1", "sshmulti2"})
err = app.Run([]string{"holodeck", "delete", "--cachepath", tempDir, "e5f6a7b8", "c9d0e1f2"})
Expect(err).NotTo(HaveOccurred())

// Verify both cache files were removed
Expand All @@ -179,14 +179,14 @@ var _ = Describe("Delete Command", func() {
Expect(os.IsNotExist(err)).To(BeTrue())

// Verify success messages for both
Expect(buf.String()).To(ContainSubstring("sshmulti1"))
Expect(buf.String()).To(ContainSubstring("sshmulti2"))
Expect(buf.String()).To(ContainSubstring("e5f6a7b8"))
Expect(buf.String()).To(ContainSubstring("c9d0e1f2"))
})

It("should stop on first error with multiple instances", func() {
// Create only one valid cache file
yaml := sshCacheYAML("sshvalid1", "ssh-valid")
cacheFile := filepath.Join(tempDir, "sshvalid1.yaml")
yaml := sshCacheYAML("a3b4c5d6", "ssh-valid")
cacheFile := filepath.Join(tempDir, "a3b4c5d6.yaml")
err := os.WriteFile(cacheFile, []byte(yaml), 0600)
Expect(err).NotTo(HaveOccurred())

Expand All @@ -196,7 +196,7 @@ var _ = Describe("Delete Command", func() {
}

// First instance doesn't exist, should fail before processing second
err = app.Run([]string{"holodeck", "delete", "--cachepath", tempDir, "nonexistent", "sshvalid1"})
err = app.Run([]string{"holodeck", "delete", "--cachepath", tempDir, "nonexistent", "a3b4c5d6"})
Expect(err).To(HaveOccurred())
Expect(err.Error()).To(ContainSubstring("failed to get instance nonexistent"))

Expand All @@ -207,8 +207,8 @@ var _ = Describe("Delete Command", func() {

It("should fail if second instance doesn't exist", func() {
// Create only one valid cache file
yaml := sshCacheYAML("sshfirst1", "ssh-first")
cacheFile := filepath.Join(tempDir, "sshfirst1.yaml")
yaml := sshCacheYAML("e7f8a9b0", "ssh-first")
cacheFile := filepath.Join(tempDir, "e7f8a9b0.yaml")
err := os.WriteFile(cacheFile, []byte(yaml), 0600)
Expect(err).NotTo(HaveOccurred())

Expand All @@ -218,7 +218,7 @@ var _ = Describe("Delete Command", func() {
}

// First succeeds, second fails
err = app.Run([]string{"holodeck", "delete", "--cachepath", tempDir, "sshfirst1", "nonexistent"})
err = app.Run([]string{"holodeck", "delete", "--cachepath", tempDir, "e7f8a9b0", "nonexistent"})
Expect(err).To(HaveOccurred())
Expect(err.Error()).To(ContainSubstring("failed to get instance nonexistent"))

Expand All @@ -235,8 +235,8 @@ var _ = Describe("Delete Command", func() {
DeferCleanup(os.RemoveAll, tempDir)

// Create a cache file in custom path
yaml := sshCacheYAML("customdel", "custom-delete")
cacheFile := filepath.Join(tempDir, "customdel.yaml")
yaml := sshCacheYAML("f1e2d3c4", "custom-delete")
cacheFile := filepath.Join(tempDir, "f1e2d3c4.yaml")
err = os.WriteFile(cacheFile, []byte(yaml), 0600)
Expect(err).NotTo(HaveOccurred())

Expand All @@ -246,7 +246,7 @@ var _ = Describe("Delete Command", func() {
}

// Use -c alias for cachepath
err = app.Run([]string{"holodeck", "delete", "-c", tempDir, "customdel"})
err = app.Run([]string{"holodeck", "delete", "-c", tempDir, "f1e2d3c4"})
Expect(err).NotTo(HaveOccurred())

// Verify cache file was removed
Expand Down
5 changes: 5 additions & 0 deletions cmd/cli/dryrun/dryrun.go
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ import (
"github.com/NVIDIA/holodeck/pkg/provider/aws"
"github.com/NVIDIA/holodeck/pkg/provisioner"
"github.com/NVIDIA/holodeck/pkg/sshutil"
"github.com/NVIDIA/holodeck/pkg/utils"

cli "github.com/urfave/cli/v2"
"golang.org/x/crypto/ssh"
Expand Down Expand Up @@ -131,6 +132,10 @@ func validateAWS(log *logger.FunLogger, opts *options) error {
// createSshClient creates a ssh client, and retries if it fails to connect
func connectOrDie(keyPath, userName, hostUrl string) error {
var err error
keyPath, err = utils.ExpandPath(keyPath)
if err != nil {
return fmt.Errorf("expanding key path: %w", err)
}
key, err := os.ReadFile(keyPath) // nolint:gosec
if err != nil {
return fmt.Errorf("failed to read key file: %w", err)
Expand Down
2 changes: 1 addition & 1 deletion cmd/cli/get/get.go
Original file line number Diff line number Diff line change
Expand Up @@ -181,7 +181,7 @@ func (m command) runKubeconfig(instanceID string) error {
}

// Download kubeconfig
if err := utils.GetKubeConfig(m.log, &env, hostUrl, outputPath); err != nil {
if err := utils.GetKubeConfig(m.log, &env, hostUrl, outputPath, ""); err != nil {
return fmt.Errorf("failed to download kubeconfig: %w", err)
}

Expand Down
4 changes: 2 additions & 2 deletions cmd/cli/status/status_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -130,14 +130,14 @@ var _ = Describe("Status Command", func() {
Expect(err).NotTo(HaveOccurred())
DeferCleanup(os.RemoveAll, tempDir)

instanceID := "test12345678"
instanceID := "ab12cd34"
cacheFile := filepath.Join(tempDir, instanceID+".yaml")
validYAML := `apiVersion: holodeck.nvidia.com/v1alpha1
kind: Environment
metadata:
name: test-environment
labels:
holodeck-instance-id: test12345678
holodeck-instance-id: ab12cd34
spec:
provider: ssh
username: testuser
Expand Down
15 changes: 10 additions & 5 deletions docs/examples/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -129,9 +129,12 @@ Install the NVIDIA driver using the official `.run` installer.

**Files:**

- [`examples/ctk_package_pinned.yaml`](../../examples/ctk_package_pinned.yaml) — CTK pinned to a specific version
- [`examples/ctk_git_source.yaml`](../../examples/ctk_git_source.yaml) — CTK built from git
- [`examples/ctk_latest_source.yaml`](../../examples/ctk_latest_source.yaml) — CTK tracking latest branch
- [`ctk_package_pinned.yaml`](../../examples/ctk_package_pinned.yaml)
— CTK pinned to a specific version
- [`ctk_git_source.yaml`](../../examples/ctk_git_source.yaml)
— CTK built from git
- [`ctk_latest_source.yaml`](../../examples/ctk_latest_source.yaml)
— CTK tracking latest branch

See the [CTK Installation Sources Guide](../guides/ctk-sources.md) for
detailed configuration options.
Expand All @@ -140,8 +143,10 @@ detailed configuration options.

**Files:**

- [`examples/runtime_containerd_git.yaml`](../../examples/runtime_containerd_git.yaml) — Containerd built from git
- [`examples/runtime_containerd_latest.yaml`](../../examples/runtime_containerd_latest.yaml) — Containerd tracking latest
- [`runtime_containerd_git.yaml`](../../examples/runtime_containerd_git.yaml)
— Containerd built from git
- [`runtime_containerd_latest.yaml`](../../examples/runtime_containerd_latest.yaml)
— Containerd tracking latest

See the [Container Runtime Sources Guide](../guides/runtime-sources.md)
for all runtime options.
Expand Down
12 changes: 6 additions & 6 deletions docs/guides/custom-templates.md
Original file line number Diff line number Diff line change
Expand Up @@ -178,15 +178,15 @@ customTemplates:

1. **Use `pre-install` for system prerequisites** like package
repos, kernel parameters, or certificates.
2. **Use `post-kubernetes` for workload deployment** since the
1. **Use `post-kubernetes` for workload deployment** since the
cluster is ready at that point.
3. **Use `post-install` for validation scripts** that verify
1. **Use `post-install` for validation scripts** that verify
the full stack.
4. **Set `continueOnError: true` for non-critical scripts**
1. **Set `continueOnError: true` for non-critical scripts**
like monitoring or logging.
5. **Add checksums for URL sources** to ensure script integrity.
6. **Keep scripts idempotent** so re-runs produce the same result.
7. **Test with `holodeck dryrun`** to validate configuration
1. **Add checksums for URL sources** to ensure script integrity.
1. **Keep scripts idempotent** so re-runs produce the same result.
1. **Test with `holodeck dryrun`** to validate configuration
before provisioning.

## Related
Expand Down
26 changes: 26 additions & 0 deletions pkg/provider/aws/cluster.go
Original file line number Diff line number Diff line change
Expand Up @@ -323,6 +323,32 @@ func (p *Provider) createClusterSecurityGroup(cache *ClusterCache) error {
},
}

// Self-referencing rules: allow all traffic between instances in this SG.
// Covers webhooks (dynamic ports), NodePort (30000-32767), IPIP (Calico),
// and any future K8s inter-node communication.
// Uses explicit TCP+UDP+ICMP (not protocol -1) for stricter compliance.
sgRef := []types.UserIdGroupPair{{GroupId: sgOutput.GroupId}}
permissions = append(permissions,
types.IpPermission{
FromPort: aws.Int32(0),
ToPort: aws.Int32(65535),
IpProtocol: aws.String("tcp"),
UserIdGroupPairs: sgRef,
},
types.IpPermission{
FromPort: aws.Int32(0),
ToPort: aws.Int32(65535),
IpProtocol: aws.String("udp"),
UserIdGroupPairs: sgRef,
},
types.IpPermission{
FromPort: aws.Int32(-1),
ToPort: aws.Int32(-1),
IpProtocol: aws.String("icmp"),
UserIdGroupPairs: sgRef,
},
)

irInput := &ec2.AuthorizeSecurityGroupIngressInput{
GroupId: sgOutput.GroupId,
IpPermissions: permissions,
Expand Down
19 changes: 19 additions & 0 deletions pkg/provider/aws/create.go
Original file line number Diff line number Diff line change
Expand Up @@ -404,6 +404,25 @@ func (p *Provider) createSecurityGroup(cache *AWS) error {
IpProtocol: &tcp,
IpRanges: ipRanges,
},
// Self-referencing: allow all TCP/UDP/ICMP between SG members
{
FromPort: aws.Int32(0),
ToPort: aws.Int32(65535),
IpProtocol: aws.String("tcp"),
UserIdGroupPairs: []types.UserIdGroupPair{{GroupId: sgOutput.GroupId}},
},
{
FromPort: aws.Int32(0),
ToPort: aws.Int32(65535),
IpProtocol: aws.String("udp"),
UserIdGroupPairs: []types.UserIdGroupPair{{GroupId: sgOutput.GroupId}},
},
{
FromPort: aws.Int32(-1),
ToPort: aws.Int32(-1),
IpProtocol: aws.String("icmp"),
UserIdGroupPairs: []types.UserIdGroupPair{{GroupId: sgOutput.GroupId}},
},
},
}

Expand Down
19 changes: 17 additions & 2 deletions pkg/provider/aws/nlb.go
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,14 @@ func (p *Provider) createNLB(cache *ClusterCache) error {
cancelLoading := p.log.Loading("Creating Network Load Balancer")

lbType := elbv2types.LoadBalancerTypeEnumNetwork
lbName := fmt.Sprintf("%s-nlb", p.ObjectMeta.Name)
// AWS load balancer names are limited to 32 characters.
const nlbSuffix = "-nlb"
maxNLBNameLen := 32 - len(nlbSuffix)
nlbBaseName := p.ObjectMeta.Name
if len(nlbBaseName) > maxNLBNameLen {
nlbBaseName = nlbBaseName[:maxNLBNameLen]
}
lbName := nlbBaseName + nlbSuffix
Comment on lines +48 to +55
Copy link

Copilot AI Mar 10, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Truncating the environment name to fit the 32-char AWS limit can cause name collisions for different environments that share a long common prefix, leading to CreateLoadBalancer/CreateTargetGroup failures that are hard to diagnose. Consider incorporating a short, deterministic suffix (e.g., a hash of the full env name) when truncation is needed so names remain unique while still meeting the length constraint.

Copilot uses AI. Check for mistakes.

// Determine subnet IDs (use the same subnet for NLB)
subnetIDs := []string{cache.Subnetid}
Expand Down Expand Up @@ -91,7 +98,15 @@ func (p *Provider) createTargetGroup(cache *ClusterCache) error {

cancelLoading := p.log.Loading("Creating target group for Kubernetes API")

tgName := fmt.Sprintf("%s-k8s-api-tg", p.ObjectMeta.Name)
// AWS target group names are limited to 32 characters.
// Truncate the environment name to fit within the limit.
const tgSuffix = "-k8s-tg"
maxNameLen := 32 - len(tgSuffix)
name := p.ObjectMeta.Name
if len(name) > maxNameLen {
name = name[:maxNameLen]
}
tgName := name + tgSuffix

// Create target group for Kubernetes API (port 6443)
createTGInput := &elasticloadbalancingv2.CreateTargetGroupInput{
Expand Down
8 changes: 6 additions & 2 deletions pkg/provisioner/cluster.go
Original file line number Diff line number Diff line change
Expand Up @@ -149,13 +149,17 @@ func (cp *ClusterProvisioner) ProvisionCluster(nodes []NodeInfo) error {
return nil
}

// determineControlPlaneEndpoint returns the control plane endpoint
// determineControlPlaneEndpoint returns the control plane endpoint for cluster-internal
// communication (kubeadm init, join, API server binding). For HA with NLB, returns the
// NLB DNS. For non-HA, returns the first CP's private IP since all nodes are in the
// same VPC and the private IP is always routable. External access (kubeconfig) is
// handled separately by RewriteKubeConfigServer.
func (cp *ClusterProvisioner) determineControlPlaneEndpoint(firstCP NodeInfo) string {
// Check if HA is enabled and we have a load balancer DNS
if cp.Environment.Status.Cluster != nil && cp.Environment.Status.Cluster.LoadBalancerDNS != "" {
return cp.Environment.Status.Cluster.LoadBalancerDNS
}
// Fall back to first control-plane private IP
// Use private IP for intra-VPC communication (init + join)
return firstCP.PrivateIP
}

Expand Down
2 changes: 2 additions & 0 deletions pkg/provisioner/cluster_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -310,6 +310,7 @@ func TestClusterProvisioner_determineControlPlaneEndpoint(t *testing.T) {
Status: v1alpha1.EnvironmentStatus{},
},
firstCP: NodeInfo{
PublicIP: "54.1.2.3",
PrivateIP: "10.0.0.1",
},
expected: "10.0.0.1",
Expand All @@ -322,6 +323,7 @@ func TestClusterProvisioner_determineControlPlaneEndpoint(t *testing.T) {
},
},
firstCP: NodeInfo{
PublicIP: "54.1.2.4",
PrivateIP: "10.0.0.2",
},
expected: "10.0.0.2",
Expand Down
Loading
Loading