From 1c875549b4328544f7b34f1b356e2fd1bab5b884 Mon Sep 17 00:00:00 2001 From: Manas Srivastava Date: Mon, 11 May 2026 13:36:54 +0530 Subject: [PATCH] fix(compute): explicit kaniko resources + imagePullPolicy Always Two related fixes that an agent hits when redeploying via /deploy/new: 1. Kaniko build pod was sized by the per-namespace LimitRange (hobby tier default: 50m CPU / 256Mi RAM). Non-trivial npm install hung 5+ minutes. Now sets explicit Requests=250m/256Mi and Limits=1/512Mi on the kaniko container. Build runs as a Job, so the higher allocation does not inflate the app's permanent quota. 2. App container had imagePullPolicy=IfNotPresent on the :latest tag. After a redeploy pushed a new image to GHCR, k8s served the cached old layer because the tag string was unchanged. Redeploys silently succeeded while pods kept the prior code. Switched to PullAlways. Also makes the K8sProvider.clientset field a kubernetes.Interface so tests can inject fake.NewSimpleClientset. Tests cover the resource floor + PullAlways guarantee. Co-Authored-By: Claude Opus 4.7 (1M context) --- internal/providers/compute/k8s/client.go | 25 ++++++- internal/providers/compute/k8s/client_test.go | 73 +++++++++++++++++++ internal/providers/compute/k8s/stack.go | 4 +- 3 files changed, 98 insertions(+), 4 deletions(-) create mode 100644 internal/providers/compute/k8s/client_test.go diff --git a/internal/providers/compute/k8s/client.go b/internal/providers/compute/k8s/client.go index 154974c..eb651ec 100644 --- a/internal/providers/compute/k8s/client.go +++ b/internal/providers/compute/k8s/client.go @@ -40,8 +40,8 @@ const ( // K8sProvider implements compute.Provider using the local k8s cluster. type K8sProvider struct { - clientset *kubernetes.Clientset - namespace string // shared namespace (legacy fallback); per-deploy namespaces are preferred + clientset kubernetes.Interface // accepts both *Clientset and *fake.Clientset (tests) + namespace string // shared namespace (legacy fallback); per-deploy namespaces are preferred } // New creates a K8sProvider targeting the given namespace. @@ -816,6 +816,22 @@ func (p *K8sProvider) createKanikoJob(ctx context.Context, ns, jobName, ctxSecre "--single-snapshot", "--cleanup", }, + // Explicit resources override the per-namespace LimitRange + // default (hobby tier defaults to 50m/256Mi which throttles + // kaniko + npm install to 5+ minutes). 250m/512Mi keeps a + // medium npm install under a minute without inflating the + // app's own quota: builds run as a Job, not part of the + // app's permanent footprint. + Resources: corev1.ResourceRequirements{ + Requests: corev1.ResourceList{ + corev1.ResourceCPU: resource.MustParse("250m"), + corev1.ResourceMemory: resource.MustParse("256Mi"), + }, + Limits: corev1.ResourceList{ + corev1.ResourceCPU: resource.MustParse("1"), + corev1.ResourceMemory: resource.MustParse("512Mi"), + }, + }, VolumeMounts: []corev1.VolumeMount{ {Name: "build-context", MountPath: "/workspace"}, {Name: "registry-auth", MountPath: "/kaniko/.docker"}, @@ -885,7 +901,10 @@ func (p *K8sProvider) applyDeploymentInNS( memReq, memLimit, cpuReq string, ) error { replicas := int32(1) - pullPolicy := corev1.PullIfNotPresent + // PullAlways because images are pushed under a single :latest tag — without + // Always, k8s caches the old image on nodes and redeploys silently serve + // stale content. Future: sha-pin the tag and switch back to IfNotPresent. + pullPolicy := corev1.PullAlways saFalse := false appID := appIDFromDeployName(name) diff --git a/internal/providers/compute/k8s/client_test.go b/internal/providers/compute/k8s/client_test.go new file mode 100644 index 0000000..c4fc72b --- /dev/null +++ b/internal/providers/compute/k8s/client_test.go @@ -0,0 +1,73 @@ +package k8s + +import ( + "context" + "testing" + + corev1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/client-go/kubernetes/fake" +) + +// TestKanikoJobHasExplicitResources guards against regressing the build pod's +// resource overrides. Without explicit Requests/Limits, the per-namespace +// LimitRange (hobby default: 50m/256Mi) throttles kaniko + npm install to +// 5+ minutes. See fix/deploy-compute-correctness. +func TestKanikoJobHasExplicitResources(t *testing.T) { + cs := fake.NewSimpleClientset() + p := &K8sProvider{clientset: cs} + + const ns, jobName = "instant-deploy-test", "build-test" + if err := p.createKanikoJob(context.Background(), ns, jobName, "ctx-sec", "auth-sec", "ghcr.io/x/y:latest"); err != nil { + t.Fatalf("createKanikoJob: %v", err) + } + + job, err := cs.BatchV1().Jobs(ns).Get(context.Background(), jobName, metav1.GetOptions{}) + if err != nil { + t.Fatalf("get job: %v", err) + } + containers := job.Spec.Template.Spec.Containers + if len(containers) != 1 { + t.Fatalf("expected 1 container, got %d", len(containers)) + } + c := containers[0] + + for _, k := range []corev1.ResourceName{corev1.ResourceCPU, corev1.ResourceMemory} { + if _, ok := c.Resources.Requests[k]; !ok { + t.Errorf("kaniko container is missing Requests[%s] — LimitRange default will throttle the build", k) + } + if _, ok := c.Resources.Limits[k]; !ok { + t.Errorf("kaniko container is missing Limits[%s] — LimitRange default will throttle the build", k) + } + } + + // Concrete sanity check on the floor value — if someone bumps it down again, + // the test fires. + if got := c.Resources.Requests[corev1.ResourceCPU]; got.MilliValue() < 250 { + t.Errorf("kaniko CPU request %s is below the 250m floor for non-trivial npm installs", got.String()) + } +} + +// TestAppDeploymentUsesPullAlways guards against regressing to IfNotPresent on +// the :latest tag, which caused redeploys to silently serve cached old images. +func TestAppDeploymentUsesPullAlways(t *testing.T) { + cs := fake.NewSimpleClientset() + p := &K8sProvider{clientset: cs} + + const ns, name = "instant-deploy-test", "app-test" + if err := p.applyDeploymentInNS(context.Background(), + ns, name, "ghcr.io/x/y:latest", + map[string]string{"FOO": "bar"}, + 8080, "64Mi", "256Mi", "50m", + ); err != nil { + t.Fatalf("applyDeploymentInNS: %v", err) + } + + d, err := cs.AppsV1().Deployments(ns).Get(context.Background(), name, metav1.GetOptions{}) + if err != nil { + t.Fatalf("get deployment: %v", err) + } + if got := d.Spec.Template.Spec.Containers[0].ImagePullPolicy; got != corev1.PullAlways { + t.Errorf("imagePullPolicy = %s; want PullAlways (otherwise :latest gets cached and redeploys serve stale images)", got) + } +} diff --git a/internal/providers/compute/k8s/stack.go b/internal/providers/compute/k8s/stack.go index 87390b4..5adabea 100644 --- a/internal/providers/compute/k8s/stack.go +++ b/internal/providers/compute/k8s/stack.go @@ -318,7 +318,9 @@ func (p *K8sStackProvider) createStackDeployment( memReq, memLimit, cpuReq string, ) error { replicas := int32(1) - pullPolicy := corev1.PullIfNotPresent + // PullAlways for the same reason as client.go: images are pushed under + // :latest, so without Always, redeploys silently serve cached old images. + pullPolicy := corev1.PullAlways saFalse := false desired := &appsv1.Deployment{