From a65bb16a3800f2731b44777cb54d557e8afb0bcc Mon Sep 17 00:00:00 2001 From: justinsb Date: Fri, 30 Aug 2024 07:30:46 -0400 Subject: [PATCH] metal: support `kops toolbox enroll` on a control-plane machine In particular, we want to build the full cluster and instance group. The control plane does not yet start, because etcd is not configured correctly. --- pkg/commands/toolbox_enroll.go | 170 +++++++++++++++++------- pkg/model/bootstrapscript.go | 18 ++- tests/e2e/scenarios/bare-metal/run-test | 10 ++ upup/pkg/fi/cloudup/new_cluster.go | 3 + 4 files changed, 150 insertions(+), 51 deletions(-) diff --git a/pkg/commands/toolbox_enroll.go b/pkg/commands/toolbox_enroll.go index b55d2c933affa..c0e62e949abb6 100644 --- a/pkg/commands/toolbox_enroll.go +++ b/pkg/commands/toolbox_enroll.go @@ -28,6 +28,7 @@ import ( "net" "os" "path" + "path/filepath" "sort" "strconv" "strings" @@ -40,6 +41,7 @@ import ( "k8s.io/client-go/rest" "k8s.io/klog/v2" "sigs.k8s.io/controller-runtime/pkg/client" + "sigs.k8s.io/yaml" "k8s.io/kops/pkg/apis/kops" "k8s.io/kops/pkg/apis/kops/v1alpha2" @@ -50,11 +52,9 @@ import ( "k8s.io/kops/pkg/model" "k8s.io/kops/pkg/model/resources" "k8s.io/kops/pkg/nodemodel" - "k8s.io/kops/pkg/nodemodel/wellknownassets" "k8s.io/kops/pkg/wellknownservices" "k8s.io/kops/upup/pkg/fi" "k8s.io/kops/upup/pkg/fi/cloudup" - "k8s.io/kops/util/pkg/architectures" "k8s.io/kops/util/pkg/vfs" ) @@ -92,12 +92,16 @@ func RunToolboxEnroll(ctx context.Context, f commandutils.Factory, out io.Writer if err != nil { return err } - if cluster == nil { return fmt.Errorf("cluster not found %q", options.ClusterName) } - ig, err := clientset.InstanceGroupsFor(cluster).Get(ctx, options.InstanceGroup, metav1.GetOptions{}) + channel, err := cloudup.ChannelForCluster(clientset.VFSContext(), cluster) + if err != nil { + return fmt.Errorf("getting channel for cluster %q: %w", options.ClusterName, err) + } + + instanceGroupList, err := clientset.InstanceGroupsFor(cluster).List(ctx, metav1.ListOptions{}) if err != nil { return err } @@ -107,10 +111,62 @@ func RunToolboxEnroll(ctx context.Context, f commandutils.Factory, out io.Writer return err } - wellKnownAddresses := make(model.WellKnownAddresses) + // The assetBuilder is used primarily to remap images. + var assetBuilder *assets.AssetBuilder + { + // ApplyClusterCmd is get the assets. + // We use DryRun and GetAssets to do this without applying any changes. + apply := &cloudup.ApplyClusterCmd{ + Cloud: cloud, + Cluster: cluster, + Clientset: clientset, + DryRun: true, + GetAssets: true, + TargetName: cloudup.TargetDryRun, + } + applyResults, err := apply.Run(ctx) + if err != nil { + return fmt.Errorf("error during apply: %w", err) + } + assetBuilder = applyResults.AssetBuilder + } + + // Populate the full cluster and instanceGroup specs. + var fullInstanceGroup *kops.InstanceGroup + var fullCluster *kops.Cluster + { + var instanceGroups []*kops.InstanceGroup + for i := range instanceGroupList.Items { + instanceGroup := &instanceGroupList.Items[i] + instanceGroups = append(instanceGroups, instanceGroup) + } + populatedCluster, err := cloudup.PopulateClusterSpec(ctx, clientset, cluster, instanceGroups, cloud, assetBuilder) + if err != nil { + return fmt.Errorf("building full cluster spec: %w", err) + } + fullCluster = populatedCluster + + // Build full IG spec to ensure we end up with a valid IG + for _, ig := range instanceGroups { + if ig.Name != options.InstanceGroup { + continue + } + populated, err := cloudup.PopulateInstanceGroupSpec(fullCluster, ig, cloud, channel) + if err != nil { + return err + } + fullInstanceGroup = populated + } + } + if fullInstanceGroup == nil { + return fmt.Errorf("instance group %q not found", options.InstanceGroup) + } + + // Determine the well-known addresses for the cluster. + wellKnownAddresses := make(model.WellKnownAddresses) { - ingresses, err := cloud.GetApiIngressStatus(cluster) + ingresses, err := cloud.GetApiIngressStatus(fullCluster) if err != nil { return fmt.Errorf("error getting ingress status: %v", err) } @@ -125,24 +181,24 @@ func RunToolboxEnroll(ctx context.Context, f commandutils.Factory, out io.Writer } } } - if len(wellKnownAddresses[wellknownservices.KubeAPIServer]) == 0 { // TODO: Should we support DNS? return fmt.Errorf("unable to determine IP address for kube-apiserver") } - for k := range wellKnownAddresses { sort.Strings(wellKnownAddresses[k]) } - scriptBytes, err := buildBootstrapData(ctx, clientset, cluster, ig, wellKnownAddresses) + // Build the bootstrap data for this node. + bootstrapData, err := buildBootstrapData(ctx, clientset, fullCluster, fullInstanceGroup, wellKnownAddresses) if err != nil { - return err + return fmt.Errorf("building bootstrap data: %w", err) } + // Enroll the node over SSH. if options.Host != "" { // TODO: This is the pattern we use a lot, but should we try to access it directly? - contextName := cluster.ObjectMeta.Name + contextName := fullCluster.ObjectMeta.Name clientGetter := genericclioptions.NewConfigFlags(true) clientGetter.Context = &contextName @@ -151,14 +207,15 @@ func RunToolboxEnroll(ctx context.Context, f commandutils.Factory, out io.Writer return fmt.Errorf("cannot load kubecfg settings for %q: %w", contextName, err) } - if err := enrollHost(ctx, options, string(scriptBytes), restConfig); err != nil { + if err := enrollHost(ctx, fullInstanceGroup, options, bootstrapData, restConfig); err != nil { return err } } + return nil } -func enrollHost(ctx context.Context, options *ToolboxEnrollOptions, nodeupScript string, restConfig *rest.Config) error { +func enrollHost(ctx context.Context, ig *kops.InstanceGroup, options *ToolboxEnrollOptions, bootstrapData *bootstrapData, restConfig *rest.Config) error { scheme := runtime.NewScheme() if err := v1alpha2.AddToScheme(scheme); err != nil { return fmt.Errorf("building kubernetes scheme: %w", err) @@ -211,19 +268,29 @@ func enrollHost(ctx context.Context, options *ToolboxEnrollOptions, nodeupScript return err } - if err := createHost(ctx, options, hostname, publicKeyBytes, kubeClient); err != nil { - return err + // We can't create the host resource in the API server for control-plane nodes, + // because the API server (likely) isn't running yet. + if !ig.IsControlPlane() { + if err := createHostResourceInAPIServer(ctx, options, hostname, publicKeyBytes, kubeClient); err != nil { + return err + } + } + + for k, v := range bootstrapData.configFiles { + if err := host.writeFile(ctx, k, bytes.NewReader(v)); err != nil { + return fmt.Errorf("writing file %q over SSH: %w", k, err) + } } - if len(nodeupScript) != 0 { - if _, err := host.runScript(ctx, nodeupScript, ExecOptions{Sudo: sudo, Echo: true}); err != nil { + if len(bootstrapData.nodeupScript) != 0 { + if _, err := host.runScript(ctx, string(bootstrapData.nodeupScript), ExecOptions{Sudo: sudo, Echo: true}); err != nil { return err } } return nil } -func createHost(ctx context.Context, options *ToolboxEnrollOptions, nodeName string, publicKey []byte, client client.Client) error { +func createHostResourceInAPIServer(ctx context.Context, options *ToolboxEnrollOptions, nodeName string, publicKey []byte, client client.Client) error { host := &v1alpha2.Host{} host.Namespace = "kops-system" host.Name = nodeName @@ -317,6 +384,11 @@ func (s *SSHHost) readFile(ctx context.Context, path string) ([]byte, error) { return p.ReadFile(ctx) } +func (s *SSHHost) writeFile(ctx context.Context, path string, data io.ReadSeeker) error { + p := vfs.NewSSHPath(s.sshClient, s.hostname, path, s.sudo) + return p.WriteFile(ctx, data, nil) +} + func (s *SSHHost) runScript(ctx context.Context, script string, options ExecOptions) (*CommandOutput, error) { var tempDir string { @@ -398,10 +470,14 @@ func (s *SSHHost) getHostname(ctx context.Context) (string, error) { return hostname, nil } -func buildBootstrapData(ctx context.Context, clientset simple.Clientset, cluster *kops.Cluster, ig *kops.InstanceGroup, wellknownAddresses model.WellKnownAddresses) ([]byte, error) { - if cluster.Spec.KubeAPIServer == nil { - cluster.Spec.KubeAPIServer = &kops.KubeAPIServerConfig{} - } +type bootstrapData struct { + nodeupScript []byte + configFiles map[string][]byte +} + +func buildBootstrapData(ctx context.Context, clientset simple.Clientset, cluster *kops.Cluster, ig *kops.InstanceGroup, wellknownAddresses model.WellKnownAddresses) (*bootstrapData, error) { + bootstrapData := &bootstrapData{} + bootstrapData.configFiles = make(map[string][]byte) getAssets := false assetBuilder := assets.NewAssetBuilder(clientset.VFSContext(), cluster.Spec.Assets, cluster.Spec.KubernetesVersion, getAssets) @@ -423,17 +499,12 @@ func buildBootstrapData(ctx context.Context, clientset simple.Clientset, cluster // encryptionConfigSecretHash = base64.URLEncoding.EncodeToString(hashBytes[:]) // } - nodeUpAssets := make(map[architectures.Architecture]*assets.MirroredAsset) - for _, arch := range architectures.GetSupported() { - asset, err := wellknownassets.NodeUpAsset(assetBuilder, arch) - if err != nil { - return nil, err - } - nodeUpAssets[arch] = asset + fileAssets := &nodemodel.FileAssets{Cluster: cluster} + if err := fileAssets.AddFileAssets(assetBuilder); err != nil { + return nil, err } - assets := make(map[architectures.Architecture][]*assets.MirroredAsset) - configBuilder, err := nodemodel.NewNodeUpConfigBuilder(cluster, assetBuilder, assets, encryptionConfigSecretHash) + configBuilder, err := nodemodel.NewNodeUpConfigBuilder(cluster, assetBuilder, fileAssets.Assets, encryptionConfigSecretHash) if err != nil { return nil, err } @@ -445,7 +516,8 @@ func buildBootstrapData(ctx context.Context, clientset simple.Clientset, cluster return nil, err } - for _, keyName := range []string{"kubernetes-ca"} { + keyNames := model.KeypairNamesForInstanceGroup(cluster, ig) + for _, keyName := range keyNames { keyset, err := keystore.FindKeyset(ctx, keyName) if err != nil { return nil, fmt.Errorf("getting keyset %q: %w", keyName, err) @@ -458,23 +530,13 @@ func buildBootstrapData(ctx context.Context, clientset simple.Clientset, cluster keysets[keyName] = keyset } - _, bootConfig, err := configBuilder.BuildConfig(ig, wellknownAddresses, keysets) + nodeupConfig, bootConfig, err := configBuilder.BuildConfig(ig, wellknownAddresses, keysets) if err != nil { return nil, err } - bootConfig.CloudProvider = "metal" - - // TODO: Should we / can we specify the node config hash? - // configData, err := utils.YamlMarshal(config) - // if err != nil { - // return nil, fmt.Errorf("error converting nodeup config to yaml: %v", err) - // } - // sum256 := sha256.Sum256(configData) - // bootConfig.NodeupConfigHash = base64.StdEncoding.EncodeToString(sum256[:]) - var nodeupScript resources.NodeUpScript - nodeupScript.NodeUpAssets = nodeUpAssets + nodeupScript.NodeUpAssets = fileAssets.NodeUpAssets nodeupScript.BootConfig = bootConfig nodeupScript.WithEnvironmentVariables(cluster, ig) @@ -483,15 +545,31 @@ func buildBootstrapData(ctx context.Context, clientset simple.Clientset, cluster nodeupScript.CloudProvider = string(cluster.GetCloudProvider()) + bootConfig.ConfigBase = fi.PtrTo("file:///etc/kubernetes/kops/config") + nodeupScriptResource, err := nodeupScript.Build() if err != nil { return nil, err } - b, err := fi.ResourceAsBytes(nodeupScriptResource) + if bootConfig.InstanceGroupRole == kops.InstanceGroupRoleControlPlane { + nodeupConfigBytes, err := yaml.Marshal(nodeupConfig) + if err != nil { + return nil, fmt.Errorf("error converting nodeup config to yaml: %w", err) + } + // Not much reason to hash this, since we're reading it from the local file system + // sum256 := sha256.Sum256(nodeupConfigBytes) + // bootConfig.NodeupConfigHash = base64.StdEncoding.EncodeToString(sum256[:]) + + p := filepath.Join("/etc/kubernetes/kops/config", "igconfig", bootConfig.InstanceGroupRole.ToLowerString(), ig.Name, "nodeupconfig.yaml") + bootstrapData.configFiles[p] = nodeupConfigBytes + } + + nodeupScriptBytes, err := fi.ResourceAsBytes(nodeupScriptResource) if err != nil { return nil, err } + bootstrapData.nodeupScript = nodeupScriptBytes - return b, nil + return bootstrapData, nil } diff --git a/pkg/model/bootstrapscript.go b/pkg/model/bootstrapscript.go index 0e85eb22d7c77..f35f88c08266a 100644 --- a/pkg/model/bootstrapscript.go +++ b/pkg/model/bootstrapscript.go @@ -124,11 +124,9 @@ func (b *BootstrapScript) kubeEnv(ig *kops.InstanceGroup, c *fi.CloudupContext) return bootConfig, nil } -// ResourceNodeUp generates and returns a nodeup (bootstrap) script from a -// template file, substituting in specific env vars & cluster spec configuration -func (b *BootstrapScriptBuilder) ResourceNodeUp(c *fi.CloudupModelBuilderContext, ig *kops.InstanceGroup) (fi.Resource, error) { +func KeypairNamesForInstanceGroup(cluster *kops.Cluster, ig *kops.InstanceGroup) []string { keypairs := []string{"kubernetes-ca", "etcd-clients-ca"} - for _, etcdCluster := range b.Cluster.Spec.EtcdClusters { + for _, etcdCluster := range cluster.Spec.EtcdClusters { k := etcdCluster.Name keypairs = append(keypairs, "etcd-manager-ca-"+k, "etcd-peers-ca-"+k) if k != "events" && k != "main" { @@ -142,7 +140,17 @@ func (b *BootstrapScriptBuilder) ResourceNodeUp(c *fi.CloudupModelBuilderContext if ig.IsBastion() { keypairs = nil + } + + return keypairs +} +// ResourceNodeUp generates and returns a nodeup (bootstrap) script from a +// template file, substituting in specific env vars & cluster spec configuration +func (b *BootstrapScriptBuilder) ResourceNodeUp(c *fi.CloudupModelBuilderContext, ig *kops.InstanceGroup) (fi.Resource, error) { + keypairNames := KeypairNamesForInstanceGroup(b.Cluster, ig) + + if ig.IsBastion() { // Bastions can have AdditionalUserData, but if there isn't any skip this part if len(ig.Spec.AdditionalUserData) == 0 { return nil, nil @@ -150,7 +158,7 @@ func (b *BootstrapScriptBuilder) ResourceNodeUp(c *fi.CloudupModelBuilderContext } caTasks := map[string]*fitasks.Keypair{} - for _, keypair := range keypairs { + for _, keypair := range keypairNames { caTaskObject, found := c.Tasks["Keypair/"+keypair] if !found { return nil, fmt.Errorf("keypair/%s task not found", keypair) diff --git a/tests/e2e/scenarios/bare-metal/run-test b/tests/e2e/scenarios/bare-metal/run-test index 059bfcdcfc6f9..c5eeb6a72ae12 100755 --- a/tests/e2e/scenarios/bare-metal/run-test +++ b/tests/e2e/scenarios/bare-metal/run-test @@ -53,6 +53,9 @@ ssh -o StrictHostKeyChecking=accept-new -i ${REPO_ROOT}/.build/.ssh/id_ed25519 r cd ${REPO_ROOT} +# Enable feature flag for bare metal +export KOPS_FEATURE_FLAGS=Metal + # Set up the AWS credentials export AWS_SECRET_ACCESS_KEY=secret export AWS_ACCESS_KEY_ID=accesskey @@ -90,4 +93,11 @@ ${KOPS} get ig --name metal.k8s.local -oyaml ${KOPS} update cluster metal.k8s.local ${KOPS} update cluster metal.k8s.local --yes --admin +# Start an SSH agent; enroll assumes SSH connectivity to the VMs with the key in the agent +eval $(ssh-agent) +ssh-add ${REPO_ROOT}/.build/.ssh/id_ed25519 + +# Enroll the control-plane VM +${KOPS} toolbox enroll --cluster metal.k8s.local --instance-group control-plane-main --host 10.123.45.10 --v=8 + echo "Test successful" \ No newline at end of file diff --git a/upup/pkg/fi/cloudup/new_cluster.go b/upup/pkg/fi/cloudup/new_cluster.go index 7a91b085835fa..e028b63f82cee 100644 --- a/upup/pkg/fi/cloudup/new_cluster.go +++ b/upup/pkg/fi/cloudup/new_cluster.go @@ -360,6 +360,9 @@ func NewCluster(opt *NewClusterOptions, clientset simple.Clientset) (*NewCluster case api.CloudProviderScaleway: cluster.Spec.CloudProvider.Scaleway = &api.ScalewaySpec{} case api.CloudProviderMetal: + if !featureflag.Metal.Enabled() { + return nil, fmt.Errorf("bare-metal support requires the Metal feature flag to be enabled") + } if cluster.Labels == nil { cluster.Labels = make(map[string]string) }