Bug 2105123: tuned: disable irqbalance (openshift#396)

* test: perfprof: utils: make sure to unquote cpus Make sure to unquote the cpumask output to prevent false negatives Signed-off-by: Francesco Romani <fromani@redhat.com> * perfprof: utils: robustness fixes Add testcases and log enhancement emerged during the local testing. Signed-off-by: Francesco Romani <fromani@redhat.com> * perfprof: tuned: disable the irqbalance plugin The tuned irqbalance plugin clears the irqbalance banned CPUs list when tuned starts. The list is then managed dynamically by the runtime handlers. On node restart, the tuned pod can be started AFTER the workload pods (kubelet nor kubernetes offers ordering guarantees when recovering the node state); clearing the banned CPUs list while pods are running and compromising the IRQ isolation guarantees. Same holds true if the NTO pod restarts for whatever reason. To prevent this disruption, we disable the irqbalance plugin entirely. Another component in the stack must now clear the irqbalance cpu ban list once per node reboot. Fixes: https://bugzilla.redhat.com/show_bug.cgi?id=2105123 Signed-off-by: Francesco Romani <fromani@redhat.com> * e2e: perfprof: add tests for cpu ban list handling Add a test to verify that a tuned restart will not clear the irqbalance cpu ban list, which is the key reason why we disabled the irqbalance tuned plugin earlier. Note there is no guarantee that any component in the stack will reset the irqbalance copu ban list once. crio inconditionally does a restore depending on a snapshot taken the first time the server runs, which is likely but not guaranteed to be correct. There's no way to declare or check the content of the value crio would reset. Signed-off-by: Francesco Romani <fromani@redhat.com> * perfprof: functests: utils: fix cpu mask padding Testing the irqbalance handling on tuned restart highlighted a crio bug when handling odd-numbered cpu affinity masks. We expect this bug to have little impact on production environments because it's unlikely they will have a number of CPUs multiple by 4 but not multiple by 8, but still we filed cri-o/cri-o#6145 In order to have a backportable change, we fix our utility code to deal with incorrect padding. Signed-off-by: Francesco Romani <fromani@redhat.com> Signed-off-by: Francesco Romani <fromani@redhat.com>
IlyaTyomkin · Aug 17, 2022 · c5cf0bd · c5cf0bd
1 parent 0441499
commit c5cf0bd
Show file tree

Hide file tree

Showing 8 changed files with 321 additions and 14 deletions.
diff --git a/assets/performanceprofile/tuned/openshift-node-performance b/assets/performanceprofile/tuned/openshift-node-performance
@@ -38,8 +38,10 @@ transparent_hugepages=never
 
 {{if not .GloballyDisableIrqLoadBalancing}}
 [irqbalance]
-#> Override the value set by cpu-partitioning with an empty one
-banned_cpus=""
+# Disable the plugin entirely, which was enabled by the parent profile `cpu-partitioning`.
+# It can be racy if TuneD restarts for whatever reason.
+#> cpu-partitioning
+enabled=false
 {{end}}
 
 [scheduler]

diff --git a/pkg/performanceprofile/controller/performanceprofile/components/tuned/tuned.go b/pkg/performanceprofile/controller/performanceprofile/components/tuned/tuned.go
@@ -109,8 +109,7 @@ func NewNodePerformance(profile *performancev2.PerformanceProfile) (*tunedv1.Tun
 		templateArgs[templateAdditionalArgs] = strings.Join(profile.Spec.AdditionalKernelArgs, cmdlineDelimiter)
 	}
 
-	if profile.Spec.GloballyDisableIrqLoadBalancing != nil &&
-		*profile.Spec.GloballyDisableIrqLoadBalancing == true {
+	if IsIRQBalancingGloballyDisabled(profile) {
 		templateArgs[templateGloballyDisableIrqLoadBalancing] = strconv.FormatBool(true)
 	}
 
@@ -218,3 +217,7 @@ func getProfileData(tunedTemplate string, data interface{}) (string, error) {
 	}
 	return profile.String(), nil
 }
+
+func IsIRQBalancingGloballyDisabled(profile *performancev2.PerformanceProfile) bool {
+	return profile.Spec.GloballyDisableIrqLoadBalancing != nil && *profile.Spec.GloballyDisableIrqLoadBalancing
+}
diff --git a/pkg/performanceprofile/controller/performanceprofile/components/utils.go b/pkg/performanceprofile/controller/performanceprofile/components/utils.go
@@ -140,7 +140,7 @@ func CPUMaskToCPUSet(cpuMask string) (cpuset.CPUSet, error) {
 		}
 		mask, err := strconv.ParseUint(chunk, 16, bitsInWord)
 		if err != nil {
-			return cpuset.NewCPUSet(), fmt.Errorf("failed to parse the CPU mask %q: %v", cpuMask, err)
+			return cpuset.NewCPUSet(), fmt.Errorf("failed to parse the CPU mask %q (chunk %q): %v", cpuMask, chunk, err)
 		}
 		for j := 0; j < bitsInWord; j++ {
 			if mask&1 == 1 {

diff --git a/pkg/performanceprofile/controller/performanceprofile/components/utils_test.go b/pkg/performanceprofile/controller/performanceprofile/components/utils_test.go
@@ -16,6 +16,7 @@ var cpuListToMask = []listToMask{
 	{"0", "00000001"},
 	{"2-3", "0000000c"},
 	{"3,4,53-55,61-63", "e0e00000,00000018"},
+	{"1,3-7", "000000fa"},
 	{"0-127", "ffffffff,ffffffff,ffffffff,ffffffff"},
 	{"0-255", "ffffffff,ffffffff,ffffffff,ffffffff,ffffffff,ffffffff,ffffffff,ffffffff"},
 }

diff --git a/test/e2e/performanceprofile/functests/1_performance/cpu_management.go b/test/e2e/performanceprofile/functests/1_performance/cpu_management.go
@@ -781,7 +781,7 @@ func promotePodToGuaranteed(pod *corev1.Pod) *corev1.Pod {
 	return pod
 }
 
-func getTestPodWithAnnotations(annotations map[string]string, cpus int) *corev1.Pod {
+func getTestPodWithProfileAndAnnotations(perfProf *performancev2.PerformanceProfile, annotations map[string]string, cpus int) *corev1.Pod {
 	testpod := pods.GetTestPod()
 	if len(annotations) > 0 {
 		testpod.Annotations = annotations
@@ -801,8 +801,17 @@ func getTestPodWithAnnotations(annotations map[string]string, cpus int) *corev1.
 		},
 	}
 
-	runtimeClassName := components.GetComponentName(profile.Name, components.ComponentNamePrefix)
-	testpod.Spec.RuntimeClassName = &runtimeClassName
+	if perfProf != nil {
+		runtimeClassName := components.GetComponentName(perfProf.Name, components.ComponentNamePrefix)
+		testpod.Spec.RuntimeClassName = &runtimeClassName
+	}
+
+	return testpod
+}
+
+func getTestPodWithAnnotations(annotations map[string]string, cpus int) *corev1.Pod {
+	testpod := getTestPodWithProfileAndAnnotations(profile, annotations, cpus)
+
 	testpod.Spec.NodeSelector = map[string]string{testutils.LabelHostname: workerRTNode.Name}
 
 	return testpod

diff --git a/test/e2e/performanceprofile/functests/1_performance/irqbalance.go b/test/e2e/performanceprofile/functests/1_performance/irqbalance.go
@@ -0,0 +1,247 @@
+package __performance
+
+import (
+	"bufio"
+	"context"
+	"encoding/json"
+	"fmt"
+	"os"
+	"strings"
+	"time"
+
+	corev1 "k8s.io/api/core/v1"
+	"k8s.io/kubernetes/pkg/kubelet/cm/cpuset"
+
+	. "github.com/onsi/ginkgo"
+	. "github.com/onsi/gomega"
+
+	machineconfigv1 "github.com/openshift/machine-config-operator/pkg/apis/machineconfiguration.openshift.io/v1"
+
+	performancev2 "github.com/openshift/cluster-node-tuning-operator/pkg/apis/performanceprofile/v2"
+	"github.com/openshift/cluster-node-tuning-operator/pkg/performanceprofile/controller/performanceprofile/components"
+	"github.com/openshift/cluster-node-tuning-operator/pkg/performanceprofile/controller/performanceprofile/components/tuned"
+	testutils "github.com/openshift/cluster-node-tuning-operator/test/e2e/performanceprofile/functests/utils"
+	testclient "github.com/openshift/cluster-node-tuning-operator/test/e2e/performanceprofile/functests/utils/client"
+	"github.com/openshift/cluster-node-tuning-operator/test/e2e/performanceprofile/functests/utils/discovery"
+	testlog "github.com/openshift/cluster-node-tuning-operator/test/e2e/performanceprofile/functests/utils/log"
+	"github.com/openshift/cluster-node-tuning-operator/test/e2e/performanceprofile/functests/utils/mcps"
+	"github.com/openshift/cluster-node-tuning-operator/test/e2e/performanceprofile/functests/utils/nodes"
+	"github.com/openshift/cluster-node-tuning-operator/test/e2e/performanceprofile/functests/utils/pods"
+	"github.com/openshift/cluster-node-tuning-operator/test/e2e/performanceprofile/functests/utils/profiles"
+	"github.com/openshift/cluster-node-tuning-operator/test/e2e/util"
+	"github.com/openshift/cluster-node-tuning-operator/test/framework"
+)
+
+var (
+	cs = framework.NewClientSet()
+)
+
+var _ = Describe("[performance] Checking IRQBalance settings", func() {
+	var workerRTNodes []corev1.Node
+	var targetNode *corev1.Node
+	var profile *performancev2.PerformanceProfile
+	var performanceMCP string
+	var err error
+
+	BeforeEach(func() {
+		if discovery.Enabled() && testutils.ProfileNotFound {
+			Skip("Discovery mode enabled, performance profile not found")
+		}
+
+		workerRTNodes, err = nodes.GetByLabels(testutils.NodeSelectorLabels)
+		Expect(err).ToNot(HaveOccurred())
+		profile, err = profiles.GetByNodeLabels(testutils.NodeSelectorLabels)
+		Expect(err).ToNot(HaveOccurred())
+		performanceMCP, err = mcps.GetByProfile(profile)
+		Expect(err).ToNot(HaveOccurred())
+
+		// Verify that worker and performance MCP have updated state equals to true
+		for _, mcpName := range []string{testutils.RoleWorker, performanceMCP} {
+			mcps.WaitForCondition(mcpName, machineconfigv1.MachineConfigPoolUpdated, corev1.ConditionTrue)
+		}
+
+		nodeIdx := pickNodeIdx(workerRTNodes)
+		targetNode = &workerRTNodes[nodeIdx]
+		By(fmt.Sprintf("verifying worker node %q", targetNode.Name))
+	})
+
+	Context("Verify irqbalance configuration handling", func() {
+
+		It("Should not overwrite the banned CPU set on tuned restart", func() {
+			if profile.Status.RuntimeClass == nil {
+				Skip("runtime class not generated")
+			}
+
+			if tuned.IsIRQBalancingGloballyDisabled(profile) {
+				Skip("this test needs dynamic IRQ balancing")
+			}
+
+			Expect(targetNode).ToNot(BeNil(), "missing target node")
+
+			irqAffBegin, err := getIrqDefaultSMPAffinity(targetNode)
+			Expect(err).ToNot(HaveOccurred(), "failed to extract the default IRQ affinity from node %q", targetNode.Name)
+			testlog.Infof("IRQ Default affinity on %q when test begins: {%s}", targetNode.Name, irqAffBegin)
+
+			bannedCPUs, err := getIrqBalanceBannedCPUs(targetNode)
+			Expect(err).ToNot(HaveOccurred(), "failed to extract the banned CPUs from node %q", targetNode.Name)
+			testlog.Infof("banned CPUs on %q when test begins: {%s}", targetNode.Name, bannedCPUs.String())
+
+			smpAffinitySet, err := nodes.GetDefaultSmpAffinitySet(targetNode)
+			Expect(err).ToNot(HaveOccurred(), "failed to get default smp affinity")
+
+			onlineCPUsSet, err := nodes.GetOnlineCPUsSet(targetNode)
+			Expect(err).ToNot(HaveOccurred(), "failed to get Online CPUs list")
+
+			// expect no irqbalance run in the system already, AKA start from pristine conditions.
+			// This is not an hard requirement, just the easier state to manage and check
+			Expect(smpAffinitySet.Equals(onlineCPUsSet)).To(BeTrue(), "found default_smp_affinity %v, expected %v - IRQBalance already run?", smpAffinitySet, onlineCPUsSet)
+
+			cpuRequest := 2 // minimum amount to be reasonably sure we're SMT-aligned
+			annotations := map[string]string{
+				"irq-load-balancing.crio.io": "disable",
+				"cpu-quota.crio.io":          "disable",
+			}
+			testpod := getTestPodWithProfileAndAnnotations(profile, annotations, cpuRequest)
+			testpod.Spec.NodeName = targetNode.Name
+
+			data, _ := json.Marshal(testpod)
+			testlog.Infof("using testpod:\n%s", string(data))
+
+			err = testclient.Client.Create(context.TODO(), testpod)
+			Expect(err).ToNot(HaveOccurred())
+			defer func() {
+				if testpod != nil {
+					testlog.Infof("deleting pod %q", testpod.Name)
+					deleteTestPod(testpod)
+				}
+				bannedCPUs, err := getIrqBalanceBannedCPUs(targetNode)
+				Expect(err).ToNot(HaveOccurred(), "failed to extract the banned CPUs from node %q", targetNode.Name)
+
+				testlog.Infof("banned CPUs on %q when test ends: {%s}", targetNode.Name, bannedCPUs.String())
+
+				irqAffBegin, err := getIrqDefaultSMPAffinity(targetNode)
+				Expect(err).ToNot(HaveOccurred(), "failed to extract the default IRQ affinity from node %q", targetNode.Name)
+
+				testlog.Infof("IRQ Default affinity on %q when test ends: {%s}", targetNode.Name, irqAffBegin)
+			}()
+
+			err = pods.WaitForCondition(testpod, corev1.PodReady, corev1.ConditionTrue, 10*time.Minute)
+			logEventsForPod(testpod)
+			Expect(err).ToNot(HaveOccurred())
+
+			// now we have something in the IRQBalance cpu list. Let's make sure the restart doesn't overwrite this data.
+			postCreateBannedCPUs, err := getIrqBalanceBannedCPUs(targetNode)
+			Expect(err).ToNot(HaveOccurred(), "failed to extract the banned CPUs from node %q", targetNode.Name)
+			testlog.Infof("banned CPUs on %q just before the tuned restart: {%s}", targetNode.Name, postCreateBannedCPUs.String())
+
+			Expect(postCreateBannedCPUs.IsEmpty()).To(BeFalse(), "banned CPUs %v should not be empty on node %q", postCreateBannedCPUs, targetNode.Name)
+
+			By(fmt.Sprintf("getting a TuneD Pod running on node %s", targetNode.Name))
+			pod, err := util.GetTunedForNode(cs, targetNode)
+			Expect(err).NotTo(HaveOccurred())
+
+			By(fmt.Sprintf("causing a restart of the tuned pod (deleting the pod) on %s", targetNode.Name))
+			_, _, err = util.ExecAndLogCommand("oc", "delete", "pod", "--wait=true", "-n", pod.Namespace, pod.Name)
+			Expect(err).NotTo(HaveOccurred())
+
+			Eventually(func() error {
+				By(fmt.Sprintf("getting again a TuneD Pod running on node %s", targetNode.Name))
+				pod, err = util.GetTunedForNode(cs, targetNode)
+				if err != nil {
+					return err
+				}
+
+				By(fmt.Sprintf("waiting for the TuneD daemon running on node %s", targetNode.Name))
+				_, err = util.WaitForCmdInPod(5*time.Second, 5*time.Minute, pod, "test", "-e", "/run/tuned/tuned.pid")
+				return err
+			}).WithTimeout(5 * time.Minute).WithPolling(10 * time.Second).ShouldNot(HaveOccurred())
+
+			By(fmt.Sprintf("re-verifying worker node %q after TuneD restart", targetNode.Name))
+			postRestartBannedCPUs, err := getIrqBalanceBannedCPUs(targetNode)
+			Expect(err).ToNot(HaveOccurred(), "failed to extract the banned CPUs from node %q", targetNode.Name)
+			testlog.Infof("banned CPUs on %q after the tuned restart: {%s}", targetNode.Name, postRestartBannedCPUs.String())
+
+			Expect(postRestartBannedCPUs.ToSlice()).To(Equal(postCreateBannedCPUs.ToSlice()), "banned CPUs changed post tuned restart on node %q", postRestartBannedCPUs.ToSlice(), targetNode.Name)
+		})
+	})
+})
+
+// nodes.BannedCPUs fails (!!!) if the current banned list is empty because, deep down, ExecCommandOnNode expects non-empty stdout.
+// In turn, we do this to at least have a chance to detect failed commands vs failed to execute commands (we had this issue in
+// not-so-distant past, legit command output lost somewhere in the communication). Fixing ExecCommandOnNode isn't trivial and
+// require close attention. For the time being we reimplement a form of nodes.BannedCPUs which can handle empty ban list.
+func getIrqBalanceBannedCPUs(node *corev1.Node) (cpuset.CPUSet, error) {
+	cmd := []string{"cat", "/rootfs/etc/sysconfig/irqbalance"}
+	conf, err := nodes.ExecCommandOnNode(cmd, node)
+	if err != nil {
+		return cpuset.NewCPUSet(), err
+	}
+
+	keyValue := findIrqBalanceBannedCPUsVarFromConf(conf)
+	if len(keyValue) == 0 {
+		// can happen: everything commented out (default if no tuning ever)
+		testlog.Warningf("cannot find the CPU ban list in the configuration (\n%s)\n", conf)
+		return cpuset.NewCPUSet(), nil
+	}
+
+	testlog.Infof("banned CPUs setting: %q", keyValue)
+
+	items := strings.FieldsFunc(keyValue, func(c rune) bool {
+		return c == '='
+	})
+	if len(items) != 2 {
+		return cpuset.NewCPUSet(), fmt.Errorf("malformed CPU ban list in the configuration")
+	}
+
+	bannedCPUs := unquote(strings.TrimSpace(items[1]))
+	testlog.Infof("banned CPUs: %q", bannedCPUs)
+
+	banned, err := components.CPUMaskToCPUSet(bannedCPUs)
+	if err != nil {
+		return cpuset.NewCPUSet(), fmt.Errorf("failed to parse the banned CPUs: %v", err)
+	}
+
+	return banned, nil
+}
+
+func getIrqDefaultSMPAffinity(node *corev1.Node) (string, error) {
+	cmd := []string{"cat", "/rootfs/proc/irq/default_smp_affinity"}
+	return nodes.ExecCommandOnNode(cmd, node)
+}
+
+func findIrqBalanceBannedCPUsVarFromConf(conf string) string {
+	scanner := bufio.NewScanner(strings.NewReader(conf))
+	for scanner.Scan() {
+		line := strings.TrimSpace(scanner.Text())
+		if strings.HasPrefix(line, "#") {
+			continue
+		}
+		if !strings.HasPrefix(line, "IRQBALANCE_BANNED_CPUS") {
+			continue
+		}
+		return line
+	}
+	return ""
+}
+
+func pickNodeIdx(nodes []corev1.Node) int {
+	name, ok := os.LookupEnv("E2E_PAO_TARGET_NODE")
+	if !ok {
+		return 0 // "random" default
+	}
+	for idx := range nodes {
+		if nodes[idx].Name == name {
+			testlog.Infof("node %q found among candidates, picking", name)
+			return idx
+		}
+	}
+	testlog.Infof("node %q not found among candidates, fall back to random one", name)
+	return 0 // "safe" default
+}
+
+func unquote(s string) string {
+	q := "\""
+	s = strings.TrimPrefix(s, q)
+	s = strings.TrimSuffix(s, q)
+	return s
+}