Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
53 changes: 53 additions & 0 deletions api/nvidia/v1/clusterpolicy_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -970,6 +970,11 @@ type DCGMExporterSpec struct {
// +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors.x-descriptors="urn:alm:descriptor:com.tectonic.ui:advanced,urn:alm:descriptor:com.tectonic.ui:text"
Args []string `json:"args,omitempty"`

// Optional: Annotations is an unstructured key value map stored with a resource that may be
// set by external tools to store and retrieve arbitrary metadata. They are not
// queryable and should be preserved when modifying objects.
Annotations map[string]string `json:"annotations,omitempty"`

// Optional: List of environment variables
// +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors=true
// +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors.displayName="Environment Variables"
Expand Down Expand Up @@ -1011,6 +1016,30 @@ type DCGMExporterSpec struct {
// +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors.displayName="HPC Job Mapping Configuration"
// +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors.x-descriptors="urn:alm:descriptor:com.tectonic.ui:advanced"
HPCJobMapping *DCGMExporterHPCJobMappingConfig `json:"hpcJobMapping,omitempty"`

// Enable Kubernetes pod labels as Prometheus label dimensions in DCGM exporter metrics.
// (Requires cluster-level read access to pods.)
// +kubebuilder:validation:Optional
// +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors=true
// +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors.displayName="Enable pod-label enrichment"
// +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors.x-descriptors="urn:alm:descriptor:com.tectonic.ui:booleanSwitch"
EnablePodLabels *bool `json:"enablePodLabels,omitempty"`

// Enable Kubernetes pod UID as a Prometheus label dimension in DCGM exporter metrics.
// (Requires cluster-level read access to pods.)
// +kubebuilder:validation:Optional
// +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors=true
// +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors.displayName="Enable pod-UID enrichment"
// +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors.x-descriptors="urn:alm:descriptor:com.tectonic.ui:booleanSwitch"
EnablePodUID *bool `json:"enablePodUID,omitempty"`

// Regex list for filtering which Kubernetes pod labels are included in DCGM exporter metrics.
// Empty means all pod labels are included.
// +kubebuilder:validation:Optional
// +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors=true
// +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors.displayName="Pod label allowlist regex"
// +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors.x-descriptors="urn:alm:descriptor:com.tectonic.ui:advanced,urn:alm:descriptor:com.tectonic.ui:text"
PodLabelAllowlistRegex []string `json:"podLabelAllowlistRegex,omitempty"`
}

// DCGMExporterHPCJobMappingConfig defines HPC job mapping configuration for NVIDIA DCGM Exporter
Expand Down Expand Up @@ -2206,6 +2235,30 @@ func (e *DCGMExporterSpec) GetHPCJobMappingDirectory() string {
return e.HPCJobMapping.Directory
}

// IsPodLabelsEnabled returns true if pod-label enrichment is enabled for DCGM Exporter
func (e *DCGMExporterSpec) IsPodLabelsEnabled() bool {
if e.EnablePodLabels == nil {
// default is false if not specified by user
return false
}
return *e.EnablePodLabels
}

// IsPodUIDEnabled returns true if pod-UID enrichment is enabled for DCGM Exporter
func (e *DCGMExporterSpec) IsPodUIDEnabled() bool {
if e.EnablePodUID == nil {
// default is false if not specified by user
return false
}
return *e.EnablePodUID
}

// IsKubernetesPodMetadataEnabled returns true if any Kubernetes pod metadata
// enrichment is enabled for DCGM Exporter.
func (e *DCGMExporterSpec) IsKubernetesPodMetadataEnabled() bool {
return e.IsPodLabelsEnabled() || e.IsPodUIDEnabled()
}

// IsEnabled returns true if gpu-feature-discovery is enabled(default) through gpu-operator
func (g *GPUFeatureDiscoverySpec) IsEnabled() bool {
if g.Enabled == nil {
Expand Down
22 changes: 22 additions & 0 deletions api/nvidia/v1/zz_generated.deepcopy.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

16 changes: 16 additions & 0 deletions assets/state-dcgm-exporter/0210_clusterrole.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRole
metadata:
name: nvidia-dcgm-exporter-read-pods
labels:
app: nvidia-dcgm-exporter
# TODO: Add resourceSlices permissions when GPU Operator exposes DRA exporter support.
rules:
- apiGroups:
- ""
resources:
- pods
verbs:
- get
- list
- watch
14 changes: 14 additions & 0 deletions assets/state-dcgm-exporter/0310_clusterrolebinding.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRoleBinding
metadata:
name: nvidia-dcgm-exporter-read-pods
labels:
app: nvidia-dcgm-exporter
roleRef:
apiGroup: rbac.authorization.k8s.io
kind: ClusterRole
name: nvidia-dcgm-exporter-read-pods
subjects:
- kind: ServiceAccount
name: nvidia-dcgm-exporter
namespace: "FILLED BY THE OPERATOR"
1 change: 1 addition & 0 deletions assets/state-dcgm-exporter/0800_daemonset.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ spec:
effect: NoSchedule
priorityClassName: system-node-critical
serviceAccountName: nvidia-dcgm-exporter
automountServiceAccountToken: false
initContainers:
- name: toolkit-validation
image: "FILLED BY THE OPERATOR"
Expand Down
25 changes: 25 additions & 0 deletions bundle/manifests/nvidia.com_clusterpolicies.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -553,6 +553,14 @@ spec:
dcgmExporter:
description: DCGMExporter spec
properties:
annotations:
additionalProperties:
type: string
description: |-
Optional: Annotations is an unstructured key value map stored with a resource that may be
set by external tools to store and retrieve arbitrary metadata. They are not
queryable and should be preserved when modifying objects.
type: object
args:
description: 'Optional: List of arguments'
items:
Expand All @@ -567,6 +575,16 @@ spec:
metrics to be collected by NVIDIA DCGM Exporter
type: string
type: object
enablePodLabels:
description: |-
Enable Kubernetes pod labels as Prometheus label dimensions in DCGM exporter metrics.
(Requires cluster-level read access to pods.)
type: boolean
enablePodUID:
description: |-
Enable Kubernetes pod UID as a Prometheus label dimension in DCGM exporter metrics.
(Requires cluster-level read access to pods.)
type: boolean
enabled:
description: Enabled indicates if deployment of NVIDIA DCGM Exporter
through operator is enabled
Expand Down Expand Up @@ -620,6 +638,13 @@ spec:
items:
type: string
type: array
podLabelAllowlistRegex:
description: |-
Regex list for filtering which Kubernetes pod labels are included in DCGM exporter metrics.
Empty means all pod labels are included.
items:
type: string
type: array
repository:
description: NVIDIA DCGM Exporter image repository
type: string
Expand Down
25 changes: 25 additions & 0 deletions config/crd/bases/nvidia.com_clusterpolicies.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -553,6 +553,14 @@ spec:
dcgmExporter:
description: DCGMExporter spec
properties:
annotations:
additionalProperties:
type: string
description: |-
Optional: Annotations is an unstructured key value map stored with a resource that may be
set by external tools to store and retrieve arbitrary metadata. They are not
queryable and should be preserved when modifying objects.
type: object
args:
description: 'Optional: List of arguments'
items:
Expand All @@ -567,6 +575,16 @@ spec:
metrics to be collected by NVIDIA DCGM Exporter
type: string
type: object
enablePodLabels:
description: |-
Enable Kubernetes pod labels as Prometheus label dimensions in DCGM exporter metrics.
(Requires cluster-level read access to pods.)
type: boolean
enablePodUID:
description: |-
Enable Kubernetes pod UID as a Prometheus label dimension in DCGM exporter metrics.
(Requires cluster-level read access to pods.)
type: boolean
enabled:
description: Enabled indicates if deployment of NVIDIA DCGM Exporter
through operator is enabled
Expand Down Expand Up @@ -620,6 +638,13 @@ spec:
items:
type: string
type: array
podLabelAllowlistRegex:
description: |-
Regex list for filtering which Kubernetes pod labels are included in DCGM exporter metrics.
Empty means all pod labels are included.
items:
type: string
type: array
repository:
description: NVIDIA DCGM Exporter image repository
type: string
Expand Down
67 changes: 67 additions & 0 deletions controllers/object_controls.go
Original file line number Diff line number Diff line change
Expand Up @@ -445,6 +445,20 @@ func RoleBinding(n ClusterPolicyController) (gpuv1.State, error) {
return gpuv1.Ready, nil
}

var rbacGates = map[string]func(*gpuv1.ClusterPolicySpec) bool{
"nvidia-dcgm-exporter-read-pods": func(config *gpuv1.ClusterPolicySpec) bool {
return config.DCGMExporter.IsKubernetesPodMetadataEnabled()
},
}

func isRBACEnabled(name string, config *gpuv1.ClusterPolicySpec) bool {
gate, ok := rbacGates[name]
if !ok {
return true
}
return gate(config)
}

// ClusterRole creates ClusterRole resource
func ClusterRole(n ClusterPolicyController) (gpuv1.State, error) {
ctx := n.ctx
Expand All @@ -464,6 +478,15 @@ func ClusterRole(n ClusterPolicyController) (gpuv1.State, error) {
return gpuv1.Disabled, nil
}

if !isRBACEnabled(obj.Name, &n.singleton.Spec) {
err := n.client.Delete(ctx, obj)
if err != nil && !apierrors.IsNotFound(err) {
logger.Error(err, "Couldn't delete")
return gpuv1.NotReady, err
}
return gpuv1.Disabled, nil
}

if err := controllerutil.SetControllerReference(n.singleton, obj, n.scheme); err != nil {
return gpuv1.NotReady, err
}
Expand Down Expand Up @@ -505,6 +528,15 @@ func ClusterRoleBinding(n ClusterPolicyController) (gpuv1.State, error) {
return gpuv1.Disabled, nil
}

if !isRBACEnabled(obj.Name, &n.singleton.Spec) {
err := n.client.Delete(ctx, obj)
if err != nil && !apierrors.IsNotFound(err) {
logger.Error(err, "Couldn't delete")
return gpuv1.NotReady, err
}
return gpuv1.Disabled, nil
}

for idx := range obj.Subjects {
obj.Subjects[idx].Namespace = n.operatorNamespace
}
Expand Down Expand Up @@ -1755,6 +1787,12 @@ func TransformDCGMExporter(obj *appsv1.DaemonSet, config *gpuv1.ClusterPolicySpe
if len(config.DCGMExporter.ImagePullSecrets) > 0 {
addPullSecrets(&obj.Spec.Template.Spec, config.DCGMExporter.ImagePullSecrets)
}

// merge extra annotations at the pod template level
if len(config.DCGMExporter.Annotations) > 0 {
addExtraAnnotations(obj, config.DCGMExporter.Annotations)
}

// set resource limits
if config.DCGMExporter.Resources != nil {
// apply resource limits to all containers
Expand Down Expand Up @@ -1820,6 +1858,26 @@ func TransformDCGMExporter(obj *appsv1.DaemonSet, config *gpuv1.ClusterPolicySpe
obj.Spec.Template.Spec.Volumes = append(obj.Spec.Template.Spec.Volumes, jobMappingVol)
}

// Inject pod-metadata enrichment env vars; RBAC is provisioned via the
// 0210/0310 assets and the SA token is mounted below.
if config.DCGMExporter.IsPodLabelsEnabled() {
setContainerEnv(&(obj.Spec.Template.Spec.Containers[0]), "DCGM_EXPORTER_KUBERNETES_ENABLE_POD_LABELS", "true")
}
if config.DCGMExporter.IsPodUIDEnabled() {
setContainerEnv(&(obj.Spec.Template.Spec.Containers[0]), "DCGM_EXPORTER_KUBERNETES_ENABLE_POD_UID", "true")
}
if len(config.DCGMExporter.PodLabelAllowlistRegex) > 0 {
setContainerEnv(&(obj.Spec.Template.Spec.Containers[0]),
"DCGM_EXPORTER_KUBERNETES_POD_LABEL_ALLOWLIST_REGEX",
strings.Join(config.DCGMExporter.PodLabelAllowlistRegex, ","))
}

// Override the base asset's automountServiceAccountToken=false when
// enrichment is on so the pod informer has client-go credentials.
if config.DCGMExporter.IsKubernetesPodMetadataEnabled() {
obj.Spec.Template.Spec.AutomountServiceAccountToken = ptr.To(true)
}

// mount configmap for custom metrics if provided by user
if config.DCGMExporter.MetricsConfig != nil && config.DCGMExporter.MetricsConfig.Name != "" {
metricsConfigVolMount := corev1.VolumeMount{Name: "metrics-config", ReadOnly: true, MountPath: MetricsConfigMountPath, SubPath: MetricsConfigFileName}
Expand Down Expand Up @@ -1851,6 +1909,15 @@ func TransformDCGMExporter(obj *appsv1.DaemonSet, config *gpuv1.ClusterPolicySpe
return nil
}

func addExtraAnnotations(obj *appsv1.DaemonSet, annotations map[string]string) {
if obj.Spec.Template.Annotations == nil {
obj.Spec.Template.Annotations = make(map[string]string)
}
for k, v := range annotations {
obj.Spec.Template.Annotations[k] = v
}
}

// TransformDCGM transforms dcgm daemonset with required config as per ClusterPolicy
func TransformDCGM(obj *appsv1.DaemonSet, config *gpuv1.ClusterPolicySpec, n ClusterPolicyController) error {
// update validation container
Expand Down
Loading
Loading